""" 에브리타임 VOC 데이터를 ChromaDB에 임베딩해서 저장. + 집계/카테고리 레이어용 사전 요약 JSON 생성. 실행: python build_index.py 출력: - ./chroma_db/ (ChromaDB persistent) - ./aggregates.json (Layer 1 - 집계) - ./category_summary.json (Layer 2 - 카테고리) """ from __future__ import annotations import json import re from pathlib import Path import chromadb import pandas as pd from chromadb.utils import embedding_functions ROOT = Path(__file__).resolve().parent DATA_DIR = ROOT.parent # ~/project/everytime-analysis CHROMA_DIR = ROOT / "chroma_db" EMBED_MODEL = "jhgan/ko-sroberta-multitask" COLLECTION = "voc" def load_posts() -> pd.DataFrame: """concerns_2year + complaints_all 병합, id 기준 중복 제거.""" concerns = pd.read_csv(DATA_DIR / "concerns_2year.csv") complaints = pd.read_csv(DATA_DIR / "complaints_all.csv") concerns["source"] = "concern" complaints["source"] = "complaint" complaints["concern_score"] = complaints["score"] complaints["primary_cat"] = complaints["categories"].astype(str).str.split(";").str[0] cols = ["id", "title", "text", "created_at", "month", "posvote", "comment_count", "concern_score", "primary_cat", "source"] df = pd.concat([concerns[cols], complaints[cols]], ignore_index=True) df = df.drop_duplicates(subset=["id"], keep="first") df["title"] = df["title"].fillna("").astype(str) df["text"] = df["text"].fillna("").astype(str) df["primary_cat"] = df["primary_cat"].fillna("기타").astype(str) df = df[df["title"].str.len() + df["text"].str.len() > 3].reset_index(drop=True) return df def build_chroma(df: pd.DataFrame) -> None: CHROMA_DIR.mkdir(exist_ok=True) client = chromadb.PersistentClient(path=str(CHROMA_DIR)) try: client.delete_collection(COLLECTION) except Exception: pass embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction( model_name=EMBED_MODEL ) collection = client.create_collection( name=COLLECTION, embedding_function=embed_fn, metadata={"hnsw:space": "cosine"}, ) docs = (df["title"] + "\n" + df["text"]).str.slice(0, 1500).tolist() ids = df["id"].astype(str).tolist() metadatas = df[["created_at", "month", "posvote", "comment_count", "concern_score", "primary_cat", "source"]].to_dict("records") # ChromaDB는 metadata 값으로 str/int/float/bool만 허용. NaN 정리. for m in metadatas: for k, v in list(m.items()): if pd.isna(v): m[k] = 0 if k in ("posvote", "comment_count", "concern_score") else "" batch = 1000 for i in range(0, len(docs), batch): collection.add( ids=ids[i:i + batch], documents=docs[i:i + batch], metadatas=metadatas[i:i + batch], ) print(f" embedded {min(i + batch, len(docs))}/{len(docs)}") print(f"[chroma] collection '{COLLECTION}' built: {collection.count()} docs") def build_aggregates(df: pd.DataFrame) -> None: """Layer 1 - 분석 요약 텍스트 + 기본 집계 수치.""" summary_path = DATA_DIR / "analysis_summary.txt" summary = summary_path.read_text(encoding="utf-8") if summary_path.exists() else "" monthly_counts = (df.groupby("month").size() .sort_index().to_dict()) cat_counts = (df["primary_cat"].value_counts().head(15).to_dict()) out = { "analysis_summary_txt": summary, "total_posts_indexed": int(len(df)), "monthly_counts": {k: int(v) for k, v in monthly_counts.items()}, "category_counts_top15": {k: int(v) for k, v in cat_counts.items()}, "period": { "start": str(df["created_at"].min()), "end": str(df["created_at"].max()), }, } (ROOT / "aggregates.json").write_text( json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8" ) print(f"[aggregates] saved. categories={len(cat_counts)}, months={len(monthly_counts)}") def build_category_summary(df: pd.DataFrame, top_n: int = 5) -> None: """Layer 2 - primary_cat별 상위 관심 게시글 top_n.""" df = df.copy() df["engage"] = df["posvote"].fillna(0) + df["comment_count"].fillna(0) out: dict = {} for cat, grp in df.groupby("primary_cat"): grp = grp.sort_values(["concern_score", "engage"], ascending=False).head(top_n) out[cat] = { "total": int(len(df[df["primary_cat"] == cat])), "top_posts": [ { "id": str(r["id"]), "title": r["title"][:120], "text_preview": re.sub(r"\s+", " ", str(r["text"]))[:200], "month": r["month"], "posvote": int(r["posvote"] or 0), "comment_count": int(r["comment_count"] or 0), } for _, r in grp.iterrows() ], } (ROOT / "category_summary.json").write_text( json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8" ) print(f"[category] saved. categories={len(out)}") def main() -> None: print("[1/4] loading CSVs...") df = load_posts() print(f" merged posts: {len(df)}") print("[2/4] building aggregates...") build_aggregates(df) print("[3/4] building category summary...") build_category_summary(df) print("[4/4] building ChromaDB (this is the slow part, grab a coffee)...") build_chroma(df) print("\ndone. next: streamlit run app.py") if __name__ == "__main__": main()