from pathlib import Path import os # ── Load .env if python-dotenv available ────────────────────────────────────── try: from dotenv import load_dotenv load_dotenv(Path(__file__).parent / ".env", override=False) except ImportError: pass # dotenv optional — env vars can be set by OS / Docker # ── Directories ──────────────────────────────────────────────────────────────── PROJECT_ROOT = Path(__file__).parent.resolve() # relative — works on any machine DATA_DIR = PROJECT_ROOT / "data" INDEX_DIR = PROJECT_ROOT / "index" LOGS_DIR = PROJECT_ROOT / "logs" # ── Source data (KCC parquet files) ─────────────────────────────────────────── KCC_DATA_DIR = Path(os.environ.get( "KCC_DATA_DIR", str(PROJECT_ROOT / "data" / "kcc_raw") )) # ── Step 1 output ────────────────────────────────────────────────────────────── CLEAN_DATA_DIR = DATA_DIR / "clean_years" CLEAN_DATA_FILE = DATA_DIR / "kcc_clean.parquet" # ── Step 2 output ────────────────────────────────────────────────────────────── FAISS_INDEX_FILE = INDEX_DIR / "faiss.index" METADATA_FILE = INDEX_DIR / "metadata.parquet" BM25_INDEX_FILE = INDEX_DIR / "bm25.db" # ── Embedding model ──────────────────────────────────────────────────────────── EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" EMBEDDING_DIM = 384 EMBED_BATCH_SIZE = 512 # ── Retrieval settings ───────────────────────────────────────────────────────── TOP_K = 5 # ── Cross-encoder reranker ───────────────────────────────────────────────────── RERANKER_MODEL = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" RERANKER_TOP_N = 20 MIN_ANSWER_CHARS = 100 # ── LLM cascade — keys from environment ─────────────────────────────────────── GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "") GROQ_MODEL_PRIMARY = "meta-llama/llama-4-scout-17b-16e-instruct" GROQ_MODEL_FALLBACK = "llama-3.3-70b-versatile" GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "") GEMINI_MODEL = "gemma-3-27b-it" # ── Mandi price API ─────────────────────────────────────────────────────────── DATA_GOV_API_KEY = os.environ.get("DATA_GOV_API_KEY", "") # ── Environment ─────────────────────────────────────────────────────────────── ENVIRONMENT = os.environ.get("ENVIRONMENT", "development") LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") ALLOWED_ORIGINS = [ o.strip() for o in os.environ.get("ALLOWED_ORIGINS", "http://localhost:8501,http://localhost:8000").split(",") if o.strip() ] B2B_DEMO_PASSWORD = os.environ.get("B2B_DEMO_PASSWORD", "DeHaat2025") # ── Data filters ─────────────────────────────────────────────────────────────── MIN_QUERY_LEN = 10 MIN_ANSWER_LEN = 20 START_YEAR = 2006 CHUNK_SIZE = 100000 SAMPLE_PER_YEAR = None # ── Column names ─────────────────────────────────────────────────────────────── QUERY_COL = "QueryText" ANSWER_COL = "KccAns" # Create dirs on import for _d in [DATA_DIR, CLEAN_DATA_DIR, INDEX_DIR, LOGS_DIR]: _d.mkdir(parents=True, exist_ok=True)