| from pathlib import Path | |
| import os | |
| # ββ Load .env if python-dotenv available ββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| from dotenv import load_dotenv | |
| load_dotenv(Path(__file__).parent / ".env", override=False) | |
| except ImportError: | |
| pass # dotenv optional β env vars can be set by OS / Docker | |
| # ββ Directories ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| PROJECT_ROOT = Path(__file__).parent.resolve() # relative β works on any machine | |
| DATA_DIR = PROJECT_ROOT / "data" | |
| INDEX_DIR = PROJECT_ROOT / "index" | |
| LOGS_DIR = PROJECT_ROOT / "logs" | |
| # ββ Source data (KCC parquet files) βββββββββββββββββββββββββββββββββββββββββββ | |
| KCC_DATA_DIR = Path(os.environ.get( | |
| "KCC_DATA_DIR", | |
| str(PROJECT_ROOT / "data" / "kcc_raw") | |
| )) | |
| # ββ Step 1 output ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CLEAN_DATA_DIR = DATA_DIR / "clean_years" | |
| CLEAN_DATA_FILE = DATA_DIR / "kcc_clean.parquet" | |
| # ββ Step 2 output ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| FAISS_INDEX_FILE = INDEX_DIR / "faiss.index" | |
| METADATA_FILE = INDEX_DIR / "metadata.parquet" | |
| BM25_INDEX_FILE = INDEX_DIR / "bm25.db" | |
| # ββ Embedding model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
| EMBEDDING_DIM = 384 | |
| EMBED_BATCH_SIZE = 512 | |
| # ββ Retrieval settings βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| TOP_K = 5 | |
| # ββ Cross-encoder reranker βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RERANKER_MODEL = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" | |
| RERANKER_TOP_N = 20 | |
| MIN_ANSWER_CHARS = 100 | |
| # ββ LLM cascade β keys from environment βββββββββββββββββββββββββββββββββββββββ | |
| GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "") | |
| GROQ_MODEL_PRIMARY = "meta-llama/llama-4-scout-17b-16e-instruct" | |
| GROQ_MODEL_FALLBACK = "llama-3.3-70b-versatile" | |
| GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "") | |
| GEMINI_MODEL = "gemma-3-27b-it" | |
| # ββ Mandi price API βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DATA_GOV_API_KEY = os.environ.get("DATA_GOV_API_KEY", "") | |
| # ββ Environment βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| ENVIRONMENT = os.environ.get("ENVIRONMENT", "development") | |
| LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO") | |
| ALLOWED_ORIGINS = [ | |
| o.strip() for o in | |
| os.environ.get("ALLOWED_ORIGINS", "http://localhost:8501,http://localhost:8000").split(",") | |
| if o.strip() | |
| ] | |
| B2B_DEMO_PASSWORD = os.environ.get("B2B_DEMO_PASSWORD", "DeHaat2025") | |
| # ββ Data filters βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MIN_QUERY_LEN = 10 | |
| MIN_ANSWER_LEN = 20 | |
| START_YEAR = 2006 | |
| CHUNK_SIZE = 100000 | |
| SAMPLE_PER_YEAR = None | |
| # ββ Column names βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| QUERY_COL = "QueryText" | |
| ANSWER_COL = "KccAns" | |
| # Create dirs on import | |
| for _d in [DATA_DIR, CLEAN_DATA_DIR, INDEX_DIR, LOGS_DIR]: | |
| _d.mkdir(parents=True, exist_ok=True) | |