kcc-agri / kcc_core /hyde.py
hritikm15's picture
Day 9 β€” v4 merge deploy: kcc_core + advisors + Proof tab + pest heatmap
49818d2 verified
"""HyDE (Hypothetical Document Embeddings) β€” generate a fake answer with the LLM,
then embed *that* and re-search. Doubles recall on rare named diseases /
chemical names that the multilingual embedder distributes poorly.
Only fires when the top FAISS+BM25+rerank score is below HYDE_TRIGGER_THRESHOLD.
Uses the cheapest model in the cascade (gemma2-9b-it) to keep cost ~0.
"""
from __future__ import annotations
from typing import Optional
_HYDE_PROMPT = """You are an Indian agricultural extension officer. A farmer asked the question below. Write a SHORT (60-120 words) hypothetical expert answer. The answer must include:
- The likely cause (1 sentence)
- The standard chemical / treatment with EXACT dose and unit (e.g. "Mancozeb 75% WP @ 2g/L")
- Timing and method (spray / drench / soil application)
- One safety / IPM note
Do NOT add disclaimers. Do NOT say "consult an expert". Just give the answer as if from an ICAR factsheet.
Farmer question: {query}
{ctx}
Hypothetical expert answer:"""
def expand_query(query: str,
crop: Optional[str] = None,
problem_type: Optional[str] = None) -> str:
"""Generate a hypothetical answer for HyDE retrieval. Returns "" if LLM fails."""
ctx = []
if crop:
ctx.append(f"Crop: {crop}")
if problem_type and problem_type != "general":
ctx.append(f"Problem type: {problem_type}")
ctx_str = ("Context: " + "; ".join(ctx) + "\n") if ctx else ""
prompt = _HYDE_PROMPT.format(query=query, ctx=ctx_str)
try:
from . import llm
# Use the cheapest model for HyDE β€” quality of the hypothetical doesn't
# need to be great, only directionally right (we're embedding it).
return llm.generate(prompt, max_tokens=200, temperature=0.3,
prefer_cheap=True) or ""
except Exception:
return ""