kcc-agri / kcc_core /topic_guard.py
hritikm15's picture
Day 8.5 fix: kcc_core/topic_guard.py
d297b8c verified
"""Cheap regex/keyword guard. Blocks non-agriculture before any FAISS/LLM cost."""
from __future__ import annotations
import re
_AGRI_SIGNALS = {
# Crops
"crop","crops","plant","plants","planting","seed","seeds","soil","fertilizer",
"pesticide","pest","disease","farm","farmer","farming","kisan","kheti","fasal",
"gehu","gehun","dhan","kapas","tamatar","aloo","pyaz","mandi","bhav","rate",
"harvest","harvesting","sow","sown","sowing","grow","grows","growing","grown",
"variety","varieties","cultivar","cultivars","season","seasonal","kharif","rabi","zaid",
"irrigation","spray","insecticide","fungicide","organic","yield","yields","blight",
"wheat","rice","paddy","cotton","maize","sugarcane","soybean","mustard",
"chilli","brinjal","onion","tomato","potato","groundnut","gram","pulses","pulse",
"barley","jowar","bajra","ragi","arhar","tur","moong","urad","masoor","chana",
# Pests / diseases
"aphid","borer","mildew","rust","wilt","thrips","mite","whitefly",
"caterpillar","jassid","leaf spot","mosaic","virus","rot","fungus",
"blast","armyworm","bollworm","helicoverpa","spodoptera","girdle",
# Hindi / regional
"khaad","dawai","beej","sinchai","pattiya","pattiyan","keeda","bimari",
"rog","upchar","khet","paidavar","safed makhi","tela","mahu","tikda",
# Weather for crops
"pala","thand","frost","baarish","drought","sukha","heat stress",
"ola","flood","andhi",
# Schemes / advisory
"kvk","icar","pm kisan","fasal bima","kcc","kisan call","advisory",
# Chemicals (banned-chemical questions ARE agri)
"endosulfan","monocrotophos","chlorpyrifos","imidacloprid","emamectin",
"mancozeb","propiconazole","thiamethoxam","rhizobium","urea","dap",
# Devanagari
"फसल","खेत","किसान","खाद","बीज","धान","गेहूं","कपास",
"मक्का","सरसों","चना","अरहर","मूंग","प्याज","आलू",
"कीड़ा","बीमारी","दवाइ","स्प्रे","उपचार","सिंचाई",
}
_NON_AGRI_RE = re.compile(
r"\b(stock market|share market|bitcoin|crypto|politics|election|"
r"movie|cricket|football|recipe|cooking|exam|bank account|insurance claim|"
r"marriage|divorce|job|salary|relationship|celebrity|news|tv show|"
r"web series|ipl|bollywood|actor|actress|pakistan|china|war|army|"
r"love|dating|girlfriend|boyfriend|password|hack|code)\b",
re.IGNORECASE,
)
OFF_TOPIC_MSG = (
"I can only help with agriculture-related questions — crops, pests, diseases, "
"fertilizers, mandi prices, irrigation, seeds, and farming advice. "
"Please ask a farming question and I'll be happy to help! 🌾\n\n"
"Main sirf kheti-baadi, fasal, keede-makode, bimari, khaad, mandi bhav, "
"aur kisan-sambandhit sawalon ka jawab de sakta hoon."
)
def is_agriculture_query(query: str) -> bool:
"""Return True if the query looks like an agriculture question.
Short follow-ups (≤3 words) always pass — they're contextual replies.
Devanagari script defaults to allow unless a hard non-agri regex hits.
"""
stripped = query.strip()
q_lower_short = stripped.lower()
# ≤3 words is usually a follow-up like "ok", "haan", "thanks" — but only
# bypass if no obvious non-agri keyword is present.
if len(stripped.split()) <= 3 and not _NON_AGRI_RE.search(q_lower_short):
return True
if any('ऀ' <= c <= 'ॿ' for c in stripped):
return not _NON_AGRI_RE.search(stripped.lower())
q_lower = stripped.lower()
if _NON_AGRI_RE.search(q_lower):
words = set(re.findall(r"\b\w+\b", q_lower))
return bool(words.intersection(_AGRI_SIGNALS))
words = set(re.findall(r"\b\w+\b", q_lower))
return bool(words.intersection(_AGRI_SIGNALS))