kcc-agri / kcc_core /classify.py
hritikm15's picture
Day 9 — v4 merge deploy: kcc_core + advisors + Proof tab + pest heatmap
49818d2 verified
"""Lightweight classifiers for language, crop, and problem type.
These are all keyword-based — fast, deterministic, no model load.
They power retrieval rewriting and prompt routing, not the final answer.
"""
from __future__ import annotations
import re
from typing import Optional
# ── Language detection ────────────────────────────────────────────────────────
_LANG_SCRIPTS = [
("Hindi", ('ऀ', 'ॿ')), # Devanagari
("Telugu", ('ఀ', '౿')),
("Kannada", ('ಀ', '೿')),
("Tamil", ('஀', '௿')),
("Bengali", ('ঀ', '৿')),
("Marathi", ('ऀ', 'ॿ')), # also Devanagari — disambiguated by keywords
("Gujarati",('઀', '૿')),
("Punjabi", ('਀', '੿')),
]
_HINGLISH_HINTS = {
"kar","ka","ke","ki","ko","main","mein","hai","hain","kya","kaise","kab",
"kahan","kyun","kyon","aur","ya","par","se","tak","liye","lekin","fir",
"khaad","beej","keeda","bimari","fasal","kheti","kisan","upchar"
}
def detect_language(query: str) -> str:
"""Detect language of query. Returns canonical English name.
Priority: script → keyword → default English.
"""
for lang, (lo, hi) in _LANG_SCRIPTS:
if any(lo <= c <= hi for c in query):
# Devanagari is shared by Hindi & Marathi; default to Hindi.
return "Marathi" if lang == "Marathi" and "मराठी" in query else (
"Hindi" if lang == "Marathi" else lang)
words = set(re.findall(r"\b[a-z]+\b", query.lower()))
if words.intersection(_HINGLISH_HINTS):
return "Hinglish"
return "English"
# ── Crop detection ────────────────────────────────────────────────────────────
_CROP_KEYWORDS = {
"Wheat": ["wheat","gehu","gehun","गेहूं","गेहु"],
"Rice": ["rice","paddy","dhan","chaval","धान","चावल"],
"Cotton": ["cotton","kapas","कपास"],
"Maize": ["maize","corn","makka","makki","मक्का"],
"Sugarcane": ["sugarcane","ganna","गन्ना"],
"Soybean": ["soybean","soyabean","soya","सोयाबीन"],
"Mustard": ["mustard","sarson","सरसों"],
"Chilli": ["chilli","chili","mirchi","mirch","मिर्च"],
"Brinjal": ["brinjal","baingan","baigan","बैंगन"],
"Onion": ["onion","pyaz","pyaaz","प्याज"],
"Tomato": ["tomato","tamatar","टमाटर"],
"Potato": ["potato","aloo","आलू"],
"Groundnut": ["groundnut","peanut","moongphali","मूंगफली"],
"Gram": ["gram","chickpea","chana","चना"],
"Pigeonpea": ["pigeonpea","arhar","tur","अरहर","तूर"],
"Mungbean": ["mungbean","moong","mung","मूंग"],
"Banana": ["banana","kela","केला"],
"Mango": ["mango","aam","आम"],
}
def detect_crop(query: str) -> Optional[str]:
q = query.lower()
for crop, kws in _CROP_KEYWORDS.items():
for kw in kws:
if kw in q:
return crop
return None
# ── Problem classification ────────────────────────────────────────────────────
_PROBLEM_KEYWORDS = {
"pest": ["pest","insect","keeda","keeda","kida","कीड़ा","aphid","borer",
"thrips","mite","whitefly","caterpillar","mahu","tela","tikda",
"safed makhi","bollworm","armyworm","helicoverpa"],
"disease": ["disease","bimari","rog","बीमारी","blight","rust","wilt",
"mildew","blast","leaf spot","mosaic","virus","rot","fungus"],
"nutrient": ["fertilizer","khaad","urea","dap","npk","nitrogen","phosphorus",
"potassium","poshan","yellow leaves","peeli","stunted","kami",
"deficiency"],
"weather": ["rain","baarish","drought","sukha","frost","pala","heat",
"thand","flood","ola","hail","wind","storm","andhi"],
"irrigation": ["irrigation","sinchai","water","drip","sprinkler","paani"],
"price": ["price","rate","mandi","bhav","kimat","sell","bechna"],
"seed": ["seed","beej","variety","kism","pusa","srh","hybrid"],
"scheme": ["scheme","yojana","subsidy","kcc","pm kisan","pmfby","loan"],
"crop_selection": ["which crop","kaunsi fasal","कौन सी फसल","grow what",
"best crop","sowing","boyenge","bujayenge"],
"soil": ["soil test","mitti","soil ph","soil","kshar","alkali"],
"harvest": ["harvest","kataai","cutting","ripe","pakka","grain"],
"storage": ["storage","bhandaran","store","grain pest","weevil"],
}
def classify_problem(query: str) -> str:
q = query.lower()
counts = {k: sum(1 for kw in v if kw in q) for k, v in _PROBLEM_KEYWORDS.items()}
best = max(counts.items(), key=lambda x: x[1])
return best[0] if best[1] > 0 else "general"
# ── Hindi → English query expansion ───────────────────────────────────────────
_HINDI_EXPANSIONS = {
"गेहूं":"wheat","गेहु":"wheat","धान":"rice","चावल":"rice",
"कपास":"cotton","मक्का":"maize","गन्ना":"sugarcane","सोयाबीन":"soybean",
"सरसों":"mustard","मिर्च":"chilli","बैंगन":"brinjal","प्याज":"onion",
"टमाटर":"tomato","आलू":"potato","मूंगफली":"groundnut","चना":"gram",
"अरहर":"pigeonpea","मूंग":"mungbean","केला":"banana","आम":"mango",
"खाद":"fertilizer","बीज":"seed","कीड़ा":"pest","बीमारी":"disease",
"रोग":"disease","दवाइ":"pesticide","स्प्रे":"spray","उपचार":"treatment",
"सिंचाई":"irrigation","पाला":"frost","सूखा":"drought","बारिश":"rain",
"मंडी":"mandi","भाव":"price",
# Romanised
"gehu":"wheat","gehun":"wheat","dhan":"rice","kapas":"cotton",
"makka":"maize","makki":"maize","ganna":"sugarcane",
"sarson":"mustard","mirchi":"chilli","mirch":"chilli","baingan":"brinjal",
"pyaz":"onion","pyaaz":"onion","tamatar":"tomato","aloo":"potato",
"moongphali":"groundnut","chana":"gram","arhar":"pigeonpea","tur":"pigeonpea",
"moong":"mungbean","kela":"banana","aam":"mango",
"khaad":"fertilizer","beej":"seed","keeda":"pest","kida":"pest",
"bimari":"disease","rog":"disease","dawai":"pesticide","upchar":"treatment",
"sinchai":"irrigation","pala":"frost","sukha":"drought","baarish":"rain",
"mandi":"mandi","bhav":"price",
"safed makhi":"whitefly","tela":"jassid","mahu":"aphid","tikda":"thrips",
}
def normalize_query(query: str) -> str:
"""Append English equivalents for Hindi/Hinglish terms — helps embedding match."""
q_lower = query.lower()
extras = []
for term, eng in _HINDI_EXPANSIONS.items():
if term in query or term in q_lower:
if eng not in q_lower:
extras.append(eng)
if extras:
return f"{query} {' '.join(extras)}"
return query