"""Lightweight classifiers for language, crop, and problem type. These are all keyword-based — fast, deterministic, no model load. They power retrieval rewriting and prompt routing, not the final answer. """ from __future__ import annotations import re from typing import Optional # ── Language detection ──────────────────────────────────────────────────────── _LANG_SCRIPTS = [ ("Hindi", ('ऀ', 'ॿ')), # Devanagari ("Telugu", ('ఀ', '౿')), ("Kannada", ('ಀ', '೿')), ("Tamil", ('஀', '௿')), ("Bengali", ('ঀ', '৿')), ("Marathi", ('ऀ', 'ॿ')), # also Devanagari — disambiguated by keywords ("Gujarati",('઀', '૿')), ("Punjabi", ('਀', '੿')), ] _HINGLISH_HINTS = { "kar","ka","ke","ki","ko","main","mein","hai","hain","kya","kaise","kab", "kahan","kyun","kyon","aur","ya","par","se","tak","liye","lekin","fir", "khaad","beej","keeda","bimari","fasal","kheti","kisan","upchar" } def detect_language(query: str) -> str: """Detect language of query. Returns canonical English name. Priority: script → keyword → default English. """ for lang, (lo, hi) in _LANG_SCRIPTS: if any(lo <= c <= hi for c in query): # Devanagari is shared by Hindi & Marathi; default to Hindi. return "Marathi" if lang == "Marathi" and "मराठी" in query else ( "Hindi" if lang == "Marathi" else lang) words = set(re.findall(r"\b[a-z]+\b", query.lower())) if words.intersection(_HINGLISH_HINTS): return "Hinglish" return "English" # ── Crop detection ──────────────────────────────────────────────────────────── _CROP_KEYWORDS = { "Wheat": ["wheat","gehu","gehun","गेहूं","गेहु"], "Rice": ["rice","paddy","dhan","chaval","धान","चावल"], "Cotton": ["cotton","kapas","कपास"], "Maize": ["maize","corn","makka","makki","मक्का"], "Sugarcane": ["sugarcane","ganna","गन्ना"], "Soybean": ["soybean","soyabean","soya","सोयाबीन"], "Mustard": ["mustard","sarson","सरसों"], "Chilli": ["chilli","chili","mirchi","mirch","मिर्च"], "Brinjal": ["brinjal","baingan","baigan","बैंगन"], "Onion": ["onion","pyaz","pyaaz","प्याज"], "Tomato": ["tomato","tamatar","टमाटर"], "Potato": ["potato","aloo","आलू"], "Groundnut": ["groundnut","peanut","moongphali","मूंगफली"], "Gram": ["gram","chickpea","chana","चना"], "Pigeonpea": ["pigeonpea","arhar","tur","अरहर","तूर"], "Mungbean": ["mungbean","moong","mung","मूंग"], "Banana": ["banana","kela","केला"], "Mango": ["mango","aam","आम"], } def detect_crop(query: str) -> Optional[str]: q = query.lower() for crop, kws in _CROP_KEYWORDS.items(): for kw in kws: if kw in q: return crop return None # ── Problem classification ──────────────────────────────────────────────────── _PROBLEM_KEYWORDS = { "pest": ["pest","insect","keeda","keeda","kida","कीड़ा","aphid","borer", "thrips","mite","whitefly","caterpillar","mahu","tela","tikda", "safed makhi","bollworm","armyworm","helicoverpa"], "disease": ["disease","bimari","rog","बीमारी","blight","rust","wilt", "mildew","blast","leaf spot","mosaic","virus","rot","fungus"], "nutrient": ["fertilizer","khaad","urea","dap","npk","nitrogen","phosphorus", "potassium","poshan","yellow leaves","peeli","stunted","kami", "deficiency"], "weather": ["rain","baarish","drought","sukha","frost","pala","heat", "thand","flood","ola","hail","wind","storm","andhi"], "irrigation": ["irrigation","sinchai","water","drip","sprinkler","paani"], "price": ["price","rate","mandi","bhav","kimat","sell","bechna"], "seed": ["seed","beej","variety","kism","pusa","srh","hybrid"], "scheme": ["scheme","yojana","subsidy","kcc","pm kisan","pmfby","loan"], "crop_selection": ["which crop","kaunsi fasal","कौन सी फसल","grow what", "best crop","sowing","boyenge","bujayenge"], "soil": ["soil test","mitti","soil ph","soil","kshar","alkali"], "harvest": ["harvest","kataai","cutting","ripe","pakka","grain"], "storage": ["storage","bhandaran","store","grain pest","weevil"], } def classify_problem(query: str) -> str: q = query.lower() counts = {k: sum(1 for kw in v if kw in q) for k, v in _PROBLEM_KEYWORDS.items()} best = max(counts.items(), key=lambda x: x[1]) return best[0] if best[1] > 0 else "general" # ── Hindi → English query expansion ─────────────────────────────────────────── _HINDI_EXPANSIONS = { "गेहूं":"wheat","गेहु":"wheat","धान":"rice","चावल":"rice", "कपास":"cotton","मक्का":"maize","गन्ना":"sugarcane","सोयाबीन":"soybean", "सरसों":"mustard","मिर्च":"chilli","बैंगन":"brinjal","प्याज":"onion", "टमाटर":"tomato","आलू":"potato","मूंगफली":"groundnut","चना":"gram", "अरहर":"pigeonpea","मूंग":"mungbean","केला":"banana","आम":"mango", "खाद":"fertilizer","बीज":"seed","कीड़ा":"pest","बीमारी":"disease", "रोग":"disease","दवाइ":"pesticide","स्प्रे":"spray","उपचार":"treatment", "सिंचाई":"irrigation","पाला":"frost","सूखा":"drought","बारिश":"rain", "मंडी":"mandi","भाव":"price", # Romanised "gehu":"wheat","gehun":"wheat","dhan":"rice","kapas":"cotton", "makka":"maize","makki":"maize","ganna":"sugarcane", "sarson":"mustard","mirchi":"chilli","mirch":"chilli","baingan":"brinjal", "pyaz":"onion","pyaaz":"onion","tamatar":"tomato","aloo":"potato", "moongphali":"groundnut","chana":"gram","arhar":"pigeonpea","tur":"pigeonpea", "moong":"mungbean","kela":"banana","aam":"mango", "khaad":"fertilizer","beej":"seed","keeda":"pest","kida":"pest", "bimari":"disease","rog":"disease","dawai":"pesticide","upchar":"treatment", "sinchai":"irrigation","pala":"frost","sukha":"drought","baarish":"rain", "mandi":"mandi","bhav":"price", "safed makhi":"whitefly","tela":"jassid","mahu":"aphid","tikda":"thrips", } def normalize_query(query: str) -> str: """Append English equivalents for Hindi/Hinglish terms — helps embedding match.""" q_lower = query.lower() extras = [] for term, eng in _HINDI_EXPANSIONS.items(): if term in query or term in q_lower: if eng not in q_lower: extras.append(eng) if extras: return f"{query} {' '.join(extras)}" return query