| """Lightweight classifiers for language, crop, and problem type. |
| |
| These are all keyword-based — fast, deterministic, no model load. |
| They power retrieval rewriting and prompt routing, not the final answer. |
| """ |
| from __future__ import annotations |
| import re |
| from typing import Optional |
|
|
| |
|
|
| _LANG_SCRIPTS = [ |
| ("Hindi", ('ऀ', 'ॿ')), |
| ("Telugu", ('ఀ', '౿')), |
| ("Kannada", ('ಀ', '')), |
| ("Tamil", ('', '')), |
| ("Bengali", ('ঀ', '')), |
| ("Marathi", ('ऀ', 'ॿ')), |
| ("Gujarati",('', '૿')), |
| ("Punjabi", ('', '')), |
| ] |
|
|
| _HINGLISH_HINTS = { |
| "kar","ka","ke","ki","ko","main","mein","hai","hain","kya","kaise","kab", |
| "kahan","kyun","kyon","aur","ya","par","se","tak","liye","lekin","fir", |
| "khaad","beej","keeda","bimari","fasal","kheti","kisan","upchar" |
| } |
|
|
| def detect_language(query: str) -> str: |
| """Detect language of query. Returns canonical English name. |
| |
| Priority: script → keyword → default English. |
| """ |
| for lang, (lo, hi) in _LANG_SCRIPTS: |
| if any(lo <= c <= hi for c in query): |
| |
| return "Marathi" if lang == "Marathi" and "मराठी" in query else ( |
| "Hindi" if lang == "Marathi" else lang) |
| words = set(re.findall(r"\b[a-z]+\b", query.lower())) |
| if words.intersection(_HINGLISH_HINTS): |
| return "Hinglish" |
| return "English" |
|
|
|
|
| |
|
|
| _CROP_KEYWORDS = { |
| "Wheat": ["wheat","gehu","gehun","गेहूं","गेहु"], |
| "Rice": ["rice","paddy","dhan","chaval","धान","चावल"], |
| "Cotton": ["cotton","kapas","कपास"], |
| "Maize": ["maize","corn","makka","makki","मक्का"], |
| "Sugarcane": ["sugarcane","ganna","गन्ना"], |
| "Soybean": ["soybean","soyabean","soya","सोयाबीन"], |
| "Mustard": ["mustard","sarson","सरसों"], |
| "Chilli": ["chilli","chili","mirchi","mirch","मिर्च"], |
| "Brinjal": ["brinjal","baingan","baigan","बैंगन"], |
| "Onion": ["onion","pyaz","pyaaz","प्याज"], |
| "Tomato": ["tomato","tamatar","टमाटर"], |
| "Potato": ["potato","aloo","आलू"], |
| "Groundnut": ["groundnut","peanut","moongphali","मूंगफली"], |
| "Gram": ["gram","chickpea","chana","चना"], |
| "Pigeonpea": ["pigeonpea","arhar","tur","अरहर","तूर"], |
| "Mungbean": ["mungbean","moong","mung","मूंग"], |
| "Banana": ["banana","kela","केला"], |
| "Mango": ["mango","aam","आम"], |
| } |
|
|
| def detect_crop(query: str) -> Optional[str]: |
| q = query.lower() |
| for crop, kws in _CROP_KEYWORDS.items(): |
| for kw in kws: |
| if kw in q: |
| return crop |
| return None |
|
|
|
|
| |
|
|
| _PROBLEM_KEYWORDS = { |
| "pest": ["pest","insect","keeda","keeda","kida","कीड़ा","aphid","borer", |
| "thrips","mite","whitefly","caterpillar","mahu","tela","tikda", |
| "safed makhi","bollworm","armyworm","helicoverpa"], |
| "disease": ["disease","bimari","rog","बीमारी","blight","rust","wilt", |
| "mildew","blast","leaf spot","mosaic","virus","rot","fungus"], |
| "nutrient": ["fertilizer","khaad","urea","dap","npk","nitrogen","phosphorus", |
| "potassium","poshan","yellow leaves","peeli","stunted","kami", |
| "deficiency"], |
| "weather": ["rain","baarish","drought","sukha","frost","pala","heat", |
| "thand","flood","ola","hail","wind","storm","andhi"], |
| "irrigation": ["irrigation","sinchai","water","drip","sprinkler","paani"], |
| "price": ["price","rate","mandi","bhav","kimat","sell","bechna"], |
| "seed": ["seed","beej","variety","kism","pusa","srh","hybrid"], |
| "scheme": ["scheme","yojana","subsidy","kcc","pm kisan","pmfby","loan"], |
| "crop_selection": ["which crop","kaunsi fasal","कौन सी फसल","grow what", |
| "best crop","sowing","boyenge","bujayenge"], |
| "soil": ["soil test","mitti","soil ph","soil","kshar","alkali"], |
| "harvest": ["harvest","kataai","cutting","ripe","pakka","grain"], |
| "storage": ["storage","bhandaran","store","grain pest","weevil"], |
| } |
|
|
| def classify_problem(query: str) -> str: |
| q = query.lower() |
| counts = {k: sum(1 for kw in v if kw in q) for k, v in _PROBLEM_KEYWORDS.items()} |
| best = max(counts.items(), key=lambda x: x[1]) |
| return best[0] if best[1] > 0 else "general" |
|
|
|
|
| |
|
|
| _HINDI_EXPANSIONS = { |
| "गेहूं":"wheat","गेहु":"wheat","धान":"rice","चावल":"rice", |
| "कपास":"cotton","मक्का":"maize","गन्ना":"sugarcane","सोयाबीन":"soybean", |
| "सरसों":"mustard","मिर्च":"chilli","बैंगन":"brinjal","प्याज":"onion", |
| "टमाटर":"tomato","आलू":"potato","मूंगफली":"groundnut","चना":"gram", |
| "अरहर":"pigeonpea","मूंग":"mungbean","केला":"banana","आम":"mango", |
| "खाद":"fertilizer","बीज":"seed","कीड़ा":"pest","बीमारी":"disease", |
| "रोग":"disease","दवाइ":"pesticide","स्प्रे":"spray","उपचार":"treatment", |
| "सिंचाई":"irrigation","पाला":"frost","सूखा":"drought","बारिश":"rain", |
| "मंडी":"mandi","भाव":"price", |
| |
| "gehu":"wheat","gehun":"wheat","dhan":"rice","kapas":"cotton", |
| "makka":"maize","makki":"maize","ganna":"sugarcane", |
| "sarson":"mustard","mirchi":"chilli","mirch":"chilli","baingan":"brinjal", |
| "pyaz":"onion","pyaaz":"onion","tamatar":"tomato","aloo":"potato", |
| "moongphali":"groundnut","chana":"gram","arhar":"pigeonpea","tur":"pigeonpea", |
| "moong":"mungbean","kela":"banana","aam":"mango", |
| "khaad":"fertilizer","beej":"seed","keeda":"pest","kida":"pest", |
| "bimari":"disease","rog":"disease","dawai":"pesticide","upchar":"treatment", |
| "sinchai":"irrigation","pala":"frost","sukha":"drought","baarish":"rain", |
| "mandi":"mandi","bhav":"price", |
| "safed makhi":"whitefly","tela":"jassid","mahu":"aphid","tikda":"thrips", |
| } |
|
|
| def normalize_query(query: str) -> str: |
| """Append English equivalents for Hindi/Hinglish terms — helps embedding match.""" |
| q_lower = query.lower() |
| extras = [] |
| for term, eng in _HINDI_EXPANSIONS.items(): |
| if term in query or term in q_lower: |
| if eng not in q_lower: |
| extras.append(eng) |
| if extras: |
| return f"{query} {' '.join(extras)}" |
| return query |
|
|