Fatmagician commited on
Commit
b9a6758
·
0 Parent(s):

Enhanced AI chatbot

Browse files
Files changed (6) hide show
  1. .gitattributes +35 -0
  2. .gitignore +5 -0
  3. Dockerfile +53 -0
  4. README.md +10 -0
  5. app.py +432 -0
  6. requirements.txt +5 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .venv
2
+ .env
3
+ *.whl
4
+ .venv/
5
+ __pycache__/
Dockerfile ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ # FROM python:3.12.3
5
+
6
+ # RUN useradd -m -u 1000 user
7
+ # USER user
8
+ # ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ # WORKDIR /app
11
+
12
+ # COPY --chown=user ./requirements.txt requirements.txt
13
+ # RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+ # RUN mkdir -p /app/models && \
15
+ # wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf \
16
+ # -O /app/models/llama-2-7b-chat.Q4_K_M.gguf
17
+
18
+ # COPY --chown=user . /app
19
+ # CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
20
+
21
+ FROM python:3.12.3-slim
22
+
23
+ # --- System dependencies ---
24
+ RUN apt-get update && apt-get install -y --no-install-recommends \
25
+ wget \
26
+ && rm -rf /var/lib/apt/lists/*
27
+
28
+ # --- Non-root user ---
29
+ RUN useradd -m -u 1000 user
30
+ USER user
31
+ WORKDIR /app
32
+ ENV PATH="/home/user/.local/bin:$PATH"
33
+
34
+ # --- Copy wheel and requirements first ---
35
+ COPY --chown=user llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl .
36
+ COPY --chown=user requirements.txt .
37
+
38
+ # --- Install dependencies ---
39
+ RUN pip install --no-cache-dir --upgrade pip && \
40
+ pip install --no-cache-dir -r requirements.txt && \
41
+ pip install --no-cache-dir llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl
42
+
43
+ # --- Download model ---
44
+ RUN mkdir -p /app/models && \
45
+ wget -q https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf \
46
+ -O /app/models/llama-2-7b-chat.Q4_K_M.gguf
47
+
48
+ # --- Copy source code ---
49
+ COPY --chown=user . /app
50
+
51
+ # --- Expose & run ---
52
+ EXPOSE 7860
53
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Jet AI
3
+ emoji: 🌍
4
+ colorFrom: pink
5
+ colorTo: purple
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import asyncio
4
+ import time
5
+ import traceback
6
+ from fastapi import FastAPI, HTTPException
7
+ from pydantic import BaseModel
8
+ from llama_cpp import Llama
9
+ from contextlib import asynccontextmanager
10
+ from huggingface_hub import hf_hub_download
11
+ import json
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # ---------- CPU optimizations ----------
19
+ def optimize_for_cpu():
20
+ """Apply CPU-specific optimizations (optional)."""
21
+ os.environ['OMP_NUM_THREADS'] = str(os.cpu_count())
22
+ os.environ['KMP_BLOCKTIME'] = '1'
23
+ os.environ['KMP_AFFINITY'] = 'granularity=fine,compact,1,0'
24
+ try:
25
+ import psutil
26
+ p = psutil.Process()
27
+ p.nice(-5)
28
+ logger.debug("Set process to higher priority")
29
+ except:
30
+ pass
31
+
32
+ optimize_for_cpu()
33
+
34
+ # ---------- Queue management ----------
35
+ class QueueStatus:
36
+ def __init__(self, max_concurrent: int = 5):
37
+ self.max_concurrent = max_concurrent
38
+ self.active_tasks = 0
39
+ self.pending_queue = []
40
+ self._lock = asyncio.Lock()
41
+
42
+ async def acquire(self):
43
+ async with self._lock:
44
+ if self.active_tasks < self.max_concurrent:
45
+ self.active_tasks += 1
46
+ return True, 0 # No queue position
47
+ else:
48
+ position = len(self.pending_queue) + 1
49
+ future = asyncio.Future()
50
+ self.pending_queue.append(future)
51
+ return False, position
52
+
53
+ async def release(self):
54
+ async with self._lock:
55
+ self.active_tasks -= 1
56
+ if self.pending_queue:
57
+ future = self.pending_queue.pop(0)
58
+ future.set_result(True)
59
+ self.active_tasks += 1
60
+
61
+ def get_status(self):
62
+ return {
63
+ "active": self.active_tasks,
64
+ "queued": len(self.pending_queue),
65
+ "max_concurrent": self.max_concurrent
66
+ }
67
+
68
+ queue_status = QueueStatus(max_concurrent=5)
69
+
70
+ # ---------- The model class with local GGUF model ----------
71
+ class MixtralFreeModel:
72
+ def __init__(self, model_path: str = None):
73
+ """
74
+ Initialize the local GGUF model using llama-cpp-python.
75
+ If model_path is None, tries env var, then default, and finally downloads from HF.
76
+ """
77
+ self.model_name = "local-gguf"
78
+ self.max_tokens = 512
79
+ self.temperature = 0.7
80
+
81
+ # Determine model path
82
+ if model_path is None:
83
+ model_path = os.environ.get("GGUF_MODEL_PATH", None)
84
+
85
+ if model_path and os.path.exists(model_path):
86
+ gguf_file = model_path
87
+ logger.info(f"Using provided model path: {gguf_file}")
88
+ else:
89
+ # Fallback to known local path
90
+ local_path = "/app/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf"
91
+ if os.path.exists(local_path):
92
+ gguf_file = local_path
93
+ logger.info(f"Using local model file: {local_path}")
94
+ else:
95
+ # Download from Hugging Face Hub
96
+ logger.info("Model not found locally. Downloading from Hugging Face Hub...")
97
+ gguf_file = hf_hub_download(
98
+ repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
99
+ filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf"
100
+ )
101
+ logger.info(f"Downloaded model to: {gguf_file}")
102
+
103
+ logger.info(f"Loading GGUF model from {gguf_file}...")
104
+ start_time = time.time()
105
+ try:
106
+ self.llm = Llama(
107
+ model_path=gguf_file,
108
+ n_ctx=8192,
109
+ n_batch=512,
110
+ n_threads=os.cpu_count(),
111
+ n_threads_batch=os.cpu_count(),
112
+ use_mlock=True,
113
+ use_mmap=True,
114
+ low_vram=False,
115
+ verbose=False,
116
+ seed=42,
117
+ )
118
+ load_time = time.time() - start_time
119
+ logger.info(f"GGUF model loaded successfully in {load_time:.2f}s")
120
+ except Exception as e:
121
+ logger.error(f"Failed to load GGUF model: {e}")
122
+ raise
123
+
124
+ async def warm_up(self) -> None:
125
+ """Perform a short test inference to warm up the model."""
126
+ logger.info("Warming up model with test inference...")
127
+ start_time = time.time()
128
+ try:
129
+ await self._generate_completion("Hello", max_tokens=10, temperature=0.1)
130
+ warm_up_time = time.time() - start_time
131
+ logger.info(f"Model warm-up completed in {warm_up_time:.2f}s")
132
+ except Exception as e:
133
+ logger.warning(f"Model warm-up failed: {e}")
134
+
135
+ async def _generate_completion(self, prompt: str, max_tokens: int = None, temperature: float = None) -> str:
136
+ """Helper to run a blocking completion in a thread."""
137
+ if max_tokens is None:
138
+ max_tokens = self.max_tokens
139
+ if temperature is None:
140
+ temperature = self.temperature
141
+
142
+ def _blocking():
143
+ start = time.time()
144
+ response = self.llm.create_completion(
145
+ prompt=prompt,
146
+ max_tokens=max_tokens,
147
+ temperature=temperature,
148
+ top_p=0.95,
149
+ stop=["</s>"],
150
+ echo=False,
151
+ stream=False
152
+ )
153
+ elapsed = time.time() - start
154
+ logger.debug(f"Blocking completion took {elapsed:.2f}s")
155
+ return response['choices'][0]['text'].strip()
156
+
157
+ return await asyncio.to_thread(_blocking)
158
+
159
+ async def generate_response(self, question: str, context: str = "") -> str:
160
+ """
161
+ Generate a response using the local GGUF model.
162
+ For guide creation requests, enforces a strict JSON output format.
163
+ """
164
+ # Check if the user is asking to create a guide
165
+ is_guide_request = any(phrase in question.lower() for phrase in
166
+ ["guide", "create a guide", "make a guide", "step by step", "tutorial"])
167
+
168
+ if is_guide_request:
169
+ # Strict system prompt for JSON‑only output
170
+ system_prompt = f"""You are an assistant that creates structured guides.
171
+ When asked to create a guide, you MUST respond with ONLY a valid JSON object in the exact format below.
172
+ Do not include any additional text, explanations, markdown, or code fences.
173
+ The JSON object must contain the keys "action", "summary", and "sections".
174
+
175
+ Format:
176
+ {{"action": "generate_guide", "summary": "Brief summary of the task", "sections": ["Overview", "Prerequisites", "Step-by-Step Instructions", "Tools & Assets", "Common Mistakes", "Tips for Success", "Next Steps"]}}
177
+
178
+ Conversation context:
179
+ {context}
180
+
181
+ Now produce the JSON object for the user's request:"""
182
+ else:
183
+ # Normal assistant prompt
184
+ system_prompt = f"""You are a helpful, accurate, and context-aware assistant. Use the conversation history below to provide a relevant and useful answer to the question.
185
+
186
+ IMPORTANT:
187
+ - Answer in the same language as the question
188
+ - Be concise but comprehensive
189
+ - Use the conversation context when relevant
190
+ - If the context doesn't contain relevant information, use your general knowledge
191
+
192
+ Conversation history:
193
+ {context}
194
+
195
+ Provide a helpful response"""
196
+
197
+ prompt = f"<s>[INST] {system_prompt} [/INST] {question}"
198
+
199
+ try:
200
+ response_text = await self._generate_completion(prompt, max_tokens=512)
201
+
202
+ # For guide requests, extract and return only the JSON object
203
+ if is_guide_request:
204
+ import re
205
+ # Match a JSON object containing "action": "generate_guide"
206
+ match = re.search(r'\{[^{}]*"action"\s*:\s*"generate_guide"[^{}]*\}', response_text, re.DOTALL)
207
+ if match:
208
+ return match.group(0)
209
+ else:
210
+ # Fallback: return a default JSON (so frontend still works)
211
+ logger.warning("Model did not return valid JSON for guide request. Using fallback.")
212
+ return json.dumps({
213
+ "action": "generate_guide",
214
+ "summary": "Create a guide based on the conversation.",
215
+ "sections": ["Overview", "Prerequisites", "Step-by-Step Instructions", "Tools & Assets", "Common Mistakes", "Tips for Success", "Next Steps"]
216
+ })
217
+ return response_text
218
+
219
+ except Exception as e:
220
+ logger.error(f"Error in generation: {str(e)}")
221
+ return "I apologize, but I'm having trouble responding right now."
222
+
223
+ def clean_question(self, question: str) -> str:
224
+ """Remove command prefixes from the question."""
225
+ start = time.time()
226
+ prefixes = ['!bot', '!ai', '@bot', 'bot,', '!ai_search']
227
+ if not question or not question.strip():
228
+ return question
229
+ question_lower = question.lower().strip()
230
+ original_question = question.strip()
231
+ for prefix in prefixes:
232
+ if question_lower.startswith(prefix.lower()):
233
+ cleaned = original_question[len(prefix):].lstrip(' ,!:@')
234
+ elapsed = time.time() - start
235
+ logger.debug(f"Cleaned question in {elapsed:.4f}s: '{cleaned}'")
236
+ return cleaned
237
+ elapsed = time.time() - start
238
+ logger.debug(f"No prefix to clean, took {elapsed:.4f}s")
239
+ return original_question
240
+
241
+ async def compress_input(self, text: str, max_tokens: int = 500) -> str:
242
+ """Compress long input into a concise summary."""
243
+ if len(text.split()) < max_tokens:
244
+ logger.debug("Input already under token limit, skipping compression")
245
+ return text
246
+ logger.info(f"Compressing input of {len(text.split())} words...")
247
+ start = time.time()
248
+ prompt = f"<s>[INST] Summarize the following text into a concise, structured form (bullet points or key-value pairs) keeping all essential details. Use at most {max_tokens} tokens.\n\nText:\n{text}\n\nSummary: [/INST]"
249
+ summary = await self._generate_completion(prompt, max_tokens=max_tokens, temperature=0.5)
250
+ elapsed = time.time() - start
251
+ logger.info(f"Compression completed in {elapsed:.2f}s")
252
+ return summary
253
+
254
+ async def generate_efficient_section(self, section_type: str, context: str, max_tokens: int = 200) -> str:
255
+ """Generate a compressed, efficient language representation of a section."""
256
+ logger.info(f"Generating efficient representation for section '{section_type}'...")
257
+ start = time.time()
258
+ system = f"You are an expert task guide writer. Generate content for the section \"{section_type}\" in an efficient language format.\nUse a structured format like:\n- Key point 1: details\n- Key point 2: details\nOr use JSON if appropriate. Keep it concise and use at most {max_tokens} tokens."
259
+ prompt = f"<s>[INST] {system}\n\nContext: {context}\nGenerate the efficient language for {section_type} section. [/INST]"
260
+ efficient = await self._generate_completion(prompt, max_tokens=max_tokens)
261
+ elapsed = time.time() - start
262
+ logger.info(f"Efficient section generation took {elapsed:.2f}s")
263
+ return efficient
264
+
265
+ async def expand_efficient_to_natural(self, efficient_text: str, section_type: str, max_tokens: int = 512) -> str:
266
+ """Expand efficient language into detailed natural language."""
267
+ logger.info(f"Expanding efficient language to natural text for section '{section_type}'...")
268
+ start = time.time()
269
+ system = f"You are an expert task guide writer. Expand the following efficient language into a detailed, clear, and helpful section titled \"{section_type}\".\nUse markdown formatting, bullet points, subheadings, and ensure it's easy to understand. Make it comprehensive. Keep it under {max_tokens} tokens."
270
+ prompt = f"<s>[INST] {system}\n\nEfficient language:\n{efficient_text}\n\nWrite the full {section_type} section now. [/INST]"
271
+ expanded = await self._generate_completion(prompt, max_tokens=max_tokens)
272
+ elapsed = time.time() - start
273
+ logger.info(f"Expansion took {elapsed:.2f}s")
274
+ return expanded
275
+
276
+ async def generate_section(self, section_type: str, context: str, compress_input: bool = True) -> str:
277
+ total_start = time.time()
278
+ logger.info(f"Starting section generation for '{section_type}' (compress_input={compress_input})")
279
+
280
+ # Only compress if context is extremely large (4000+ words)
281
+ if compress_input and len(context.split()) > 4000: # was 1500
282
+ logger.info("Input context extremely large, compressing...")
283
+ context = await self.compress_input(context, max_tokens=500)
284
+ else:
285
+ logger.info(f"Input context size OK: {len(context.split())} words")
286
+
287
+ efficient = await self.generate_efficient_section(section_type, context)
288
+ expanded = await self.expand_efficient_to_natural(efficient, section_type)
289
+
290
+ total_time = time.time() - total_start
291
+ logger.info(f"Total section generation time: {total_time:.2f}s")
292
+ return expanded
293
+
294
+ # ---------- Global model variable ----------
295
+ model = None
296
+
297
+ # ---------- Lifespan context manager ----------
298
+ @asynccontextmanager
299
+ async def lifespan(app: FastAPI):
300
+ global model
301
+ try:
302
+ logger.info("Starting lifespan startup...")
303
+ start_total = time.time()
304
+ model = MixtralFreeModel()
305
+ await model.warm_up()
306
+ total_time = time.time() - start_total
307
+ logger.info(f"Model initialized and warmed up successfully in {total_time:.2f}s")
308
+ except Exception as e:
309
+ logger.error(f"Failed to initialize model: {e}")
310
+ model = None
311
+ yield
312
+ # Shutdown
313
+ logger.info("Shutting down, releasing model resources.")
314
+ model = None
315
+ logger.info("Shutdown complete.")
316
+
317
+ # ---------- FastAPI app ----------
318
+ app = FastAPI(
319
+ title="Free AI Response API",
320
+ description="Uses local GGUF model with queue management",
321
+ version="1.0",
322
+ lifespan=lifespan
323
+ )
324
+
325
+ app.add_middleware(
326
+ CORSMiddleware,
327
+ allow_origins=["*"], # For development; restrict in production
328
+ allow_credentials=True,
329
+ allow_methods=["*"], # Allows all methods, including OPTIONS
330
+ allow_headers=["*"],
331
+ )
332
+
333
+ # Request/Response models
334
+ class ChatRequest(BaseModel):
335
+ question: str
336
+ context: str = ""
337
+
338
+ class ChatResponse(BaseModel):
339
+ response: str
340
+
341
+ class GenerateSectionRequest(BaseModel):
342
+ section_type: str
343
+ context: str
344
+ compress_input: bool = True
345
+
346
+ class GenerateSectionResponse(BaseModel):
347
+ content: str
348
+
349
+ # ---------- Endpoints ----------
350
+ @app.get("/")
351
+ async def root():
352
+ return {"message": "Free AI Response API is running (local GGUF model). Use POST /chat or POST /generate-section."}
353
+
354
+ @app.get("/queue-status")
355
+ async def get_queue_status():
356
+ """Return current queue status for load balancing."""
357
+ return queue_status.get_status()
358
+
359
+ @app.post("/chat", response_model=ChatResponse)
360
+ async def chat(request: ChatRequest):
361
+ queue_start = time.time()
362
+ can_process, queue_position = await queue_status.acquire()
363
+ queue_wait = time.time() - queue_start
364
+
365
+ if not can_process:
366
+ logger.info(f"Request queued at position {queue_position} (queue wait {queue_wait:.3f}s)")
367
+ return {
368
+ "status": "queued",
369
+ "queue_position": queue_position,
370
+ "message": f"Request queued at position {queue_position}"
371
+ }
372
+
373
+ logger.info(f"Request started processing after queue wait {queue_wait:.3f}s")
374
+ req_start = time.time()
375
+ try:
376
+ if model is None:
377
+ raise HTTPException(status_code=503, detail="Model not available")
378
+
379
+ clean_start = time.time()
380
+ cleaned_question = model.clean_question(request.question)
381
+ clean_time = time.time() - clean_start
382
+ logger.info(f"Cleaned question in {clean_time:.4f}s")
383
+
384
+ response_text = await model.generate_response(cleaned_question, request.context)
385
+
386
+ total_time = time.time() - req_start
387
+ logger.info(f"Chat request completed in {total_time:.2f}s (including queue wait {queue_wait:.3f}s)")
388
+ return ChatResponse(response=response_text)
389
+ except Exception as e:
390
+ logger.error(f"Error processing request: {e}")
391
+ logger.error(traceback.format_exc())
392
+ raise HTTPException(status_code=500, detail="Internal server error")
393
+ finally:
394
+ await queue_status.release()
395
+
396
+ @app.post("/generate-section", response_model=GenerateSectionResponse)
397
+ async def generate_section(request: GenerateSectionRequest):
398
+ queue_start = time.time()
399
+ can_process, queue_position = await queue_status.acquire()
400
+ queue_wait = time.time() - queue_start
401
+
402
+ if not can_process:
403
+ logger.info(f"Request queued at position {queue_position} (queue wait {queue_wait:.3f}s)")
404
+ return {
405
+ "status": "queued",
406
+ "queue_position": queue_position,
407
+ "message": f"Request queued at position {queue_position}"
408
+ }
409
+
410
+ logger.info(f"Section generation started after queue wait {queue_wait:.3f}s")
411
+ req_start = time.time()
412
+ try:
413
+ if model is None:
414
+ raise HTTPException(status_code=503, detail="Model not available")
415
+
416
+ content = await model.generate_section(
417
+ request.section_type, request.context, request.compress_input
418
+ )
419
+
420
+ total_time = time.time() - req_start
421
+ logger.info(f"Generate-section request completed in {total_time:.2f}s (queue wait {queue_wait:.3f}s)")
422
+ return GenerateSectionResponse(content=content)
423
+ except Exception as e:
424
+ logger.error(f"Error generating section: {e}")
425
+ logger.error(traceback.format_exc())
426
+ raise HTTPException(status_code=500, detail="Internal server error")
427
+ finally:
428
+ await queue_status.release()
429
+
430
+ if __name__ == "__main__":
431
+ import uvicorn
432
+ uvicorn.run(app, host="0.0.0.0", port=8000, log_level="debug")
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ openai>=1.0.0
4
+ python-dotenv==1.0.0
5
+ huggingface-hub==0.35.1