Fatmagician commited on
Commit
b7e622a
·
1 Parent(s): 4537781

New updates faster

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. Dockerfile +32 -63
  3. app.py +99 -98
.gitignore CHANGED
@@ -1,4 +1,5 @@
1
  .venv
 
2
  .env
3
  *.whl
4
  .venv/
 
1
  .venv
2
+ /.git_hf
3
  .env
4
  *.whl
5
  .venv/
Dockerfile CHANGED
@@ -1,97 +1,66 @@
1
- # # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
- # # you will also find guides on how best to write your Dockerfile
3
 
4
- # # FROM python:3.12.3
5
 
6
- # # RUN useradd -m -u 1000 user
7
- # # USER user
8
- # # ENV PATH="/home/user/.local/bin:$PATH"
9
-
10
- # # WORKDIR /app
11
-
12
- # # COPY --chown=user ./requirements.txt requirements.txt
13
- # # RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
- # # RUN mkdir -p /app/models && \
15
- # # wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf \
16
- # # -O /app/models/llama-2-7b-chat.Q4_K_M.gguf
17
-
18
- # # COPY --chown=user . /app
19
- # # CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
20
-
21
- # FROM python:3.12
22
-
23
- # # --- System dependencies ---
24
- # RUN apt-get update && apt-get install -y --no-install-recommends \
25
- # wget \
26
- # libgomp1 \
27
- # && rm -rf /var/lib/apt/lists/*
28
-
29
- # RUN apt-get update && apt-get install -y build-essential cmake
30
-
31
- # # --- Non-root user ---
32
  # RUN useradd -m -u 1000 user
33
  # USER user
34
- # WORKDIR /app
35
  # ENV PATH="/home/user/.local/bin:$PATH"
36
 
37
- # # --- Copy wheel and requirements first ---
38
- # COPY --chown=user llama_cpp_python-0.3.20-py3-none-linux_x86_64.whl .
39
- # COPY --chown=user requirements.txt .
40
-
41
- # ENV CMAKE_ARGS="-DLLAMA_AVX2=on -DLLAMA_FMA=on -DLLAMA_OPENMP=on"
42
-
43
-
44
- # # --- Install dependencies ---
45
- # RUN pip install --no-cache-dir --upgrade pip && \
46
- # pip install --no-cache-dir -r requirements.txt && \
47
- # pip install --no-cache-dir llama_cpp_python-0.3.20-py3-none-linux_x86_64.whl
48
 
49
- # # --- Download model ---
 
50
  # RUN mkdir -p /app/models && \
51
- # wget --progress=bar:force \
52
- # --retry-connrefused \
53
- # --tries=5 \
54
- # --timeout=30 \
55
- # -O /app/models/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf \
56
- # https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-GGUF/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
57
-
58
- # RUN ls -lh /app/models && \
59
- # du -h /app/models/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
60
-
61
- # # --- Copy source code ---
62
- # COPY --chown=user . /app
63
 
64
- # # --- Expose & run ---
65
- # EXPOSE 7860
66
  # CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
67
 
68
  FROM python:3.12
69
 
 
70
  RUN apt-get update && apt-get install -y --no-install-recommends \
71
  wget \
72
  libgomp1 \
73
- build-essential \
74
- cmake \
75
  && rm -rf /var/lib/apt/lists/*
76
 
 
 
 
77
  RUN useradd -m -u 1000 user
78
  USER user
79
  WORKDIR /app
80
  ENV PATH="/home/user/.local/bin:$PATH"
81
 
 
 
82
  COPY --chown=user requirements.txt .
83
 
84
  ENV CMAKE_ARGS="-DLLAMA_AVX2=on -DLLAMA_FMA=on -DLLAMA_OPENMP=on"
85
 
 
 
86
  RUN pip install --no-cache-dir --upgrade pip && \
87
  pip install --no-cache-dir -r requirements.txt && \
88
- pip install --no-cache-dir llama-cpp-python
89
 
 
90
  RUN mkdir -p /app/models && \
91
- wget -O /app/models/model.gguf \
92
- https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-GGUF/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
93
-
 
 
 
 
 
 
 
 
94
  COPY --chown=user . /app
95
 
 
96
  EXPOSE 7860
97
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
 
4
+ # FROM python:3.12.3
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  # RUN useradd -m -u 1000 user
7
  # USER user
 
8
  # ENV PATH="/home/user/.local/bin:$PATH"
9
 
10
+ # WORKDIR /app
 
 
 
 
 
 
 
 
 
 
11
 
12
+ # COPY --chown=user ./requirements.txt requirements.txt
13
+ # RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
  # RUN mkdir -p /app/models && \
15
+ # wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf \
16
+ # -O /app/models/llama-2-7b-chat.Q4_K_M.gguf
 
 
 
 
 
 
 
 
 
 
17
 
18
+ # COPY --chown=user . /app
 
19
  # CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
20
 
21
  FROM python:3.12
22
 
23
+ # --- System dependencies ---
24
  RUN apt-get update && apt-get install -y --no-install-recommends \
25
  wget \
26
  libgomp1 \
 
 
27
  && rm -rf /var/lib/apt/lists/*
28
 
29
+ RUN apt-get update && apt-get install -y build-essential cmake
30
+
31
+ # --- Non-root user ---
32
  RUN useradd -m -u 1000 user
33
  USER user
34
  WORKDIR /app
35
  ENV PATH="/home/user/.local/bin:$PATH"
36
 
37
+ # --- Copy wheel and requirements first ---
38
+ COPY --chown=user llama_cpp_python-0.3.20-py3-none-linux_x86_64.whl .
39
  COPY --chown=user requirements.txt .
40
 
41
  ENV CMAKE_ARGS="-DLLAMA_AVX2=on -DLLAMA_FMA=on -DLLAMA_OPENMP=on"
42
 
43
+
44
+ # --- Install dependencies ---
45
  RUN pip install --no-cache-dir --upgrade pip && \
46
  pip install --no-cache-dir -r requirements.txt && \
47
+ pip install --no-cache-dir llama_cpp_python-0.3.20-py3-none-linux_x86_64.whl
48
 
49
+ # --- Download model ---
50
  RUN mkdir -p /app/models && \
51
+ wget --progress=bar:force \
52
+ --retry-connrefused \
53
+ --tries=5 \
54
+ --timeout=30 \
55
+ -O /app/models/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf \
56
+ https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-GGUF/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
57
+
58
+ RUN ls -lh /app/models && \
59
+ du -h /app/models/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
60
+
61
+ # --- Copy source code ---
62
  COPY --chown=user . /app
63
 
64
+ # --- Expose & run ---
65
  EXPOSE 7860
66
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -3,9 +3,9 @@ import logging
3
  import asyncio
4
  import time
5
  import traceback
6
- import json # <-- add this
7
- from fastapi import FastAPI, HTTPException
8
- from fastapi.middleware.cors import CORSMiddleware # <-- add this
9
  from pydantic import BaseModel
10
  from llama_cpp import Llama
11
  from contextlib import asynccontextmanager
@@ -43,7 +43,7 @@ class QueueStatus:
43
  async with self._lock:
44
  if self.active_tasks < self.max_concurrent:
45
  self.active_tasks += 1
46
- return True, 0 # No queue position
47
  else:
48
  position = len(self.pending_queue) + 1
49
  future = asyncio.Future()
@@ -109,7 +109,6 @@ class MixtralFreeModel:
109
  verbose=False,
110
  seed=42,
111
  )
112
-
113
  load_time = time.time() - start_time
114
  logger.info(f"GGUF model loaded successfully in {load_time:.2f}s")
115
  except Exception as e:
@@ -117,7 +116,6 @@ class MixtralFreeModel:
117
  raise
118
 
119
  async def warm_up(self) -> None:
120
- """Perform a short test inference to warm up the model."""
121
  logger.info("Warming up model with test inference...")
122
  start_time = time.time()
123
  try:
@@ -128,11 +126,10 @@ class MixtralFreeModel:
128
  logger.warning(f"Model warm-up failed: {e}")
129
 
130
  async def _generate_completion(self, prompt: str, max_tokens: int = None, temperature: float = None) -> str:
131
- """Helper to run a blocking completion in a thread."""
132
  if max_tokens is None:
133
  max_tokens = self.max_tokens
134
  if temperature is None:
135
- temperature = self.temperature
136
 
137
  def _blocking():
138
  start = time.time()
@@ -141,7 +138,7 @@ class MixtralFreeModel:
141
  max_tokens=max_tokens,
142
  temperature=temperature,
143
  top_p=0.95,
144
- stop=["</s>"],
145
  echo=False,
146
  stream=False
147
  )
@@ -152,11 +149,6 @@ class MixtralFreeModel:
152
  return await asyncio.to_thread(_blocking)
153
 
154
  async def generate_response(self, question: str, context: str = "") -> str:
155
- """
156
- Generate a response using the local GGUF model.
157
- For guide creation requests, enforces a strict JSON output format.
158
- """
159
- # Check if the user is asking to create a guide
160
  is_guide_request = any(phrase in question.lower() for phrase in
161
  ["guide", "create a guide", "make a guide", "step by step", "tutorial"])
162
 
@@ -174,7 +166,6 @@ class MixtralFreeModel:
174
 
175
  Now produce the JSON object for the user's request:"""
176
  else:
177
- # Normal assistant prompt
178
  system_prompt = f"""You are a helpful, accurate, and context-aware assistant. Use the conversation history below to provide a relevant and useful answer to the question.
179
 
180
  IMPORTANT:
@@ -188,20 +179,17 @@ class MixtralFreeModel:
188
 
189
  Provide a helpful response"""
190
 
191
- prompt = f"<s>[INST] {system_prompt} [/INST] {question}"
192
 
193
  try:
194
  response_text = await self._generate_completion(prompt, max_tokens=512)
195
 
196
- # For guide requests, extract and return only the JSON object
197
  if is_guide_request:
198
  import re
199
- # Match a JSON object containing "action": "generate_guide"
200
  match = re.search(r'\{[^{}]*"action"\s*:\s*"generate_guide"[^{}]*\}', response_text, re.DOTALL)
201
  if match:
202
  return match.group(0)
203
  else:
204
- # Fallback: return a default JSON (so frontend still works)
205
  logger.warning("Model did not return valid JSON for guide request. Using fallback.")
206
  return json.dumps({
207
  "action": "generate_guide",
@@ -209,14 +197,11 @@ class MixtralFreeModel:
209
  "sections": ["Overview", "Prerequisites", "Step-by-Step Instructions", "Tools & Assets", "Flow"]
210
  })
211
  return response_text
212
-
213
  except Exception as e:
214
  logger.error(f"Error in generation: {str(e)}")
215
  return "I apologize, but I'm having trouble responding right now."
216
 
217
  def clean_question(self, question: str) -> str:
218
- """Remove command prefixes from the question."""
219
- start = time.time()
220
  prefixes = ['!bot', '!ai', '@bot', 'bot,', '!ai_search']
221
  if not question or not question.strip():
222
  return question
@@ -225,17 +210,11 @@ class MixtralFreeModel:
225
  for prefix in prefixes:
226
  if question_lower.startswith(prefix.lower()):
227
  cleaned = original_question[len(prefix):].lstrip(' ,!:@')
228
- elapsed = time.time() - start
229
- logger.debug(f"Cleaned question in {elapsed:.4f}s: '{cleaned}'")
230
  return cleaned
231
- elapsed = time.time() - start
232
- logger.debug(f"No prefix to clean, took {elapsed:.4f}s")
233
  return original_question
234
 
235
  async def compress_input(self, text: str, max_tokens: int = 500) -> str:
236
- """Compress long input into a concise summary."""
237
  if len(text.split()) < max_tokens:
238
- logger.debug("Input already under token limit, skipping compression")
239
  return text
240
  logger.info(f"Compressing input of {len(text.split())} words...")
241
  start = time.time()
@@ -245,9 +224,8 @@ class MixtralFreeModel:
245
  logger.info(f"Compression completed in {elapsed:.2f}s")
246
  return summary
247
 
248
- async def generate_efficient_section(self, section_type: str, context: str, max_tokens: int = 200) -> str:
249
- """Generate a compressed, efficient language representation of a section."""
250
- logger.info(f"Generating efficient representation for section '{section_type}'...")
251
  start = time.time()
252
  system = f"You are an expert task guide writer. Generate content for the section \"{section_type}\" in an efficient language format.\nUse a structured format like:\n- Key point 1: details\n- Key point 2: details\nOr use JSON if appropriate. Keep it concise and use at most {max_tokens} tokens."
253
  prompt = f"<s>[INST] {system}\n\nContext: {context}\nGenerate the efficient language for {section_type} section. [/INST]"
@@ -256,11 +234,23 @@ class MixtralFreeModel:
256
  logger.info(f"Efficient section generation took {elapsed:.2f}s")
257
  return efficient
258
 
259
- async def expand_efficient_to_natural(self, efficient_text: str, section_type: str, max_tokens: int = 512) -> str:
260
- """Expand efficient language into detailed natural language."""
261
  logger.info(f"Expanding efficient language to natural text for section '{section_type}'...")
262
  start = time.time()
263
- system = f"You are an expert task guide writer. Expand the following efficient language into a detailed, clear, and helpful section titled \"{section_type}\".\nUse markdown formatting, bullet points, subheadings, and ensure it's easy to understand. Make it comprehensive."
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  prompt = f"<s>[INST] {system}\n\nEfficient language:\n{efficient_text}\n\nWrite the full {section_type} section now. [/INST]"
265
  expanded = await self._generate_completion(prompt, max_tokens=max_tokens)
266
  elapsed = time.time() - start
@@ -268,8 +258,6 @@ class MixtralFreeModel:
268
  return expanded
269
 
270
  async def generate_flow_diagram(self, context: str) -> str:
271
- """Generate a Mermaid flowchart for the Flow section."""
272
-
273
  prompt = f"""[INST] You are an expert at creating Mermaid flowcharts for task guides.
274
 
275
  STRICT RULES:
@@ -294,37 +282,20 @@ class MixtralFreeModel:
294
  Now generate the diagram. [/INST]"""
295
 
296
  try:
297
- response = await self._generate_completion(
298
- prompt,
299
- max_tokens=512,
300
- temperature=0.2
301
- )
302
-
303
  response = response.strip()
304
-
305
- # ✅ Case 1: Model already returns proper block
306
  if response.startswith("```mermaid") and response.endswith("```"):
307
  return response
308
-
309
- # ✅ Case 2: Model returns raw flowchart without code block
310
  if "flowchart" in response or "graph" in response:
311
  return f"```mermaid\n{response}\n```"
312
-
313
- # ❌ Case 3: Model output is garbage → fallback
314
  logger.warning("Invalid Mermaid output, using fallback diagram.")
315
-
316
  return """```mermaid
317
-
318
  flowchart TD
319
  A[Start] --> B[Follow the steps above]
320
  B --> C[Complete task]
321
  C --> D[End]"""
322
-
323
-
324
  except Exception as e:
325
  logger.error(f"Flow diagram generation failed: {e}")
326
-
327
- # ❌ Hard fallback (error case)
328
  return """```mermaid
329
  flowchart TD
330
  A[Start] --> B[Error generating diagram]
@@ -333,25 +304,16 @@ class MixtralFreeModel:
333
  ```"""
334
 
335
  async def generate_section(self, section_type: str, context: str, compress_input: bool = True) -> str:
336
- """Generate a detailed section using compress -> efficient -> expand pipeline."""
337
  total_start = time.time()
338
- # Special handling for Flow section
339
  if section_type.lower() == "flow":
340
  return await self.generate_flow_diagram(context)
341
  logger.info(f"Starting section generation for '{section_type}' (compress_input={compress_input})")
342
- # Step 1: compress input if needed
343
  if compress_input and len(context.split()) > 1500:
344
- logger.info("Input context large, compressing...")
345
  context = await self.compress_input(context, max_tokens=1000)
346
  else:
347
  logger.info(f"Input context size OK: {len(context.split())} words")
348
-
349
- # Step 2: generate efficient language
350
  efficient = await self.generate_efficient_section(section_type, context)
351
-
352
- # Step 3: expand to natural language
353
  expanded = await self.expand_efficient_to_natural(efficient, section_type)
354
-
355
  total_time = time.time() - total_start
356
  logger.info(f"Total section generation time: {total_time:.2f}s")
357
  return expanded
@@ -374,7 +336,6 @@ async def lifespan(app: FastAPI):
374
  logger.error(f"Failed to initialize model: {e}")
375
  model = None
376
  yield
377
- # Shutdown
378
  logger.info("Shutting down, releasing model resources.")
379
  model = None
380
  logger.info("Shutdown complete.")
@@ -389,9 +350,9 @@ app = FastAPI(
389
 
390
  app.add_middleware(
391
  CORSMiddleware,
392
- allow_origins=["*"], # For development; restrict in production
393
  allow_credentials=True,
394
- allow_methods=["*"], # Allows all methods, including OPTIONS
395
  allow_headers=["*"],
396
  )
397
 
@@ -405,20 +366,26 @@ class ChatResponse(BaseModel):
405
 
406
  class GenerateSectionRequest(BaseModel):
407
  section_type: str
408
- context: str
 
409
  compress_input: bool = True
410
 
411
  class GenerateSectionResponse(BaseModel):
412
  content: str
413
 
 
 
 
 
 
 
414
  # ---------- Endpoints ----------
415
  @app.get("/")
416
  async def root():
417
- return {"message": "Free AI Response API is running (local GGUF model). Use POST /chat or POST /generate-section."}
418
 
419
  @app.get("/queue-status")
420
  async def get_queue_status():
421
- """Return current queue status for load balancing."""
422
  return queue_status.get_status()
423
 
424
  @app.post("/chat", response_model=ChatResponse)
@@ -428,28 +395,18 @@ async def chat(request: ChatRequest):
428
  queue_wait = time.time() - queue_start
429
 
430
  if not can_process:
431
- logger.info(f"Request queued at position {queue_position} (queue wait {queue_wait:.3f}s)")
432
- return {
433
- "status": "queued",
434
- "queue_position": queue_position,
435
- "message": f"Request queued at position {queue_position}"
436
- }
437
 
438
  logger.info(f"Request started processing after queue wait {queue_wait:.3f}s")
439
  req_start = time.time()
440
  try:
441
  if model is None:
442
  raise HTTPException(status_code=503, detail="Model not available")
443
-
444
- clean_start = time.time()
445
  cleaned_question = model.clean_question(request.question)
446
- clean_time = time.time() - clean_start
447
- logger.info(f"Cleaned question in {clean_time:.4f}s")
448
-
449
  response_text = await model.generate_response(cleaned_question, request.context)
450
-
451
  total_time = time.time() - req_start
452
- logger.info(f"Chat request completed in {total_time:.2f}s (including queue wait {queue_wait:.3f}s)")
453
  return ChatResponse(response=response_text)
454
  except Exception as e:
455
  logger.error(f"Error processing request: {e}")
@@ -459,32 +416,48 @@ async def chat(request: ChatRequest):
459
  await queue_status.release()
460
 
461
  @app.post("/generate-section", response_model=GenerateSectionResponse)
462
- async def generate_section(request: GenerateSectionRequest):
463
  queue_start = time.time()
464
  can_process, queue_position = await queue_status.acquire()
465
  queue_wait = time.time() - queue_start
466
 
467
  if not can_process:
468
- logger.info(f"Request queued at position {queue_position} (queue wait {queue_wait:.3f}s)")
469
- return {
470
- "status": "queued",
471
- "queue_position": queue_position,
472
- "message": f"Request queued at position {queue_position}"
473
- }
474
 
475
  logger.info(f"Section generation started after queue wait {queue_wait:.3f}s")
476
- req_start = time.time()
477
  try:
478
  if model is None:
479
  raise HTTPException(status_code=503, detail="Model not available")
480
-
481
- content = await model.generate_section(
482
- request.section_type, request.context, request.compress_input
483
- )
484
-
485
- total_time = time.time() - req_start
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  logger.info(f"Generate-section request completed in {total_time:.2f}s (queue wait {queue_wait:.3f}s)")
487
- return GenerateSectionResponse(content=content)
488
  except Exception as e:
489
  logger.error(f"Error generating section: {e}")
490
  logger.error(traceback.format_exc())
@@ -492,6 +465,34 @@ async def generate_section(request: GenerateSectionRequest):
492
  finally:
493
  await queue_status.release()
494
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
  if __name__ == "__main__":
496
  import uvicorn
497
- uvicorn.run(app, host="0.0.0.0", port=8000, log_level="debug")
 
3
  import asyncio
4
  import time
5
  import traceback
6
+ import json
7
+ from fastapi import FastAPI, HTTPException, Request
8
+ from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel
10
  from llama_cpp import Llama
11
  from contextlib import asynccontextmanager
 
43
  async with self._lock:
44
  if self.active_tasks < self.max_concurrent:
45
  self.active_tasks += 1
46
+ return True, 0
47
  else:
48
  position = len(self.pending_queue) + 1
49
  future = asyncio.Future()
 
109
  verbose=False,
110
  seed=42,
111
  )
 
112
  load_time = time.time() - start_time
113
  logger.info(f"GGUF model loaded successfully in {load_time:.2f}s")
114
  except Exception as e:
 
116
  raise
117
 
118
  async def warm_up(self) -> None:
 
119
  logger.info("Warming up model with test inference...")
120
  start_time = time.time()
121
  try:
 
126
  logger.warning(f"Model warm-up failed: {e}")
127
 
128
  async def _generate_completion(self, prompt: str, max_tokens: int = None, temperature: float = None) -> str:
 
129
  if max_tokens is None:
130
  max_tokens = self.max_tokens
131
  if temperature is None:
132
+ temperature = 0.3
133
 
134
  def _blocking():
135
  start = time.time()
 
138
  max_tokens=max_tokens,
139
  temperature=temperature,
140
  top_p=0.95,
141
+ stop=[],
142
  echo=False,
143
  stream=False
144
  )
 
149
  return await asyncio.to_thread(_blocking)
150
 
151
  async def generate_response(self, question: str, context: str = "") -> str:
 
 
 
 
 
152
  is_guide_request = any(phrase in question.lower() for phrase in
153
  ["guide", "create a guide", "make a guide", "step by step", "tutorial"])
154
 
 
166
 
167
  Now produce the JSON object for the user's request:"""
168
  else:
 
169
  system_prompt = f"""You are a helpful, accurate, and context-aware assistant. Use the conversation history below to provide a relevant and useful answer to the question.
170
 
171
  IMPORTANT:
 
179
 
180
  Provide a helpful response"""
181
 
182
+ prompt = f"<s>[INST] {system_prompt}\n\nNow handle this user request: {question} [/INST]"
183
 
184
  try:
185
  response_text = await self._generate_completion(prompt, max_tokens=512)
186
 
 
187
  if is_guide_request:
188
  import re
 
189
  match = re.search(r'\{[^{}]*"action"\s*:\s*"generate_guide"[^{}]*\}', response_text, re.DOTALL)
190
  if match:
191
  return match.group(0)
192
  else:
 
193
  logger.warning("Model did not return valid JSON for guide request. Using fallback.")
194
  return json.dumps({
195
  "action": "generate_guide",
 
197
  "sections": ["Overview", "Prerequisites", "Step-by-Step Instructions", "Tools & Assets", "Flow"]
198
  })
199
  return response_text
 
200
  except Exception as e:
201
  logger.error(f"Error in generation: {str(e)}")
202
  return "I apologize, but I'm having trouble responding right now."
203
 
204
  def clean_question(self, question: str) -> str:
 
 
205
  prefixes = ['!bot', '!ai', '@bot', 'bot,', '!ai_search']
206
  if not question or not question.strip():
207
  return question
 
210
  for prefix in prefixes:
211
  if question_lower.startswith(prefix.lower()):
212
  cleaned = original_question[len(prefix):].lstrip(' ,!:@')
 
 
213
  return cleaned
 
 
214
  return original_question
215
 
216
  async def compress_input(self, text: str, max_tokens: int = 500) -> str:
 
217
  if len(text.split()) < max_tokens:
 
218
  return text
219
  logger.info(f"Compressing input of {len(text.split())} words...")
220
  start = time.time()
 
224
  logger.info(f"Compression completed in {elapsed:.2f}s")
225
  return summary
226
 
227
+ async def generate_efficient_section(self, section_type: str, context: str, max_tokens: int = 300) -> str:
228
+ logger.info(f"Generating efficient representation for '{section_type}'...")
 
229
  start = time.time()
230
  system = f"You are an expert task guide writer. Generate content for the section \"{section_type}\" in an efficient language format.\nUse a structured format like:\n- Key point 1: details\n- Key point 2: details\nOr use JSON if appropriate. Keep it concise and use at most {max_tokens} tokens."
231
  prompt = f"<s>[INST] {system}\n\nContext: {context}\nGenerate the efficient language for {section_type} section. [/INST]"
 
234
  logger.info(f"Efficient section generation took {elapsed:.2f}s")
235
  return efficient
236
 
237
+ async def expand_efficient_to_natural(self, efficient_text: str, section_type: str, max_tokens: int = 300) -> str:
 
238
  logger.info(f"Expanding efficient language to natural text for section '{section_type}'...")
239
  start = time.time()
240
+ system = f"""You are an expert task guide writer.
241
+ Expand the efficient language into a **short but helpful** section titled "{section_type}".
242
+
243
+ STRICT RULES:
244
+ - Maximum 120 words total.
245
+ - Use markdown subheadings (###) and bullet points.
246
+ - No long paragraphs – break into 3-5 bullet points or short phrases.
247
+ - Skip introductions, conclusions, and fluff.
248
+ - Keep the tone professional and clear.
249
+
250
+ Efficient language:
251
+ {efficient_text}
252
+
253
+ Write the {section_type} section now:"""
254
  prompt = f"<s>[INST] {system}\n\nEfficient language:\n{efficient_text}\n\nWrite the full {section_type} section now. [/INST]"
255
  expanded = await self._generate_completion(prompt, max_tokens=max_tokens)
256
  elapsed = time.time() - start
 
258
  return expanded
259
 
260
  async def generate_flow_diagram(self, context: str) -> str:
 
 
261
  prompt = f"""[INST] You are an expert at creating Mermaid flowcharts for task guides.
262
 
263
  STRICT RULES:
 
282
  Now generate the diagram. [/INST]"""
283
 
284
  try:
285
+ response = await self._generate_completion(prompt, max_tokens=512, temperature=0.2)
 
 
 
 
 
286
  response = response.strip()
 
 
287
  if response.startswith("```mermaid") and response.endswith("```"):
288
  return response
 
 
289
  if "flowchart" in response or "graph" in response:
290
  return f"```mermaid\n{response}\n```"
 
 
291
  logger.warning("Invalid Mermaid output, using fallback diagram.")
 
292
  return """```mermaid
 
293
  flowchart TD
294
  A[Start] --> B[Follow the steps above]
295
  B --> C[Complete task]
296
  C --> D[End]"""
 
 
297
  except Exception as e:
298
  logger.error(f"Flow diagram generation failed: {e}")
 
 
299
  return """```mermaid
300
  flowchart TD
301
  A[Start] --> B[Error generating diagram]
 
304
  ```"""
305
 
306
  async def generate_section(self, section_type: str, context: str, compress_input: bool = True) -> str:
 
307
  total_start = time.time()
 
308
  if section_type.lower() == "flow":
309
  return await self.generate_flow_diagram(context)
310
  logger.info(f"Starting section generation for '{section_type}' (compress_input={compress_input})")
 
311
  if compress_input and len(context.split()) > 1500:
 
312
  context = await self.compress_input(context, max_tokens=1000)
313
  else:
314
  logger.info(f"Input context size OK: {len(context.split())} words")
 
 
315
  efficient = await self.generate_efficient_section(section_type, context)
 
 
316
  expanded = await self.expand_efficient_to_natural(efficient, section_type)
 
317
  total_time = time.time() - total_start
318
  logger.info(f"Total section generation time: {total_time:.2f}s")
319
  return expanded
 
336
  logger.error(f"Failed to initialize model: {e}")
337
  model = None
338
  yield
 
339
  logger.info("Shutting down, releasing model resources.")
340
  model = None
341
  logger.info("Shutdown complete.")
 
350
 
351
  app.add_middleware(
352
  CORSMiddleware,
353
+ allow_origins=["*"],
354
  allow_credentials=True,
355
+ allow_methods=["*"],
356
  allow_headers=["*"],
357
  )
358
 
 
366
 
367
  class GenerateSectionRequest(BaseModel):
368
  section_type: str
369
+ context: str = "" # legacy, optional
370
+ compressed_context: str = None # new field (skip efficient phase)
371
  compress_input: bool = True
372
 
373
  class GenerateSectionResponse(BaseModel):
374
  content: str
375
 
376
+ class CompressQueryRequest(BaseModel):
377
+ prompt: str
378
+
379
+ class CompressQueryResponse(BaseModel):
380
+ compressed: str
381
+
382
  # ---------- Endpoints ----------
383
  @app.get("/")
384
  async def root():
385
+ return {"message": "Free AI Response API is running. Use POST /chat, POST /generate-section, or POST /compress-query."}
386
 
387
  @app.get("/queue-status")
388
  async def get_queue_status():
 
389
  return queue_status.get_status()
390
 
391
  @app.post("/chat", response_model=ChatResponse)
 
395
  queue_wait = time.time() - queue_start
396
 
397
  if not can_process:
398
+ logger.info(f"Request queued at position {queue_position}")
399
+ return {"status": "queued", "queue_position": queue_position}
 
 
 
 
400
 
401
  logger.info(f"Request started processing after queue wait {queue_wait:.3f}s")
402
  req_start = time.time()
403
  try:
404
  if model is None:
405
  raise HTTPException(status_code=503, detail="Model not available")
 
 
406
  cleaned_question = model.clean_question(request.question)
 
 
 
407
  response_text = await model.generate_response(cleaned_question, request.context)
 
408
  total_time = time.time() - req_start
409
+ logger.info(f"Chat request completed in {total_time:.2f}s (queue wait {queue_wait:.3f}s)")
410
  return ChatResponse(response=response_text)
411
  except Exception as e:
412
  logger.error(f"Error processing request: {e}")
 
416
  await queue_status.release()
417
 
418
  @app.post("/generate-section", response_model=GenerateSectionResponse)
419
+ async def generate_section_endpoint(request: GenerateSectionRequest):
420
  queue_start = time.time()
421
  can_process, queue_position = await queue_status.acquire()
422
  queue_wait = time.time() - queue_start
423
 
424
  if not can_process:
425
+ return {"status": "queued", "queue_position": queue_position}
 
 
 
 
 
426
 
427
  logger.info(f"Section generation started after queue wait {queue_wait:.3f}s")
 
428
  try:
429
  if model is None:
430
  raise HTTPException(status_code=503, detail="Model not available")
431
+
432
+ # SPECIAL CASE: Flow section -> generate Mermaid diagram
433
+ if request.section_type.lower() == "flow":
434
+ # For Flow, we ignore compressed_context and always generate a diagram
435
+ # But we can optionally use compressed_context as additional context
436
+ if request.compressed_context:
437
+ context = request.compressed_context
438
+ else:
439
+ context = request.context
440
+ diagram = await model.generate_flow_diagram(context)
441
+ total_time = time.time() - queue_start
442
+ logger.info(f"Flow diagram generated in {total_time:.2f}s")
443
+ return GenerateSectionResponse(content=diagram)
444
+
445
+ # Normal sections: use compressed_context if provided, else efficient+expand
446
+ if request.compressed_context:
447
+ efficient_repr = request.compressed_context
448
+ logger.info(f"Using provided compressed context for section '{request.section_type}'")
449
+ else:
450
+ context_to_use = request.context
451
+ if request.compress_input and len(context_to_use.split()) > 1500:
452
+ logger.info("Input context large, compressing...")
453
+ context_to_use = await model.compress_input(context_to_use, max_tokens=1000)
454
+ efficient_repr = await model.generate_efficient_section(request.section_type, context_to_use)
455
+
456
+ expanded = await model.expand_efficient_to_natural(efficient_repr, request.section_type)
457
+
458
+ total_time = time.time() - queue_start
459
  logger.info(f"Generate-section request completed in {total_time:.2f}s (queue wait {queue_wait:.3f}s)")
460
+ return GenerateSectionResponse(content=expanded)
461
  except Exception as e:
462
  logger.error(f"Error generating section: {e}")
463
  logger.error(traceback.format_exc())
 
465
  finally:
466
  await queue_status.release()
467
 
468
+ @app.post("/compress-query", response_model=CompressQueryResponse)
469
+ async def compress_query_endpoint(request: CompressQueryRequest):
470
+ queue_start = time.time()
471
+ can_process, queue_position = await queue_status.acquire()
472
+ queue_wait = time.time() - queue_start
473
+
474
+ if not can_process:
475
+ return {"status": "queued", "queue_position": queue_position}
476
+
477
+ logger.info(f"Compress-query started after queue wait {queue_wait:.3f}s")
478
+ try:
479
+ if model is None:
480
+ raise HTTPException(status_code=503, detail="Model not available")
481
+
482
+ # Use generate_efficient_section with a special context to compress the user prompt
483
+ compressed = await model.generate_efficient_section(
484
+ section_type="QueryCompression",
485
+ context=f"User request: {request.prompt}\nProduce a dense, efficient representation (bullet points or key-value pairs) of the user's intent, steps, and requirements. Keep under 300 tokens."
486
+ )
487
+ total_time = time.time() - queue_start
488
+ logger.info(f"Compress-query completed in {total_time:.2f}s")
489
+ return CompressQueryResponse(compressed=compressed)
490
+ except Exception as e:
491
+ logger.error(f"Error compressing query: {e}")
492
+ raise HTTPException(status_code=500, detail="Internal server error")
493
+ finally:
494
+ await queue_status.release()
495
+
496
  if __name__ == "__main__":
497
  import uvicorn
498
+ uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")