Spaces:

CiegCieg
/

InstructJet-AI

Sleeping

App Files Files Community

Fatmagician commited on May 2

Commit

b7e622a

1 Parent(s): 4537781

New updates faster

Browse files

Files changed (3) hide show

.gitignore +1 -0
Dockerfile +32 -63
app.py +99 -98

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 .venv
 .env
 *.whl
 .venv/

 .venv
+/.git_hf
 .env
 *.whl
 .venv/

Dockerfile CHANGED Viewed

@@ -1,97 +1,66 @@
-# # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
-# # you will also find guides on how best to write your Dockerfile
-# # FROM python:3.12.3
-# # RUN useradd -m -u 1000 user
-# # USER user
-# # ENV PATH="/home/user/.local/bin:$PATH"
-# # WORKDIR /app
-# # COPY --chown=user ./requirements.txt requirements.txt
-# # RUN pip install --no-cache-dir --upgrade -r requirements.txt
-# # RUN mkdir -p /app/models && \
-# #     wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf \
-# #          -O /app/models/llama-2-7b-chat.Q4_K_M.gguf
-# # COPY --chown=user . /app
-# # CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
-# FROM python:3.12
-# # --- System dependencies ---
-# RUN apt-get update && apt-get install -y --no-install-recommends \
-#     wget \
-#     libgomp1 \
-#     && rm -rf /var/lib/apt/lists/*
-# RUN apt-get update && apt-get install -y build-essential cmake
-# # --- Non-root user ---
 # RUN useradd -m -u 1000 user
 # USER user
-# WORKDIR /app
 # ENV PATH="/home/user/.local/bin:$PATH"
-# # --- Copy wheel and requirements first ---
-# COPY --chown=user llama_cpp_python-0.3.20-py3-none-linux_x86_64.whl .
-# COPY --chown=user requirements.txt .
-# ENV CMAKE_ARGS="-DLLAMA_AVX2=on -DLLAMA_FMA=on -DLLAMA_OPENMP=on"
-# # --- Install dependencies ---
-# RUN pip install --no-cache-dir --upgrade pip && \
-#     pip install --no-cache-dir -r requirements.txt && \
-#     pip install --no-cache-dir llama_cpp_python-0.3.20-py3-none-linux_x86_64.whl
-# # --- Download model ---
 # RUN mkdir -p /app/models && \
-#     wget --progress=bar:force \
-#          --retry-connrefused \
-#          --tries=5 \
-#          --timeout=30 \
-#          -O /app/models/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf \
-#          https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-GGUF/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
-# RUN ls -lh /app/models && \
-#     du -h /app/models/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
-# # --- Copy source code ---
-# COPY --chown=user . /app
-# # --- Expose & run ---
-# EXPOSE 7860
 # CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 FROM python:3.12
 RUN apt-get update && apt-get install -y --no-install-recommends \
     wget \
     libgomp1 \
-    build-essential \
-    cmake \
     && rm -rf /var/lib/apt/lists/*
 RUN useradd -m -u 1000 user
 USER user
 WORKDIR /app
 ENV PATH="/home/user/.local/bin:$PATH"
 COPY --chown=user requirements.txt .
 ENV CMAKE_ARGS="-DLLAMA_AVX2=on -DLLAMA_FMA=on -DLLAMA_OPENMP=on"
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt && \
-    pip install --no-cache-dir llama-cpp-python
 RUN mkdir -p /app/models && \
-    wget -O /app/models/model.gguf \
-    https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-GGUF/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
 COPY --chown=user . /app
 EXPOSE 7860
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+# FROM python:3.12.3
 # RUN useradd -m -u 1000 user
 # USER user
 # ENV PATH="/home/user/.local/bin:$PATH"
+# WORKDIR /app
+# COPY --chown=user ./requirements.txt requirements.txt
+# RUN pip install --no-cache-dir --upgrade -r requirements.txt
 # RUN mkdir -p /app/models && \
+#     wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf \
+#          -O /app/models/llama-2-7b-chat.Q4_K_M.gguf
+# COPY --chown=user . /app
 # CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 FROM python:3.12
+# --- System dependencies ---
 RUN apt-get update && apt-get install -y --no-install-recommends \
     wget \
     libgomp1 \
     && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y build-essential cmake
+# --- Non-root user ---
 RUN useradd -m -u 1000 user
 USER user
 WORKDIR /app
 ENV PATH="/home/user/.local/bin:$PATH"
+# --- Copy wheel and requirements first ---
+COPY --chown=user llama_cpp_python-0.3.20-py3-none-linux_x86_64.whl .
 COPY --chown=user requirements.txt .
 ENV CMAKE_ARGS="-DLLAMA_AVX2=on -DLLAMA_FMA=on -DLLAMA_OPENMP=on"
+# --- Install dependencies ---
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt && \
+    pip install --no-cache-dir llama_cpp_python-0.3.20-py3-none-linux_x86_64.whl
+# --- Download model ---
 RUN mkdir -p /app/models && \
+    wget --progress=bar:force \
+         --retry-connrefused \
+         --tries=5 \
+         --timeout=30 \
+         -O /app/models/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf \
+         https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-GGUF/resolve/main/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
+RUN ls -lh /app/models && \
+    du -h /app/models/Ministral-3-3B-Instruct-2512-Q4_K_M.gguf
+# --- Copy source code ---
 COPY --chown=user . /app
+# --- Expose & run ---
 EXPOSE 7860
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -3,9 +3,9 @@ import logging
 import asyncio
 import time
 import traceback
-import json                     # <-- add this
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware   # <-- add this
 from pydantic import BaseModel
 from llama_cpp import Llama
 from contextlib import asynccontextmanager
@@ -43,7 +43,7 @@ class QueueStatus:
         async with self._lock:
             if self.active_tasks < self.max_concurrent:
                 self.active_tasks += 1
-                return True, 0  # No queue position
             else:
                 position = len(self.pending_queue) + 1
                 future = asyncio.Future()
@@ -109,7 +109,6 @@ class MixtralFreeModel:
                 verbose=False,
                 seed=42,
             )
             load_time = time.time() - start_time
             logger.info(f"GGUF model loaded successfully in {load_time:.2f}s")
         except Exception as e:
@@ -117,7 +116,6 @@ class MixtralFreeModel:
             raise
     async def warm_up(self) -> None:
-        """Perform a short test inference to warm up the model."""
         logger.info("Warming up model with test inference...")
         start_time = time.time()
         try:
@@ -128,11 +126,10 @@ class MixtralFreeModel:
             logger.warning(f"Model warm-up failed: {e}")
     async def _generate_completion(self, prompt: str, max_tokens: int = None, temperature: float = None) -> str:
-        """Helper to run a blocking completion in a thread."""
         if max_tokens is None:
             max_tokens = self.max_tokens
         if temperature is None:
-            temperature = self.temperature
         def _blocking():
             start = time.time()
@@ -141,7 +138,7 @@ class MixtralFreeModel:
                 max_tokens=max_tokens,
                 temperature=temperature,
                 top_p=0.95,
-                stop=["</s>"],
                 echo=False,
                 stream=False
             )
@@ -152,11 +149,6 @@ class MixtralFreeModel:
         return await asyncio.to_thread(_blocking)
     async def generate_response(self, question: str, context: str = "") -> str:
-        """
-        Generate a response using the local GGUF model.
-        For guide creation requests, enforces a strict JSON output format.
-        """
-        # Check if the user is asking to create a guide
         is_guide_request = any(phrase in question.lower() for phrase in
                             ["guide", "create a guide", "make a guide", "step by step", "tutorial"])
@@ -174,7 +166,6 @@ class MixtralFreeModel:
         Now produce the JSON object for the user's request:"""
         else:
-            # Normal assistant prompt
             system_prompt = f"""You are a helpful, accurate, and context-aware assistant. Use the conversation history below to provide a relevant and useful answer to the question.
     IMPORTANT:
@@ -188,20 +179,17 @@ class MixtralFreeModel:
     Provide a helpful response"""
-        prompt = f"<s>[INST] {system_prompt} [/INST] {question}"
         try:
             response_text = await self._generate_completion(prompt, max_tokens=512)
-            # For guide requests, extract and return only the JSON object
             if is_guide_request:
                 import re
-                # Match a JSON object containing "action": "generate_guide"
                 match = re.search(r'\{[^{}]*"action"\s*:\s*"generate_guide"[^{}]*\}', response_text, re.DOTALL)
                 if match:
                     return match.group(0)
                 else:
-                    # Fallback: return a default JSON (so frontend still works)
                     logger.warning("Model did not return valid JSON for guide request. Using fallback.")
                     return json.dumps({
                         "action": "generate_guide",
@@ -209,14 +197,11 @@ class MixtralFreeModel:
                         "sections": ["Overview", "Prerequisites", "Step-by-Step Instructions", "Tools & Assets", "Flow"]
                     })
             return response_text
         except Exception as e:
             logger.error(f"Error in generation: {str(e)}")
             return "I apologize, but I'm having trouble responding right now."
     def clean_question(self, question: str) -> str:
-        """Remove command prefixes from the question."""
-        start = time.time()
         prefixes = ['!bot', '!ai', '@bot', 'bot,', '!ai_search']
         if not question or not question.strip():
             return question
@@ -225,17 +210,11 @@ class MixtralFreeModel:
         for prefix in prefixes:
             if question_lower.startswith(prefix.lower()):
                 cleaned = original_question[len(prefix):].lstrip(' ,!:@')
-                elapsed = time.time() - start
-                logger.debug(f"Cleaned question in {elapsed:.4f}s: '{cleaned}'")
                 return cleaned
-        elapsed = time.time() - start
-        logger.debug(f"No prefix to clean, took {elapsed:.4f}s")
         return original_question
     async def compress_input(self, text: str, max_tokens: int = 500) -> str:
-        """Compress long input into a concise summary."""
         if len(text.split()) < max_tokens:
-            logger.debug("Input already under token limit, skipping compression")
             return text
         logger.info(f"Compressing input of {len(text.split())} words...")
         start = time.time()
@@ -245,9 +224,8 @@ class MixtralFreeModel:
         logger.info(f"Compression completed in {elapsed:.2f}s")
         return summary
-    async def generate_efficient_section(self, section_type: str, context: str, max_tokens: int = 200) -> str:
-        """Generate a compressed, efficient language representation of a section."""
-        logger.info(f"Generating efficient representation for section '{section_type}'...")
         start = time.time()
         system = f"You are an expert task guide writer. Generate content for the section \"{section_type}\" in an efficient language format.\nUse a structured format like:\n- Key point 1: details\n- Key point 2: details\nOr use JSON if appropriate. Keep it concise and use at most {max_tokens} tokens."
         prompt = f"<s>[INST] {system}\n\nContext: {context}\nGenerate the efficient language for {section_type} section. [/INST]"
@@ -256,11 +234,23 @@ class MixtralFreeModel:
         logger.info(f"Efficient section generation took {elapsed:.2f}s")
         return efficient
-    async def expand_efficient_to_natural(self, efficient_text: str, section_type: str, max_tokens: int = 512) -> str:
-        """Expand efficient language into detailed natural language."""
         logger.info(f"Expanding efficient language to natural text for section '{section_type}'...")
         start = time.time()
-        system = f"You are an expert task guide writer. Expand the following efficient language into a detailed, clear, and helpful section titled \"{section_type}\".\nUse markdown formatting, bullet points, subheadings, and ensure it's easy to understand. Make it comprehensive."
         prompt = f"<s>[INST] {system}\n\nEfficient language:\n{efficient_text}\n\nWrite the full {section_type} section now. [/INST]"
         expanded = await self._generate_completion(prompt, max_tokens=max_tokens)
         elapsed = time.time() - start
@@ -268,8 +258,6 @@ class MixtralFreeModel:
         return expanded
     async def generate_flow_diagram(self, context: str) -> str:
-        """Generate a Mermaid flowchart for the Flow section."""
         prompt = f"""[INST] You are an expert at creating Mermaid flowcharts for task guides.
         STRICT RULES:
@@ -294,37 +282,20 @@ class MixtralFreeModel:
         Now generate the diagram. [/INST]"""
         try:
-            response = await self._generate_completion(
-                prompt,
-                max_tokens=512,
-                temperature=0.2
-            )
             response = response.strip()
-            # ✅ Case 1: Model already returns proper block
             if response.startswith("```mermaid") and response.endswith("```"):
                 return response
-            # ✅ Case 2: Model returns raw flowchart without code block
             if "flowchart" in response or "graph" in response:
                 return f"```mermaid\n{response}\n```"
-            # ❌ Case 3: Model output is garbage → fallback
             logger.warning("Invalid Mermaid output, using fallback diagram.")
             return """```mermaid
         flowchart TD
         A[Start] --> B[Follow the steps above]
         B --> C[Complete task]
         C --> D[End]"""
         except Exception as e:
             logger.error(f"Flow diagram generation failed: {e}")
-            # ❌ Hard fallback (error case)
             return """```mermaid
             flowchart TD
             A[Start] --> B[Error generating diagram]
@@ -333,25 +304,16 @@ class MixtralFreeModel:
             ```"""
     async def generate_section(self, section_type: str, context: str, compress_input: bool = True) -> str:
-        """Generate a detailed section using compress -> efficient -> expand pipeline."""
         total_start = time.time()
-        # Special handling for Flow section
         if section_type.lower() == "flow":
             return await self.generate_flow_diagram(context)
         logger.info(f"Starting section generation for '{section_type}' (compress_input={compress_input})")
-        # Step 1: compress input if needed
         if compress_input and len(context.split()) > 1500:
-            logger.info("Input context large, compressing...")
             context = await self.compress_input(context, max_tokens=1000)
         else:
             logger.info(f"Input context size OK: {len(context.split())} words")
-        # Step 2: generate efficient language
         efficient = await self.generate_efficient_section(section_type, context)
-        # Step 3: expand to natural language
         expanded = await self.expand_efficient_to_natural(efficient, section_type)
         total_time = time.time() - total_start
         logger.info(f"Total section generation time: {total_time:.2f}s")
         return expanded
@@ -374,7 +336,6 @@ async def lifespan(app: FastAPI):
         logger.error(f"Failed to initialize model: {e}")
         model = None
     yield
-    # Shutdown
     logger.info("Shutting down, releasing model resources.")
     model = None
     logger.info("Shutdown complete.")
@@ -389,9 +350,9 @@ app = FastAPI(
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],          # For development; restrict in production
     allow_credentials=True,
-    allow_methods=["*"],          # Allows all methods, including OPTIONS
     allow_headers=["*"],
 )
@@ -405,20 +366,26 @@ class ChatResponse(BaseModel):
 class GenerateSectionRequest(BaseModel):
     section_type: str
-    context: str
     compress_input: bool = True
 class GenerateSectionResponse(BaseModel):
     content: str
 # ---------- Endpoints ----------
 @app.get("/")
 async def root():
-    return {"message": "Free AI Response API is running (local GGUF model). Use POST /chat or POST /generate-section."}
 @app.get("/queue-status")
 async def get_queue_status():
-    """Return current queue status for load balancing."""
     return queue_status.get_status()
 @app.post("/chat", response_model=ChatResponse)
@@ -428,28 +395,18 @@ async def chat(request: ChatRequest):
     queue_wait = time.time() - queue_start
     if not can_process:
-        logger.info(f"Request queued at position {queue_position} (queue wait {queue_wait:.3f}s)")
-        return {
-            "status": "queued",
-            "queue_position": queue_position,
-            "message": f"Request queued at position {queue_position}"
-        }
     logger.info(f"Request started processing after queue wait {queue_wait:.3f}s")
     req_start = time.time()
     try:
         if model is None:
             raise HTTPException(status_code=503, detail="Model not available")
-        clean_start = time.time()
         cleaned_question = model.clean_question(request.question)
-        clean_time = time.time() - clean_start
-        logger.info(f"Cleaned question in {clean_time:.4f}s")
         response_text = await model.generate_response(cleaned_question, request.context)
         total_time = time.time() - req_start
-        logger.info(f"Chat request completed in {total_time:.2f}s (including queue wait {queue_wait:.3f}s)")
         return ChatResponse(response=response_text)
     except Exception as e:
         logger.error(f"Error processing request: {e}")
@@ -459,32 +416,48 @@ async def chat(request: ChatRequest):
         await queue_status.release()
 @app.post("/generate-section", response_model=GenerateSectionResponse)
-async def generate_section(request: GenerateSectionRequest):
     queue_start = time.time()
     can_process, queue_position = await queue_status.acquire()
     queue_wait = time.time() - queue_start
     if not can_process:
-        logger.info(f"Request queued at position {queue_position} (queue wait {queue_wait:.3f}s)")
-        return {
-            "status": "queued",
-            "queue_position": queue_position,
-            "message": f"Request queued at position {queue_position}"
-        }
     logger.info(f"Section generation started after queue wait {queue_wait:.3f}s")
-    req_start = time.time()
     try:
         if model is None:
             raise HTTPException(status_code=503, detail="Model not available")
-        content = await model.generate_section(
-            request.section_type, request.context, request.compress_input
-        )
-        total_time = time.time() - req_start
         logger.info(f"Generate-section request completed in {total_time:.2f}s (queue wait {queue_wait:.3f}s)")
-        return GenerateSectionResponse(content=content)
     except Exception as e:
         logger.error(f"Error generating section: {e}")
         logger.error(traceback.format_exc())
@@ -492,6 +465,34 @@ async def generate_section(request: GenerateSectionRequest):
     finally:
         await queue_status.release()
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="debug")

 import asyncio
 import time
 import traceback
+import json
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 from llama_cpp import Llama
 from contextlib import asynccontextmanager
         async with self._lock:
             if self.active_tasks < self.max_concurrent:
                 self.active_tasks += 1
+                return True, 0
             else:
                 position = len(self.pending_queue) + 1
                 future = asyncio.Future()
                 verbose=False,
                 seed=42,
             )
             load_time = time.time() - start_time
             logger.info(f"GGUF model loaded successfully in {load_time:.2f}s")
         except Exception as e:
             raise
     async def warm_up(self) -> None:
         logger.info("Warming up model with test inference...")
         start_time = time.time()
         try:
             logger.warning(f"Model warm-up failed: {e}")
     async def _generate_completion(self, prompt: str, max_tokens: int = None, temperature: float = None) -> str:
         if max_tokens is None:
             max_tokens = self.max_tokens
         if temperature is None:
+            temperature = 0.3
         def _blocking():
             start = time.time()
                 max_tokens=max_tokens,
                 temperature=temperature,
                 top_p=0.95,
+                stop=[],
                 echo=False,
                 stream=False
             )
         return await asyncio.to_thread(_blocking)
     async def generate_response(self, question: str, context: str = "") -> str:
         is_guide_request = any(phrase in question.lower() for phrase in
                             ["guide", "create a guide", "make a guide", "step by step", "tutorial"])
         Now produce the JSON object for the user's request:"""
         else:
             system_prompt = f"""You are a helpful, accurate, and context-aware assistant. Use the conversation history below to provide a relevant and useful answer to the question.
     IMPORTANT:
     Provide a helpful response"""
+        prompt = f"<s>[INST] {system_prompt}\n\nNow handle this user request: {question} [/INST]"
         try:
             response_text = await self._generate_completion(prompt, max_tokens=512)
             if is_guide_request:
                 import re
                 match = re.search(r'\{[^{}]*"action"\s*:\s*"generate_guide"[^{}]*\}', response_text, re.DOTALL)
                 if match:
                     return match.group(0)
                 else:
                     logger.warning("Model did not return valid JSON for guide request. Using fallback.")
                     return json.dumps({
                         "action": "generate_guide",
                         "sections": ["Overview", "Prerequisites", "Step-by-Step Instructions", "Tools & Assets", "Flow"]
                     })
             return response_text
         except Exception as e:
             logger.error(f"Error in generation: {str(e)}")
             return "I apologize, but I'm having trouble responding right now."
     def clean_question(self, question: str) -> str:
         prefixes = ['!bot', '!ai', '@bot', 'bot,', '!ai_search']
         if not question or not question.strip():
             return question
         for prefix in prefixes:
             if question_lower.startswith(prefix.lower()):
                 cleaned = original_question[len(prefix):].lstrip(' ,!:@')
                 return cleaned
         return original_question
     async def compress_input(self, text: str, max_tokens: int = 500) -> str:
         if len(text.split()) < max_tokens:
             return text
         logger.info(f"Compressing input of {len(text.split())} words...")
         start = time.time()
         logger.info(f"Compression completed in {elapsed:.2f}s")
         return summary
+    async def generate_efficient_section(self, section_type: str, context: str, max_tokens: int = 300) -> str:
+        logger.info(f"Generating efficient representation for '{section_type}'...")
         start = time.time()
         system = f"You are an expert task guide writer. Generate content for the section \"{section_type}\" in an efficient language format.\nUse a structured format like:\n- Key point 1: details\n- Key point 2: details\nOr use JSON if appropriate. Keep it concise and use at most {max_tokens} tokens."
         prompt = f"<s>[INST] {system}\n\nContext: {context}\nGenerate the efficient language for {section_type} section. [/INST]"
         logger.info(f"Efficient section generation took {elapsed:.2f}s")
         return efficient
+    async def expand_efficient_to_natural(self, efficient_text: str, section_type: str, max_tokens: int = 300) -> str:
         logger.info(f"Expanding efficient language to natural text for section '{section_type}'...")
         start = time.time()
+        system = f"""You are an expert task guide writer.
+        Expand the efficient language into a **short but helpful** section titled "{section_type}".
+        STRICT RULES:
+        - Maximum 120 words total.
+        - Use markdown subheadings (###) and bullet points.
+        - No long paragraphs – break into 3-5 bullet points or short phrases.
+        - Skip introductions, conclusions, and fluff.
+        - Keep the tone professional and clear.
+        Efficient language:
+        {efficient_text}
+        Write the {section_type} section now:"""
         prompt = f"<s>[INST] {system}\n\nEfficient language:\n{efficient_text}\n\nWrite the full {section_type} section now. [/INST]"
         expanded = await self._generate_completion(prompt, max_tokens=max_tokens)
         elapsed = time.time() - start
         return expanded
     async def generate_flow_diagram(self, context: str) -> str:
         prompt = f"""[INST] You are an expert at creating Mermaid flowcharts for task guides.
         STRICT RULES:
         Now generate the diagram. [/INST]"""
         try:
+            response = await self._generate_completion(prompt, max_tokens=512, temperature=0.2)
             response = response.strip()
             if response.startswith("```mermaid") and response.endswith("```"):
                 return response
             if "flowchart" in response or "graph" in response:
                 return f"```mermaid\n{response}\n```"
             logger.warning("Invalid Mermaid output, using fallback diagram.")
             return """```mermaid
         flowchart TD
         A[Start] --> B[Follow the steps above]
         B --> C[Complete task]
         C --> D[End]"""
         except Exception as e:
             logger.error(f"Flow diagram generation failed: {e}")
             return """```mermaid
             flowchart TD
             A[Start] --> B[Error generating diagram]
             ```"""
     async def generate_section(self, section_type: str, context: str, compress_input: bool = True) -> str:
         total_start = time.time()
         if section_type.lower() == "flow":
             return await self.generate_flow_diagram(context)
         logger.info(f"Starting section generation for '{section_type}' (compress_input={compress_input})")
         if compress_input and len(context.split()) > 1500:
             context = await self.compress_input(context, max_tokens=1000)
         else:
             logger.info(f"Input context size OK: {len(context.split())} words")
         efficient = await self.generate_efficient_section(section_type, context)
         expanded = await self.expand_efficient_to_natural(efficient, section_type)
         total_time = time.time() - total_start
         logger.info(f"Total section generation time: {total_time:.2f}s")
         return expanded
         logger.error(f"Failed to initialize model: {e}")
         model = None
     yield
     logger.info("Shutting down, releasing model resources.")
     model = None
     logger.info("Shutdown complete.")
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
+    allow_methods=["*"],
     allow_headers=["*"],
 )
 class GenerateSectionRequest(BaseModel):
     section_type: str
+    context: str = ""               # legacy, optional
+    compressed_context: str = None  # new field (skip efficient phase)
     compress_input: bool = True
 class GenerateSectionResponse(BaseModel):
     content: str
+class CompressQueryRequest(BaseModel):
+    prompt: str
+class CompressQueryResponse(BaseModel):
+    compressed: str
 # ---------- Endpoints ----------
 @app.get("/")
 async def root():
+    return {"message": "Free AI Response API is running. Use POST /chat, POST /generate-section, or POST /compress-query."}
 @app.get("/queue-status")
 async def get_queue_status():
     return queue_status.get_status()
 @app.post("/chat", response_model=ChatResponse)
     queue_wait = time.time() - queue_start
     if not can_process:
+        logger.info(f"Request queued at position {queue_position}")
+        return {"status": "queued", "queue_position": queue_position}
     logger.info(f"Request started processing after queue wait {queue_wait:.3f}s")
     req_start = time.time()
     try:
         if model is None:
             raise HTTPException(status_code=503, detail="Model not available")
         cleaned_question = model.clean_question(request.question)
         response_text = await model.generate_response(cleaned_question, request.context)
         total_time = time.time() - req_start
+        logger.info(f"Chat request completed in {total_time:.2f}s (queue wait {queue_wait:.3f}s)")
         return ChatResponse(response=response_text)
     except Exception as e:
         logger.error(f"Error processing request: {e}")
         await queue_status.release()
 @app.post("/generate-section", response_model=GenerateSectionResponse)
+async def generate_section_endpoint(request: GenerateSectionRequest):
     queue_start = time.time()
     can_process, queue_position = await queue_status.acquire()
     queue_wait = time.time() - queue_start
     if not can_process:
+        return {"status": "queued", "queue_position": queue_position}
     logger.info(f"Section generation started after queue wait {queue_wait:.3f}s")
     try:
         if model is None:
             raise HTTPException(status_code=503, detail="Model not available")
+        # SPECIAL CASE: Flow section -> generate Mermaid diagram
+        if request.section_type.lower() == "flow":
+            # For Flow, we ignore compressed_context and always generate a diagram
+            # But we can optionally use compressed_context as additional context
+            if request.compressed_context:
+                context = request.compressed_context
+            else:
+                context = request.context
+            diagram = await model.generate_flow_diagram(context)
+            total_time = time.time() - queue_start
+            logger.info(f"Flow diagram generated in {total_time:.2f}s")
+            return GenerateSectionResponse(content=diagram)
+        # Normal sections: use compressed_context if provided, else efficient+expand
+        if request.compressed_context:
+            efficient_repr = request.compressed_context
+            logger.info(f"Using provided compressed context for section '{request.section_type}'")
+        else:
+            context_to_use = request.context
+            if request.compress_input and len(context_to_use.split()) > 1500:
+                logger.info("Input context large, compressing...")
+                context_to_use = await model.compress_input(context_to_use, max_tokens=1000)
+            efficient_repr = await model.generate_efficient_section(request.section_type, context_to_use)
+        expanded = await model.expand_efficient_to_natural(efficient_repr, request.section_type)
+        total_time = time.time() - queue_start
         logger.info(f"Generate-section request completed in {total_time:.2f}s (queue wait {queue_wait:.3f}s)")
+        return GenerateSectionResponse(content=expanded)
     except Exception as e:
         logger.error(f"Error generating section: {e}")
         logger.error(traceback.format_exc())
     finally:
         await queue_status.release()
+@app.post("/compress-query", response_model=CompressQueryResponse)
+async def compress_query_endpoint(request: CompressQueryRequest):
+    queue_start = time.time()
+    can_process, queue_position = await queue_status.acquire()
+    queue_wait = time.time() - queue_start
+    if not can_process:
+        return {"status": "queued", "queue_position": queue_position}
+    logger.info(f"Compress-query started after queue wait {queue_wait:.3f}s")
+    try:
+        if model is None:
+            raise HTTPException(status_code=503, detail="Model not available")
+        # Use generate_efficient_section with a special context to compress the user prompt
+        compressed = await model.generate_efficient_section(
+            section_type="QueryCompression",
+            context=f"User request: {request.prompt}\nProduce a dense, efficient representation (bullet points or key-value pairs) of the user's intent, steps, and requirements. Keep under 300 tokens."
+        )
+        total_time = time.time() - queue_start
+        logger.info(f"Compress-query completed in {total_time:.2f}s")
+        return CompressQueryResponse(compressed=compressed)
+    except Exception as e:
+        logger.error(f"Error compressing query: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+    finally:
+        await queue_status.release()
 if __name__ == "__main__":
     import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000, log_level="info")