Spaces:

minhvtt
/

EBD_Fest

Sleeping

App Files Files Community

minhvtt commited on Apr 6

Commit

44f3755

verified ·

1 Parent(s): e9b5da3

Upload 9 files

Browse files

Files changed (3) hide show

requirements.txt +1 -0
routes_team_chat.py +4 -0
services_legacy.py +263 -8

requirements.txt CHANGED Viewed

@@ -12,3 +12,4 @@ soundfile
 numpy
 scipy
 pillow

 numpy
 scipy
 pillow
+pypdf

routes_team_chat.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import re
 import asyncio
 import logging
 import time
 from typing import Any, Dict, List, Optional
@@ -37,6 +38,9 @@ from services import (
 router = APIRouter()
 team_doc_route_logger = logging.getLogger("nomus.team_documents.route")
 def _json_safe(value: Any) -> Any:

 import re
 import asyncio
 import logging
+import os
 import time
 from typing import Any, Dict, List, Optional
 router = APIRouter()
 team_doc_route_logger = logging.getLogger("nomus.team_documents.route")
+team_doc_route_logger.setLevel(
+    getattr(logging, os.environ.get("TEAM_DOC_LOG_LEVEL", "INFO").upper(), logging.INFO)
+)
 def _json_safe(value: Any) -> Any:

services_legacy.py CHANGED Viewed

@@ -27,6 +27,11 @@ from PIL import Image, ImageOps
 from pymongo.errors import OperationFailure
 from transformers import AutoTokenizer, VitsModel
 from core import (
     AUTO_COMPACT_COOLDOWN_SEC,
     AUTO_COMPACT_ENABLED,
@@ -75,6 +80,18 @@ TEAM_DOC_NODE_CONTENT_LIMIT = int(os.environ.get("TEAM_DOC_NODE_CONTENT_LIMIT",
 TEAM_DOC_NODE_CHUNK_SIZE = int(os.environ.get("TEAM_DOC_NODE_CHUNK_SIZE", "80"))
 TEAM_DOC_SLOW_LOG_SEC = float(os.environ.get("TEAM_DOC_SLOW_LOG_SEC", "2.5"))
 TEAM_DOC_HEAVY_NODE_WARN = int(os.environ.get("TEAM_DOC_HEAVY_NODE_WARN", "1200"))
 def get_vn_now() -> datetime:
@@ -639,6 +656,172 @@ def get_selected_team_messages(team_id: str, selected_ids: List[str], project_id
 def _safe_decode_text(raw_bytes: bytes, fallback_name: str = "") -> str:
     encodings = ["utf-8", "utf-8-sig", "cp1258", "latin-1"]
     for enc in encodings:
         try:
@@ -798,6 +981,45 @@ def _load_team_document_nodes(doc_id: str) -> List[Dict[str, Any]]:
     return nodes
 def _resolve_document_tree(doc: Dict[str, Any]) -> Dict[str, Any]:
     inline_tree = doc.get("tree") if isinstance(doc.get("tree"), dict) else None
     if inline_tree and isinstance(inline_tree.get("nodes"), list):
@@ -1155,20 +1377,44 @@ def list_team_documents(team_id: str, project_id: Optional[str] = None) -> List[
     )
     for doc in docs:
         doc_started = time.perf_counter()
-        tree = _resolve_document_tree(doc)
-        nodes = tree.get("nodes") or []
         doc["tree"] = {
-            "root_id": tree.get("root_id", "root"),
-            "total_nodes": int(tree.get("total_nodes") or len(nodes)),
         }
         path_cache: Dict[str, Dict[str, Any]] = {}
         node_catalog: List[Dict[str, Any]] = []
-        for node in nodes:
             node_id = str(node.get("id") or "")
             if not node_id:
                 continue
             if node_id not in path_cache:
-                path_cache[node_id] = _build_node_path(tree, node_id)
             path_info = path_cache[node_id]
             node_catalog.append(
                 {
@@ -1186,11 +1432,20 @@ def list_team_documents(team_id: str, project_id: Optional[str] = None) -> List[
         doc["node_catalog"] = node_catalog
         doc_elapsed = time.perf_counter() - doc_started
-        if len(nodes) >= TEAM_DOC_HEAVY_NODE_WARN or doc_elapsed >= TEAM_DOC_SLOW_LOG_SEC:
             team_doc_logger.warning(
                 "[team-doc] list doc heavy doc_id=%s nodes=%d duration=%.3fs",
                 doc.get("id"),
-                len(nodes),
                 doc_elapsed,
             )

 from pymongo.errors import OperationFailure
 from transformers import AutoTokenizer, VitsModel
+try:
+    from pypdf import PdfReader
+except Exception:
+    PdfReader = None
 from core import (
     AUTO_COMPACT_COOLDOWN_SEC,
     AUTO_COMPACT_ENABLED,
 TEAM_DOC_NODE_CHUNK_SIZE = int(os.environ.get("TEAM_DOC_NODE_CHUNK_SIZE", "80"))
 TEAM_DOC_SLOW_LOG_SEC = float(os.environ.get("TEAM_DOC_SLOW_LOG_SEC", "2.5"))
 TEAM_DOC_HEAVY_NODE_WARN = int(os.environ.get("TEAM_DOC_HEAVY_NODE_WARN", "1200"))
+TEAM_DOC_NODE_CATALOG_LIMIT = int(os.environ.get("TEAM_DOC_NODE_CATALOG_LIMIT", "1200"))
+TEAM_DOC_PDF_MAX_PAGES = int(os.environ.get("TEAM_DOC_PDF_MAX_PAGES", "240"))
+TEAM_DOC_LOG_LEVEL = os.environ.get("TEAM_DOC_LOG_LEVEL", "INFO").upper()
+TEAM_DOC_OCR_ENABLED = os.environ.get("TEAM_DOC_OCR_ENABLED", "true").strip().lower() == "true"
+TEAM_DOC_OCR_MAX_PAGES = int(os.environ.get("TEAM_DOC_OCR_MAX_PAGES", "12"))
+TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE = int(os.environ.get("TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE", "1"))
+TEAM_DOC_OCR_MODEL = os.environ.get("TEAM_DOC_OCR_MODEL", os.environ.get("MODEL_NAME", "gemini-flash-lite-latest"))
+TEAM_DOC_OCR_IMAGE_MAX_SIZE = int(os.environ.get("TEAM_DOC_OCR_IMAGE_MAX_SIZE", "1700"))
+TEAM_DOC_OCR_MAX_CHARS = int(os.environ.get("TEAM_DOC_OCR_MAX_CHARS", "160000"))
+TEAM_DOC_PDF_ALLOW_BINARY_FALLBACK = os.environ.get("TEAM_DOC_PDF_ALLOW_BINARY_FALLBACK", "false").strip().lower() == "true"
+team_doc_logger.setLevel(getattr(logging, TEAM_DOC_LOG_LEVEL, logging.INFO))
 def get_vn_now() -> datetime:
 def _safe_decode_text(raw_bytes: bytes, fallback_name: str = "") -> str:
+    normalized_name = str(fallback_name or "").lower()
+    looks_like_pdf = normalized_name.endswith(".pdf") or raw_bytes[:4] == b"%PDF"
+    def _extract_page_images(page: Any) -> List[bytes]:
+        images_attr = getattr(page, "images", None)
+        if images_attr is None:
+            return []
+        image_bytes_list: List[bytes] = []
+        try:
+            images_seq = list(images_attr)
+        except Exception:
+            images_seq = []
+        for image_obj in images_seq[: max(1, TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE)]:
+            image_data = getattr(image_obj, "data", None)
+            if isinstance(image_data, (bytes, bytearray)) and image_data:
+                image_bytes_list.append(bytes(image_data))
+                continue
+            pil_image = getattr(image_obj, "image", None)
+            if pil_image is not None:
+                try:
+                    output = io.BytesIO()
+                    pil_image.save(output, format="PNG")
+                    image_bytes_list.append(output.getvalue())
+                except Exception:
+                    continue
+        return image_bytes_list
+    def _normalize_image_for_ocr(image_bytes: bytes) -> tuple[bytes, str]:
+        try:
+            image = Image.open(io.BytesIO(image_bytes))
+            image = ImageOps.exif_transpose(image).convert("RGB")
+            image.thumbnail((TEAM_DOC_OCR_IMAGE_MAX_SIZE, TEAM_DOC_OCR_IMAGE_MAX_SIZE), Image.Resampling.LANCZOS)
+            output = io.BytesIO()
+            image.save(output, format="JPEG", quality=82, optimize=True)
+            return output.getvalue(), "image/jpeg"
+        except Exception:
+            return image_bytes, "image/png"
+    def _ocr_image_with_llm(image_bytes: bytes, mime_type: str, page_number: int) -> str:
+        try:
+            response = model_client.models.generate_content(
+                model=TEAM_DOC_OCR_MODEL,
+                contents=[
+                    types.Part.from_text(
+                        text=(
+                            "Bạn là OCR engine. Trích xuất nguyên văn text có thể đọc được từ ảnh tài liệu. "
+                            "Giữ bố cục cơ bản bằng xuống dòng. Không thêm giải thích."
+                        )
+                    ),
+                    types.Part.from_bytes(data=image_bytes, mime_type=mime_type),
+                ],
+                config=types.GenerateContentConfig(temperature=0.0),
+            )
+            text = (response.text or "").strip()
+            if not text:
+                return ""
+            text = re.sub(r"^```(?:text)?\\s*", "", text)
+            text = re.sub(r"\\s*```$", "", text)
+            if len(text) > TEAM_DOC_OCR_MAX_CHARS:
+                text = text[:TEAM_DOC_OCR_MAX_CHARS]
+            team_doc_logger.info(
+                "[team-doc] OCR page success file=%s page=%d chars=%d",
+                fallback_name,
+                page_number,
+                len(text),
+            )
+            return text
+        except Exception:
+            team_doc_logger.exception(
+                "[team-doc] OCR page failed file=%s page=%d",
+                fallback_name,
+                page_number,
+            )
+            return ""
+    def _extract_pdf_text_by_ocr(reader: Any) -> str:
+        if not TEAM_DOC_OCR_ENABLED:
+            return ""
+        started = time.perf_counter()
+        page_limit = max(1, TEAM_DOC_OCR_MAX_PAGES)
+        page_count = min(len(reader.pages), page_limit)
+        collected_pages: List[str] = []
+        for page_idx in range(page_count):
+            page = reader.pages[page_idx]
+            page_images = _extract_page_images(page)
+            if not page_images:
+                continue
+            page_text_parts: List[str] = []
+            for image_bytes in page_images[: max(1, TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE)]:
+                normalized_bytes, mime_type = _normalize_image_for_ocr(image_bytes)
+                ocr_text = _ocr_image_with_llm(normalized_bytes, mime_type, page_idx + 1)
+                if ocr_text:
+                    page_text_parts.append(ocr_text)
+            if page_text_parts:
+                collected_pages.append("\n".join(page_text_parts).strip())
+        text = "\n\n".join(part for part in collected_pages if part).strip()
+        elapsed = time.perf_counter() - started
+        team_doc_logger.info(
+            "[team-doc] OCR fallback done file=%s pages_scanned=%d pages_with_text=%d chars=%d duration=%.3fs",
+            fallback_name,
+            page_count,
+            len(collected_pages),
+            len(text),
+            elapsed,
+        )
+        return text
+    if looks_like_pdf:
+        if PdfReader is None:
+            team_doc_logger.warning(
+                "[team-doc] PDF parser unavailable (pypdf not installed), fallback to byte decode file=%s",
+                fallback_name,
+            )
+        else:
+            try:
+                reader = PdfReader(io.BytesIO(raw_bytes))
+                pages = reader.pages[:TEAM_DOC_PDF_MAX_PAGES]
+                page_texts: List[str] = []
+                for page in pages:
+                    extracted = page.extract_text() or ""
+                    if extracted.strip():
+                        page_texts.append(extracted)
+                text = "\n\n".join(page_texts).strip()
+                if text:
+                    team_doc_logger.info(
+                        "[team-doc] PDF extracted file=%s pages=%d chars=%d",
+                        fallback_name,
+                        len(pages),
+                        len(text),
+                    )
+                    return text
+                team_doc_logger.warning(
+                    "[team-doc] PDF extraction returned empty text file=%s pages=%d",
+                    fallback_name,
+                    len(pages),
+                )
+                ocr_text = _extract_pdf_text_by_ocr(reader)
+                if ocr_text:
+                    return ocr_text
+            except Exception:
+                team_doc_logger.exception(
+                    "[team-doc] PDF extraction failed file=%s, fallback to byte decode",
+                    fallback_name,
+                )
+        if TEAM_DOC_PDF_ALLOW_BINARY_FALLBACK:
+            team_doc_logger.warning(
+                "[team-doc] PDF textual extraction unavailable, binary decode fallback enabled file=%s",
+                fallback_name,
+            )
+        else:
+            team_doc_logger.warning(
+                "[team-doc] PDF textual extraction unavailable, returning placeholder to avoid binary noise file=%s",
+                fallback_name,
+            )
+            return "Khong the trich xuat van ban tu PDF nay. Vui long upload PDF co text layer hoac thu OCR fallback."
     encodings = ["utf-8", "utf-8-sig", "cp1258", "latin-1"]
     for enc in encodings:
         try:
     return nodes
+def _load_team_document_nodes_preview(
+    doc_id: str,
+    max_nodes: int,
+    chunk_size_hint: int = TEAM_DOC_NODE_CHUNK_SIZE,
+) -> List[Dict[str, Any]]:
+    normalized_doc_id = str(doc_id or "").strip()
+    if not normalized_doc_id or max_nodes <= 0:
+        return []
+    effective_chunk_size = max(1, int(chunk_size_hint or TEAM_DOC_NODE_CHUNK_SIZE))
+    chunk_limit = max(1, math.ceil(max_nodes / effective_chunk_size))
+    projection = {"_id": 0, "chunk_index": 1, "nodes": 1}
+    try:
+        cursor = (
+            team_doc_chunks_collection.find({"doc_id": normalized_doc_id}, projection)
+            .hint("idx_team_doc_chunks_doc_id_chunk")
+            .sort("chunk_index", 1)
+            .limit(chunk_limit)
+        )
+        if hasattr(cursor, "allow_disk_use"):
+            cursor = cursor.allow_disk_use(True)
+        rows = list(cursor)
+    except (OperationFailure, ValueError):
+        rows = list(team_doc_chunks_collection.find({"doc_id": normalized_doc_id}, projection))
+        rows.sort(key=lambda row: int(row.get("chunk_index", 0)))
+        rows = rows[:chunk_limit]
+    nodes: List[Dict[str, Any]] = []
+    for row in rows:
+        chunk_nodes = row.get("nodes")
+        if not isinstance(chunk_nodes, list):
+            continue
+        nodes.extend(chunk_nodes)
+        if len(nodes) >= max_nodes:
+            return nodes[:max_nodes]
+    return nodes
 def _resolve_document_tree(doc: Dict[str, Any]) -> Dict[str, Any]:
     inline_tree = doc.get("tree") if isinstance(doc.get("tree"), dict) else None
     if inline_tree and isinstance(inline_tree.get("nodes"), list):
     )
     for doc in docs:
         doc_started = time.perf_counter()
+        tree_meta = doc.get("tree_meta") if isinstance(doc.get("tree_meta"), dict) else {}
+        inline_tree = doc.get("tree") if isinstance(doc.get("tree"), dict) else None
+        root_id = str(tree_meta.get("root_id") or "root")
+        total_nodes = int(tree_meta.get("total_nodes") or 0)
+        all_nodes: List[Dict[str, Any]] = []
+        if inline_tree and isinstance(inline_tree.get("nodes"), list):
+            root_id = str(inline_tree.get("root_id") or root_id)
+            all_nodes = inline_tree.get("nodes") or []
+            total_nodes = int(inline_tree.get("total_nodes") or len(all_nodes))
+        else:
+            chunk_size_hint = int(tree_meta.get("chunk_size") or TEAM_DOC_NODE_CHUNK_SIZE)
+            all_nodes = _load_team_document_nodes_preview(
+                str(doc.get("id") or ""),
+                TEAM_DOC_NODE_CATALOG_LIMIT,
+                chunk_size_hint=chunk_size_hint,
+            )
+        preview_nodes = all_nodes[:TEAM_DOC_NODE_CATALOG_LIMIT] if TEAM_DOC_NODE_CATALOG_LIMIT > 0 else []
+        tree_for_paths = {"root_id": root_id, "nodes": preview_nodes}
         doc["tree"] = {
+            "root_id": root_id,
+            "total_nodes": int(total_nodes or len(all_nodes)),
         }
+        doc["node_catalog_total"] = int(total_nodes or len(all_nodes))
+        doc["node_catalog_limit"] = TEAM_DOC_NODE_CATALOG_LIMIT
+        doc["node_catalog_truncated"] = doc["node_catalog_total"] > len(preview_nodes)
         path_cache: Dict[str, Dict[str, Any]] = {}
         node_catalog: List[Dict[str, Any]] = []
+        for node in preview_nodes:
             node_id = str(node.get("id") or "")
             if not node_id:
                 continue
             if node_id not in path_cache:
+                path_cache[node_id] = _build_node_path(tree_for_paths, node_id)
             path_info = path_cache[node_id]
             node_catalog.append(
                 {
         doc["node_catalog"] = node_catalog
         doc_elapsed = time.perf_counter() - doc_started
+        if doc.get("node_catalog_truncated"):
+            team_doc_logger.warning(
+                "[team-doc] list doc catalog truncated doc_id=%s returned=%d total=%d limit=%d",
+                doc.get("id"),
+                len(node_catalog),
+                doc.get("node_catalog_total"),
+                TEAM_DOC_NODE_CATALOG_LIMIT,
+            )
+        if int(doc.get("node_catalog_total") or 0) >= TEAM_DOC_HEAVY_NODE_WARN or doc_elapsed >= TEAM_DOC_SLOW_LOG_SEC:
             team_doc_logger.warning(
                 "[team-doc] list doc heavy doc_id=%s nodes=%d duration=%.3fs",
                 doc.get("id"),
+                int(doc.get("node_catalog_total") or 0),
                 doc_elapsed,
             )