minhvtt commited on
Commit
44f3755
·
verified ·
1 Parent(s): e9b5da3

Upload 9 files

Browse files
Files changed (3) hide show
  1. requirements.txt +1 -0
  2. routes_team_chat.py +4 -0
  3. services_legacy.py +263 -8
requirements.txt CHANGED
@@ -12,3 +12,4 @@ soundfile
12
  numpy
13
  scipy
14
  pillow
 
 
12
  numpy
13
  scipy
14
  pillow
15
+ pypdf
routes_team_chat.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import re
3
  import asyncio
4
  import logging
 
5
  import time
6
  from typing import Any, Dict, List, Optional
7
 
@@ -37,6 +38,9 @@ from services import (
37
 
38
  router = APIRouter()
39
  team_doc_route_logger = logging.getLogger("nomus.team_documents.route")
 
 
 
40
 
41
 
42
  def _json_safe(value: Any) -> Any:
 
2
  import re
3
  import asyncio
4
  import logging
5
+ import os
6
  import time
7
  from typing import Any, Dict, List, Optional
8
 
 
38
 
39
  router = APIRouter()
40
  team_doc_route_logger = logging.getLogger("nomus.team_documents.route")
41
+ team_doc_route_logger.setLevel(
42
+ getattr(logging, os.environ.get("TEAM_DOC_LOG_LEVEL", "INFO").upper(), logging.INFO)
43
+ )
44
 
45
 
46
  def _json_safe(value: Any) -> Any:
services_legacy.py CHANGED
@@ -27,6 +27,11 @@ from PIL import Image, ImageOps
27
  from pymongo.errors import OperationFailure
28
  from transformers import AutoTokenizer, VitsModel
29
 
 
 
 
 
 
30
  from core import (
31
  AUTO_COMPACT_COOLDOWN_SEC,
32
  AUTO_COMPACT_ENABLED,
@@ -75,6 +80,18 @@ TEAM_DOC_NODE_CONTENT_LIMIT = int(os.environ.get("TEAM_DOC_NODE_CONTENT_LIMIT",
75
  TEAM_DOC_NODE_CHUNK_SIZE = int(os.environ.get("TEAM_DOC_NODE_CHUNK_SIZE", "80"))
76
  TEAM_DOC_SLOW_LOG_SEC = float(os.environ.get("TEAM_DOC_SLOW_LOG_SEC", "2.5"))
77
  TEAM_DOC_HEAVY_NODE_WARN = int(os.environ.get("TEAM_DOC_HEAVY_NODE_WARN", "1200"))
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
 
80
  def get_vn_now() -> datetime:
@@ -639,6 +656,172 @@ def get_selected_team_messages(team_id: str, selected_ids: List[str], project_id
639
 
640
 
641
  def _safe_decode_text(raw_bytes: bytes, fallback_name: str = "") -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
642
  encodings = ["utf-8", "utf-8-sig", "cp1258", "latin-1"]
643
  for enc in encodings:
644
  try:
@@ -798,6 +981,45 @@ def _load_team_document_nodes(doc_id: str) -> List[Dict[str, Any]]:
798
  return nodes
799
 
800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
  def _resolve_document_tree(doc: Dict[str, Any]) -> Dict[str, Any]:
802
  inline_tree = doc.get("tree") if isinstance(doc.get("tree"), dict) else None
803
  if inline_tree and isinstance(inline_tree.get("nodes"), list):
@@ -1155,20 +1377,44 @@ def list_team_documents(team_id: str, project_id: Optional[str] = None) -> List[
1155
  )
1156
  for doc in docs:
1157
  doc_started = time.perf_counter()
1158
- tree = _resolve_document_tree(doc)
1159
- nodes = tree.get("nodes") or []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1160
  doc["tree"] = {
1161
- "root_id": tree.get("root_id", "root"),
1162
- "total_nodes": int(tree.get("total_nodes") or len(nodes)),
1163
  }
 
 
 
 
1164
  path_cache: Dict[str, Dict[str, Any]] = {}
1165
  node_catalog: List[Dict[str, Any]] = []
1166
- for node in nodes:
1167
  node_id = str(node.get("id") or "")
1168
  if not node_id:
1169
  continue
1170
  if node_id not in path_cache:
1171
- path_cache[node_id] = _build_node_path(tree, node_id)
1172
  path_info = path_cache[node_id]
1173
  node_catalog.append(
1174
  {
@@ -1186,11 +1432,20 @@ def list_team_documents(team_id: str, project_id: Optional[str] = None) -> List[
1186
  doc["node_catalog"] = node_catalog
1187
 
1188
  doc_elapsed = time.perf_counter() - doc_started
1189
- if len(nodes) >= TEAM_DOC_HEAVY_NODE_WARN or doc_elapsed >= TEAM_DOC_SLOW_LOG_SEC:
 
 
 
 
 
 
 
 
 
1190
  team_doc_logger.warning(
1191
  "[team-doc] list doc heavy doc_id=%s nodes=%d duration=%.3fs",
1192
  doc.get("id"),
1193
- len(nodes),
1194
  doc_elapsed,
1195
  )
1196
 
 
27
  from pymongo.errors import OperationFailure
28
  from transformers import AutoTokenizer, VitsModel
29
 
30
+ try:
31
+ from pypdf import PdfReader
32
+ except Exception:
33
+ PdfReader = None
34
+
35
  from core import (
36
  AUTO_COMPACT_COOLDOWN_SEC,
37
  AUTO_COMPACT_ENABLED,
 
80
  TEAM_DOC_NODE_CHUNK_SIZE = int(os.environ.get("TEAM_DOC_NODE_CHUNK_SIZE", "80"))
81
  TEAM_DOC_SLOW_LOG_SEC = float(os.environ.get("TEAM_DOC_SLOW_LOG_SEC", "2.5"))
82
  TEAM_DOC_HEAVY_NODE_WARN = int(os.environ.get("TEAM_DOC_HEAVY_NODE_WARN", "1200"))
83
+ TEAM_DOC_NODE_CATALOG_LIMIT = int(os.environ.get("TEAM_DOC_NODE_CATALOG_LIMIT", "1200"))
84
+ TEAM_DOC_PDF_MAX_PAGES = int(os.environ.get("TEAM_DOC_PDF_MAX_PAGES", "240"))
85
+ TEAM_DOC_LOG_LEVEL = os.environ.get("TEAM_DOC_LOG_LEVEL", "INFO").upper()
86
+ TEAM_DOC_OCR_ENABLED = os.environ.get("TEAM_DOC_OCR_ENABLED", "true").strip().lower() == "true"
87
+ TEAM_DOC_OCR_MAX_PAGES = int(os.environ.get("TEAM_DOC_OCR_MAX_PAGES", "12"))
88
+ TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE = int(os.environ.get("TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE", "1"))
89
+ TEAM_DOC_OCR_MODEL = os.environ.get("TEAM_DOC_OCR_MODEL", os.environ.get("MODEL_NAME", "gemini-flash-lite-latest"))
90
+ TEAM_DOC_OCR_IMAGE_MAX_SIZE = int(os.environ.get("TEAM_DOC_OCR_IMAGE_MAX_SIZE", "1700"))
91
+ TEAM_DOC_OCR_MAX_CHARS = int(os.environ.get("TEAM_DOC_OCR_MAX_CHARS", "160000"))
92
+ TEAM_DOC_PDF_ALLOW_BINARY_FALLBACK = os.environ.get("TEAM_DOC_PDF_ALLOW_BINARY_FALLBACK", "false").strip().lower() == "true"
93
+
94
+ team_doc_logger.setLevel(getattr(logging, TEAM_DOC_LOG_LEVEL, logging.INFO))
95
 
96
 
97
  def get_vn_now() -> datetime:
 
656
 
657
 
658
  def _safe_decode_text(raw_bytes: bytes, fallback_name: str = "") -> str:
659
+ normalized_name = str(fallback_name or "").lower()
660
+ looks_like_pdf = normalized_name.endswith(".pdf") or raw_bytes[:4] == b"%PDF"
661
+
662
+ def _extract_page_images(page: Any) -> List[bytes]:
663
+ images_attr = getattr(page, "images", None)
664
+ if images_attr is None:
665
+ return []
666
+
667
+ image_bytes_list: List[bytes] = []
668
+ try:
669
+ images_seq = list(images_attr)
670
+ except Exception:
671
+ images_seq = []
672
+
673
+ for image_obj in images_seq[: max(1, TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE)]:
674
+ image_data = getattr(image_obj, "data", None)
675
+ if isinstance(image_data, (bytes, bytearray)) and image_data:
676
+ image_bytes_list.append(bytes(image_data))
677
+ continue
678
+
679
+ pil_image = getattr(image_obj, "image", None)
680
+ if pil_image is not None:
681
+ try:
682
+ output = io.BytesIO()
683
+ pil_image.save(output, format="PNG")
684
+ image_bytes_list.append(output.getvalue())
685
+ except Exception:
686
+ continue
687
+ return image_bytes_list
688
+
689
+ def _normalize_image_for_ocr(image_bytes: bytes) -> tuple[bytes, str]:
690
+ try:
691
+ image = Image.open(io.BytesIO(image_bytes))
692
+ image = ImageOps.exif_transpose(image).convert("RGB")
693
+ image.thumbnail((TEAM_DOC_OCR_IMAGE_MAX_SIZE, TEAM_DOC_OCR_IMAGE_MAX_SIZE), Image.Resampling.LANCZOS)
694
+ output = io.BytesIO()
695
+ image.save(output, format="JPEG", quality=82, optimize=True)
696
+ return output.getvalue(), "image/jpeg"
697
+ except Exception:
698
+ return image_bytes, "image/png"
699
+
700
+ def _ocr_image_with_llm(image_bytes: bytes, mime_type: str, page_number: int) -> str:
701
+ try:
702
+ response = model_client.models.generate_content(
703
+ model=TEAM_DOC_OCR_MODEL,
704
+ contents=[
705
+ types.Part.from_text(
706
+ text=(
707
+ "Bạn là OCR engine. Trích xuất nguyên văn text có thể đọc được từ ảnh tài liệu. "
708
+ "Giữ bố cục cơ bản bằng xuống dòng. Không thêm giải thích."
709
+ )
710
+ ),
711
+ types.Part.from_bytes(data=image_bytes, mime_type=mime_type),
712
+ ],
713
+ config=types.GenerateContentConfig(temperature=0.0),
714
+ )
715
+ text = (response.text or "").strip()
716
+ if not text:
717
+ return ""
718
+ text = re.sub(r"^```(?:text)?\\s*", "", text)
719
+ text = re.sub(r"\\s*```$", "", text)
720
+ if len(text) > TEAM_DOC_OCR_MAX_CHARS:
721
+ text = text[:TEAM_DOC_OCR_MAX_CHARS]
722
+ team_doc_logger.info(
723
+ "[team-doc] OCR page success file=%s page=%d chars=%d",
724
+ fallback_name,
725
+ page_number,
726
+ len(text),
727
+ )
728
+ return text
729
+ except Exception:
730
+ team_doc_logger.exception(
731
+ "[team-doc] OCR page failed file=%s page=%d",
732
+ fallback_name,
733
+ page_number,
734
+ )
735
+ return ""
736
+
737
+ def _extract_pdf_text_by_ocr(reader: Any) -> str:
738
+ if not TEAM_DOC_OCR_ENABLED:
739
+ return ""
740
+
741
+ started = time.perf_counter()
742
+ page_limit = max(1, TEAM_DOC_OCR_MAX_PAGES)
743
+ page_count = min(len(reader.pages), page_limit)
744
+ collected_pages: List[str] = []
745
+
746
+ for page_idx in range(page_count):
747
+ page = reader.pages[page_idx]
748
+ page_images = _extract_page_images(page)
749
+ if not page_images:
750
+ continue
751
+
752
+ page_text_parts: List[str] = []
753
+ for image_bytes in page_images[: max(1, TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE)]:
754
+ normalized_bytes, mime_type = _normalize_image_for_ocr(image_bytes)
755
+ ocr_text = _ocr_image_with_llm(normalized_bytes, mime_type, page_idx + 1)
756
+ if ocr_text:
757
+ page_text_parts.append(ocr_text)
758
+
759
+ if page_text_parts:
760
+ collected_pages.append("\n".join(page_text_parts).strip())
761
+
762
+ text = "\n\n".join(part for part in collected_pages if part).strip()
763
+ elapsed = time.perf_counter() - started
764
+ team_doc_logger.info(
765
+ "[team-doc] OCR fallback done file=%s pages_scanned=%d pages_with_text=%d chars=%d duration=%.3fs",
766
+ fallback_name,
767
+ page_count,
768
+ len(collected_pages),
769
+ len(text),
770
+ elapsed,
771
+ )
772
+ return text
773
+
774
+ if looks_like_pdf:
775
+ if PdfReader is None:
776
+ team_doc_logger.warning(
777
+ "[team-doc] PDF parser unavailable (pypdf not installed), fallback to byte decode file=%s",
778
+ fallback_name,
779
+ )
780
+ else:
781
+ try:
782
+ reader = PdfReader(io.BytesIO(raw_bytes))
783
+ pages = reader.pages[:TEAM_DOC_PDF_MAX_PAGES]
784
+ page_texts: List[str] = []
785
+ for page in pages:
786
+ extracted = page.extract_text() or ""
787
+ if extracted.strip():
788
+ page_texts.append(extracted)
789
+ text = "\n\n".join(page_texts).strip()
790
+ if text:
791
+ team_doc_logger.info(
792
+ "[team-doc] PDF extracted file=%s pages=%d chars=%d",
793
+ fallback_name,
794
+ len(pages),
795
+ len(text),
796
+ )
797
+ return text
798
+ team_doc_logger.warning(
799
+ "[team-doc] PDF extraction returned empty text file=%s pages=%d",
800
+ fallback_name,
801
+ len(pages),
802
+ )
803
+
804
+ ocr_text = _extract_pdf_text_by_ocr(reader)
805
+ if ocr_text:
806
+ return ocr_text
807
+ except Exception:
808
+ team_doc_logger.exception(
809
+ "[team-doc] PDF extraction failed file=%s, fallback to byte decode",
810
+ fallback_name,
811
+ )
812
+
813
+ if TEAM_DOC_PDF_ALLOW_BINARY_FALLBACK:
814
+ team_doc_logger.warning(
815
+ "[team-doc] PDF textual extraction unavailable, binary decode fallback enabled file=%s",
816
+ fallback_name,
817
+ )
818
+ else:
819
+ team_doc_logger.warning(
820
+ "[team-doc] PDF textual extraction unavailable, returning placeholder to avoid binary noise file=%s",
821
+ fallback_name,
822
+ )
823
+ return "Khong the trich xuat van ban tu PDF nay. Vui long upload PDF co text layer hoac thu OCR fallback."
824
+
825
  encodings = ["utf-8", "utf-8-sig", "cp1258", "latin-1"]
826
  for enc in encodings:
827
  try:
 
981
  return nodes
982
 
983
 
984
+ def _load_team_document_nodes_preview(
985
+ doc_id: str,
986
+ max_nodes: int,
987
+ chunk_size_hint: int = TEAM_DOC_NODE_CHUNK_SIZE,
988
+ ) -> List[Dict[str, Any]]:
989
+ normalized_doc_id = str(doc_id or "").strip()
990
+ if not normalized_doc_id or max_nodes <= 0:
991
+ return []
992
+
993
+ effective_chunk_size = max(1, int(chunk_size_hint or TEAM_DOC_NODE_CHUNK_SIZE))
994
+ chunk_limit = max(1, math.ceil(max_nodes / effective_chunk_size))
995
+ projection = {"_id": 0, "chunk_index": 1, "nodes": 1}
996
+
997
+ try:
998
+ cursor = (
999
+ team_doc_chunks_collection.find({"doc_id": normalized_doc_id}, projection)
1000
+ .hint("idx_team_doc_chunks_doc_id_chunk")
1001
+ .sort("chunk_index", 1)
1002
+ .limit(chunk_limit)
1003
+ )
1004
+ if hasattr(cursor, "allow_disk_use"):
1005
+ cursor = cursor.allow_disk_use(True)
1006
+ rows = list(cursor)
1007
+ except (OperationFailure, ValueError):
1008
+ rows = list(team_doc_chunks_collection.find({"doc_id": normalized_doc_id}, projection))
1009
+ rows.sort(key=lambda row: int(row.get("chunk_index", 0)))
1010
+ rows = rows[:chunk_limit]
1011
+
1012
+ nodes: List[Dict[str, Any]] = []
1013
+ for row in rows:
1014
+ chunk_nodes = row.get("nodes")
1015
+ if not isinstance(chunk_nodes, list):
1016
+ continue
1017
+ nodes.extend(chunk_nodes)
1018
+ if len(nodes) >= max_nodes:
1019
+ return nodes[:max_nodes]
1020
+ return nodes
1021
+
1022
+
1023
  def _resolve_document_tree(doc: Dict[str, Any]) -> Dict[str, Any]:
1024
  inline_tree = doc.get("tree") if isinstance(doc.get("tree"), dict) else None
1025
  if inline_tree and isinstance(inline_tree.get("nodes"), list):
 
1377
  )
1378
  for doc in docs:
1379
  doc_started = time.perf_counter()
1380
+ tree_meta = doc.get("tree_meta") if isinstance(doc.get("tree_meta"), dict) else {}
1381
+ inline_tree = doc.get("tree") if isinstance(doc.get("tree"), dict) else None
1382
+
1383
+ root_id = str(tree_meta.get("root_id") or "root")
1384
+ total_nodes = int(tree_meta.get("total_nodes") or 0)
1385
+ all_nodes: List[Dict[str, Any]] = []
1386
+
1387
+ if inline_tree and isinstance(inline_tree.get("nodes"), list):
1388
+ root_id = str(inline_tree.get("root_id") or root_id)
1389
+ all_nodes = inline_tree.get("nodes") or []
1390
+ total_nodes = int(inline_tree.get("total_nodes") or len(all_nodes))
1391
+ else:
1392
+ chunk_size_hint = int(tree_meta.get("chunk_size") or TEAM_DOC_NODE_CHUNK_SIZE)
1393
+ all_nodes = _load_team_document_nodes_preview(
1394
+ str(doc.get("id") or ""),
1395
+ TEAM_DOC_NODE_CATALOG_LIMIT,
1396
+ chunk_size_hint=chunk_size_hint,
1397
+ )
1398
+
1399
+ preview_nodes = all_nodes[:TEAM_DOC_NODE_CATALOG_LIMIT] if TEAM_DOC_NODE_CATALOG_LIMIT > 0 else []
1400
+ tree_for_paths = {"root_id": root_id, "nodes": preview_nodes}
1401
+
1402
  doc["tree"] = {
1403
+ "root_id": root_id,
1404
+ "total_nodes": int(total_nodes or len(all_nodes)),
1405
  }
1406
+ doc["node_catalog_total"] = int(total_nodes or len(all_nodes))
1407
+ doc["node_catalog_limit"] = TEAM_DOC_NODE_CATALOG_LIMIT
1408
+ doc["node_catalog_truncated"] = doc["node_catalog_total"] > len(preview_nodes)
1409
+
1410
  path_cache: Dict[str, Dict[str, Any]] = {}
1411
  node_catalog: List[Dict[str, Any]] = []
1412
+ for node in preview_nodes:
1413
  node_id = str(node.get("id") or "")
1414
  if not node_id:
1415
  continue
1416
  if node_id not in path_cache:
1417
+ path_cache[node_id] = _build_node_path(tree_for_paths, node_id)
1418
  path_info = path_cache[node_id]
1419
  node_catalog.append(
1420
  {
 
1432
  doc["node_catalog"] = node_catalog
1433
 
1434
  doc_elapsed = time.perf_counter() - doc_started
1435
+ if doc.get("node_catalog_truncated"):
1436
+ team_doc_logger.warning(
1437
+ "[team-doc] list doc catalog truncated doc_id=%s returned=%d total=%d limit=%d",
1438
+ doc.get("id"),
1439
+ len(node_catalog),
1440
+ doc.get("node_catalog_total"),
1441
+ TEAM_DOC_NODE_CATALOG_LIMIT,
1442
+ )
1443
+
1444
+ if int(doc.get("node_catalog_total") or 0) >= TEAM_DOC_HEAVY_NODE_WARN or doc_elapsed >= TEAM_DOC_SLOW_LOG_SEC:
1445
  team_doc_logger.warning(
1446
  "[team-doc] list doc heavy doc_id=%s nodes=%d duration=%.3fs",
1447
  doc.get("id"),
1448
+ int(doc.get("node_catalog_total") or 0),
1449
  doc_elapsed,
1450
  )
1451