Upload 9 files
Browse files- requirements.txt +1 -0
- routes_team_chat.py +4 -0
- services_legacy.py +263 -8
requirements.txt
CHANGED
|
@@ -12,3 +12,4 @@ soundfile
|
|
| 12 |
numpy
|
| 13 |
scipy
|
| 14 |
pillow
|
|
|
|
|
|
| 12 |
numpy
|
| 13 |
scipy
|
| 14 |
pillow
|
| 15 |
+
pypdf
|
routes_team_chat.py
CHANGED
|
@@ -2,6 +2,7 @@ import json
|
|
| 2 |
import re
|
| 3 |
import asyncio
|
| 4 |
import logging
|
|
|
|
| 5 |
import time
|
| 6 |
from typing import Any, Dict, List, Optional
|
| 7 |
|
|
@@ -37,6 +38,9 @@ from services import (
|
|
| 37 |
|
| 38 |
router = APIRouter()
|
| 39 |
team_doc_route_logger = logging.getLogger("nomus.team_documents.route")
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
def _json_safe(value: Any) -> Any:
|
|
|
|
| 2 |
import re
|
| 3 |
import asyncio
|
| 4 |
import logging
|
| 5 |
+
import os
|
| 6 |
import time
|
| 7 |
from typing import Any, Dict, List, Optional
|
| 8 |
|
|
|
|
| 38 |
|
| 39 |
router = APIRouter()
|
| 40 |
team_doc_route_logger = logging.getLogger("nomus.team_documents.route")
|
| 41 |
+
team_doc_route_logger.setLevel(
|
| 42 |
+
getattr(logging, os.environ.get("TEAM_DOC_LOG_LEVEL", "INFO").upper(), logging.INFO)
|
| 43 |
+
)
|
| 44 |
|
| 45 |
|
| 46 |
def _json_safe(value: Any) -> Any:
|
services_legacy.py
CHANGED
|
@@ -27,6 +27,11 @@ from PIL import Image, ImageOps
|
|
| 27 |
from pymongo.errors import OperationFailure
|
| 28 |
from transformers import AutoTokenizer, VitsModel
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
from core import (
|
| 31 |
AUTO_COMPACT_COOLDOWN_SEC,
|
| 32 |
AUTO_COMPACT_ENABLED,
|
|
@@ -75,6 +80,18 @@ TEAM_DOC_NODE_CONTENT_LIMIT = int(os.environ.get("TEAM_DOC_NODE_CONTENT_LIMIT",
|
|
| 75 |
TEAM_DOC_NODE_CHUNK_SIZE = int(os.environ.get("TEAM_DOC_NODE_CHUNK_SIZE", "80"))
|
| 76 |
TEAM_DOC_SLOW_LOG_SEC = float(os.environ.get("TEAM_DOC_SLOW_LOG_SEC", "2.5"))
|
| 77 |
TEAM_DOC_HEAVY_NODE_WARN = int(os.environ.get("TEAM_DOC_HEAVY_NODE_WARN", "1200"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
def get_vn_now() -> datetime:
|
|
@@ -639,6 +656,172 @@ def get_selected_team_messages(team_id: str, selected_ids: List[str], project_id
|
|
| 639 |
|
| 640 |
|
| 641 |
def _safe_decode_text(raw_bytes: bytes, fallback_name: str = "") -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
encodings = ["utf-8", "utf-8-sig", "cp1258", "latin-1"]
|
| 643 |
for enc in encodings:
|
| 644 |
try:
|
|
@@ -798,6 +981,45 @@ def _load_team_document_nodes(doc_id: str) -> List[Dict[str, Any]]:
|
|
| 798 |
return nodes
|
| 799 |
|
| 800 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 801 |
def _resolve_document_tree(doc: Dict[str, Any]) -> Dict[str, Any]:
|
| 802 |
inline_tree = doc.get("tree") if isinstance(doc.get("tree"), dict) else None
|
| 803 |
if inline_tree and isinstance(inline_tree.get("nodes"), list):
|
|
@@ -1155,20 +1377,44 @@ def list_team_documents(team_id: str, project_id: Optional[str] = None) -> List[
|
|
| 1155 |
)
|
| 1156 |
for doc in docs:
|
| 1157 |
doc_started = time.perf_counter()
|
| 1158 |
-
|
| 1159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1160 |
doc["tree"] = {
|
| 1161 |
-
"root_id":
|
| 1162 |
-
"total_nodes": int(
|
| 1163 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1164 |
path_cache: Dict[str, Dict[str, Any]] = {}
|
| 1165 |
node_catalog: List[Dict[str, Any]] = []
|
| 1166 |
-
for node in
|
| 1167 |
node_id = str(node.get("id") or "")
|
| 1168 |
if not node_id:
|
| 1169 |
continue
|
| 1170 |
if node_id not in path_cache:
|
| 1171 |
-
path_cache[node_id] = _build_node_path(
|
| 1172 |
path_info = path_cache[node_id]
|
| 1173 |
node_catalog.append(
|
| 1174 |
{
|
|
@@ -1186,11 +1432,20 @@ def list_team_documents(team_id: str, project_id: Optional[str] = None) -> List[
|
|
| 1186 |
doc["node_catalog"] = node_catalog
|
| 1187 |
|
| 1188 |
doc_elapsed = time.perf_counter() - doc_started
|
| 1189 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1190 |
team_doc_logger.warning(
|
| 1191 |
"[team-doc] list doc heavy doc_id=%s nodes=%d duration=%.3fs",
|
| 1192 |
doc.get("id"),
|
| 1193 |
-
|
| 1194 |
doc_elapsed,
|
| 1195 |
)
|
| 1196 |
|
|
|
|
| 27 |
from pymongo.errors import OperationFailure
|
| 28 |
from transformers import AutoTokenizer, VitsModel
|
| 29 |
|
| 30 |
+
try:
|
| 31 |
+
from pypdf import PdfReader
|
| 32 |
+
except Exception:
|
| 33 |
+
PdfReader = None
|
| 34 |
+
|
| 35 |
from core import (
|
| 36 |
AUTO_COMPACT_COOLDOWN_SEC,
|
| 37 |
AUTO_COMPACT_ENABLED,
|
|
|
|
| 80 |
TEAM_DOC_NODE_CHUNK_SIZE = int(os.environ.get("TEAM_DOC_NODE_CHUNK_SIZE", "80"))
|
| 81 |
TEAM_DOC_SLOW_LOG_SEC = float(os.environ.get("TEAM_DOC_SLOW_LOG_SEC", "2.5"))
|
| 82 |
TEAM_DOC_HEAVY_NODE_WARN = int(os.environ.get("TEAM_DOC_HEAVY_NODE_WARN", "1200"))
|
| 83 |
+
TEAM_DOC_NODE_CATALOG_LIMIT = int(os.environ.get("TEAM_DOC_NODE_CATALOG_LIMIT", "1200"))
|
| 84 |
+
TEAM_DOC_PDF_MAX_PAGES = int(os.environ.get("TEAM_DOC_PDF_MAX_PAGES", "240"))
|
| 85 |
+
TEAM_DOC_LOG_LEVEL = os.environ.get("TEAM_DOC_LOG_LEVEL", "INFO").upper()
|
| 86 |
+
TEAM_DOC_OCR_ENABLED = os.environ.get("TEAM_DOC_OCR_ENABLED", "true").strip().lower() == "true"
|
| 87 |
+
TEAM_DOC_OCR_MAX_PAGES = int(os.environ.get("TEAM_DOC_OCR_MAX_PAGES", "12"))
|
| 88 |
+
TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE = int(os.environ.get("TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE", "1"))
|
| 89 |
+
TEAM_DOC_OCR_MODEL = os.environ.get("TEAM_DOC_OCR_MODEL", os.environ.get("MODEL_NAME", "gemini-flash-lite-latest"))
|
| 90 |
+
TEAM_DOC_OCR_IMAGE_MAX_SIZE = int(os.environ.get("TEAM_DOC_OCR_IMAGE_MAX_SIZE", "1700"))
|
| 91 |
+
TEAM_DOC_OCR_MAX_CHARS = int(os.environ.get("TEAM_DOC_OCR_MAX_CHARS", "160000"))
|
| 92 |
+
TEAM_DOC_PDF_ALLOW_BINARY_FALLBACK = os.environ.get("TEAM_DOC_PDF_ALLOW_BINARY_FALLBACK", "false").strip().lower() == "true"
|
| 93 |
+
|
| 94 |
+
team_doc_logger.setLevel(getattr(logging, TEAM_DOC_LOG_LEVEL, logging.INFO))
|
| 95 |
|
| 96 |
|
| 97 |
def get_vn_now() -> datetime:
|
|
|
|
| 656 |
|
| 657 |
|
| 658 |
def _safe_decode_text(raw_bytes: bytes, fallback_name: str = "") -> str:
|
| 659 |
+
normalized_name = str(fallback_name or "").lower()
|
| 660 |
+
looks_like_pdf = normalized_name.endswith(".pdf") or raw_bytes[:4] == b"%PDF"
|
| 661 |
+
|
| 662 |
+
def _extract_page_images(page: Any) -> List[bytes]:
|
| 663 |
+
images_attr = getattr(page, "images", None)
|
| 664 |
+
if images_attr is None:
|
| 665 |
+
return []
|
| 666 |
+
|
| 667 |
+
image_bytes_list: List[bytes] = []
|
| 668 |
+
try:
|
| 669 |
+
images_seq = list(images_attr)
|
| 670 |
+
except Exception:
|
| 671 |
+
images_seq = []
|
| 672 |
+
|
| 673 |
+
for image_obj in images_seq[: max(1, TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE)]:
|
| 674 |
+
image_data = getattr(image_obj, "data", None)
|
| 675 |
+
if isinstance(image_data, (bytes, bytearray)) and image_data:
|
| 676 |
+
image_bytes_list.append(bytes(image_data))
|
| 677 |
+
continue
|
| 678 |
+
|
| 679 |
+
pil_image = getattr(image_obj, "image", None)
|
| 680 |
+
if pil_image is not None:
|
| 681 |
+
try:
|
| 682 |
+
output = io.BytesIO()
|
| 683 |
+
pil_image.save(output, format="PNG")
|
| 684 |
+
image_bytes_list.append(output.getvalue())
|
| 685 |
+
except Exception:
|
| 686 |
+
continue
|
| 687 |
+
return image_bytes_list
|
| 688 |
+
|
| 689 |
+
def _normalize_image_for_ocr(image_bytes: bytes) -> tuple[bytes, str]:
|
| 690 |
+
try:
|
| 691 |
+
image = Image.open(io.BytesIO(image_bytes))
|
| 692 |
+
image = ImageOps.exif_transpose(image).convert("RGB")
|
| 693 |
+
image.thumbnail((TEAM_DOC_OCR_IMAGE_MAX_SIZE, TEAM_DOC_OCR_IMAGE_MAX_SIZE), Image.Resampling.LANCZOS)
|
| 694 |
+
output = io.BytesIO()
|
| 695 |
+
image.save(output, format="JPEG", quality=82, optimize=True)
|
| 696 |
+
return output.getvalue(), "image/jpeg"
|
| 697 |
+
except Exception:
|
| 698 |
+
return image_bytes, "image/png"
|
| 699 |
+
|
| 700 |
+
def _ocr_image_with_llm(image_bytes: bytes, mime_type: str, page_number: int) -> str:
|
| 701 |
+
try:
|
| 702 |
+
response = model_client.models.generate_content(
|
| 703 |
+
model=TEAM_DOC_OCR_MODEL,
|
| 704 |
+
contents=[
|
| 705 |
+
types.Part.from_text(
|
| 706 |
+
text=(
|
| 707 |
+
"Bạn là OCR engine. Trích xuất nguyên văn text có thể đọc được từ ảnh tài liệu. "
|
| 708 |
+
"Giữ bố cục cơ bản bằng xuống dòng. Không thêm giải thích."
|
| 709 |
+
)
|
| 710 |
+
),
|
| 711 |
+
types.Part.from_bytes(data=image_bytes, mime_type=mime_type),
|
| 712 |
+
],
|
| 713 |
+
config=types.GenerateContentConfig(temperature=0.0),
|
| 714 |
+
)
|
| 715 |
+
text = (response.text or "").strip()
|
| 716 |
+
if not text:
|
| 717 |
+
return ""
|
| 718 |
+
text = re.sub(r"^```(?:text)?\\s*", "", text)
|
| 719 |
+
text = re.sub(r"\\s*```$", "", text)
|
| 720 |
+
if len(text) > TEAM_DOC_OCR_MAX_CHARS:
|
| 721 |
+
text = text[:TEAM_DOC_OCR_MAX_CHARS]
|
| 722 |
+
team_doc_logger.info(
|
| 723 |
+
"[team-doc] OCR page success file=%s page=%d chars=%d",
|
| 724 |
+
fallback_name,
|
| 725 |
+
page_number,
|
| 726 |
+
len(text),
|
| 727 |
+
)
|
| 728 |
+
return text
|
| 729 |
+
except Exception:
|
| 730 |
+
team_doc_logger.exception(
|
| 731 |
+
"[team-doc] OCR page failed file=%s page=%d",
|
| 732 |
+
fallback_name,
|
| 733 |
+
page_number,
|
| 734 |
+
)
|
| 735 |
+
return ""
|
| 736 |
+
|
| 737 |
+
def _extract_pdf_text_by_ocr(reader: Any) -> str:
|
| 738 |
+
if not TEAM_DOC_OCR_ENABLED:
|
| 739 |
+
return ""
|
| 740 |
+
|
| 741 |
+
started = time.perf_counter()
|
| 742 |
+
page_limit = max(1, TEAM_DOC_OCR_MAX_PAGES)
|
| 743 |
+
page_count = min(len(reader.pages), page_limit)
|
| 744 |
+
collected_pages: List[str] = []
|
| 745 |
+
|
| 746 |
+
for page_idx in range(page_count):
|
| 747 |
+
page = reader.pages[page_idx]
|
| 748 |
+
page_images = _extract_page_images(page)
|
| 749 |
+
if not page_images:
|
| 750 |
+
continue
|
| 751 |
+
|
| 752 |
+
page_text_parts: List[str] = []
|
| 753 |
+
for image_bytes in page_images[: max(1, TEAM_DOC_OCR_MAX_IMAGES_PER_PAGE)]:
|
| 754 |
+
normalized_bytes, mime_type = _normalize_image_for_ocr(image_bytes)
|
| 755 |
+
ocr_text = _ocr_image_with_llm(normalized_bytes, mime_type, page_idx + 1)
|
| 756 |
+
if ocr_text:
|
| 757 |
+
page_text_parts.append(ocr_text)
|
| 758 |
+
|
| 759 |
+
if page_text_parts:
|
| 760 |
+
collected_pages.append("\n".join(page_text_parts).strip())
|
| 761 |
+
|
| 762 |
+
text = "\n\n".join(part for part in collected_pages if part).strip()
|
| 763 |
+
elapsed = time.perf_counter() - started
|
| 764 |
+
team_doc_logger.info(
|
| 765 |
+
"[team-doc] OCR fallback done file=%s pages_scanned=%d pages_with_text=%d chars=%d duration=%.3fs",
|
| 766 |
+
fallback_name,
|
| 767 |
+
page_count,
|
| 768 |
+
len(collected_pages),
|
| 769 |
+
len(text),
|
| 770 |
+
elapsed,
|
| 771 |
+
)
|
| 772 |
+
return text
|
| 773 |
+
|
| 774 |
+
if looks_like_pdf:
|
| 775 |
+
if PdfReader is None:
|
| 776 |
+
team_doc_logger.warning(
|
| 777 |
+
"[team-doc] PDF parser unavailable (pypdf not installed), fallback to byte decode file=%s",
|
| 778 |
+
fallback_name,
|
| 779 |
+
)
|
| 780 |
+
else:
|
| 781 |
+
try:
|
| 782 |
+
reader = PdfReader(io.BytesIO(raw_bytes))
|
| 783 |
+
pages = reader.pages[:TEAM_DOC_PDF_MAX_PAGES]
|
| 784 |
+
page_texts: List[str] = []
|
| 785 |
+
for page in pages:
|
| 786 |
+
extracted = page.extract_text() or ""
|
| 787 |
+
if extracted.strip():
|
| 788 |
+
page_texts.append(extracted)
|
| 789 |
+
text = "\n\n".join(page_texts).strip()
|
| 790 |
+
if text:
|
| 791 |
+
team_doc_logger.info(
|
| 792 |
+
"[team-doc] PDF extracted file=%s pages=%d chars=%d",
|
| 793 |
+
fallback_name,
|
| 794 |
+
len(pages),
|
| 795 |
+
len(text),
|
| 796 |
+
)
|
| 797 |
+
return text
|
| 798 |
+
team_doc_logger.warning(
|
| 799 |
+
"[team-doc] PDF extraction returned empty text file=%s pages=%d",
|
| 800 |
+
fallback_name,
|
| 801 |
+
len(pages),
|
| 802 |
+
)
|
| 803 |
+
|
| 804 |
+
ocr_text = _extract_pdf_text_by_ocr(reader)
|
| 805 |
+
if ocr_text:
|
| 806 |
+
return ocr_text
|
| 807 |
+
except Exception:
|
| 808 |
+
team_doc_logger.exception(
|
| 809 |
+
"[team-doc] PDF extraction failed file=%s, fallback to byte decode",
|
| 810 |
+
fallback_name,
|
| 811 |
+
)
|
| 812 |
+
|
| 813 |
+
if TEAM_DOC_PDF_ALLOW_BINARY_FALLBACK:
|
| 814 |
+
team_doc_logger.warning(
|
| 815 |
+
"[team-doc] PDF textual extraction unavailable, binary decode fallback enabled file=%s",
|
| 816 |
+
fallback_name,
|
| 817 |
+
)
|
| 818 |
+
else:
|
| 819 |
+
team_doc_logger.warning(
|
| 820 |
+
"[team-doc] PDF textual extraction unavailable, returning placeholder to avoid binary noise file=%s",
|
| 821 |
+
fallback_name,
|
| 822 |
+
)
|
| 823 |
+
return "Khong the trich xuat van ban tu PDF nay. Vui long upload PDF co text layer hoac thu OCR fallback."
|
| 824 |
+
|
| 825 |
encodings = ["utf-8", "utf-8-sig", "cp1258", "latin-1"]
|
| 826 |
for enc in encodings:
|
| 827 |
try:
|
|
|
|
| 981 |
return nodes
|
| 982 |
|
| 983 |
|
| 984 |
+
def _load_team_document_nodes_preview(
|
| 985 |
+
doc_id: str,
|
| 986 |
+
max_nodes: int,
|
| 987 |
+
chunk_size_hint: int = TEAM_DOC_NODE_CHUNK_SIZE,
|
| 988 |
+
) -> List[Dict[str, Any]]:
|
| 989 |
+
normalized_doc_id = str(doc_id or "").strip()
|
| 990 |
+
if not normalized_doc_id or max_nodes <= 0:
|
| 991 |
+
return []
|
| 992 |
+
|
| 993 |
+
effective_chunk_size = max(1, int(chunk_size_hint or TEAM_DOC_NODE_CHUNK_SIZE))
|
| 994 |
+
chunk_limit = max(1, math.ceil(max_nodes / effective_chunk_size))
|
| 995 |
+
projection = {"_id": 0, "chunk_index": 1, "nodes": 1}
|
| 996 |
+
|
| 997 |
+
try:
|
| 998 |
+
cursor = (
|
| 999 |
+
team_doc_chunks_collection.find({"doc_id": normalized_doc_id}, projection)
|
| 1000 |
+
.hint("idx_team_doc_chunks_doc_id_chunk")
|
| 1001 |
+
.sort("chunk_index", 1)
|
| 1002 |
+
.limit(chunk_limit)
|
| 1003 |
+
)
|
| 1004 |
+
if hasattr(cursor, "allow_disk_use"):
|
| 1005 |
+
cursor = cursor.allow_disk_use(True)
|
| 1006 |
+
rows = list(cursor)
|
| 1007 |
+
except (OperationFailure, ValueError):
|
| 1008 |
+
rows = list(team_doc_chunks_collection.find({"doc_id": normalized_doc_id}, projection))
|
| 1009 |
+
rows.sort(key=lambda row: int(row.get("chunk_index", 0)))
|
| 1010 |
+
rows = rows[:chunk_limit]
|
| 1011 |
+
|
| 1012 |
+
nodes: List[Dict[str, Any]] = []
|
| 1013 |
+
for row in rows:
|
| 1014 |
+
chunk_nodes = row.get("nodes")
|
| 1015 |
+
if not isinstance(chunk_nodes, list):
|
| 1016 |
+
continue
|
| 1017 |
+
nodes.extend(chunk_nodes)
|
| 1018 |
+
if len(nodes) >= max_nodes:
|
| 1019 |
+
return nodes[:max_nodes]
|
| 1020 |
+
return nodes
|
| 1021 |
+
|
| 1022 |
+
|
| 1023 |
def _resolve_document_tree(doc: Dict[str, Any]) -> Dict[str, Any]:
|
| 1024 |
inline_tree = doc.get("tree") if isinstance(doc.get("tree"), dict) else None
|
| 1025 |
if inline_tree and isinstance(inline_tree.get("nodes"), list):
|
|
|
|
| 1377 |
)
|
| 1378 |
for doc in docs:
|
| 1379 |
doc_started = time.perf_counter()
|
| 1380 |
+
tree_meta = doc.get("tree_meta") if isinstance(doc.get("tree_meta"), dict) else {}
|
| 1381 |
+
inline_tree = doc.get("tree") if isinstance(doc.get("tree"), dict) else None
|
| 1382 |
+
|
| 1383 |
+
root_id = str(tree_meta.get("root_id") or "root")
|
| 1384 |
+
total_nodes = int(tree_meta.get("total_nodes") or 0)
|
| 1385 |
+
all_nodes: List[Dict[str, Any]] = []
|
| 1386 |
+
|
| 1387 |
+
if inline_tree and isinstance(inline_tree.get("nodes"), list):
|
| 1388 |
+
root_id = str(inline_tree.get("root_id") or root_id)
|
| 1389 |
+
all_nodes = inline_tree.get("nodes") or []
|
| 1390 |
+
total_nodes = int(inline_tree.get("total_nodes") or len(all_nodes))
|
| 1391 |
+
else:
|
| 1392 |
+
chunk_size_hint = int(tree_meta.get("chunk_size") or TEAM_DOC_NODE_CHUNK_SIZE)
|
| 1393 |
+
all_nodes = _load_team_document_nodes_preview(
|
| 1394 |
+
str(doc.get("id") or ""),
|
| 1395 |
+
TEAM_DOC_NODE_CATALOG_LIMIT,
|
| 1396 |
+
chunk_size_hint=chunk_size_hint,
|
| 1397 |
+
)
|
| 1398 |
+
|
| 1399 |
+
preview_nodes = all_nodes[:TEAM_DOC_NODE_CATALOG_LIMIT] if TEAM_DOC_NODE_CATALOG_LIMIT > 0 else []
|
| 1400 |
+
tree_for_paths = {"root_id": root_id, "nodes": preview_nodes}
|
| 1401 |
+
|
| 1402 |
doc["tree"] = {
|
| 1403 |
+
"root_id": root_id,
|
| 1404 |
+
"total_nodes": int(total_nodes or len(all_nodes)),
|
| 1405 |
}
|
| 1406 |
+
doc["node_catalog_total"] = int(total_nodes or len(all_nodes))
|
| 1407 |
+
doc["node_catalog_limit"] = TEAM_DOC_NODE_CATALOG_LIMIT
|
| 1408 |
+
doc["node_catalog_truncated"] = doc["node_catalog_total"] > len(preview_nodes)
|
| 1409 |
+
|
| 1410 |
path_cache: Dict[str, Dict[str, Any]] = {}
|
| 1411 |
node_catalog: List[Dict[str, Any]] = []
|
| 1412 |
+
for node in preview_nodes:
|
| 1413 |
node_id = str(node.get("id") or "")
|
| 1414 |
if not node_id:
|
| 1415 |
continue
|
| 1416 |
if node_id not in path_cache:
|
| 1417 |
+
path_cache[node_id] = _build_node_path(tree_for_paths, node_id)
|
| 1418 |
path_info = path_cache[node_id]
|
| 1419 |
node_catalog.append(
|
| 1420 |
{
|
|
|
|
| 1432 |
doc["node_catalog"] = node_catalog
|
| 1433 |
|
| 1434 |
doc_elapsed = time.perf_counter() - doc_started
|
| 1435 |
+
if doc.get("node_catalog_truncated"):
|
| 1436 |
+
team_doc_logger.warning(
|
| 1437 |
+
"[team-doc] list doc catalog truncated doc_id=%s returned=%d total=%d limit=%d",
|
| 1438 |
+
doc.get("id"),
|
| 1439 |
+
len(node_catalog),
|
| 1440 |
+
doc.get("node_catalog_total"),
|
| 1441 |
+
TEAM_DOC_NODE_CATALOG_LIMIT,
|
| 1442 |
+
)
|
| 1443 |
+
|
| 1444 |
+
if int(doc.get("node_catalog_total") or 0) >= TEAM_DOC_HEAVY_NODE_WARN or doc_elapsed >= TEAM_DOC_SLOW_LOG_SEC:
|
| 1445 |
team_doc_logger.warning(
|
| 1446 |
"[team-doc] list doc heavy doc_id=%s nodes=%d duration=%.3fs",
|
| 1447 |
doc.get("id"),
|
| 1448 |
+
int(doc.get("node_catalog_total") or 0),
|
| 1449 |
doc_elapsed,
|
| 1450 |
)
|
| 1451 |
|