| import os |
| import asyncio |
| from langchain.document_loaders import PyPDFLoader |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from langchain.vectorstores import FAISS |
| from langchain_huggingface.embeddings import HuggingFaceEmbeddings |
|
|
| async def process_and_store_file(file_path, user_id, websocket=None, upload_directory="./uploaded_files"): |
| try: |
| |
| if websocket: |
| await websocket.send_text("1. PDF ํ์ผ ๋ก๋ ์ค...") |
| loader = PyPDFLoader(file_path) |
| documents = loader.load() |
| if websocket: |
| await websocket.send_text(f"PDF ํ์ผ ๋ก๋ ์๋ฃ: {len(documents)} ๋ฌธ์") |
| except Exception as e: |
| if websocket: |
| await websocket.send_text(f"PDF ํ์ผ ๋ก๋ ์ค๋ฅ: {e}") |
| return |
|
|
| try: |
| |
| if websocket: |
| await websocket.send_text("2. ํ
์คํธ ๋ถํ ์ค...") |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) |
| docs = text_splitter.split_documents(documents) |
| if websocket: |
| await websocket.send_text(f"ํ
์คํธ ๋ถํ ์๋ฃ: {len(docs)} ์ฒญํฌ") |
| except Exception as e: |
| if websocket: |
| await websocket.send_text(f"ํ
์คํธ ๋ถํ ์ค๋ฅ: {e}") |
| return |
|
|
| try: |
| |
| if websocket: |
| await websocket.send_text("3. ์๋ฒ ๋ฉ ์์ฑ ๋ฐ ๋ฒกํฐํ ์ค...") |
| embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") |
| vectors = FAISS.from_documents(docs, embeddings) |
|
|
| |
| db_path = os.path.join(upload_directory, "faiss_index") |
| vectors.save_local(db_path) |
| if websocket: |
| await websocket.send_text(f"FAISS ์ธ๋ฑ์ค ์ ์ฅ ์๋ฃ: {db_path}") |
| except Exception as e: |
| if websocket: |
| await websocket.send_text(f"๋ฒกํฐํ ์ค๋ฅ: {e}") |
| return |
| finally: |
| |
| try: |
| if os.path.exists(file_path): |
| os.remove(file_path) |
| if websocket: |
| await websocket.send_text(f"ํ์ผ ์ญ์ ์๋ฃ: {file_path}") |
| except Exception as e: |
| if websocket: |
| await websocket.send_text(f"ํ์ผ ์ญ์ ์ค๋ฅ: {e}") |
|
|