Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import easyocr | |
| import fitz | |
| import docx | |
| import os | |
| import numpy as np | |
| from PIL import Image | |
| # تحميل المحرك | |
| reader = easyocr.Reader(['ar', 'en'], gpu=False) | |
| def process_raw_ocr(file_objs, img_input, current_text): | |
| text_output = "" | |
| # 1. معالجة الملفات | |
| if file_objs: | |
| for file in file_objs: | |
| ext = os.path.splitext(file.name)[1].lower() | |
| # إضافة دعم الصور (JPG) فقط | |
| if ext in [".jpg", ".jpeg", ".png"]: | |
| res = reader.readtext(file.name, detail=0, paragraph=True) | |
| text_output += "\n".join(res) + "\n" | |
| elif ext == ".pdf": | |
| doc = fitz.open(file.name) | |
| for page in doc: | |
| # الدقة الأصلية (Matrix 2) | |
| pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| img.save("temp.png") | |
| res = reader.readtext("temp.png", detail=0, paragraph=True) | |
| text_output += "\n".join(res) + "\n" | |
| doc.close() | |
| elif ext == ".docx": | |
| d = docx.Document(file.name) | |
| text_output += "\n".join([p.text for p in d.paragraphs]) + "\n" | |
| elif ext == ".txt": | |
| with open(file.name, "r", encoding="utf-8", errors="ignore") as f: | |
| text_output += f.read() + "\n" | |
| # 2. معالجة الصور/الكاميرا | |
| if img_input is not None: | |
| img_pil = Image.fromarray(img_input.astype('uint8')) if isinstance(img_input, np.ndarray) else img_input | |
| img_pil.save("temp_c.png") | |
| res = reader.readtext("temp_c.png", detail=0, paragraph=True) | |
| text_output += "\n".join(res) + "\n" | |
| # تنظيف الملفات المؤقتة | |
| for f in ["temp.png", "temp_c.png"]: | |
| if os.path.exists(f): os.remove(f) | |
| return f"{current_text}\n\n{text_output}".strip() | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 🔍 Raw OCR Engine (نسخة خام)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| f_in = gr.File(label="الملفات", file_count="multiple") | |
| i_in = gr.Image(label="الكاميرا") | |
| btn = gr.Button("استخراج النص الخام", variant="primary") | |
| with gr.Column(): | |
| out = gr.Textbox(label="النص المستخرج كما هو", lines=25) | |
| btn.click(process_raw_ocr, inputs=[f_in, i_in, out], outputs=[out]) | |
| demo.launch() |