import gradio as gr import easyocr import fitz import docx import os import numpy as np from PIL import Image # تحميل المحرك reader = easyocr.Reader(['ar', 'en'], gpu=False) def process_raw_ocr(file_objs, img_input, current_text): text_output = "" # 1. معالجة الملفات if file_objs: for file in file_objs: ext = os.path.splitext(file.name)[1].lower() # إضافة دعم الصور (JPG) فقط if ext in [".jpg", ".jpeg", ".png"]: res = reader.readtext(file.name, detail=0, paragraph=True) text_output += "\n".join(res) + "\n" elif ext == ".pdf": doc = fitz.open(file.name) for page in doc: # الدقة الأصلية (Matrix 2) pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) img.save("temp.png") res = reader.readtext("temp.png", detail=0, paragraph=True) text_output += "\n".join(res) + "\n" doc.close() elif ext == ".docx": d = docx.Document(file.name) text_output += "\n".join([p.text for p in d.paragraphs]) + "\n" elif ext == ".txt": with open(file.name, "r", encoding="utf-8", errors="ignore") as f: text_output += f.read() + "\n" # 2. معالجة الصور/الكاميرا if img_input is not None: img_pil = Image.fromarray(img_input.astype('uint8')) if isinstance(img_input, np.ndarray) else img_input img_pil.save("temp_c.png") res = reader.readtext("temp_c.png", detail=0, paragraph=True) text_output += "\n".join(res) + "\n" # تنظيف الملفات المؤقتة for f in ["temp.png", "temp_c.png"]: if os.path.exists(f): os.remove(f) return f"{current_text}\n\n{text_output}".strip() with gr.Blocks() as demo: gr.Markdown("# 🔍 Raw OCR Engine (نسخة خام)") with gr.Row(): with gr.Column(): f_in = gr.File(label="الملفات", file_count="multiple") i_in = gr.Image(label="الكاميرا") btn = gr.Button("استخراج النص الخام", variant="primary") with gr.Column(): out = gr.Textbox(label="النص المستخرج كما هو", lines=25) btn.click(process_raw_ocr, inputs=[f_in, i_in, out], outputs=[out]) demo.launch()