Spaces:
Sleeping
Sleeping
File size: 2,598 Bytes
20f67e5 f6d6e11 20f67e5 f6d6e11 20f67e5 64e9cfe bd58369 64e9cfe 20f67e5 bd58369 64e9cfe 20f67e5 64e9cfe 20f67e5 64e9cfe 20f67e5 64e9cfe 20f67e5 f6d6e11 20f67e5 64e9cfe bd58369 64e9cfe 20f67e5 64e9cfe 20f67e5 64e9cfe 20f67e5 64e9cfe 20f67e5 64e9cfe 20f67e5 7284cb3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | import gradio as gr
import easyocr
import fitz
import docx
import os
import numpy as np
from PIL import Image
# تحميل المحرك
reader = easyocr.Reader(['ar', 'en'], gpu=False)
def process_raw_ocr(file_objs, img_input, current_text):
text_output = ""
# 1. معالجة الملفات
if file_objs:
for file in file_objs:
ext = os.path.splitext(file.name)[1].lower()
# إضافة دعم الصور (JPG) فقط
if ext in [".jpg", ".jpeg", ".png"]:
res = reader.readtext(file.name, detail=0, paragraph=True)
text_output += "\n".join(res) + "\n"
elif ext == ".pdf":
doc = fitz.open(file.name)
for page in doc:
# الدقة الأصلية (Matrix 2)
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img.save("temp.png")
res = reader.readtext("temp.png", detail=0, paragraph=True)
text_output += "\n".join(res) + "\n"
doc.close()
elif ext == ".docx":
d = docx.Document(file.name)
text_output += "\n".join([p.text for p in d.paragraphs]) + "\n"
elif ext == ".txt":
with open(file.name, "r", encoding="utf-8", errors="ignore") as f:
text_output += f.read() + "\n"
# 2. معالجة الصور/الكاميرا
if img_input is not None:
img_pil = Image.fromarray(img_input.astype('uint8')) if isinstance(img_input, np.ndarray) else img_input
img_pil.save("temp_c.png")
res = reader.readtext("temp_c.png", detail=0, paragraph=True)
text_output += "\n".join(res) + "\n"
# تنظيف الملفات المؤقتة
for f in ["temp.png", "temp_c.png"]:
if os.path.exists(f): os.remove(f)
return f"{current_text}\n\n{text_output}".strip()
with gr.Blocks() as demo:
gr.Markdown("# 🔍 Raw OCR Engine (نسخة خام)")
with gr.Row():
with gr.Column():
f_in = gr.File(label="الملفات", file_count="multiple")
i_in = gr.Image(label="الكاميرا")
btn = gr.Button("استخراج النص الخام", variant="primary")
with gr.Column():
out = gr.Textbox(label="النص المستخرج كما هو", lines=25)
btn.click(process_raw_ocr, inputs=[f_in, i_in, out], outputs=[out])
demo.launch() |