File size: 2,598 Bytes
20f67e5
 
 
 
 
 
 
 
f6d6e11
20f67e5
 
 
 
 
f6d6e11
20f67e5
 
64e9cfe
bd58369
 
 
 
 
 
 
64e9cfe
20f67e5
bd58369
64e9cfe
20f67e5
64e9cfe
 
20f67e5
 
 
64e9cfe
20f67e5
 
64e9cfe
20f67e5
 
f6d6e11
20f67e5
64e9cfe
 
 
 
 
bd58369
64e9cfe
 
20f67e5
 
 
 
64e9cfe
20f67e5
 
64e9cfe
20f67e5
64e9cfe
20f67e5
64e9cfe
20f67e5
 
 
7284cb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import easyocr
import fitz
import docx
import os
import numpy as np
from PIL import Image

# تحميل المحرك
reader = easyocr.Reader(['ar', 'en'], gpu=False)

def process_raw_ocr(file_objs, img_input, current_text):
    text_output = ""
    
    # 1. معالجة الملفات
    if file_objs:
        for file in file_objs:
            ext = os.path.splitext(file.name)[1].lower()
            
            # إضافة دعم الصور (JPG) فقط
            if ext in [".jpg", ".jpeg", ".png"]:
                res = reader.readtext(file.name, detail=0, paragraph=True)
                text_output += "\n".join(res) + "\n"
            
            elif ext == ".pdf":
                doc = fitz.open(file.name)
                for page in doc:
                    # الدقة الأصلية (Matrix 2)
                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    img.save("temp.png")
                    res = reader.readtext("temp.png", detail=0, paragraph=True)
                    text_output += "\n".join(res) + "\n"
                doc.close()
            elif ext == ".docx":
                d = docx.Document(file.name)
                text_output += "\n".join([p.text for p in d.paragraphs]) + "\n"
            elif ext == ".txt":
                with open(file.name, "r", encoding="utf-8", errors="ignore") as f:
                    text_output += f.read() + "\n"

    # 2. معالجة الصور/الكاميرا
    if img_input is not None:
        img_pil = Image.fromarray(img_input.astype('uint8')) if isinstance(img_input, np.ndarray) else img_input
        img_pil.save("temp_c.png")
        res = reader.readtext("temp_c.png", detail=0, paragraph=True)
        text_output += "\n".join(res) + "\n"

    # تنظيف الملفات المؤقتة
    for f in ["temp.png", "temp_c.png"]:
        if os.path.exists(f): os.remove(f)

    return f"{current_text}\n\n{text_output}".strip()

with gr.Blocks() as demo:
    gr.Markdown("# 🔍 Raw OCR Engine (نسخة خام)")
    with gr.Row():
        with gr.Column():
            f_in = gr.File(label="الملفات", file_count="multiple")
            i_in = gr.Image(label="الكاميرا")
            btn = gr.Button("استخراج النص الخام", variant="primary")
        with gr.Column():
            out = gr.Textbox(label="النص المستخرج كما هو", lines=25)

    btn.click(process_raw_ocr, inputs=[f_in, i_in, out], outputs=[out])

demo.launch()