Spaces:
Sleeping
Sleeping
Claude commited on
feat: support ALTO XML et PAGE XML comme GT dans l'upload de corpus
Browse files- Ajoute `_detect_xml_gt` qui détecte le format (ALTO/PAGE) et extrait le texte GT
- Ajoute `_extract_alto_text` : concatène les CONTENT des <String> par ligne
- Ajoute `_extract_page_text` : concatène les <Unicode> en ordre de lecture
- `_analyze_corpus_dir` reconnaît les paires image + .xml ; génère le .gt.txt
correspondant et indique le format dans chaque paire et le résumé global
- `_flatten_zip_to_dir` et l'endpoint upload acceptent désormais les .xml
- 11 nouveaux tests couvrant ALTO XML, PAGE XML, texte brut et XML inconnu
https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq
- picarones/web/app.py +99 -7
- tests/test_sprint6_web_interface.py +155 -0
picarones/web/app.py
CHANGED
|
@@ -33,6 +33,7 @@ import shutil
|
|
| 33 |
import threading
|
| 34 |
import time
|
| 35 |
import uuid
|
|
|
|
| 36 |
import zipfile
|
| 37 |
from dataclasses import dataclass, field
|
| 38 |
from datetime import datetime, timezone
|
|
@@ -539,6 +540,76 @@ async def api_corpus_browse(path: str = Query(default=".", description="Chemin
|
|
| 539 |
# API — corpus upload
|
| 540 |
# ---------------------------------------------------------------------------
|
| 541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
def _analyze_corpus_dir(path: Path) -> dict:
|
| 543 |
"""Analyse un dossier et retourne un résumé des paires image/GT détectées."""
|
| 544 |
images = sorted(f.name for f in path.iterdir() if f.suffix.lower() in _IMAGE_EXTS)
|
|
@@ -546,11 +617,31 @@ def _analyze_corpus_dir(path: Path) -> dict:
|
|
| 546 |
missing_gt: list[str] = []
|
| 547 |
for img in images:
|
| 548 |
stem = Path(img).stem
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 552 |
else:
|
| 553 |
missing_gt.append(img)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
return {
|
| 555 |
"doc_count": len(pairs),
|
| 556 |
"pairs": pairs[:20],
|
|
@@ -559,19 +650,20 @@ def _analyze_corpus_dir(path: Path) -> dict:
|
|
| 559 |
"has_missing_gt": len(missing_gt) > 0,
|
| 560 |
"warnings": [f"GT manquant : {img}" for img in missing_gt[:5]],
|
| 561 |
"usable": len(pairs) > 0,
|
|
|
|
| 562 |
}
|
| 563 |
|
| 564 |
|
| 565 |
def _flatten_zip_to_dir(zf: zipfile.ZipFile, dest: Path) -> None:
|
| 566 |
-
"""Extrait un ZIP en aplatissant les paires image/.gt.txt dans dest."""
|
| 567 |
dest.mkdir(parents=True, exist_ok=True)
|
| 568 |
for member in zf.infolist():
|
| 569 |
if member.is_dir():
|
| 570 |
continue
|
| 571 |
p = Path(member.filename)
|
| 572 |
name = p.name
|
| 573 |
-
# Accepter images
|
| 574 |
-
if p.suffix.lower() in _IMAGE_EXTS or name.endswith(".gt.txt"):
|
| 575 |
data = zf.read(member.filename)
|
| 576 |
(dest / name).write_bytes(data)
|
| 577 |
|
|
@@ -594,7 +686,7 @@ async def api_corpus_upload(files: list[UploadFile] = File(...)) -> dict:
|
|
| 594 |
import io
|
| 595 |
with zipfile.ZipFile(io.BytesIO(data)) as zf:
|
| 596 |
_flatten_zip_to_dir(zf, corpus_dir)
|
| 597 |
-
elif suffix in _IMAGE_EXTS or filename.endswith(".gt.txt") or suffix
|
| 598 |
(corpus_dir / filename).write_bytes(data)
|
| 599 |
# Ignorer les autres types
|
| 600 |
|
|
|
|
| 33 |
import threading
|
| 34 |
import time
|
| 35 |
import uuid
|
| 36 |
+
import xml.etree.ElementTree as ET
|
| 37 |
import zipfile
|
| 38 |
from dataclasses import dataclass, field
|
| 39 |
from datetime import datetime, timezone
|
|
|
|
| 540 |
# API — corpus upload
|
| 541 |
# ---------------------------------------------------------------------------
|
| 542 |
|
| 543 |
+
def _detect_xml_gt(xml_bytes: bytes) -> tuple[str, str] | None:
|
| 544 |
+
"""Détecte si xml_bytes est un fichier ALTO ou PAGE XML et extrait le texte GT.
|
| 545 |
+
|
| 546 |
+
Retourne (format_label, texte_gt) ou None si le format n'est pas reconnu.
|
| 547 |
+
"""
|
| 548 |
+
try:
|
| 549 |
+
root = ET.fromstring(xml_bytes)
|
| 550 |
+
except ET.ParseError:
|
| 551 |
+
return None
|
| 552 |
+
|
| 553 |
+
tag = root.tag # peut être "{namespace}alto" ou "alto" ou "{ns}PcGts"
|
| 554 |
+
|
| 555 |
+
# --- ALTO XML ---
|
| 556 |
+
# Namespace contient loc.gov/standards/alto ou balise racine "alto"
|
| 557 |
+
ns_alto = "http://www.loc.gov/standards/alto"
|
| 558 |
+
is_alto = (
|
| 559 |
+
ns_alto in tag
|
| 560 |
+
or tag.lower() == "alto"
|
| 561 |
+
or (tag.startswith("{") and tag.split("}")[1].lower() in ("alto",))
|
| 562 |
+
)
|
| 563 |
+
if is_alto:
|
| 564 |
+
text = _extract_alto_text(root)
|
| 565 |
+
return ("ALTO XML", text)
|
| 566 |
+
|
| 567 |
+
# --- PAGE XML ---
|
| 568 |
+
# Balise racine PcGts (avec ou sans namespace)
|
| 569 |
+
local = tag.split("}")[-1] if "}" in tag else tag
|
| 570 |
+
if local == "PcGts":
|
| 571 |
+
text = _extract_page_text(root)
|
| 572 |
+
return ("PAGE XML", text)
|
| 573 |
+
|
| 574 |
+
return None
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
def _extract_alto_text(root: ET.Element) -> str:
|
| 578 |
+
"""Extrait le texte plein d'un arbre ALTO XML.
|
| 579 |
+
|
| 580 |
+
Concatène les attributs CONTENT des balises <String> dans l'ordre de lecture
|
| 581 |
+
(bloc → ligne → mot), avec un espace entre mots et une newline entre lignes.
|
| 582 |
+
"""
|
| 583 |
+
# Chercher les éléments TextLine (avec ou sans namespace)
|
| 584 |
+
lines: list[str] = []
|
| 585 |
+
for elem in root.iter():
|
| 586 |
+
local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
| 587 |
+
if local == "TextLine":
|
| 588 |
+
words: list[str] = []
|
| 589 |
+
for child in elem.iter():
|
| 590 |
+
child_local = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
| 591 |
+
if child_local == "String":
|
| 592 |
+
content = child.get("CONTENT", "")
|
| 593 |
+
if content:
|
| 594 |
+
words.append(content)
|
| 595 |
+
if words:
|
| 596 |
+
lines.append(" ".join(words))
|
| 597 |
+
return "\n".join(lines)
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
def _extract_page_text(root: ET.Element) -> str:
|
| 601 |
+
"""Extrait le texte plein d'un arbre PAGE XML.
|
| 602 |
+
|
| 603 |
+
Concatène le contenu des balises <Unicode> dans l'ordre de lecture.
|
| 604 |
+
"""
|
| 605 |
+
texts: list[str] = []
|
| 606 |
+
for elem in root.iter():
|
| 607 |
+
local = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
| 608 |
+
if local == "Unicode" and elem.text:
|
| 609 |
+
texts.append(elem.text.strip())
|
| 610 |
+
return "\n".join(t for t in texts if t)
|
| 611 |
+
|
| 612 |
+
|
| 613 |
def _analyze_corpus_dir(path: Path) -> dict:
|
| 614 |
"""Analyse un dossier et retourne un résumé des paires image/GT détectées."""
|
| 615 |
images = sorted(f.name for f in path.iterdir() if f.suffix.lower() in _IMAGE_EXTS)
|
|
|
|
| 617 |
missing_gt: list[str] = []
|
| 618 |
for img in images:
|
| 619 |
stem = Path(img).stem
|
| 620 |
+
gt_txt = path / (stem + ".gt.txt")
|
| 621 |
+
gt_xml = path / (stem + ".xml")
|
| 622 |
+
if gt_txt.exists():
|
| 623 |
+
pairs.append({"image": img, "gt": stem + ".gt.txt", "gt_format": "texte brut"})
|
| 624 |
+
elif gt_xml.exists():
|
| 625 |
+
result = _detect_xml_gt(gt_xml.read_bytes())
|
| 626 |
+
if result is not None:
|
| 627 |
+
fmt, text = result
|
| 628 |
+
# Matérialiser le GT en .gt.txt pour le chargeur de corpus
|
| 629 |
+
gt_txt.write_text(text, encoding="utf-8")
|
| 630 |
+
pairs.append({"image": img, "gt": stem + ".gt.txt", "gt_format": fmt})
|
| 631 |
+
else:
|
| 632 |
+
missing_gt.append(img)
|
| 633 |
else:
|
| 634 |
missing_gt.append(img)
|
| 635 |
+
|
| 636 |
+
# Détecter le format dominant pour le résumé global
|
| 637 |
+
formats = {p["gt_format"] for p in pairs}
|
| 638 |
+
if len(formats) == 1:
|
| 639 |
+
dominant_format: str = formats.pop()
|
| 640 |
+
elif formats:
|
| 641 |
+
dominant_format = "mixte"
|
| 642 |
+
else:
|
| 643 |
+
dominant_format = "texte brut"
|
| 644 |
+
|
| 645 |
return {
|
| 646 |
"doc_count": len(pairs),
|
| 647 |
"pairs": pairs[:20],
|
|
|
|
| 650 |
"has_missing_gt": len(missing_gt) > 0,
|
| 651 |
"warnings": [f"GT manquant : {img}" for img in missing_gt[:5]],
|
| 652 |
"usable": len(pairs) > 0,
|
| 653 |
+
"gt_format": dominant_format,
|
| 654 |
}
|
| 655 |
|
| 656 |
|
| 657 |
def _flatten_zip_to_dir(zf: zipfile.ZipFile, dest: Path) -> None:
|
| 658 |
+
"""Extrait un ZIP en aplatissant les paires image/.gt.txt/.xml dans dest."""
|
| 659 |
dest.mkdir(parents=True, exist_ok=True)
|
| 660 |
for member in zf.infolist():
|
| 661 |
if member.is_dir():
|
| 662 |
continue
|
| 663 |
p = Path(member.filename)
|
| 664 |
name = p.name
|
| 665 |
+
# Accepter images, .gt.txt et .xml (ALTO/PAGE)
|
| 666 |
+
if p.suffix.lower() in _IMAGE_EXTS or name.endswith(".gt.txt") or p.suffix.lower() == ".xml":
|
| 667 |
data = zf.read(member.filename)
|
| 668 |
(dest / name).write_bytes(data)
|
| 669 |
|
|
|
|
| 686 |
import io
|
| 687 |
with zipfile.ZipFile(io.BytesIO(data)) as zf:
|
| 688 |
_flatten_zip_to_dir(zf, corpus_dir)
|
| 689 |
+
elif suffix in _IMAGE_EXTS or filename.endswith(".gt.txt") or suffix in (".txt", ".xml"):
|
| 690 |
(corpus_dir / filename).write_bytes(data)
|
| 691 |
# Ignorer les autres types
|
| 692 |
|
tests/test_sprint6_web_interface.py
CHANGED
|
@@ -1337,3 +1337,158 @@ class TestFastAPICorpusUpload:
|
|
| 1337 |
# corpus_id containing ".." (without slash — FastAPI strips slashes from path params)
|
| 1338 |
r = client.delete("/api/corpus/uploads/..malicious..")
|
| 1339 |
assert r.status_code in (400, 404)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1337 |
# corpus_id containing ".." (without slash — FastAPI strips slashes from path params)
|
| 1338 |
r = client.delete("/api/corpus/uploads/..malicious..")
|
| 1339 |
assert r.status_code in (400, 404)
|
| 1340 |
+
|
| 1341 |
+
# --- ALTO XML ---
|
| 1342 |
+
|
| 1343 |
+
@pytest.fixture
|
| 1344 |
+
def alto_xml_bytes(self):
|
| 1345 |
+
"""Contenu d'un fichier ALTO XML minimal valide."""
|
| 1346 |
+
return (
|
| 1347 |
+
b'<?xml version="1.0" encoding="UTF-8"?>'
|
| 1348 |
+
b'<alto xmlns="http://www.loc.gov/standards/alto/ns-v4#">'
|
| 1349 |
+
b"<Layout><Page><PrintSpace>"
|
| 1350 |
+
b"<TextBlock><TextLine>"
|
| 1351 |
+
b'<String CONTENT="Bonjour"/>'
|
| 1352 |
+
b'<String CONTENT="monde"/>'
|
| 1353 |
+
b"</TextLine></TextBlock>"
|
| 1354 |
+
b"</PrintSpace></Page></Layout>"
|
| 1355 |
+
b"</alto>"
|
| 1356 |
+
)
|
| 1357 |
+
|
| 1358 |
+
@pytest.fixture
|
| 1359 |
+
def tmp_alto_zip(self, alto_xml_bytes):
|
| 1360 |
+
"""ZIP contenant une paire image + ALTO XML."""
|
| 1361 |
+
import io
|
| 1362 |
+
import zipfile
|
| 1363 |
+
|
| 1364 |
+
buf = io.BytesIO()
|
| 1365 |
+
with zipfile.ZipFile(buf, "w") as zf:
|
| 1366 |
+
zf.writestr("page001.png", b"\x89PNG")
|
| 1367 |
+
zf.writestr("page001.xml", alto_xml_bytes)
|
| 1368 |
+
buf.seek(0)
|
| 1369 |
+
return buf.getvalue()
|
| 1370 |
+
|
| 1371 |
+
def test_upload_alto_zip_returns_200(self, client, tmp_alto_zip):
|
| 1372 |
+
r = client.post(
|
| 1373 |
+
"/api/corpus/upload",
|
| 1374 |
+
files=[("files", ("corpus.zip", tmp_alto_zip, "application/zip"))],
|
| 1375 |
+
)
|
| 1376 |
+
assert r.status_code == 200
|
| 1377 |
+
|
| 1378 |
+
def test_upload_alto_zip_doc_count(self, client, tmp_alto_zip):
|
| 1379 |
+
r = client.post(
|
| 1380 |
+
"/api/corpus/upload",
|
| 1381 |
+
files=[("files", ("corpus.zip", tmp_alto_zip, "application/zip"))],
|
| 1382 |
+
)
|
| 1383 |
+
assert r.json()["doc_count"] == 1
|
| 1384 |
+
|
| 1385 |
+
def test_upload_alto_zip_format(self, client, tmp_alto_zip):
|
| 1386 |
+
r = client.post(
|
| 1387 |
+
"/api/corpus/upload",
|
| 1388 |
+
files=[("files", ("corpus.zip", tmp_alto_zip, "application/zip"))],
|
| 1389 |
+
)
|
| 1390 |
+
d = r.json()
|
| 1391 |
+
assert d["gt_format"] == "ALTO XML"
|
| 1392 |
+
assert d["pairs"][0]["gt_format"] == "ALTO XML"
|
| 1393 |
+
|
| 1394 |
+
def test_upload_alto_individual_files(self, client, alto_xml_bytes):
|
| 1395 |
+
files = [
|
| 1396 |
+
("files", ("img001.png", b"\x89PNG", "image/png")),
|
| 1397 |
+
("files", ("img001.xml", alto_xml_bytes, "application/xml")),
|
| 1398 |
+
]
|
| 1399 |
+
r = client.post("/api/corpus/upload", files=files)
|
| 1400 |
+
assert r.status_code == 200
|
| 1401 |
+
assert r.json()["doc_count"] == 1
|
| 1402 |
+
assert r.json()["gt_format"] == "ALTO XML"
|
| 1403 |
+
|
| 1404 |
+
def test_alto_text_extraction(self, alto_xml_bytes):
|
| 1405 |
+
"""_detect_xml_gt extrait correctement le texte depuis un ALTO XML."""
|
| 1406 |
+
from picarones.web.app import _detect_xml_gt
|
| 1407 |
+
result = _detect_xml_gt(alto_xml_bytes)
|
| 1408 |
+
assert result is not None
|
| 1409 |
+
fmt, text = result
|
| 1410 |
+
assert fmt == "ALTO XML"
|
| 1411 |
+
assert "Bonjour" in text
|
| 1412 |
+
assert "monde" in text
|
| 1413 |
+
|
| 1414 |
+
# --- PAGE XML ---
|
| 1415 |
+
|
| 1416 |
+
@pytest.fixture
|
| 1417 |
+
def page_xml_bytes(self):
|
| 1418 |
+
"""Contenu d'un fichier PAGE XML minimal valide."""
|
| 1419 |
+
return (
|
| 1420 |
+
b'<?xml version="1.0" encoding="UTF-8"?>'
|
| 1421 |
+
b'<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15">'
|
| 1422 |
+
b"<Page><TextRegion><TextLine>"
|
| 1423 |
+
b"<TextEquiv><Unicode>Texte de la ligne</Unicode></TextEquiv>"
|
| 1424 |
+
b"</TextLine></TextRegion></Page>"
|
| 1425 |
+
b"</PcGts>"
|
| 1426 |
+
)
|
| 1427 |
+
|
| 1428 |
+
@pytest.fixture
|
| 1429 |
+
def tmp_page_zip(self, page_xml_bytes):
|
| 1430 |
+
"""ZIP contenant une paire image + PAGE XML."""
|
| 1431 |
+
import io
|
| 1432 |
+
import zipfile
|
| 1433 |
+
|
| 1434 |
+
buf = io.BytesIO()
|
| 1435 |
+
with zipfile.ZipFile(buf, "w") as zf:
|
| 1436 |
+
zf.writestr("page002.png", b"\x89PNG")
|
| 1437 |
+
zf.writestr("page002.xml", page_xml_bytes)
|
| 1438 |
+
buf.seek(0)
|
| 1439 |
+
return buf.getvalue()
|
| 1440 |
+
|
| 1441 |
+
def test_upload_page_zip_returns_200(self, client, tmp_page_zip):
|
| 1442 |
+
r = client.post(
|
| 1443 |
+
"/api/corpus/upload",
|
| 1444 |
+
files=[("files", ("corpus.zip", tmp_page_zip, "application/zip"))],
|
| 1445 |
+
)
|
| 1446 |
+
assert r.status_code == 200
|
| 1447 |
+
|
| 1448 |
+
def test_upload_page_zip_format(self, client, tmp_page_zip):
|
| 1449 |
+
r = client.post(
|
| 1450 |
+
"/api/corpus/upload",
|
| 1451 |
+
files=[("files", ("corpus.zip", tmp_page_zip, "application/zip"))],
|
| 1452 |
+
)
|
| 1453 |
+
d = r.json()
|
| 1454 |
+
assert d["gt_format"] == "PAGE XML"
|
| 1455 |
+
assert d["pairs"][0]["gt_format"] == "PAGE XML"
|
| 1456 |
+
|
| 1457 |
+
def test_page_text_extraction(self, page_xml_bytes):
|
| 1458 |
+
"""_detect_xml_gt extrait correctement le texte depuis un PAGE XML."""
|
| 1459 |
+
from picarones.web.app import _detect_xml_gt
|
| 1460 |
+
result = _detect_xml_gt(page_xml_bytes)
|
| 1461 |
+
assert result is not None
|
| 1462 |
+
fmt, text = result
|
| 1463 |
+
assert fmt == "PAGE XML"
|
| 1464 |
+
assert "Texte de la ligne" in text
|
| 1465 |
+
|
| 1466 |
+
# --- Texte brut ---
|
| 1467 |
+
|
| 1468 |
+
def test_upload_plain_txt_format_reported(self, client, tmp_corpus_zip):
|
| 1469 |
+
"""Un corpus .gt.txt classique doit indiquer 'texte brut' dans le résumé."""
|
| 1470 |
+
r = client.post(
|
| 1471 |
+
"/api/corpus/upload",
|
| 1472 |
+
files=[("files", ("corpus.zip", tmp_corpus_zip, "application/zip"))],
|
| 1473 |
+
)
|
| 1474 |
+
assert r.status_code == 200
|
| 1475 |
+
assert r.json()["gt_format"] == "texte brut"
|
| 1476 |
+
|
| 1477 |
+
# --- XML inconnu ignoré ---
|
| 1478 |
+
|
| 1479 |
+
def test_unknown_xml_not_valid_pair(self, client):
|
| 1480 |
+
"""Un XML non ALTO/PAGE ne crée pas de paire valide."""
|
| 1481 |
+
import io
|
| 1482 |
+
import zipfile
|
| 1483 |
+
|
| 1484 |
+
unknown_xml = b'<?xml version="1.0"?><root><item>foo</item></root>'
|
| 1485 |
+
buf = io.BytesIO()
|
| 1486 |
+
with zipfile.ZipFile(buf, "w") as zf:
|
| 1487 |
+
zf.writestr("pageX.png", b"\x89PNG")
|
| 1488 |
+
zf.writestr("pageX.xml", unknown_xml)
|
| 1489 |
+
buf.seek(0)
|
| 1490 |
+
r = client.post(
|
| 1491 |
+
"/api/corpus/upload",
|
| 1492 |
+
files=[("files", ("corpus.zip", buf.getvalue(), "application/zip"))],
|
| 1493 |
+
)
|
| 1494 |
+
assert r.status_code == 422
|