pharmaspine-backend / scripts /restore_postgres_fixtures.py
ashish1265659565's picture
Upload folder using huggingface_hub
08fd094 verified
Raw
History Blame
6.66 kB
"""Restore canonical Postgres integration-test fixtures.
The experimental ingest pipeline replaced stable chunk IDs (``chk-ret-*``,
``chk-test-*``) with random UUIDs and broke retrieval/integration tests.
This script puts the database back to the known-good fixture state defined in:
- tests/integration/test_retrieval_api_postgres.py
- tests/integration/test_memory_api_postgres.py
Usage:
python3 scripts/restore_postgres_fixtures.py
"""
from __future__ import annotations
import json
import os
import sys
from datetime import UTC, datetime
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
_env_file = REPO_ROOT / ".env"
if _env_file.exists():
for raw_line in _env_file.read_text().splitlines():
line = raw_line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
os.environ.setdefault(key.strip(), value.strip())
import psycopg # noqa: E402
DSN = os.getenv(
"AKS_DATABASE_URL",
"postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine",
).replace("postgresql+psycopg://", "postgresql://", 1)
NOW = datetime.now(UTC)
# Stable fixture chunks — do not change IDs without updating integration tests.
FIXTURE_CHUNKS = [
{
"chunk_id": "chk-ret-lbl-1",
"source_id": "LBL-NSCLC-RET-EMA-2026",
"version_id": "ver-ret-lbl-1",
"text": "The approved dose is 80 mg once daily in first-line metastatic NSCLC within EU label boundaries.",
"claim_type": "DOSE",
"section_path": "2 POSOLOGY",
"page_start": 2,
"page_end": 2,
"token_count": 15,
"audience_fit": ["HCP", "Internal"],
},
{
"chunk_id": "chk-ret-csr-1",
"source_id": "DOC-CSR-NSCLC-RET-2026",
"version_id": "ver-ret-csr-1",
"text": "Progression-free survival improved in EGFR-positive NSCLC for DRUG-A.",
"claim_type": "EFFICACY",
"section_path": "RESULTS",
"page_start": 5,
"page_end": 5,
"token_count": 10,
"audience_fit": ["HCP", "Internal"],
},
{
"chunk_id": "chk-ret-sop-1",
"source_id": "SOP-MED-NSCLC-RET-2026",
"version_id": "ver-ret-sop-1",
"text": "Internal responders should preserve dose boundaries and medical citation discipline.",
"claim_type": "DOSE",
"section_path": "DOSING GUIDANCE",
"page_start": 1,
"page_end": 1,
"token_count": 10,
"audience_fit": ["Internal"],
},
{
"chunk_id": "chk-test-lbl-1",
"source_id": "LBL-NSCLC-TEST-EMA-2026",
"version_id": "ver-test-lbl-1",
"text": "The recommended dose is 80 mg once daily in first-line metastatic NSCLC within approved EU label boundaries.",
"claim_type": "DOSE",
"section_path": "2 POSOLOGY",
"page_start": 2,
"page_end": 2,
"token_count": 16,
"audience_fit": ["HCP", "Internal"],
},
{
"chunk_id": "chk-test-csr-1",
"source_id": "DOC-CSR-NSCLC-TEST-2026",
"version_id": "ver-test-csr-1",
"text": "DRUG-A improved progression-free survival in EGFR-positive NSCLC in the study population.",
"claim_type": "EFFICACY",
"section_path": "RESULTS",
"page_start": 5,
"page_end": 5,
"token_count": 12,
"audience_fit": ["HCP", "Internal"],
},
]
FIXTURE_LINKS = [
("clm-ret-lbl-1", "chk-ret-lbl-1", "LBL-NSCLC-RET-EMA-2026"),
("clm-ret-csr-1", "chk-ret-csr-1", "DOC-CSR-NSCLC-RET-2026"),
("clm-test-lbl-1", "chk-test-lbl-1", "LBL-NSCLC-TEST-EMA-2026"),
("clm-test-csr-1", "chk-test-csr-1", "DOC-CSR-NSCLC-TEST-2026"),
]
PROTECTED_SOURCE_IDS = {row["source_id"] for row in FIXTURE_CHUNKS}
def restore() -> None:
conn = psycopg.connect(DSN)
try:
with conn:
with conn.cursor() as cur:
cur.execute(
"DELETE FROM chunks WHERE source_id = ANY(%s)",
(list(PROTECTED_SOURCE_IDS),),
)
deleted_chunks = cur.rowcount
for row in FIXTURE_CHUNKS:
cur.execute(
"""
INSERT INTO chunks (
chunk_id, source_id, version_id, text, claim_type,
section_path, page_start, page_end, token_count,
audience_fit, geography_fit, therapy_area,
embedding_id, created_at
) VALUES (
%(chunk_id)s, %(source_id)s, %(version_id)s, %(text)s, %(claim_type)s,
%(section_path)s, %(page_start)s, %(page_end)s, %(token_count)s,
%(audience_fit)s::json, 'EU / EMA', 'NSCLC',
NULL, %(created_at)s
)
""",
{**row, "audience_fit": json.dumps(row["audience_fit"]), "created_at": NOW},
)
cur.execute(
"DELETE FROM claim_evidence_links WHERE claim_id = ANY(%s)",
([link[0] for link in FIXTURE_LINKS],),
)
for claim_id, chunk_id, source_id in FIXTURE_LINKS:
cur.execute(
"""
INSERT INTO claim_evidence_links (
claim_id, chunk_id, source_id, support_type,
extraction_confidence, is_primary_support
) VALUES (%s, %s, %s, 'PRIMARY', 0.99, TRUE)
ON CONFLICT DO NOTHING
""",
(claim_id, chunk_id, source_id),
)
print(f"Restored {len(FIXTURE_CHUNKS)} fixture chunks (removed {deleted_chunks} prior rows).")
print(f"Restored {len(FIXTURE_LINKS)} claim_evidence_links.")
cur.execute("SELECT COUNT(*) FROM chunks")
print(f"chunks total: {cur.fetchone()[0]}")
cur.execute(
"""
DELETE FROM chunk_embeddings ce
WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.chunk_id = ce.chunk_id)
"""
)
orphans_removed = cur.rowcount or 0
if orphans_removed:
print(f"Removed {orphans_removed} orphan chunk_embeddings.")
finally:
conn.close()
if __name__ == "__main__":
restore()
sys.exit(0)