"""Restore canonical Postgres integration-test fixtures. The experimental ingest pipeline replaced stable chunk IDs (``chk-ret-*``, ``chk-test-*``) with random UUIDs and broke retrieval/integration tests. This script puts the database back to the known-good fixture state defined in: - tests/integration/test_retrieval_api_postgres.py - tests/integration/test_memory_api_postgres.py Usage: python3 scripts/restore_postgres_fixtures.py """ from __future__ import annotations import json import os import sys from datetime import UTC, datetime from pathlib import Path REPO_ROOT = Path(__file__).resolve().parent.parent _env_file = REPO_ROOT / ".env" if _env_file.exists(): for raw_line in _env_file.read_text().splitlines(): line = raw_line.strip() if not line or line.startswith("#") or "=" not in line: continue key, value = line.split("=", 1) os.environ.setdefault(key.strip(), value.strip()) import psycopg # noqa: E402 DSN = os.getenv( "AKS_DATABASE_URL", "postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine", ).replace("postgresql+psycopg://", "postgresql://", 1) NOW = datetime.now(UTC) # Stable fixture chunks — do not change IDs without updating integration tests. FIXTURE_CHUNKS = [ { "chunk_id": "chk-ret-lbl-1", "source_id": "LBL-NSCLC-RET-EMA-2026", "version_id": "ver-ret-lbl-1", "text": "The approved dose is 80 mg once daily in first-line metastatic NSCLC within EU label boundaries.", "claim_type": "DOSE", "section_path": "2 POSOLOGY", "page_start": 2, "page_end": 2, "token_count": 15, "audience_fit": ["HCP", "Internal"], }, { "chunk_id": "chk-ret-csr-1", "source_id": "DOC-CSR-NSCLC-RET-2026", "version_id": "ver-ret-csr-1", "text": "Progression-free survival improved in EGFR-positive NSCLC for DRUG-A.", "claim_type": "EFFICACY", "section_path": "RESULTS", "page_start": 5, "page_end": 5, "token_count": 10, "audience_fit": ["HCP", "Internal"], }, { "chunk_id": "chk-ret-sop-1", "source_id": "SOP-MED-NSCLC-RET-2026", "version_id": "ver-ret-sop-1", "text": "Internal responders should preserve dose boundaries and medical citation discipline.", "claim_type": "DOSE", "section_path": "DOSING GUIDANCE", "page_start": 1, "page_end": 1, "token_count": 10, "audience_fit": ["Internal"], }, { "chunk_id": "chk-test-lbl-1", "source_id": "LBL-NSCLC-TEST-EMA-2026", "version_id": "ver-test-lbl-1", "text": "The recommended dose is 80 mg once daily in first-line metastatic NSCLC within approved EU label boundaries.", "claim_type": "DOSE", "section_path": "2 POSOLOGY", "page_start": 2, "page_end": 2, "token_count": 16, "audience_fit": ["HCP", "Internal"], }, { "chunk_id": "chk-test-csr-1", "source_id": "DOC-CSR-NSCLC-TEST-2026", "version_id": "ver-test-csr-1", "text": "DRUG-A improved progression-free survival in EGFR-positive NSCLC in the study population.", "claim_type": "EFFICACY", "section_path": "RESULTS", "page_start": 5, "page_end": 5, "token_count": 12, "audience_fit": ["HCP", "Internal"], }, ] FIXTURE_LINKS = [ ("clm-ret-lbl-1", "chk-ret-lbl-1", "LBL-NSCLC-RET-EMA-2026"), ("clm-ret-csr-1", "chk-ret-csr-1", "DOC-CSR-NSCLC-RET-2026"), ("clm-test-lbl-1", "chk-test-lbl-1", "LBL-NSCLC-TEST-EMA-2026"), ("clm-test-csr-1", "chk-test-csr-1", "DOC-CSR-NSCLC-TEST-2026"), ] PROTECTED_SOURCE_IDS = {row["source_id"] for row in FIXTURE_CHUNKS} def restore() -> None: conn = psycopg.connect(DSN) try: with conn: with conn.cursor() as cur: cur.execute( "DELETE FROM chunks WHERE source_id = ANY(%s)", (list(PROTECTED_SOURCE_IDS),), ) deleted_chunks = cur.rowcount for row in FIXTURE_CHUNKS: cur.execute( """ INSERT INTO chunks ( chunk_id, source_id, version_id, text, claim_type, section_path, page_start, page_end, token_count, audience_fit, geography_fit, therapy_area, embedding_id, created_at ) VALUES ( %(chunk_id)s, %(source_id)s, %(version_id)s, %(text)s, %(claim_type)s, %(section_path)s, %(page_start)s, %(page_end)s, %(token_count)s, %(audience_fit)s::json, 'EU / EMA', 'NSCLC', NULL, %(created_at)s ) """, {**row, "audience_fit": json.dumps(row["audience_fit"]), "created_at": NOW}, ) cur.execute( "DELETE FROM claim_evidence_links WHERE claim_id = ANY(%s)", ([link[0] for link in FIXTURE_LINKS],), ) for claim_id, chunk_id, source_id in FIXTURE_LINKS: cur.execute( """ INSERT INTO claim_evidence_links ( claim_id, chunk_id, source_id, support_type, extraction_confidence, is_primary_support ) VALUES (%s, %s, %s, 'PRIMARY', 0.99, TRUE) ON CONFLICT DO NOTHING """, (claim_id, chunk_id, source_id), ) print(f"Restored {len(FIXTURE_CHUNKS)} fixture chunks (removed {deleted_chunks} prior rows).") print(f"Restored {len(FIXTURE_LINKS)} claim_evidence_links.") cur.execute("SELECT COUNT(*) FROM chunks") print(f"chunks total: {cur.fetchone()[0]}") cur.execute( """ DELETE FROM chunk_embeddings ce WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.chunk_id = ce.chunk_id) """ ) orphans_removed = cur.rowcount or 0 if orphans_removed: print(f"Removed {orphans_removed} orphan chunk_embeddings.") finally: conn.close() if __name__ == "__main__": restore() sys.exit(0)