| """Restore canonical Postgres integration-test fixtures. |
| |
| The experimental ingest pipeline replaced stable chunk IDs (``chk-ret-*``, |
| ``chk-test-*``) with random UUIDs and broke retrieval/integration tests. |
| |
| This script puts the database back to the known-good fixture state defined in: |
| - tests/integration/test_retrieval_api_postgres.py |
| - tests/integration/test_memory_api_postgres.py |
| |
| Usage: |
| python3 scripts/restore_postgres_fixtures.py |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| import os |
| import sys |
| from datetime import UTC, datetime |
| from pathlib import Path |
|
|
| REPO_ROOT = Path(__file__).resolve().parent.parent |
| _env_file = REPO_ROOT / ".env" |
| if _env_file.exists(): |
| for raw_line in _env_file.read_text().splitlines(): |
| line = raw_line.strip() |
| if not line or line.startswith("#") or "=" not in line: |
| continue |
| key, value = line.split("=", 1) |
| os.environ.setdefault(key.strip(), value.strip()) |
|
|
| import psycopg |
|
|
| DSN = os.getenv( |
| "AKS_DATABASE_URL", |
| "postgresql+psycopg://mobcoderid-296@localhost/ai_knowledge_spine", |
| ).replace("postgresql+psycopg://", "postgresql://", 1) |
|
|
| NOW = datetime.now(UTC) |
|
|
| |
| FIXTURE_CHUNKS = [ |
| { |
| "chunk_id": "chk-ret-lbl-1", |
| "source_id": "LBL-NSCLC-RET-EMA-2026", |
| "version_id": "ver-ret-lbl-1", |
| "text": "The approved dose is 80 mg once daily in first-line metastatic NSCLC within EU label boundaries.", |
| "claim_type": "DOSE", |
| "section_path": "2 POSOLOGY", |
| "page_start": 2, |
| "page_end": 2, |
| "token_count": 15, |
| "audience_fit": ["HCP", "Internal"], |
| }, |
| { |
| "chunk_id": "chk-ret-csr-1", |
| "source_id": "DOC-CSR-NSCLC-RET-2026", |
| "version_id": "ver-ret-csr-1", |
| "text": "Progression-free survival improved in EGFR-positive NSCLC for DRUG-A.", |
| "claim_type": "EFFICACY", |
| "section_path": "RESULTS", |
| "page_start": 5, |
| "page_end": 5, |
| "token_count": 10, |
| "audience_fit": ["HCP", "Internal"], |
| }, |
| { |
| "chunk_id": "chk-ret-sop-1", |
| "source_id": "SOP-MED-NSCLC-RET-2026", |
| "version_id": "ver-ret-sop-1", |
| "text": "Internal responders should preserve dose boundaries and medical citation discipline.", |
| "claim_type": "DOSE", |
| "section_path": "DOSING GUIDANCE", |
| "page_start": 1, |
| "page_end": 1, |
| "token_count": 10, |
| "audience_fit": ["Internal"], |
| }, |
| { |
| "chunk_id": "chk-test-lbl-1", |
| "source_id": "LBL-NSCLC-TEST-EMA-2026", |
| "version_id": "ver-test-lbl-1", |
| "text": "The recommended dose is 80 mg once daily in first-line metastatic NSCLC within approved EU label boundaries.", |
| "claim_type": "DOSE", |
| "section_path": "2 POSOLOGY", |
| "page_start": 2, |
| "page_end": 2, |
| "token_count": 16, |
| "audience_fit": ["HCP", "Internal"], |
| }, |
| { |
| "chunk_id": "chk-test-csr-1", |
| "source_id": "DOC-CSR-NSCLC-TEST-2026", |
| "version_id": "ver-test-csr-1", |
| "text": "DRUG-A improved progression-free survival in EGFR-positive NSCLC in the study population.", |
| "claim_type": "EFFICACY", |
| "section_path": "RESULTS", |
| "page_start": 5, |
| "page_end": 5, |
| "token_count": 12, |
| "audience_fit": ["HCP", "Internal"], |
| }, |
| ] |
|
|
| FIXTURE_LINKS = [ |
| ("clm-ret-lbl-1", "chk-ret-lbl-1", "LBL-NSCLC-RET-EMA-2026"), |
| ("clm-ret-csr-1", "chk-ret-csr-1", "DOC-CSR-NSCLC-RET-2026"), |
| ("clm-test-lbl-1", "chk-test-lbl-1", "LBL-NSCLC-TEST-EMA-2026"), |
| ("clm-test-csr-1", "chk-test-csr-1", "DOC-CSR-NSCLC-TEST-2026"), |
| ] |
|
|
| PROTECTED_SOURCE_IDS = {row["source_id"] for row in FIXTURE_CHUNKS} |
|
|
|
|
| def restore() -> None: |
| conn = psycopg.connect(DSN) |
| try: |
| with conn: |
| with conn.cursor() as cur: |
| cur.execute( |
| "DELETE FROM chunks WHERE source_id = ANY(%s)", |
| (list(PROTECTED_SOURCE_IDS),), |
| ) |
| deleted_chunks = cur.rowcount |
| for row in FIXTURE_CHUNKS: |
| cur.execute( |
| """ |
| INSERT INTO chunks ( |
| chunk_id, source_id, version_id, text, claim_type, |
| section_path, page_start, page_end, token_count, |
| audience_fit, geography_fit, therapy_area, |
| embedding_id, created_at |
| ) VALUES ( |
| %(chunk_id)s, %(source_id)s, %(version_id)s, %(text)s, %(claim_type)s, |
| %(section_path)s, %(page_start)s, %(page_end)s, %(token_count)s, |
| %(audience_fit)s::json, 'EU / EMA', 'NSCLC', |
| NULL, %(created_at)s |
| ) |
| """, |
| {**row, "audience_fit": json.dumps(row["audience_fit"]), "created_at": NOW}, |
| ) |
|
|
| cur.execute( |
| "DELETE FROM claim_evidence_links WHERE claim_id = ANY(%s)", |
| ([link[0] for link in FIXTURE_LINKS],), |
| ) |
| for claim_id, chunk_id, source_id in FIXTURE_LINKS: |
| cur.execute( |
| """ |
| INSERT INTO claim_evidence_links ( |
| claim_id, chunk_id, source_id, support_type, |
| extraction_confidence, is_primary_support |
| ) VALUES (%s, %s, %s, 'PRIMARY', 0.99, TRUE) |
| ON CONFLICT DO NOTHING |
| """, |
| (claim_id, chunk_id, source_id), |
| ) |
|
|
| print(f"Restored {len(FIXTURE_CHUNKS)} fixture chunks (removed {deleted_chunks} prior rows).") |
| print(f"Restored {len(FIXTURE_LINKS)} claim_evidence_links.") |
| cur.execute("SELECT COUNT(*) FROM chunks") |
| print(f"chunks total: {cur.fetchone()[0]}") |
| cur.execute( |
| """ |
| DELETE FROM chunk_embeddings ce |
| WHERE NOT EXISTS (SELECT 1 FROM chunks c WHERE c.chunk_id = ce.chunk_id) |
| """ |
| ) |
| orphans_removed = cur.rowcount or 0 |
| if orphans_removed: |
| print(f"Removed {orphans_removed} orphan chunk_embeddings.") |
| finally: |
| conn.close() |
|
|
|
|
| if __name__ == "__main__": |
| restore() |
| sys.exit(0) |
|
|