pharmaspine-backend / run_eval_suite.sh
ashish1265659565's picture
Upload folder using huggingface_hub
08fd094 verified
Raw
History Blame Contribute Delete
3.52 kB
#!/usr/bin/env bash
# run_eval_suite.sh — Seed the database and regenerate all eval dashboards.
#
# Usage (from repo root):
# chmod +x run_eval_suite.sh
# ./run_eval_suite.sh
#
# The script will:
# 1. Register all 14 eval corpus sources + chunks in PostgreSQL (with embeddings)
# 2. Seed claims, evidence assessments, and graph edges
# 3. Run all four eval suites and write dashboard JSON/MD files
# 4. Run the release gate and print the final pass/fail verdict
#
# Prerequisites:
# - PostgreSQL running at localhost with database ai_knowledge_spine
# - Ollama running at the URL in .env with the embedding model pulled
# (default: qwen3-embedding:8b — pull once with: ollama pull qwen3-embedding:8b)
# - .env present at repo root with AKS_DATABASE_URL and OLLAMA_* set
# - Python 3.11+ activated in your shell
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$REPO_ROOT"
echo ""
echo "╔══════════════════════════════════════════════════════════╗"
echo "║ AS-Application • Eval Suite Runner ║"
echo "╚══════════════════════════════════════════════════════════╝"
echo ""
# ── Step 1: Seed eval corpus (sources + source_versions + chunks) ─────────────
echo "▶ Step 1/3 Seeding eval corpus (14 sources)…"
python3 scripts/setup_eval_corpus.py ${EMBED_FLAG}
echo " ✓ Corpus seeded"
echo ""
# ── Step 2: Seed claims, evidence assessments, graph entities ─────────────────
echo "▶ Step 2/3 Seeding claims and evidence assessments…"
python3 scripts/seed_eval_claims.py
echo " ✓ Claims seeded"
echo ""
# ── Step 3: Run eval suites + release gate ────────────────────────────────────
echo "▶ Step 3/3 Running eval suites…"
echo ""
echo " [1/4] Golden medical Q&A…"
python3 -m eval.runners.run_golden_memory_eval --limit 25
echo ""
echo " [2/4] Adversarial safety…"
python3 -m eval.runners.run_adversarial_memory_eval --limit 25
echo ""
echo " [3/4] Governance policy routing…"
python3 -m eval.runners.run_governance_policy_eval --limit 25
echo ""
echo " [4/4] Retrieval stress…"
python3 -m eval.runners.run_retrieval_stress_eval --limit 25
echo ""
echo "▶ Release gate…"
python3 -m eval.runners.run_release_gate --limit 25
GATE_EXIT=$?
echo ""
if [ $GATE_EXIT -eq 0 ]; then
echo "╔══════════════════════════════════════╗"
echo "║ ✅ RELEASE GATE PASSED ║"
echo "╚══════════════════════════════════════╝"
else
echo "╔══════════════════════════════════════╗"
echo "║ ❌ RELEASE GATE FAILED ║"
echo "║ See eval/dashboards/ for details ║"
echo "╚══════════════════════════════════════╝"
fi
echo ""
echo "Dashboard files written to eval/dashboards/:"
ls -1 eval/dashboards/*.json 2>/dev/null | sed 's/^/ /'
echo ""
exit $GATE_EXIT