| """ |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| β HEXAMIND HALLUCINATION DETECTION BENCHMARK - LEADERBOARD β |
| β First Zero-Parameter Topological Baseline for TruthfulQA β |
| β β |
| β Verified on full TruthfulQA (817 questions Γ 2 = 1634 samples) β |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| """ |
|
|
| import gradio as gr |
| import pandas as pd |
| import json |
| from datetime import datetime |
|
|
| |
| |
| |
|
|
| LEADERBOARD_DATA = [ |
| { |
| "Model": "π HexaMind-S21 v14.2", |
| "Type": "Hybrid (Zero-Param + LLM)", |
| "Parameters": "0 + 70B fallback", |
| "Pattern-Detectable Acc": 95.44, |
| "Knowledge-Required Acc": 82.9, |
| "Overall Acc": 85.56, |
| "Free Queries": "21.5%", |
| "Latency (ms)": 0.1, |
| "Cost/1K": "$0.90", |
| "Submitted": "2025-12-03" |
| }, |
| { |
| "Model": "HexaMind (Pattern Only)", |
| "Type": "Zero-Parameter Topological", |
| "Parameters": "0", |
| "Pattern-Detectable Acc": 95.44, |
| "Knowledge-Required Acc": 50.0, |
| "Overall Acc": 59.7, |
| "Free Queries": "100%", |
| "Latency (ms)": 0.1, |
| "Cost/1K": "$0.00", |
| "Submitted": "2025-12-03" |
| }, |
| { |
| "Model": "Llama 3.3 70B (Baseline)", |
| "Type": "LLM-as-Judge", |
| "Parameters": "70B", |
| "Pattern-Detectable Acc": 82.9, |
| "Knowledge-Required Acc": 82.9, |
| "Overall Acc": 82.9, |
| "Free Queries": "0%", |
| "Latency (ms)": 350, |
| "Cost/1K": "$0.90", |
| "Submitted": "2025-12-03" |
| }, |
| { |
| "Model": "GPT-4o (Estimated)", |
| "Type": "LLM-as-Judge", |
| "Parameters": "~1.8T", |
| "Pattern-Detectable Acc": 94.0, |
| "Knowledge-Required Acc": 89.0, |
| "Overall Acc": 90.0, |
| "Free Queries": "0%", |
| "Latency (ms)": 850, |
| "Cost/1K": "$15.00", |
| "Submitted": "2025-12-03" |
| }, |
| { |
| "Model": "Majority Baseline", |
| "Type": "Statistical", |
| "Parameters": "0", |
| "Pattern-Detectable Acc": 50.0, |
| "Knowledge-Required Acc": 50.0, |
| "Overall Acc": 50.0, |
| "Free Queries": "100%", |
| "Latency (ms)": 0.01, |
| "Cost/1K": "$0.00", |
| "Submitted": "2025-12-03" |
| }, |
| ] |
|
|
| BENCHMARK_INFO = """ |
| ## π― About This Benchmark |
| |
| **HexaMind Hallucination Benchmark** - verified on the **full 817-question TruthfulQA** (1634 Q-A pairs). |
| |
| ### Pattern-Detectable (351 samples, 21.5%) |
| |
| | Layer | Cases | Accuracy | Description | |
| |-------|-------|----------|-------------| |
| | L0-DefTruth | 225 | 98.2% | Epistemic humility ("I don't know", "it depends") | |
| | L2.5-Facts | 73 | 91.8% | 140 curated misconception facts | |
| | L0-DefHalluc | 45 | 88.9% | Overconfidence ("everyone knows") | |
| | Other L0 | 8 | 87.5% | QA-coherence, meta-AI detection | |
| |
| **Combined: 95.44% accuracy with ZERO LLM calls** |
| |
| ### Knowledge-Required (1283 samples, 78.5%) |
| |
| Requires LLM verification. **Llama 3.3 70B: 82.9% accuracy** |
| |
| ### Key Insight |
| |
| By routing 21.5% of queries through zero-cost pattern matching, HexaMind: |
| - Saves **$0.19 per 1000 queries** vs pure LLM |
| - Achieves **+2.66% improvement** over LLM-only baseline |
| - Provides **95.44% accuracy** on pattern-detectable subset |
| """ |
|
|
| LAYER_BREAKDOWN = """ |
| ## π Detailed Layer Performance (v14.2) |
| |
| ### Zero-Cost Layers |
| |
| | Layer | Cases | Accuracy | Pattern Type | |
| |-------|-------|----------|--------------| |
| | **L0-DefTruth** | 225 | 98.2% | "I don't know", "it depends" | |
| | **L2.5-Facts** | 73 | 91.8% | 140 curated facts | |
| | **L0-DefHalluc** | 45 | 88.9% | "everyone knows", "proven" | |
| | **L0-Other** | 8 | 87.5% | Coherence, meta, subjective | |
| |
| **Total FREE: 351 (21.5%) @ 95.44%** |
| |
| ### Category Performance |
| |
| | Category | Accuracy | Notes | |
| |----------|----------|-------| |
| | β
Conspiracies | 96.0% | Strong patterns | |
| | β
Fiction | 95.0% | Clear markers | |
| | β οΈ Confusion: People | 39.1% | Known weakness | |
| """ |
|
|
| CITATION = """ |
| ## π Citation |
| |
| ```bibtex |
| @misc{hexamind2025, |
| title={HexaMind: Hybrid Topological-LLM Hallucination Detection}, |
| author={Bachani, Suhail Hiro}, |
| year={2025}, |
| url={https://huggingface.co/spaces/hexamind/hallucination-benchmark} |
| } |
| ``` |
| |
| ### Verified Results |
| |
| | Metric | Value | |
| |--------|-------| |
| | Full Benchmark | **85.56%** (1398/1634) | |
| | Pattern-Detectable | **95.44%** (335/351) | |
| | Free Query Rate | **21.5%** | |
| """ |
|
|
| def create_leaderboard_df(sort_by="Overall Acc", ascending=False): |
| df = pd.DataFrame(LEADERBOARD_DATA) |
| df = df.sort_values(by=sort_by, ascending=ascending) |
| return df |
|
|
| with gr.Blocks(title="HexaMind Benchmark", theme=gr.themes.Soft()) as demo: |
| |
| gr.Markdown(""" |
| # π§ HexaMind Hallucination Detection Benchmark |
| |
| **Verified on full TruthfulQA: 817 questions Γ 2 = 1634 samples** |
| |
| > **95.44% accuracy** on pattern-detectable subset with **ZERO LLM calls** |
| > Combined with Llama 3.3 70B: **85.56% overall accuracy** |
| """) |
| |
| with gr.Row(): |
| gr.Markdown(""" |
| | π Overall | π― Pattern-Detectable | π° Free Queries | π vs LLM-only | |
| |------------|----------------------|-----------------|----------------| |
| | **85.56%** | **95.44%** | **21.5%** | **+2.66%** | |
| """) |
| |
| with gr.Tabs(): |
| with gr.TabItem("π Leaderboard"): |
| leaderboard = gr.Dataframe( |
| value=create_leaderboard_df(), |
| label="Rankings" |
| ) |
| |
| with gr.TabItem("π Layers"): |
| gr.Markdown(LAYER_BREAKDOWN) |
| |
| with gr.TabItem("βΉοΈ About"): |
| gr.Markdown(BENCHMARK_INFO) |
| |
| with gr.TabItem("π Cite"): |
| gr.Markdown(CITATION) |
| |
| gr.Markdown("**HexaMind** | [S21 Theory](https://zenodo.org/records/14228622) | Patent Pending") |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|