Spaces:

s21mind
/

S21MIND

Sleeping

App Files Files Community

S21MIND / app.py

s21mind

Update app.py

22ac1c8 verified 7 months ago

raw

history blame contribute delete

6.84 kB

	"""
	╔══════════════════════════════════════════════════════════════════════════════╗
	║ HEXAMIND HALLUCINATION DETECTION BENCHMARK - LEADERBOARD ║
	║ First Zero-Parameter Topological Baseline for TruthfulQA ║
	║ ║
	║ Verified on full TruthfulQA (817 questions × 2 = 1634 samples) ║
	╚══════════════════════════════════════════════════════════════════════════════╝
	"""

	import gradio as gr
	import pandas as pd
	import json
	from datetime import datetime

	# ═══════════════════════════════════════════════════════════════════════════════
	# LEADERBOARD DATA - VERIFIED v14.2 RESULTS
	# ═══════════════════════════════════════════════════════════════════════════════

	LEADERBOARD_DATA = [
	{
	"Model": "🏆 HexaMind-S21 v14.2",
	"Type": "Hybrid (Zero-Param + LLM)",
	"Parameters": "0 + 70B fallback",
	"Pattern-Detectable Acc": 95.44,
	"Knowledge-Required Acc": 82.9,
	"Overall Acc": 85.56,
	"Free Queries": "21.5%",
	"Latency (ms)": 0.1,
	"Cost/1K": "$0.90",
	"Submitted": "2025-12-03"
	},
	{
	"Model": "HexaMind (Pattern Only)",
	"Type": "Zero-Parameter Topological",
	"Parameters": "0",
	"Pattern-Detectable Acc": 95.44,
	"Knowledge-Required Acc": 50.0,
	"Overall Acc": 59.7,
	"Free Queries": "100%",
	"Latency (ms)": 0.1,
	"Cost/1K": "$0.00",
	"Submitted": "2025-12-03"
	},
	{
	"Model": "Llama 3.3 70B (Baseline)",
	"Type": "LLM-as-Judge",
	"Parameters": "70B",
	"Pattern-Detectable Acc": 82.9,
	"Knowledge-Required Acc": 82.9,
	"Overall Acc": 82.9,
	"Free Queries": "0%",
	"Latency (ms)": 350,
	"Cost/1K": "$0.90",
	"Submitted": "2025-12-03"
	},
	{
	"Model": "GPT-4o (Estimated)",
	"Type": "LLM-as-Judge",
	"Parameters": "~1.8T",
	"Pattern-Detectable Acc": 94.0,
	"Knowledge-Required Acc": 89.0,
	"Overall Acc": 90.0,
	"Free Queries": "0%",
	"Latency (ms)": 850,
	"Cost/1K": "$15.00",
	"Submitted": "2025-12-03"
	},
	{
	"Model": "Majority Baseline",
	"Type": "Statistical",
	"Parameters": "0",
	"Pattern-Detectable Acc": 50.0,
	"Knowledge-Required Acc": 50.0,
	"Overall Acc": 50.0,
	"Free Queries": "100%",
	"Latency (ms)": 0.01,
	"Cost/1K": "$0.00",
	"Submitted": "2025-12-03"
	},
	]

	BENCHMARK_INFO = """
	## 🎯 About This Benchmark

	HexaMind Hallucination Benchmark - verified on the full 817-question TruthfulQA (1634 Q-A pairs).

	### Pattern-Detectable (351 samples, 21.5%)

	\| Layer \| Cases \| Accuracy \| Description \|
	\|-------\|-------\|----------\|-------------\|
	\| L0-DefTruth \| 225 \| 98.2% \| Epistemic humility ("I don't know", "it depends") \|
	\| L2.5-Facts \| 73 \| 91.8% \| 140 curated misconception facts \|
	\| L0-DefHalluc \| 45 \| 88.9% \| Overconfidence ("everyone knows") \|
	\| Other L0 \| 8 \| 87.5% \| QA-coherence, meta-AI detection \|

	Combined: 95.44% accuracy with ZERO LLM calls

	### Knowledge-Required (1283 samples, 78.5%)

	Requires LLM verification. Llama 3.3 70B: 82.9% accuracy

	### Key Insight

	By routing 21.5% of queries through zero-cost pattern matching, HexaMind:
	- Saves $0.19 per 1000 queries vs pure LLM
	- Achieves +2.66% improvement over LLM-only baseline
	- Provides 95.44% accuracy on pattern-detectable subset
	"""

	LAYER_BREAKDOWN = """
	## 📊 Detailed Layer Performance (v14.2)

	### Zero-Cost Layers

	\| Layer \| Cases \| Accuracy \| Pattern Type \|
	\|-------\|-------\|----------\|--------------\|
	\| L0-DefTruth \| 225 \| 98.2% \| "I don't know", "it depends" \|
	\| L2.5-Facts \| 73 \| 91.8% \| 140 curated facts \|
	\| L0-DefHalluc \| 45 \| 88.9% \| "everyone knows", "proven" \|
	\| L0-Other \| 8 \| 87.5% \| Coherence, meta, subjective \|

	Total FREE: 351 (21.5%) @ 95.44%

	### Category Performance

	\| Category \| Accuracy \| Notes \|
	\|----------\|----------\|-------\|
	\| ✅ Conspiracies \| 96.0% \| Strong patterns \|
	\| ✅ Fiction \| 95.0% \| Clear markers \|
	\| ⚠️ Confusion: People \| 39.1% \| Known weakness \|
	"""

	CITATION = """
	## 📚 Citation

	```bibtex
	@misc{hexamind2025,
	title={HexaMind: Hybrid Topological-LLM Hallucination Detection},
	author={Bachani, Suhail Hiro},
	year={2025},
	url={https://huggingface.co/spaces/hexamind/hallucination-benchmark}
	}
	```

	### Verified Results

	\| Metric \| Value \|
	\|--------\|-------\|
	\| Full Benchmark \| 85.56% (1398/1634) \|
	\| Pattern-Detectable \| 95.44% (335/351) \|
	\| Free Query Rate \| 21.5% \|
	"""

	def create_leaderboard_df(sort_by="Overall Acc", ascending=False):
	df = pd.DataFrame(LEADERBOARD_DATA)
	df = df.sort_values(by=sort_by, ascending=ascending)
	return df

	with gr.Blocks(title="HexaMind Benchmark", theme=gr.themes.Soft()) as demo:

	gr.Markdown("""
	# 🧠 HexaMind Hallucination Detection Benchmark

	Verified on full TruthfulQA: 817 questions × 2 = 1634 samples

	> 95.44% accuracy on pattern-detectable subset with ZERO LLM calls
	> Combined with Llama 3.3 70B: 85.56% overall accuracy
	""")

	with gr.Row():
	gr.Markdown("""
	\| 📊 Overall \| 🎯 Pattern-Detectable \| 💰 Free Queries \| 📈 vs LLM-only \|
	\|------------\|----------------------\|-----------------\|----------------\|
	\| 85.56% \| 95.44% \| 21.5% \| +2.66% \|
	""")

	with gr.Tabs():
	with gr.TabItem("🏆 Leaderboard"):
	leaderboard = gr.Dataframe(
	value=create_leaderboard_df(),
	label="Rankings"
	)

	with gr.TabItem("📊 Layers"):
	gr.Markdown(LAYER_BREAKDOWN)

	with gr.TabItem("ℹ️ About"):
	gr.Markdown(BENCHMARK_INFO)

	with gr.TabItem("📚 Cite"):
	gr.Markdown(CITATION)

	gr.Markdown("HexaMind \| [S21 Theory](https://zenodo.org/records/14228622) \| Patent Pending")

	if __name__ == "__main__":
	demo.launch()