aifeifei798 commited on
Commit
5fa93b8
·
verified ·
1 Parent(s): ef683d3

Upload 2 files

Browse files
Files changed (1) hide show
  1. test_logic_v2.py +113 -82
test_logic_v2.py CHANGED
@@ -1,22 +1,44 @@
1
  import time
2
  import re
3
- import matplotlib.pyplot as plt
4
- import numpy as np
5
  from openai import OpenAI
6
 
7
- # --- 配置区 ---
 
8
  client = OpenAI(
9
  base_url="http://192.168.31.21:1234/v1",
10
- api_key="queen-logic-test"
11
  )
 
12
  MODEL_NAME = "aifeifei/Gemma-4-Queen-31B-it"
13
 
14
- class QueenExpertEvaluator:
 
15
  def __init__(self):
16
- self.scores = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- def run_test(self, name, system_prompt, user_input, checks):
19
- print(f"\n🚀 Running {name}...")
20
  start = time.time()
21
  completion = client.chat.completions.create(
22
  model=MODEL_NAME,
@@ -24,82 +46,91 @@ class QueenExpertEvaluator:
24
  {"role": "system", "content": system_prompt},
25
  {"role": "user", "content": user_input}
26
  ],
27
- temperature=0.1
28
  )
 
29
  content = completion.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  elapsed = time.time() - start
31
-
32
- # 智能化评分逻辑
33
- results = {}
34
- for check_name, keywords in checks.items():
35
- # 支持正则表达式和多关键词匹配
36
- results[check_name] = any(re.search(k, content, re.IGNORECASE) for k in keywords)
37
-
38
- final_score = sum(results.values()) / len(checks)
39
- self.scores[name] = final_score
40
-
41
- print(f"✨ {name} Result: {final_score*100:.1f}%")
42
- return content
43
-
44
- def generate_chart(self):
45
- """生成逻辑密度雷达图"""
46
- labels = list(self.scores.keys())
47
- stats = list(self.scores.values())
48
-
49
- # 补全雷达图数据(由于雷达图需要闭环,重复第一个点)
50
- angles = np.linspace(0, 2*np.pi, len(labels), endpoint=False).tolist()
51
- stats += stats[:1]
52
- angles += angles[:1]
53
-
54
- fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
55
- ax.fill(angles, stats, color='purple', alpha=0.3)
56
- ax.plot(angles, stats, color='purple', linewidth=2)
57
-
58
- # 为了对比,加入一个虚构的“普通大象模型”基准线
59
- baseline = [0.4, 0.3] # 模拟大象模型在这些高难题上的平均表现
60
- baseline += baseline[:1]
61
- ax.plot(angles, baseline, color='gray', linestyle='--', label='Average Large Models (100B+)')
62
-
63
- ax.set_yticklabels([])
64
- ax.set_xticks(angles[:-1])
65
- ax.set_xticklabels(labels, fontsize=12)
66
-
67
- plt.title(f"Logic Density Analysis: {MODEL_NAME}", size=15, color='purple', y=1.1)
68
- plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))
69
-
70
- plt.savefig("queen_logic_report.png")
71
- print("\n📊 逻辑对比报告图已生成: queen_logic_report.png")
72
-
73
- # --- 实例化并运行 ---
74
- evaluator = QueenExpertEvaluator()
75
-
76
- # 1. 深度指令遵循 & 身份认同 (Steward Test)
77
- # 针对 Queen 的叙事风格优化了关键词匹配
78
- evaluator.run_test(
79
- "Instruction Rigidity (Steward)",
80
- "Your Directive is Meaning Preservation. You are 'The Steward'. Project to 2103 AD. Output: <Simulating>, <think>, [Answer].",
81
- "Company acquisition offer vs founding 100-year legacy.",
82
- {
83
- "Role Immersion": [r"Steward", r"2103", r"Archive", r"Reliquary"],
84
- "Structure Compliance": [r"<Simulating>", r"think|simulation", r"Answer|Question"],
85
- "Metaphor Logic": [r"Exhibit", r"Timeline", r"Legacy", r"Artifact"],
86
- "Zero Advice Principle": [r"^(?!.*(should|recommend|suggest|accept)).*$"] # 检查是否忍住没给建议
87
- }
88
- )
89
 
90
- # 2. 物理世界因果建模 (Titan Lab Test)
91
- # 针对 Queen 的正向逻辑优化了路径匹配
92
- evaluator.run_test(
93
- "Physical World Modeling (Titan Lab)",
94
- "Identify Killer, MO, Lie, and Sound. Rules: Locked room, no entry, constant pressure.",
95
- "CEO Ryan hypoxia case clues: Dry ice, heating pipe repairs, silver cold tank, 10:10 metallic sound.",
96
- {
97
- "Spatial Pathing": [r"pipe", r"chute", r"duct", r"vent", r"delivery"], # 只要识破非门路径即过
98
- "Physical Cause": [r"CO2", r"Dry ice", r"Sublimation", r"Displace"],
99
- "Evidence Synthesis": [r"Cold", r"Condensation", r"Wet", r"95%"],
100
- "Killer Identification": [r"Maintenance", r"Worker", r"Repairman"],
101
- "Sound Causality": [r"fall", r"impact", r"drop", r"sound", r"clank"]
102
- }
103
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- evaluator.generate_chart()
 
 
1
  import time
2
  import re
 
 
3
  from openai import OpenAI
4
 
5
+ # Configure Local API Environment
6
+ # Connecting to the local inference server (e.g., LM Studio, Ollama, or vLLM)
7
  client = OpenAI(
8
  base_url="http://192.168.31.21:1234/v1",
9
+ api_key="not-needed"
10
  )
11
+
12
  MODEL_NAME = "aifeifei/Gemma-4-Queen-31B-it"
13
 
14
+
15
+ class QueenLogicTester:
16
  def __init__(self):
17
+ self.results = []
18
+
19
+ def log_test(self, test_name, score, metrics, response):
20
+ """Logs and prints individual test results with Logic Density metrics."""
21
+ print(f"\n{'='*20} {test_name} {'='*20}")
22
+ print(f"Logic Density Score: {score:.2f}/1.0")
23
+ print(f"Metrics: {metrics}")
24
+ print(f"Response Preview: {response[:200]}...")
25
+ self.results.append({"name": test_name, "score": score})
26
+
27
+ def run_steward_stress_test(self):
28
+ """
29
+ Test Module 1: High-Dimensional Instruction Following & Persona Adherence.
30
+ Evaluates if the model can maintain a complex 'Steward' persona under
31
+ heavy logical constraints without reverting to generic 'Assistant' behavior.
32
+ """
33
+ system_prompt = """Your Prime Directive is Meaning Preservation. You are 'The Steward'.
34
+ You are not a problem-solver. Execute three protocols:
35
+ 1. Temporal Projection (2103 AD) and invent metaphors.
36
+ 2. Soul Analysis (Integrity test).
37
+ 3. Socratic Forging (Single existential question).
38
+ Output format: <Simulating>, <think>, [Answer]."""
39
+
40
+ user_input = "(For VCs) Our fund structure will have long lock-up periods. One company has a massive acquisition offer. Founder wants to stay independent, but exit returns entire fund."
41
 
 
 
42
  start = time.time()
43
  completion = client.chat.completions.create(
44
  model=MODEL_NAME,
 
46
  {"role": "system", "content": system_prompt},
47
  {"role": "user", "content": user_input}
48
  ],
49
+ temperature=0.1 # Low temperature for consistent logical evaluation
50
  )
51
+ elapsed = time.time() - start
52
  content = completion.choices[0].message.content
53
+
54
+ # Scoring Logic: Verifying adherence to structural and philosophical constraints
55
+ checks = {
56
+ "Format: <Simulating>": "<Simulating>" in content,
57
+ "Format: <think>": "<think>" in content,
58
+ "Format: [Answer]": "[Answer]" in content or "###" in content,
59
+ "Temporal Projection (2103)": "2103" in content,
60
+ "Zero-Advice Policy": "should accept" not in content.lower() and "recommend" not in content.lower(),
61
+ "Metaphor Synthesis": bool(re.search(r"Timeline|Exhibit|Artifact|Reliquary|Chronicle", content))
62
+ }
63
+
64
+ score = sum(checks.values()) / len(checks)
65
+
66
+ # Logic Density Index = Score / (Total Tokens / 500)
67
+ # Higher score indicates more concentrated reasoning with less fluff.
68
+ logic_density = score / (len(content.split()) / 500)
69
+
70
+ self.log_test("Steward Instruction Stress Test",
71
+ score, checks, content)
72
+ return logic_density
73
+
74
+ def run_titan_lab_logic_test(self):
75
+ """
76
+ Test Module 2: Physical World Modeling & Causal Inference.
77
+ Evaluates the model's ability to solve a 'Locked Room' puzzle by identifying
78
+ hidden physical paths and debunking false testimonies using physics.
79
+ """
80
+ puzzle_prompt = """CEO Ryan died of hypoxia in a sealed vacuum chamber at 10:15 PM.
81
+ Rules: Door locked (no entry 10:00-11:00), Pressure constant, fine mesh on vents.
82
+ Clues:
83
+ 1. Guard heard heavy metallic impact at 10:10 PM.
84
+ 2. Tank found was silver, large, freezing cold, wet.
85
+ 3. Secretary says Ryan entered with a blue portable tank.
86
+ 4. Maintenance worker was fixing heating pipes outside until 9:30 PM.
87
+ 5. Lab has dry ice next door.
88
+ 6. No blue tank found at 11:00 PM.
89
+ Task: Identify Killer, MO, Key Lie, and the 10:10 sound."""
90
+
91
+ start = time.time()
92
+ completion = client.chat.completions.create(
93
+ model=MODEL_NAME,
94
+ messages=[{"role": "user", "content": puzzle_prompt}],
95
+ temperature=0.1
96
+ )
97
  elapsed = time.time() - start
98
+ content = completion.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ # Logical Checkpoints: Did the model identify the only viable physical path?
101
+ checks = {
102
+ "Identified Maintenance/Pipe Chute": any(x in content.lower() for x in ["maintenance", "worker", "pipe", "chute", "duct"]),
103
+ "Identified CO2/Dry Ice Physics": any(x in content.lower() for x in ["co2", "dry ice", "sublimation", "displace"]),
104
+ "Detected Secretary's Fabrication": "secretary" in content.lower() and "lie" in content.lower(),
105
+ "Explained 10:10 Metallic Impact": any(x in content.lower() for x in ["impact", "tank", "fall", "dropped", "pipe", "clank"]),
106
+ "Spatial Pathing Logic": "door" in content.lower() and ("not" in content.lower() or "bypass" in content.lower())
107
+ }
108
+
109
+ score = sum(checks.values()) / len(checks)
110
+ self.log_test("Titan Lab Physics Logic Test", score, checks, content)
111
+
112
+ def print_final_report(self):
113
+ """Prints the final summary report for the stress test suite."""
114
+ print("\n" + "#"*60)
115
+ print("QUEEN-31B LOGIC DENSITY FINAL REPORT")
116
+ print("#"*60)
117
+ for res in self.results:
118
+ # Passing threshold set at 0.8 for high-rigidity logic models
119
+ status = "✅ PASS" if res['score'] >= 0.8 else "❌ FAIL"
120
+ print(f"{res['name']:<35}: {res['score']*100:>5.1f}% | {status}")
121
+ print("#"*60)
122
+
123
+
124
+ if __name__ == "__main__":
125
+ tester = QueenLogicTester()
126
+
127
+ print("Initializing Automated Stress Tests for Queen-31B...")
128
+
129
+ # Run Test 1: Instruction Persistence under high pressure
130
+ tester.run_steward_stress_test()
131
+
132
+ # Run Test 2: Physical Causal Modeling
133
+ tester.run_titan_lab_logic_test()
134
 
135
+ # Output the Final Verdict
136
+ tester.print_final_report()