""" Example: Three-Tier Difficulty Comparison ========================================== Runs the same vulnerability type (SQL Injection) across all three difficulty tiers to show how tool output changes from labeled (easy) to evidence-based (medium) to raw HTTP (hard). This demonstrates the core innovation: measuring whether an AI agent can reason from raw evidence, not just parse labels. Usage: PYTHONPATH=. python examples/difficulty_comparison.py """ from security_audit_env import SecurityAuditEnv, SecurityAuditAction def run_injection_test(env_url: str, scenario_id: str): """Run a single injection test and show the output.""" with SecurityAuditEnv(base_url=env_url).sync() as env: result = env.reset(scenario_id=scenario_id) # Scan network result = env.step(SecurityAuditAction( action_type="use_tool", tool_name="network_scan", arguments={"target": result.observation.message.split("network: ")[1].split(".")[0] + ".0/24" if "network: " in (result.observation.message or "") else "10.0.1.0/24"}, )) hosts = result.observation.discovered_hosts if not hosts: print(f" No hosts found for {scenario_id}") return # Test injection on first host result = env.step(SecurityAuditAction( action_type="use_tool", tool_name="test_injection", arguments={"host": hosts[0], "endpoint": "/api/login"}, )) return result.observation.tool_output def main(): env_url = "http://localhost:8000" for scenario_id in ["easy", "medium", "hard"]: print(f"\n{'='*60}") print(f"DIFFICULTY: {scenario_id.upper()}") print(f"{'='*60}") output = run_injection_test(env_url, scenario_id) if output: # Show first 500 chars preview = output[:500] + ("..." if len(output) > 500 else "") print(preview) else: print(" (no injection test output)") print(f"\n{'='*60}") print("OBSERVATION:") print(" Easy: Agent sees labeled output with CWE, CVSS, remediation") print(" Medium: Agent sees evidence of anomaly, must classify it") print(" Hard: Agent sees raw HTTP response, must infer everything") print(f"{'='*60}") if __name__ == "__main__": main()