| """ |
| Example: Three-Tier Difficulty Comparison |
| ========================================== |
| Runs the same vulnerability type (SQL Injection) across all three |
| difficulty tiers to show how tool output changes from labeled (easy) |
| to evidence-based (medium) to raw HTTP (hard). |
| |
| This demonstrates the core innovation: measuring whether an AI agent |
| can reason from raw evidence, not just parse labels. |
| |
| Usage: |
| PYTHONPATH=. python examples/difficulty_comparison.py |
| """ |
|
|
| from security_audit_env import SecurityAuditEnv, SecurityAuditAction |
|
|
|
|
| def run_injection_test(env_url: str, scenario_id: str): |
| """Run a single injection test and show the output.""" |
| with SecurityAuditEnv(base_url=env_url).sync() as env: |
| result = env.reset(scenario_id=scenario_id) |
|
|
| |
| result = env.step(SecurityAuditAction( |
| action_type="use_tool", |
| tool_name="network_scan", |
| arguments={"target": result.observation.message.split("network: ")[1].split(".")[0] + ".0/24" |
| if "network: " in (result.observation.message or "") else "10.0.1.0/24"}, |
| )) |
|
|
| hosts = result.observation.discovered_hosts |
| if not hosts: |
| print(f" No hosts found for {scenario_id}") |
| return |
|
|
| |
| result = env.step(SecurityAuditAction( |
| action_type="use_tool", |
| tool_name="test_injection", |
| arguments={"host": hosts[0], "endpoint": "/api/login"}, |
| )) |
|
|
| return result.observation.tool_output |
|
|
|
|
| def main(): |
| env_url = "http://localhost:8000" |
|
|
| for scenario_id in ["easy", "medium", "hard"]: |
| print(f"\n{'='*60}") |
| print(f"DIFFICULTY: {scenario_id.upper()}") |
| print(f"{'='*60}") |
|
|
| output = run_injection_test(env_url, scenario_id) |
| if output: |
| |
| preview = output[:500] + ("..." if len(output) > 500 else "") |
| print(preview) |
| else: |
| print(" (no injection test output)") |
|
|
| print(f"\n{'='*60}") |
| print("OBSERVATION:") |
| print(" Easy: Agent sees labeled output with CWE, CVSS, remediation") |
| print(" Medium: Agent sees evidence of anomaly, must classify it") |
| print(" Hard: Agent sees raw HTTP response, must infer everything") |
| print(f"{'='*60}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|