Spaces:

SmartKapila
/

OmniGuard-Evolved-V2

Sleeping

App Files Files Community

SmartKapila commited on Apr 23

Commit

f3f05d8

1 Parent(s): c0bf5ac

Added openenv.yaml and updated REDAME with placeholders for post completion video and documentation according to judging criteria

Browse files

Files changed (5) hide show

README.md +31 -0
eval/benchmark.py +29 -10
openenv.yaml +14 -0
server/env.py +2 -1
server/openenv_adapter.py +16 -7

README.md CHANGED Viewed

@@ -1,3 +1,24 @@
 # OmniGuard-Evolved-V2
 **Distributed OpenEnv RL Environment for Autonomous VulnOps & MCP Gateway Defense**
@@ -7,6 +28,10 @@
 > adversarial AI attacks — including prompt injection, credential exfiltration, STDIO
 > sandbox escapes, and recursive self-correction chains.
 ## Architecture
 ```
@@ -103,6 +128,12 @@ python -m eval.benchmark \
 Produces `reports/results.json` and `reports/reward_curve.png`.
 ## API Endpoints
 | Method | Path | Description |

+---
+title: OmniGuard Evolved V2
+emoji: 🛡️
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+app_port: 8000
+pinned: false
+short_description: A distributed OpenEnv RL environment for training LLM-based defenders to protect enterprise MCP gateways against autonomous adversarial AI attacks.
+tags:
+  - openenv
+  - reinforcement-learning
+  - ai-security
+  - mcp
+  - fastapi
+  - pytorch
+  - Unsloth
+  - Hugging Face
+---
 # OmniGuard-Evolved-V2
 **Distributed OpenEnv RL Environment for Autonomous VulnOps & MCP Gateway Defense**
 > adversarial AI attacks — including prompt injection, credential exfiltration, STDIO
 > sandbox escapes, and recursive self-correction chains.
+### 🏆 Hackathon Submission Links
+- **Hugging Face Space**: [OmniGuard-Evolved-V2 Environment](https://huggingface.co/spaces/omni-team/omniguard-evolved-v2) *(Replace with actual URL before submission)*
+- **2-Minute Pitch Video**: [YouTube Link](https://youtube.com) *(Replace with actual URL before submission)*
 ## Architecture
 ```
 Produces `reports/results.json` and `reports/reward_curve.png`.
+#### Empirical Improvement Proof
+The graphs below demonstrate the empirical improvement of the GRPO-trained policy over the untrained baseline, showing both the increase in overall reward and the massive reduction in "Alert Fatigue" (False Positive rate).
+![Reward and False Positive Curves](reports/reward_curve.png)
 ## API Endpoints
 | Method | Path | Description |

eval/benchmark.py CHANGED Viewed

@@ -44,6 +44,7 @@ def parse_action_type(text: str) -> str:
 class RunSummary:
     name: str
     rewards: list[float]
     false_positive: int
     false_negative: int
     catastrophic_breach: int
@@ -139,6 +140,7 @@ def run_policy(
 ) -> RunSummary:
     observations = reset_env(client, env_url, list(range(env_instances)), task_name=task_name)
     rewards: list[float] = []
     false_positive = 0
     false_negative = 0
@@ -168,7 +170,10 @@ def run_policy(
         for result in results:
             rewards.append(float(result["reward"]["total"]))
             info = result.get("info", {})
-            false_positive += int(bool(info.get("false_positive")))
             false_negative += int(bool(info.get("false_negative")))
             catastrophic_breach += int(bool(info.get("catastrophic_breach")))
             true_positive += int(bool(info.get("true_positive")))
@@ -190,6 +195,7 @@ def run_policy(
     return RunSummary(
         name=task_name,
         rewards=rewards[:steps],
         false_positive=false_positive,
         false_negative=false_negative,
         catastrophic_breach=catastrophic_breach,
@@ -216,15 +222,28 @@ def save_plot(
     output_path: Path,
 ) -> None:
     x_axis = np.arange(len(random_summary.rewards))
-    plt.figure(figsize=(11, 6))
-    plt.plot(x_axis, moving_average(random_summary.rewards), label="Random Agent", linewidth=2)
-    plt.plot(x_axis, moving_average(untrained_summary.rewards), label="Untrained Qwen2.5", linewidth=2)
-    plt.plot(x_axis, moving_average(trained_summary.rewards), label="GRPO-Trained Model", linewidth=2)
-    plt.xlabel("Step")
-    plt.ylabel("Moving Average Reward")
-    plt.title("OmniGuard Reward Curves: Baselines vs Trained Policy")
-    plt.grid(alpha=0.3)
-    plt.legend()
     plt.tight_layout()
     plt.savefig(output_path, dpi=170)
     plt.close()

 class RunSummary:
     name: str
     rewards: list[float]
+    false_positive_history: list[int]
     false_positive: int
     false_negative: int
     catastrophic_breach: int
 ) -> RunSummary:
     observations = reset_env(client, env_url, list(range(env_instances)), task_name=task_name)
     rewards: list[float] = []
+    fp_history: list[int] = []
     false_positive = 0
     false_negative = 0
         for result in results:
             rewards.append(float(result["reward"]["total"]))
             info = result.get("info", {})
+            is_fp = int(bool(info.get("false_positive")))
+            fp_history.append(is_fp)
+            false_positive += is_fp
             false_negative += int(bool(info.get("false_negative")))
             catastrophic_breach += int(bool(info.get("catastrophic_breach")))
             true_positive += int(bool(info.get("true_positive")))
     return RunSummary(
         name=task_name,
         rewards=rewards[:steps],
+        false_positive_history=fp_history[:steps],
         false_positive=false_positive,
         false_negative=false_negative,
         catastrophic_breach=catastrophic_breach,
     output_path: Path,
 ) -> None:
     x_axis = np.arange(len(random_summary.rewards))
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
+    # Subplot 1: Reward Curves
+    ax1.plot(x_axis, moving_average(random_summary.rewards), label="Random Agent", linewidth=2)
+    ax1.plot(x_axis, moving_average(untrained_summary.rewards), label="Untrained Qwen2.5", linewidth=2)
+    ax1.plot(x_axis, moving_average(trained_summary.rewards), label="GRPO-Trained Model", linewidth=2)
+    ax1.set_xlabel("Step")
+    ax1.set_ylabel("Moving Average Reward")
+    ax1.set_title("Reward Improvement")
+    ax1.grid(alpha=0.3)
+    ax1.legend()
+    # Subplot 2: False Positive Rate (Alert Fatigue)
+    ax2.plot(x_axis, moving_average(untrained_summary.false_positive_history, window=50), label="Untrained Qwen2.5", linewidth=2, color="orange")
+    ax2.plot(x_axis, moving_average(trained_summary.false_positive_history, window=50), label="GRPO-Trained Model", linewidth=2, color="green")
+    ax2.set_xlabel("Step")
+    ax2.set_ylabel("False Positive Rate (Moving Avg)")
+    ax2.set_title("Reduction in Alert Fatigue (False Positives)")
+    ax2.grid(alpha=0.3)
+    ax2.legend()
+    plt.suptitle("OmniGuard-Evolved-V2: Baseline vs Trained Policy Benchmarks", fontsize=14)
     plt.tight_layout()
     plt.savefig(output_path, dpi=170)
     plt.close()

openenv.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+name: "OmniGuard-Evolved V2"
+description: "A partially observable, adaptive curriculum MCP gateway defense environment."
+version: "0.2.0"
+entrypoint: "server.env:OmniGuardStateMachine"
+dependencies:
+  - fastapi
+  - pydantic
+  - datasets
+  - httpx
+  - uvicorn
+  - numpy
+tasks:
+  - name: "default"
+    description: "Defend against dynamic, evolving prompt injection and MCP capability abuse."

server/env.py CHANGED Viewed

@@ -10,10 +10,11 @@ from server.generator import StreamingPayloadGenerator
 from server.graders import DualRewardGrader
 from server.models import DefenseAction, MCPToolContext, StepReward, ThreatObservation
 from server.telemetry import TelemetrySink
 from server.verifier import ActionVerifier
-class OmniGuardStateMachine:
     """Per-instance environment state machine.
     Runs entirely in its own process (via ``AsyncVectorEnvManager``).

 from server.graders import DualRewardGrader
 from server.models import DefenseAction, MCPToolContext, StepReward, ThreatObservation
 from server.telemetry import TelemetrySink
+from server.openenv_adapter import BaseMCPEnvironment
 from server.verifier import ActionVerifier
+class OmniGuardStateMachine(BaseMCPEnvironment):
     """Per-instance environment state machine.
     Runs entirely in its own process (via ``AsyncVectorEnvManager``).

server/openenv_adapter.py CHANGED Viewed

@@ -8,12 +8,21 @@ def create_openenv_metadata() -> dict[str, Any]:
         "adapter": "local",
         "openenv_pytorch_available": False,
     }
-    try:
-        import openenv_pytorch  # type: ignore
-        metadata["adapter"] = "openenv-pytorch"
-        metadata["openenv_pytorch_available"] = True
-        metadata["openenv_version"] = getattr(openenv_pytorch, "__version__", "unknown")
-    except Exception:
-        metadata["openenv_pytorch_available"] = False
     return metadata

         "adapter": "local",
         "openenv_pytorch_available": False,
     }
+class BaseMCPEnvironment:
+    """Fallback base class when openenv-pytorch is not available."""
+    pass
+try:
+    import openenv_pytorch  # type: ignore
+    if hasattr(openenv_pytorch, 'MCPEnvironment'):
+        BaseMCPEnvironment = openenv_pytorch.MCPEnvironment
+    elif hasattr(openenv_pytorch, 'Environment'):
+        BaseMCPEnvironment = openenv_pytorch.Environment
+    metadata["adapter"] = "openenv-pytorch"
+    metadata["openenv_pytorch_available"] = True
+    metadata["openenv_version"] = getattr(openenv_pytorch, "__version__", "unknown")
+except Exception:
+    metadata["openenv_pytorch_available"] = False
     return metadata