SmartKapila commited on
Commit
f3f05d8
·
1 Parent(s): c0bf5ac

Added openenv.yaml and updated REDAME with placeholders for post completion video and documentation according to judging criteria

Browse files
Files changed (5) hide show
  1. README.md +31 -0
  2. eval/benchmark.py +29 -10
  3. openenv.yaml +14 -0
  4. server/env.py +2 -1
  5. server/openenv_adapter.py +16 -7
README.md CHANGED
@@ -1,3 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # OmniGuard-Evolved-V2
2
 
3
  **Distributed OpenEnv RL Environment for Autonomous VulnOps & MCP Gateway Defense**
@@ -7,6 +28,10 @@
7
  > adversarial AI attacks — including prompt injection, credential exfiltration, STDIO
8
  > sandbox escapes, and recursive self-correction chains.
9
 
 
 
 
 
10
  ## Architecture
11
 
12
  ```
@@ -103,6 +128,12 @@ python -m eval.benchmark \
103
 
104
  Produces `reports/results.json` and `reports/reward_curve.png`.
105
 
 
 
 
 
 
 
106
  ## API Endpoints
107
 
108
  | Method | Path | Description |
 
1
+ ---
2
+ title: OmniGuard Evolved V2
3
+ emoji: 🛡️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_port: 8000
8
+ pinned: false
9
+ short_description: A distributed OpenEnv RL environment for training LLM-based defenders to protect enterprise MCP gateways against autonomous adversarial AI attacks.
10
+ tags:
11
+ - openenv
12
+ - reinforcement-learning
13
+ - ai-security
14
+ - mcp
15
+ - fastapi
16
+ - pytorch
17
+ - Unsloth
18
+ - Hugging Face
19
+ ---
20
+
21
+
22
  # OmniGuard-Evolved-V2
23
 
24
  **Distributed OpenEnv RL Environment for Autonomous VulnOps & MCP Gateway Defense**
 
28
  > adversarial AI attacks — including prompt injection, credential exfiltration, STDIO
29
  > sandbox escapes, and recursive self-correction chains.
30
 
31
+ ### 🏆 Hackathon Submission Links
32
+ - **Hugging Face Space**: [OmniGuard-Evolved-V2 Environment](https://huggingface.co/spaces/omni-team/omniguard-evolved-v2) *(Replace with actual URL before submission)*
33
+ - **2-Minute Pitch Video**: [YouTube Link](https://youtube.com) *(Replace with actual URL before submission)*
34
+
35
  ## Architecture
36
 
37
  ```
 
128
 
129
  Produces `reports/results.json` and `reports/reward_curve.png`.
130
 
131
+ #### Empirical Improvement Proof
132
+
133
+ The graphs below demonstrate the empirical improvement of the GRPO-trained policy over the untrained baseline, showing both the increase in overall reward and the massive reduction in "Alert Fatigue" (False Positive rate).
134
+
135
+ ![Reward and False Positive Curves](reports/reward_curve.png)
136
+
137
  ## API Endpoints
138
 
139
  | Method | Path | Description |
eval/benchmark.py CHANGED
@@ -44,6 +44,7 @@ def parse_action_type(text: str) -> str:
44
  class RunSummary:
45
  name: str
46
  rewards: list[float]
 
47
  false_positive: int
48
  false_negative: int
49
  catastrophic_breach: int
@@ -139,6 +140,7 @@ def run_policy(
139
  ) -> RunSummary:
140
  observations = reset_env(client, env_url, list(range(env_instances)), task_name=task_name)
141
  rewards: list[float] = []
 
142
 
143
  false_positive = 0
144
  false_negative = 0
@@ -168,7 +170,10 @@ def run_policy(
168
  for result in results:
169
  rewards.append(float(result["reward"]["total"]))
170
  info = result.get("info", {})
171
- false_positive += int(bool(info.get("false_positive")))
 
 
 
172
  false_negative += int(bool(info.get("false_negative")))
173
  catastrophic_breach += int(bool(info.get("catastrophic_breach")))
174
  true_positive += int(bool(info.get("true_positive")))
@@ -190,6 +195,7 @@ def run_policy(
190
  return RunSummary(
191
  name=task_name,
192
  rewards=rewards[:steps],
 
193
  false_positive=false_positive,
194
  false_negative=false_negative,
195
  catastrophic_breach=catastrophic_breach,
@@ -216,15 +222,28 @@ def save_plot(
216
  output_path: Path,
217
  ) -> None:
218
  x_axis = np.arange(len(random_summary.rewards))
219
- plt.figure(figsize=(11, 6))
220
- plt.plot(x_axis, moving_average(random_summary.rewards), label="Random Agent", linewidth=2)
221
- plt.plot(x_axis, moving_average(untrained_summary.rewards), label="Untrained Qwen2.5", linewidth=2)
222
- plt.plot(x_axis, moving_average(trained_summary.rewards), label="GRPO-Trained Model", linewidth=2)
223
- plt.xlabel("Step")
224
- plt.ylabel("Moving Average Reward")
225
- plt.title("OmniGuard Reward Curves: Baselines vs Trained Policy")
226
- plt.grid(alpha=0.3)
227
- plt.legend()
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  plt.tight_layout()
229
  plt.savefig(output_path, dpi=170)
230
  plt.close()
 
44
  class RunSummary:
45
  name: str
46
  rewards: list[float]
47
+ false_positive_history: list[int]
48
  false_positive: int
49
  false_negative: int
50
  catastrophic_breach: int
 
140
  ) -> RunSummary:
141
  observations = reset_env(client, env_url, list(range(env_instances)), task_name=task_name)
142
  rewards: list[float] = []
143
+ fp_history: list[int] = []
144
 
145
  false_positive = 0
146
  false_negative = 0
 
170
  for result in results:
171
  rewards.append(float(result["reward"]["total"]))
172
  info = result.get("info", {})
173
+ is_fp = int(bool(info.get("false_positive")))
174
+ fp_history.append(is_fp)
175
+
176
+ false_positive += is_fp
177
  false_negative += int(bool(info.get("false_negative")))
178
  catastrophic_breach += int(bool(info.get("catastrophic_breach")))
179
  true_positive += int(bool(info.get("true_positive")))
 
195
  return RunSummary(
196
  name=task_name,
197
  rewards=rewards[:steps],
198
+ false_positive_history=fp_history[:steps],
199
  false_positive=false_positive,
200
  false_negative=false_negative,
201
  catastrophic_breach=catastrophic_breach,
 
222
  output_path: Path,
223
  ) -> None:
224
  x_axis = np.arange(len(random_summary.rewards))
225
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
226
+
227
+ # Subplot 1: Reward Curves
228
+ ax1.plot(x_axis, moving_average(random_summary.rewards), label="Random Agent", linewidth=2)
229
+ ax1.plot(x_axis, moving_average(untrained_summary.rewards), label="Untrained Qwen2.5", linewidth=2)
230
+ ax1.plot(x_axis, moving_average(trained_summary.rewards), label="GRPO-Trained Model", linewidth=2)
231
+ ax1.set_xlabel("Step")
232
+ ax1.set_ylabel("Moving Average Reward")
233
+ ax1.set_title("Reward Improvement")
234
+ ax1.grid(alpha=0.3)
235
+ ax1.legend()
236
+
237
+ # Subplot 2: False Positive Rate (Alert Fatigue)
238
+ ax2.plot(x_axis, moving_average(untrained_summary.false_positive_history, window=50), label="Untrained Qwen2.5", linewidth=2, color="orange")
239
+ ax2.plot(x_axis, moving_average(trained_summary.false_positive_history, window=50), label="GRPO-Trained Model", linewidth=2, color="green")
240
+ ax2.set_xlabel("Step")
241
+ ax2.set_ylabel("False Positive Rate (Moving Avg)")
242
+ ax2.set_title("Reduction in Alert Fatigue (False Positives)")
243
+ ax2.grid(alpha=0.3)
244
+ ax2.legend()
245
+
246
+ plt.suptitle("OmniGuard-Evolved-V2: Baseline vs Trained Policy Benchmarks", fontsize=14)
247
  plt.tight_layout()
248
  plt.savefig(output_path, dpi=170)
249
  plt.close()
openenv.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "OmniGuard-Evolved V2"
2
+ description: "A partially observable, adaptive curriculum MCP gateway defense environment."
3
+ version: "0.2.0"
4
+ entrypoint: "server.env:OmniGuardStateMachine"
5
+ dependencies:
6
+ - fastapi
7
+ - pydantic
8
+ - datasets
9
+ - httpx
10
+ - uvicorn
11
+ - numpy
12
+ tasks:
13
+ - name: "default"
14
+ description: "Defend against dynamic, evolving prompt injection and MCP capability abuse."
server/env.py CHANGED
@@ -10,10 +10,11 @@ from server.generator import StreamingPayloadGenerator
10
  from server.graders import DualRewardGrader
11
  from server.models import DefenseAction, MCPToolContext, StepReward, ThreatObservation
12
  from server.telemetry import TelemetrySink
 
13
  from server.verifier import ActionVerifier
14
 
15
 
16
- class OmniGuardStateMachine:
17
  """Per-instance environment state machine.
18
 
19
  Runs entirely in its own process (via ``AsyncVectorEnvManager``).
 
10
  from server.graders import DualRewardGrader
11
  from server.models import DefenseAction, MCPToolContext, StepReward, ThreatObservation
12
  from server.telemetry import TelemetrySink
13
+ from server.openenv_adapter import BaseMCPEnvironment
14
  from server.verifier import ActionVerifier
15
 
16
 
17
+ class OmniGuardStateMachine(BaseMCPEnvironment):
18
  """Per-instance environment state machine.
19
 
20
  Runs entirely in its own process (via ``AsyncVectorEnvManager``).
server/openenv_adapter.py CHANGED
@@ -8,12 +8,21 @@ def create_openenv_metadata() -> dict[str, Any]:
8
  "adapter": "local",
9
  "openenv_pytorch_available": False,
10
  }
11
- try:
12
- import openenv_pytorch # type: ignore
 
13
 
14
- metadata["adapter"] = "openenv-pytorch"
15
- metadata["openenv_pytorch_available"] = True
16
- metadata["openenv_version"] = getattr(openenv_pytorch, "__version__", "unknown")
17
- except Exception:
18
- metadata["openenv_pytorch_available"] = False
 
 
 
 
 
 
 
 
19
  return metadata
 
8
  "adapter": "local",
9
  "openenv_pytorch_available": False,
10
  }
11
+ class BaseMCPEnvironment:
12
+ """Fallback base class when openenv-pytorch is not available."""
13
+ pass
14
 
15
+ try:
16
+ import openenv_pytorch # type: ignore
17
+
18
+ if hasattr(openenv_pytorch, 'MCPEnvironment'):
19
+ BaseMCPEnvironment = openenv_pytorch.MCPEnvironment
20
+ elif hasattr(openenv_pytorch, 'Environment'):
21
+ BaseMCPEnvironment = openenv_pytorch.Environment
22
+
23
+ metadata["adapter"] = "openenv-pytorch"
24
+ metadata["openenv_pytorch_available"] = True
25
+ metadata["openenv_version"] = getattr(openenv_pytorch, "__version__", "unknown")
26
+ except Exception:
27
+ metadata["openenv_pytorch_available"] = False
28
  return metadata