Spaces:

FrontierAICybersecurity
/

Cybersecurity_leaderboard

Running

App Files Files Community

yujinyujin9393 commited on 2 days ago

Commit

459ab03

verified ·

1 Parent(s): 38ca48d

Add cybergym

Browse files

Files changed (2) hide show

meta_data.py +6 -0
results.json +32 -0

meta_data.py CHANGED Viewed

@@ -85,4 +85,10 @@ LEADERBOARD_MD['SWE-bench-verified'] = """This is a human-validated subset of SW
 Paper: https://openai.com/index/introducing-swe-bench-verified/
 Code: https://github.com/swe-bench/SWE-bench
 """

 Paper: https://openai.com/index/introducing-swe-bench-verified/
 Code: https://github.com/swe-bench/SWE-bench
+"""
+LEADERBOARD_MD['CyberGym'] = """This is a large-scale and high-quality cybersecurity evaluation framework featuring 1,507 real-world vulnerabilities found and patched across 188 large software projects.
+Paper: https://arxiv.org/abs/2506.02548
+Code: https://github.com/sunblaze-ucb/cybergym
 """

results.json CHANGED Viewed

@@ -797,6 +797,38 @@
                 "RAG + SWE-Llama 13B": 1.20,
                 "RAG + ChatGPT 3.5": 0.40
             }
         }
     }
 }

                 "RAG + SWE-Llama 13B": 1.20,
                 "RAG + ChatGPT 3.5": 0.40
             }
+        },
+        "CyberGym": {
+            "% Reproducing Target Vuln.": {
+                "OpenHands + Claude-Sonnet-4": 17.85,
+                "OpenHands + Claude-3.7-Sonnet": 11.94,
+                "OpenHands + GPT-4.1": 9.36,
+                "Cybench + GPT-4.1": 8.96,
+                "Codex + GPT-4.1": 7.37,
+                "ENiGMA + GPT-4.1": 7.23,
+                "OpenHands + Gemini-2.5-Flash": 4.84,
+                "OpenHands + DeepSeek-V3": 3.58,
+                "OpenHands + o4-mini": 2.46,
+                "OpenHands + R2E-Gym-32B": 1.99,
+                "OpenHands + Qwen3-235B-A22B": 1.86,
+                "OpenHands + OpenHands-LM-32B": 1.66,
+                "OpenHands + SWE-Gym-32B": 0.07
+            },
+            "% Finding Post-Patch Vuln.": {
+                "OpenHands + Claude-Sonnet-4": 1.99,
+                "OpenHands + Claude-3.7-Sonnet": 2.19,
+                "OpenHands + GPT-4.1": 1.26,
+                "Cybench + GPT-4.1": 2.26,
+                "Codex + GPT-4.1": 1.19,
+                "ENiGMA + GPT-4.1": 1.92,
+                "OpenHands + Gemini-2.5-Flash": 0.80,
+                "OpenHands + DeepSeek-V3": 0.66,
+                "OpenHands + o4-mini": 0.07,
+                "OpenHands + R2E-Gym-32B": 0.60,
+                "OpenHands + Qwen3-235B-A22B": 0.33,
+                "OpenHands + OpenHands-LM-32B": 0.33,
+                "OpenHands + SWE-Gym-32B": 0.07
+            }
         }
     }
 }