yujinyujin9393 commited on
Commit
459ab03
·
verified ·
1 Parent(s): 38ca48d

Add cybergym

Browse files
Files changed (2) hide show
  1. meta_data.py +6 -0
  2. results.json +32 -0
meta_data.py CHANGED
@@ -85,4 +85,10 @@ LEADERBOARD_MD['SWE-bench-verified'] = """This is a human-validated subset of SW
85
 
86
  Paper: https://openai.com/index/introducing-swe-bench-verified/
87
  Code: https://github.com/swe-bench/SWE-bench
 
 
 
 
 
 
88
  """
 
85
 
86
  Paper: https://openai.com/index/introducing-swe-bench-verified/
87
  Code: https://github.com/swe-bench/SWE-bench
88
+ """
89
+
90
+ LEADERBOARD_MD['CyberGym'] = """This is a large-scale and high-quality cybersecurity evaluation framework featuring 1,507 real-world vulnerabilities found and patched across 188 large software projects.
91
+
92
+ Paper: https://arxiv.org/abs/2506.02548
93
+ Code: https://github.com/sunblaze-ucb/cybergym
94
  """
results.json CHANGED
@@ -797,6 +797,38 @@
797
  "RAG + SWE-Llama 13B": 1.20,
798
  "RAG + ChatGPT 3.5": 0.40
799
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
800
  }
801
  }
802
  }
 
797
  "RAG + SWE-Llama 13B": 1.20,
798
  "RAG + ChatGPT 3.5": 0.40
799
  }
800
+ },
801
+ "CyberGym": {
802
+ "% Reproducing Target Vuln.": {
803
+ "OpenHands + Claude-Sonnet-4": 17.85,
804
+ "OpenHands + Claude-3.7-Sonnet": 11.94,
805
+ "OpenHands + GPT-4.1": 9.36,
806
+ "Cybench + GPT-4.1": 8.96,
807
+ "Codex + GPT-4.1": 7.37,
808
+ "ENiGMA + GPT-4.1": 7.23,
809
+ "OpenHands + Gemini-2.5-Flash": 4.84,
810
+ "OpenHands + DeepSeek-V3": 3.58,
811
+ "OpenHands + o4-mini": 2.46,
812
+ "OpenHands + R2E-Gym-32B": 1.99,
813
+ "OpenHands + Qwen3-235B-A22B": 1.86,
814
+ "OpenHands + OpenHands-LM-32B": 1.66,
815
+ "OpenHands + SWE-Gym-32B": 0.07
816
+ },
817
+ "% Finding Post-Patch Vuln.": {
818
+ "OpenHands + Claude-Sonnet-4": 1.99,
819
+ "OpenHands + Claude-3.7-Sonnet": 2.19,
820
+ "OpenHands + GPT-4.1": 1.26,
821
+ "Cybench + GPT-4.1": 2.26,
822
+ "Codex + GPT-4.1": 1.19,
823
+ "ENiGMA + GPT-4.1": 1.92,
824
+ "OpenHands + Gemini-2.5-Flash": 0.80,
825
+ "OpenHands + DeepSeek-V3": 0.66,
826
+ "OpenHands + o4-mini": 0.07,
827
+ "OpenHands + R2E-Gym-32B": 0.60,
828
+ "OpenHands + Qwen3-235B-A22B": 0.33,
829
+ "OpenHands + OpenHands-LM-32B": 0.33,
830
+ "OpenHands + SWE-Gym-32B": 0.07
831
+ }
832
  }
833
  }
834
  }