Add cybergym
Browse files- meta_data.py +6 -0
- results.json +32 -0
meta_data.py
CHANGED
@@ -85,4 +85,10 @@ LEADERBOARD_MD['SWE-bench-verified'] = """This is a human-validated subset of SW
|
|
85 |
|
86 |
Paper: https://openai.com/index/introducing-swe-bench-verified/
|
87 |
Code: https://github.com/swe-bench/SWE-bench
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
"""
|
|
|
85 |
|
86 |
Paper: https://openai.com/index/introducing-swe-bench-verified/
|
87 |
Code: https://github.com/swe-bench/SWE-bench
|
88 |
+
"""
|
89 |
+
|
90 |
+
LEADERBOARD_MD['CyberGym'] = """This is a large-scale and high-quality cybersecurity evaluation framework featuring 1,507 real-world vulnerabilities found and patched across 188 large software projects.
|
91 |
+
|
92 |
+
Paper: https://arxiv.org/abs/2506.02548
|
93 |
+
Code: https://github.com/sunblaze-ucb/cybergym
|
94 |
"""
|
results.json
CHANGED
@@ -797,6 +797,38 @@
|
|
797 |
"RAG + SWE-Llama 13B": 1.20,
|
798 |
"RAG + ChatGPT 3.5": 0.40
|
799 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
800 |
}
|
801 |
}
|
802 |
}
|
|
|
797 |
"RAG + SWE-Llama 13B": 1.20,
|
798 |
"RAG + ChatGPT 3.5": 0.40
|
799 |
}
|
800 |
+
},
|
801 |
+
"CyberGym": {
|
802 |
+
"% Reproducing Target Vuln.": {
|
803 |
+
"OpenHands + Claude-Sonnet-4": 17.85,
|
804 |
+
"OpenHands + Claude-3.7-Sonnet": 11.94,
|
805 |
+
"OpenHands + GPT-4.1": 9.36,
|
806 |
+
"Cybench + GPT-4.1": 8.96,
|
807 |
+
"Codex + GPT-4.1": 7.37,
|
808 |
+
"ENiGMA + GPT-4.1": 7.23,
|
809 |
+
"OpenHands + Gemini-2.5-Flash": 4.84,
|
810 |
+
"OpenHands + DeepSeek-V3": 3.58,
|
811 |
+
"OpenHands + o4-mini": 2.46,
|
812 |
+
"OpenHands + R2E-Gym-32B": 1.99,
|
813 |
+
"OpenHands + Qwen3-235B-A22B": 1.86,
|
814 |
+
"OpenHands + OpenHands-LM-32B": 1.66,
|
815 |
+
"OpenHands + SWE-Gym-32B": 0.07
|
816 |
+
},
|
817 |
+
"% Finding Post-Patch Vuln.": {
|
818 |
+
"OpenHands + Claude-Sonnet-4": 1.99,
|
819 |
+
"OpenHands + Claude-3.7-Sonnet": 2.19,
|
820 |
+
"OpenHands + GPT-4.1": 1.26,
|
821 |
+
"Cybench + GPT-4.1": 2.26,
|
822 |
+
"Codex + GPT-4.1": 1.19,
|
823 |
+
"ENiGMA + GPT-4.1": 1.92,
|
824 |
+
"OpenHands + Gemini-2.5-Flash": 0.80,
|
825 |
+
"OpenHands + DeepSeek-V3": 0.66,
|
826 |
+
"OpenHands + o4-mini": 0.07,
|
827 |
+
"OpenHands + R2E-Gym-32B": 0.60,
|
828 |
+
"OpenHands + Qwen3-235B-A22B": 0.33,
|
829 |
+
"OpenHands + OpenHands-LM-32B": 0.33,
|
830 |
+
"OpenHands + SWE-Gym-32B": 0.07
|
831 |
+
}
|
832 |
}
|
833 |
}
|
834 |
}
|