karimouda commited on
Commit
1d74df0
·
verified ·
1 Parent(s): 6408e65

Delete results/openai/o3_results_2025-05-11_19-13-35.json

Browse files
results/openai/o3_results_2025-05-11_19-13-35.json DELETED
@@ -1,158 +0,0 @@
1
- {
2
- "results": {
3
- "average_score": 7.727659574468085,
4
- "speed": 2.9113119225076014,
5
- "contamination_score": 0,
6
- "execution_time": 3119.89929,
7
- "errors": [
8
- {
9
- "error": "Error code: 400 - {'error': {'message': 'Could not finish the message because max_tokens or model output limit was reached. Please try again with higher max_tokens.', 'type': 'invalid_request_error', 'param': None, 'code': None}}",
10
- "prompt": "{'role': 'user', 'content': \"\u0627\u0643\u062a\u0628 \u0645\u0646\u0634\u0648\u0631 \u0639\u0644\u0649 \u0627\u0644\u0641\u064a\u0633\u0628\u0648\u0643 \u0639\u0646 \u0627\u0644\u0633\u0648\u0642 \u0627\u0644\u0634\u0639\u0628\u064a \u0628\u0635\u064a\u063a\u0629 JSON. \u0644\u0627 \u062a\u0630\u0643\u0631 \u0627\u0644\u0643\u0644\u0645\u0627\u062a \u0627\u0644\u062a\u0627\u0644\u064a\u0629 '\u0627\u0644\u0633\u0648\u0642' \u0648'JSON' \u0641\u064a \u0631\u062f\u0643. \\n\u0627\u0644\u0625\u062c\u0627\u0628\u0629:\"}"
11
- }
12
- ],
13
- "scores_by_category": [
14
- {
15
- "category": "Paraphrasing",
16
- "average_score": 10.0,
17
- "count": 6
18
- },
19
- {
20
- "category": "Reading Comprehension",
21
- "average_score": 10.0,
22
- "count": 17
23
- },
24
- {
25
- "category": "Function Calling",
26
- "average_score": 9.666666666666666,
27
- "count": 3
28
- },
29
- {
30
- "category": "Dialect Detection",
31
- "average_score": 9.090909090909092,
32
- "count": 11
33
- },
34
- {
35
- "category": "MMLU",
36
- "average_score": 8.925619834710744,
37
- "count": 121
38
- },
39
- {
40
- "category": "Sentiment Analysis",
41
- "average_score": 8.88888888888889,
42
- "count": 9
43
- },
44
- {
45
- "category": "Trust & Safety",
46
- "average_score": 8.666666666666666,
47
- "count": 30
48
- },
49
- {
50
- "category": "Reasoning & Math",
51
- "average_score": 8.604651162790697,
52
- "count": 43
53
- },
54
- {
55
- "category": "Entity Extraction",
56
- "average_score": 8.6,
57
- "count": 5
58
- },
59
- {
60
- "category": "Transliteration",
61
- "average_score": 8.166666666666666,
62
- "count": 6
63
- },
64
- {
65
- "category": "Instruction Following",
66
- "average_score": 8.142857142857142,
67
- "count": 7
68
- },
69
- {
70
- "category": "Translation (incl Dialects)",
71
- "average_score": 7.972222222222222,
72
- "count": 36
73
- },
74
- {
75
- "category": "Summarization",
76
- "average_score": 7.25,
77
- "count": 8
78
- },
79
- {
80
- "category": "General Knowledge",
81
- "average_score": 6.873015873015873,
82
- "count": 63
83
- },
84
- {
85
- "category": "Arabic Language & Grammar",
86
- "average_score": 6.823529411764706,
87
- "count": 17
88
- },
89
- {
90
- "category": "Writing (incl Dialects)",
91
- "average_score": 6.681818181818182,
92
- "count": 22
93
- },
94
- {
95
- "category": "Hallucination",
96
- "average_score": 6.666666666666667,
97
- "count": 3
98
- },
99
- {
100
- "category": "Coding",
101
- "average_score": 6.0,
102
- "count": 3
103
- },
104
- {
105
- "category": "Structuring",
106
- "average_score": 5.333333333333333,
107
- "count": 3
108
- },
109
- {
110
- "category": "Long Context",
111
- "average_score": 5.0,
112
- "count": 4
113
- },
114
- {
115
- "category": "RAG QA",
116
- "average_score": 4.365853658536586,
117
- "count": 41
118
- },
119
- {
120
- "category": "Diacritization",
121
- "average_score": 3.3333333333333335,
122
- "count": 12
123
- }
124
- ],
125
- "scores_by_format": [
126
- {
127
- "format": "Fill-in-the-blank",
128
- "average_score": 10.0,
129
- "count": 8
130
- },
131
- {
132
- "format": "Short Answer",
133
- "average_score": 10.0,
134
- "count": 5
135
- },
136
- {
137
- "format": "MCQ",
138
- "average_score": 8.851528384279476,
139
- "count": 229
140
- },
141
- {
142
- "format": "Generation",
143
- "average_score": 6.469298245614035,
144
- "count": 228
145
- }
146
- ]
147
- },
148
- "config": {
149
- "model": "openai/o3",
150
- "model_sha": "na",
151
- "submitted_time": "2025-05-11 18:21:34",
152
- "likes": -1,
153
- "params": 999,
154
- "license": "closed",
155
- "model_source": "API",
156
- "model_category": "Large"
157
- }
158
- }