Rename prompts for LS, SU, NER, and REL
Browse files- app.py +102 -25
- src/tasks.py +8 -8
app.py
CHANGED
@@ -47,6 +47,9 @@ def mean_of_max_per_field(df):
|
|
47 |
|
48 |
|
49 |
def boxplot_per_task(dataframe=None, baselines=None):
|
|
|
|
|
|
|
50 |
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
51 |
|
52 |
if dataframe is None:
|
@@ -56,7 +59,6 @@ def boxplot_per_task(dataframe=None, baselines=None):
|
|
56 |
for task in tasks
|
57 |
})
|
58 |
|
59 |
-
# baseline per ciascun task (se non viene passata, metto random tra 50 e 70)
|
60 |
if baselines is None:
|
61 |
baselines = {task: np.random.randint(50, 70) for task in tasks}
|
62 |
|
@@ -73,27 +75,26 @@ def boxplot_per_task(dataframe=None, baselines=None):
|
|
73 |
fig.add_trace(go.Box(
|
74 |
y=y_data,
|
75 |
name=task,
|
76 |
-
|
77 |
-
|
78 |
-
line=dict(color=
|
79 |
fillcolor=colors[i],
|
80 |
opacity=0.7,
|
81 |
-
hovertemplate=
|
82 |
-
width=0.6
|
|
|
|
|
83 |
))
|
84 |
|
85 |
-
# baseline
|
86 |
if task in baselines and baselines[task] is not None:
|
87 |
-
# baseline come linea orizzontale
|
88 |
fig.add_shape(
|
89 |
type="line",
|
90 |
-
x0=i-0.3, x1=i+0.3,
|
91 |
y0=baselines[task], y1=baselines[task],
|
92 |
line=dict(color="black", width=2, dash="dash"),
|
93 |
xref="x", yref="y"
|
94 |
)
|
95 |
-
|
96 |
-
# label con valore baseline
|
97 |
fig.add_annotation(
|
98 |
x=i, y=baselines[task],
|
99 |
text=f"{baselines[task]}%",
|
@@ -103,19 +104,19 @@ def boxplot_per_task(dataframe=None, baselines=None):
|
|
103 |
)
|
104 |
|
105 |
fig.update_layout(
|
106 |
-
title="Distribution of Model Accuracy by Task
|
107 |
xaxis_title="Task",
|
108 |
yaxis_title="Accuracy (%)",
|
109 |
template="plotly_white",
|
110 |
boxmode="group",
|
111 |
dragmode=False,
|
112 |
font=dict(family="Arial", size=13),
|
113 |
-
margin=dict(b=
|
114 |
-
annotations
|
115 |
dict(
|
116 |
text=(
|
117 |
-
"Boxplots show LLM accuracy in zero/few-shot settings. <br>"
|
118 |
-
"
|
119 |
),
|
120 |
xref="paper", yref="paper",
|
121 |
x=0.5, y=-0.33,
|
@@ -124,7 +125,6 @@ def boxplot_per_task(dataframe=None, baselines=None):
|
|
124 |
)
|
125 |
]
|
126 |
)
|
127 |
-
#fig.update_yaxes(fixedrange=True)
|
128 |
fig.update_yaxes(range=[0, 100], fixedrange=True)
|
129 |
|
130 |
return fig
|
@@ -137,6 +137,74 @@ BASELINES = {
|
|
137 |
}
|
138 |
|
139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
|
142 |
def line_chart(dataframe):
|
@@ -255,11 +323,11 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
255 |
|
256 |
for _, row in sorted_dataframe.iterrows():
|
257 |
if row['IS_FS']: # 5-Few-Shot
|
258 |
-
if row["#Params (B)"] >
|
259 |
-
new_model_column.append(f"{row['Model']}
|
260 |
large_medal_fs_assigned = True
|
261 |
-
elif 10 < row["#Params (B)"] <=
|
262 |
-
new_model_column.append(f"{row['Model']}
|
263 |
medium_medal_fs_assigned = True
|
264 |
elif row["#Params (B)"] <= 10 and not small_medal_fs_assigned:
|
265 |
new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🏆")
|
@@ -267,11 +335,11 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
267 |
else:
|
268 |
new_model_column.append(row["Model"])
|
269 |
else: # 0-Shot
|
270 |
-
if row["#Params (B)"] >
|
271 |
-
new_model_column.append(f"{row['Model']}
|
272 |
large_medal_0shot_assigned = True
|
273 |
-
elif 10 < row["#Params (B)"] <=
|
274 |
-
new_model_column.append(f"{row['Model']}
|
275 |
medium_medal_0shot_assigned = True
|
276 |
elif row["#Params (B)"] <= 10 and not small_medal_0shot_assigned:
|
277 |
new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🎖️")
|
@@ -279,6 +347,14 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
279 |
else:
|
280 |
new_model_column.append(row["Model"])
|
281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
# Aggiorna la colonna Model
|
283 |
sorted_dataframe["Model"] = new_model_column
|
284 |
|
@@ -503,6 +579,7 @@ with demo:
|
|
503 |
#gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
|
504 |
gr.Plot(value=line_chart(LEADERBOARD_DF))
|
505 |
gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
|
|
|
506 |
|
507 |
# About tab
|
508 |
with gr.TabItem("📝 About"):
|
|
|
47 |
|
48 |
|
49 |
def boxplot_per_task(dataframe=None, baselines=None):
|
50 |
+
|
51 |
+
print(dataframe.columns)
|
52 |
+
|
53 |
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
54 |
|
55 |
if dataframe is None:
|
|
|
59 |
for task in tasks
|
60 |
})
|
61 |
|
|
|
62 |
if baselines is None:
|
63 |
baselines = {task: np.random.randint(50, 70) for task in tasks}
|
64 |
|
|
|
75 |
fig.add_trace(go.Box(
|
76 |
y=y_data,
|
77 |
name=task,
|
78 |
+
marker=dict(color=colors[i]),
|
79 |
+
# Modifica: Impostiamo il colore della linea della scatola su un colore diverso dal riempimento
|
80 |
+
line=dict(color="black", width=2),
|
81 |
fillcolor=colors[i],
|
82 |
opacity=0.7,
|
83 |
+
hovertemplate="<b>"+task+"</b><br>Accuracy: %{y:.2f}%<extra></extra>",
|
84 |
+
width=0.6,
|
85 |
+
whiskerwidth=0.2,
|
86 |
+
quartilemethod="linear"
|
87 |
))
|
88 |
|
89 |
+
# baseline
|
90 |
if task in baselines and baselines[task] is not None:
|
|
|
91 |
fig.add_shape(
|
92 |
type="line",
|
93 |
+
x0=i-0.3, x1=i+0.3,
|
94 |
y0=baselines[task], y1=baselines[task],
|
95 |
line=dict(color="black", width=2, dash="dash"),
|
96 |
xref="x", yref="y"
|
97 |
)
|
|
|
|
|
98 |
fig.add_annotation(
|
99 |
x=i, y=baselines[task],
|
100 |
text=f"{baselines[task]}%",
|
|
|
104 |
)
|
105 |
|
106 |
fig.update_layout(
|
107 |
+
title="Distribution of Model Accuracy by Task",
|
108 |
xaxis_title="Task",
|
109 |
yaxis_title="Accuracy (%)",
|
110 |
template="plotly_white",
|
111 |
boxmode="group",
|
112 |
dragmode=False,
|
113 |
font=dict(family="Arial", size=13),
|
114 |
+
margin=dict(b=140),
|
115 |
+
annotations=[
|
116 |
dict(
|
117 |
text=(
|
118 |
+
"Boxplots show LLM accuracy in zero/few-shot settings. Black dashed lines<br>"
|
119 |
+
"indicate best-performing supervised models evaluated on EVALITA."
|
120 |
),
|
121 |
xref="paper", yref="paper",
|
122 |
x=0.5, y=-0.33,
|
|
|
125 |
)
|
126 |
]
|
127 |
)
|
|
|
128 |
fig.update_yaxes(range=[0, 100], fixedrange=True)
|
129 |
|
130 |
return fig
|
|
|
137 |
}
|
138 |
|
139 |
|
140 |
+
def boxplot_prompts_per_task(dataframe, tasks=None):
|
141 |
+
if tasks is None:
|
142 |
+
tasks = ["TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]
|
143 |
+
|
144 |
+
fig = go.Figure()
|
145 |
+
|
146 |
+
# Liste per creare una sola voce in legenda per Average e Best
|
147 |
+
avg_x, avg_y = [], []
|
148 |
+
best_x, best_y, best_text = [], [], []
|
149 |
+
|
150 |
+
for task in tasks:
|
151 |
+
avg_col = f"{task} Prompt Average"
|
152 |
+
best_col = f"{task} Best Prompt"
|
153 |
+
best_id_col = f"{task} Best Prompt Id"
|
154 |
+
|
155 |
+
if all(col in dataframe.columns for col in [avg_col, best_col, best_id_col]):
|
156 |
+
avg_value = dataframe[avg_col].mean()
|
157 |
+
avg_x.append(task)
|
158 |
+
avg_y.append(avg_value)
|
159 |
+
|
160 |
+
best_value = dataframe[best_col].mean()
|
161 |
+
best_x.append(task)
|
162 |
+
best_y.append(best_value)
|
163 |
+
best_id = dataframe[best_id_col].mode()[0] # Most frequent best prompt id
|
164 |
+
best_text.append(f"P:{best_id}")
|
165 |
+
|
166 |
+
# Barre Average Accuracy (azzurro)
|
167 |
+
fig.add_trace(go.Bar(
|
168 |
+
x=avg_x,
|
169 |
+
y=avg_y,
|
170 |
+
name="Average Accuracy",
|
171 |
+
marker_color="#1f77b4",
|
172 |
+
#hovertemplate="%{y:.2f}%<extra></extra>"
|
173 |
+
#hovertemplate="<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
|
174 |
+
))
|
175 |
+
|
176 |
+
# Barre Best Prompt (rosso)
|
177 |
+
fig.add_trace(go.Bar(
|
178 |
+
x=best_x,
|
179 |
+
y=best_y,
|
180 |
+
name="Best Prompt",
|
181 |
+
marker_color="#d62728",
|
182 |
+
#hovertemplate="%{y:.2f}%<extra></extra>"
|
183 |
+
#hovertemplate = "<b>" + task + "</b><br>Accuracy: %{y:.2f}%<extra></extra>",
|
184 |
+
))
|
185 |
+
|
186 |
+
# Testo sopra barre Best Prompt con ID
|
187 |
+
for x, y, text in zip(best_x, best_y, best_text):
|
188 |
+
fig.add_annotation(
|
189 |
+
x=x,
|
190 |
+
y=y + 1, # leggermente sopra la barra
|
191 |
+
text=text,
|
192 |
+
showarrow=False,
|
193 |
+
font=dict(size=12, color="black")
|
194 |
+
)
|
195 |
+
|
196 |
+
fig.update_layout(
|
197 |
+
title="Comparison of Average Prompt Accuracy vs Best Prompt Accuracy per Task",
|
198 |
+
xaxis_title="Task",
|
199 |
+
yaxis_title="Accuracy (%)",
|
200 |
+
barmode='group',
|
201 |
+
template="plotly_white",
|
202 |
+
font=dict(family="Arial", size=13),
|
203 |
+
yaxis=dict(range=[0, 100], fixedrange=True)
|
204 |
+
)
|
205 |
+
|
206 |
+
return fig
|
207 |
+
|
208 |
|
209 |
|
210 |
def line_chart(dataframe):
|
|
|
323 |
|
324 |
for _, row in sorted_dataframe.iterrows():
|
325 |
if row['IS_FS']: # 5-Few-Shot
|
326 |
+
if row["#Params (B)"] > 50 and not large_medal_fs_assigned:
|
327 |
+
new_model_column.append(f"{row['Model']} 1️⃣0️⃣0️⃣🅱️🏆")
|
328 |
large_medal_fs_assigned = True
|
329 |
+
elif 10 < row["#Params (B)"] <= 50 and not medium_medal_fs_assigned:
|
330 |
+
new_model_column.append(f"{row['Model']} 5️⃣0️⃣🅱️🏆")
|
331 |
medium_medal_fs_assigned = True
|
332 |
elif row["#Params (B)"] <= 10 and not small_medal_fs_assigned:
|
333 |
new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🏆")
|
|
|
335 |
else:
|
336 |
new_model_column.append(row["Model"])
|
337 |
else: # 0-Shot
|
338 |
+
if row["#Params (B)"] > 50 and not large_medal_0shot_assigned:
|
339 |
+
new_model_column.append(f"{row['Model']} 1️⃣0️⃣0️⃣🅱️🎖️")
|
340 |
large_medal_0shot_assigned = True
|
341 |
+
elif 10 < row["#Params (B)"] <= 50 and not medium_medal_0shot_assigned:
|
342 |
+
new_model_column.append(f"{row['Model']} 5️⃣0️⃣🅱️🎖️")
|
343 |
medium_medal_0shot_assigned = True
|
344 |
elif row["#Params (B)"] <= 10 and not small_medal_0shot_assigned:
|
345 |
new_model_column.append(f"{row['Model']} 1️⃣0️⃣🅱️🎖️")
|
|
|
347 |
else:
|
348 |
new_model_column.append(row["Model"])
|
349 |
|
350 |
+
|
351 |
+
# Lista delle colonne da aggiornare
|
352 |
+
cols_to_update = ["REL Best Prompt Id", "NER Best Prompt Id", "SU Best Prompt Id", "LS Best Prompt Id"]
|
353 |
+
# Applichiamo la trasformazione
|
354 |
+
for col in cols_to_update:
|
355 |
+
dataframe[col] = dataframe[col].replace({1: 7, 2: 8})
|
356 |
+
|
357 |
+
|
358 |
# Aggiorna la colonna Model
|
359 |
sorted_dataframe["Model"] = new_model_column
|
360 |
|
|
|
579 |
#gr.Plot(value=line_chart_interactive_test(), label="Andamento interattivo")
|
580 |
gr.Plot(value=line_chart(LEADERBOARD_DF))
|
581 |
gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES))
|
582 |
+
gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF))
|
583 |
|
584 |
# About tab
|
585 |
with gr.TabItem("📝 About"):
|
src/tasks.py
CHANGED
@@ -125,8 +125,8 @@ LS_DESCRIPTION = """### Lexical Substitution (LS) --- *Generative task*
|
|
125 |
|
126 |
| # | Prompt |
|
127 |
|-----|--------------------------------------------------------------------------------|
|
128 |
-
|
|
129 |
-
|
|
130 |
|
131 |
<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
|
132 |
|
@@ -137,8 +137,8 @@ SU_DESCRIPTION = """### Summarization (SUM) --- *Generative task*
|
|
137 |
|
138 |
| # | Prompt |
|
139 |
|-----|--------------------------------------------------------------------------------|
|
140 |
-
|
|
141 |
-
|
|
142 |
|
143 |
<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
|
144 |
|
@@ -149,8 +149,8 @@ NER_DESCRIPTION = """### Named Entity Recognition (NER) --- *Generative task*
|
|
149 |
|
150 |
| # | Prompt |
|
151 |
|-----|--------------------------------------------------------------------------------|
|
152 |
-
|
|
153 |
-
|
|
154 |
|
155 |
<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
|
156 |
|
@@ -161,8 +161,8 @@ REL_DESCRIPTION = """### Relation Extraction (REL) --- *Generative task*
|
|
161 |
|
162 |
| # | Prompt |
|
163 |
|-----|--------------------------------------------------------------------------------|
|
164 |
-
|
|
165 |
-
|
|
166 |
|
167 |
<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
|
168 |
|
|
|
125 |
|
126 |
| # | Prompt |
|
127 |
|-----|--------------------------------------------------------------------------------|
|
128 |
+
| 7 | Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
|
129 |
+
| 8 | Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: |
|
130 |
|
131 |
<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
|
132 |
|
|
|
137 |
|
138 |
| # | Prompt |
|
139 |
|-----|--------------------------------------------------------------------------------|
|
140 |
+
| 7 | Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
|
141 |
+
| 8 | Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: |
|
142 |
|
143 |
<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
|
144 |
|
|
|
149 |
|
150 |
| # | Prompt |
|
151 |
|-----|--------------------------------------------------------------------------------|
|
152 |
+
| 7 | Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
|
153 |
+
| 8 | Devi svolgere un compito di riconoscimento delle entità nei testi. Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: |
|
154 |
|
155 |
<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
|
156 |
|
|
|
161 |
|
162 |
| # | Prompt |
|
163 |
|-----|--------------------------------------------------------------------------------|
|
164 |
+
| 7 | Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
|
165 |
+
| 8 | Devi svolgere un compito di estrazione di relazioni da documenti medici. Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: |
|
166 |
|
167 |
<small>**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = F1 averaged over the 2 prompts. **Best Prompt** = F1 of the best prompt. **Prompt ID** = ID of the best prompt (see legend above). </small>
|
168 |
|