Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
import html
|
4 |
+
import markdown
|
5 |
+
from typing import List, Dict, Any, Tuple
|
6 |
+
|
7 |
+
import gradio as gr
|
8 |
+
import pandas as pd
|
9 |
+
from datasets import load_dataset
|
10 |
+
|
11 |
+
# df = pd.read_json("selected_battles.json")
|
12 |
+
|
13 |
+
# load arena battles
|
14 |
+
ds = load_dataset("lmarena-ai/arena-human-preference-100k", split="train")
|
15 |
+
battles = ds['train'].to_pandas()
|
16 |
+
|
17 |
+
# Expected columns in this dataset family:
|
18 |
+
# ['question_id','model_a','model_b','winner','conversation_a','conversation_b',
|
19 |
+
# 'turn','anony','language','tstamp','conv_metadata','is_code','is_refusal',
|
20 |
+
# 'dedup_tag','category_tag','judge_hash', ...]
|
21 |
+
# See HF card. ──> winner ∈ {model_a, model_b, tie, both_bad}; conversations are full threads. [oai_citation:1‡Hugging Face](https://huggingface.co/datasets/lmarena-ai/arena-human-preference-100k/blob/c9fe392b54cd08a0fd27777455318bac2e7b495c/README.md?utm_source=chatgpt.com)
|
22 |
+
|
23 |
+
# Dropdown options - sorted by frequency
|
24 |
+
def get_sorted_options(column_name):
|
25 |
+
if column_name not in df.columns:
|
26 |
+
return ["(Any)"]
|
27 |
+
value_counts = df[column_name].dropna().value_counts()
|
28 |
+
sorted_values = value_counts.index.tolist()
|
29 |
+
return ["(Any)"] + sorted_values
|
30 |
+
|
31 |
+
models_a = get_sorted_options("model_a")
|
32 |
+
models_b = get_sorted_options("model_b")
|
33 |
+
languages = get_sorted_options("language")
|
34 |
+
|
35 |
+
def _ensure_messages(x: Any) -> List[Dict[str, Any]]:
|
36 |
+
"""
|
37 |
+
conversation_a / conversation_b can be:
|
38 |
+
- a Python list of {role, content} dicts
|
39 |
+
- a JSON string encoding that list
|
40 |
+
Normalize to a list of dicts with 'role' and 'content'.
|
41 |
+
"""
|
42 |
+
if isinstance(x, list):
|
43 |
+
return x
|
44 |
+
if isinstance(x, str):
|
45 |
+
try:
|
46 |
+
val = json.loads(x)
|
47 |
+
if isinstance(val, list):
|
48 |
+
return val
|
49 |
+
except Exception:
|
50 |
+
pass
|
51 |
+
# Last resort: wrap as a single assistant message
|
52 |
+
return [{"role": "assistant", "content": str(x)}]
|
53 |
+
|
54 |
+
def _winner_text(row: pd.Series) -> str:
|
55 |
+
w = str(row.get("winner", "")).strip().lower()
|
56 |
+
mapping = {
|
57 |
+
"model_a": "Preference: Model A",
|
58 |
+
"model_b": "Preference: Model B",
|
59 |
+
"tie": "Preference: Tie",
|
60 |
+
"both_bad": "Preference: Tie (both bad)",
|
61 |
+
}
|
62 |
+
return mapping.get(w, "Preference: (unknown)")
|
63 |
+
|
64 |
+
def _bubble_html(messages: List[Dict[str, Any]], side_label: str) -> str:
|
65 |
+
"""
|
66 |
+
Make a chat-like interface with proper user/assistant bubbles.
|
67 |
+
User messages are on the left, assistant messages on the right.
|
68 |
+
"""
|
69 |
+
# Tailwind-like inline styles (no external CSS)
|
70 |
+
css = """
|
71 |
+
<style>
|
72 |
+
.chat-container {padding:12px; border-radius:16px; background:#fafafa; box-shadow:0 1px 3px rgba(0,0,0,.08);}
|
73 |
+
.model-label {font-weight:600; font-size:14px; margin-bottom:12px; opacity:.8; text-align:center;}
|
74 |
+
.message {margin:12px 0; display:flex; align-items:flex-start;}
|
75 |
+
.message.user {justify-content:flex-start;}
|
76 |
+
.message.assistant {justify-content:flex-end;}
|
77 |
+
.bubble {max-width:70%; padding:10px 14px; border-radius:18px; word-wrap:break-word;}
|
78 |
+
.bubble.user {background:#e9eef7; color:#2c3e50; margin-right:auto;}
|
79 |
+
.bubble.assistant {background:#eaf7ea; color:#2c3e50; margin-left:auto;}
|
80 |
+
.role-label {font-size:11px; font-weight:500; margin-bottom:4px; opacity:.7;}
|
81 |
+
.role-label.assistant {text-align:right;}
|
82 |
+
.bubble pre {background:#f5f5f5; padding:8px; border-radius:4px; overflow-x:auto; margin:8px 0;}
|
83 |
+
.bubble code {background:#f0f0f0; padding:2px 4px; border-radius:3px; font-family:monospace;}
|
84 |
+
.bubble p {margin:8px 0;}
|
85 |
+
.bubble ul, .bubble ol {margin:8px 0; padding-left:20px;}
|
86 |
+
.bubble blockquote {border-left:3px solid #ddd; padding-left:12px; margin:8px 0; color:#666;}
|
87 |
+
</style>
|
88 |
+
"""
|
89 |
+
body = [f'<div class="chat-container">']
|
90 |
+
|
91 |
+
# Only show model label at top for User side
|
92 |
+
if side_label != "Assistant":
|
93 |
+
body.append(f'<div class="model-label">{side_label}</div>')
|
94 |
+
|
95 |
+
first_assistant_message = True
|
96 |
+
for m in messages:
|
97 |
+
role = (m.get("role") or "").lower()
|
98 |
+
content = str(m.get("content", "")).strip()
|
99 |
+
if not content:
|
100 |
+
continue
|
101 |
+
|
102 |
+
# Convert markdown to HTML
|
103 |
+
try:
|
104 |
+
rendered_content = markdown.markdown(content, extensions=['fenced_code', 'codehilite', 'tables'])
|
105 |
+
except:
|
106 |
+
# Fallback to escaped content if markdown rendering fails
|
107 |
+
rendered_content = html.escape(content)
|
108 |
+
|
109 |
+
if role in ("user", "system"):
|
110 |
+
role_display = "User" if role == "user" else "System"
|
111 |
+
body.append(f'''
|
112 |
+
<div class="message user">
|
113 |
+
<div>
|
114 |
+
<div class="role-label">{role_display}</div>
|
115 |
+
<div class="bubble user">{rendered_content}</div>
|
116 |
+
</div>
|
117 |
+
</div>
|
118 |
+
''')
|
119 |
+
else:
|
120 |
+
# For assistant messages, include the model name in the first message
|
121 |
+
if first_assistant_message and side_label == "Assistant":
|
122 |
+
content = f"{side_label}: {content}"
|
123 |
+
try:
|
124 |
+
rendered_content = markdown.markdown(content, extensions=['fenced_code', 'codehilite', 'tables'])
|
125 |
+
except:
|
126 |
+
rendered_content = html.escape(content)
|
127 |
+
first_assistant_message = False
|
128 |
+
|
129 |
+
body.append(f'''
|
130 |
+
<div class="message assistant">
|
131 |
+
<div>
|
132 |
+
<div class="role-label assistant">Assistant</div>
|
133 |
+
<div class="bubble assistant">{rendered_content}</div>
|
134 |
+
</div>
|
135 |
+
</div>
|
136 |
+
''')
|
137 |
+
|
138 |
+
body.append("</div>")
|
139 |
+
return css + "\n".join(body)
|
140 |
+
|
141 |
+
def filter_df(model_a_sel: str, model_b_sel: str, lang_sel: str) -> pd.DataFrame:
|
142 |
+
sub = df
|
143 |
+
if model_a_sel != "(Any)":
|
144 |
+
sub = sub[sub["model_a"] == model_a_sel]
|
145 |
+
if model_b_sel != "(Any)":
|
146 |
+
sub = sub[sub["model_b"] == model_b_sel]
|
147 |
+
if "language" in sub.columns and lang_sel != "(Any)":
|
148 |
+
sub = sub[sub["language"].astype(str) == lang_sel]
|
149 |
+
return sub.reset_index(drop=True)
|
150 |
+
|
151 |
+
def format_row(row: pd.Series) -> Tuple[str, str, str, str, str]:
|
152 |
+
# Prompt headline = first user message if present
|
153 |
+
msgs_a = _ensure_messages(row["conversation_a"])
|
154 |
+
msgs_b = _ensure_messages(row["conversation_b"])
|
155 |
+
first_user = ""
|
156 |
+
for m in msgs_a:
|
157 |
+
if (m.get("role") or "").lower() == "user":
|
158 |
+
first_user = str(m.get("content", "")).strip()
|
159 |
+
break
|
160 |
+
|
161 |
+
left = _bubble_html(msgs_a, f"Model A: {row['model_a']}")
|
162 |
+
right = _bubble_html(msgs_b, f"Model B: {row['model_b']}")
|
163 |
+
|
164 |
+
# Create a subtle preference footer with soft yellow background
|
165 |
+
preference_text = _winner_text(row)
|
166 |
+
footer_html = f"""
|
167 |
+
<div style="
|
168 |
+
background: #fff8e1;
|
169 |
+
color: #5d4037;
|
170 |
+
padding: 10px 16px;
|
171 |
+
margin: 12px 0;
|
172 |
+
border-radius: 6px;
|
173 |
+
font-weight: 600;
|
174 |
+
font-size: 14px;
|
175 |
+
text-align: center;
|
176 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.08);
|
177 |
+
border: 1px solid #ffcc02;
|
178 |
+
">
|
179 |
+
{preference_text}
|
180 |
+
</div>
|
181 |
+
"""
|
182 |
+
|
183 |
+
return "", left, right, footer_html, ""
|
184 |
+
|
185 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo")) as demo:
|
186 |
+
gr.Markdown("# Chatbot Arena Battle Viewer (100k)")
|
187 |
+
gr.Markdown(
|
188 |
+
"Filter by **Model A**, **Model B**, and **Language**, then browse side-by-side conversations. "
|
189 |
+
"Data: `lmarena-ai/arena-human-preference-100k`."
|
190 |
+
)
|
191 |
+
|
192 |
+
with gr.Row():
|
193 |
+
dd_a = gr.Dropdown(models_a, label="Model A", value="(Any)")
|
194 |
+
dd_b = gr.Dropdown(models_b, label="Model B", value="(Any)")
|
195 |
+
dd_l = gr.Dropdown(languages, label="Language", value=languages[0])
|
196 |
+
|
197 |
+
with gr.Row():
|
198 |
+
btn_rand = gr.Button("Random match")
|
199 |
+
btn_prev = gr.Button("◀ Prev")
|
200 |
+
btn_next = gr.Button("Next ▶")
|
201 |
+
|
202 |
+
st_indices = gr.State([])
|
203 |
+
st_ptr = gr.State(0)
|
204 |
+
|
205 |
+
header_md = gr.Markdown()
|
206 |
+
with gr.Row():
|
207 |
+
left_html = gr.HTML()
|
208 |
+
right_html = gr.HTML()
|
209 |
+
footer_md = gr.HTML()
|
210 |
+
meta_md = gr.Markdown()
|
211 |
+
|
212 |
+
def apply_filters(a, b, l):
|
213 |
+
sub = filter_df(a, b, l)
|
214 |
+
idxs = list(range(len(sub)))
|
215 |
+
ptr = 0 if idxs else -1
|
216 |
+
if ptr >= 0:
|
217 |
+
row = sub.iloc[ptr]
|
218 |
+
head, left, right, foot, meta = format_row(row)
|
219 |
+
else:
|
220 |
+
head = left = right = foot = meta = "_No rows match your filters._"
|
221 |
+
return idxs, ptr, head, left, right, foot, meta
|
222 |
+
|
223 |
+
def nav(a, b, l, indices, ptr, direction):
|
224 |
+
sub = filter_df(a, b, l)
|
225 |
+
if not len(sub):
|
226 |
+
return [], -1, "_No rows match your filters._", "", "", "", ""
|
227 |
+
idxs = list(range(len(sub)))
|
228 |
+
if ptr is None or ptr < 0 or ptr >= len(sub):
|
229 |
+
ptr = 0
|
230 |
+
if direction == "next":
|
231 |
+
ptr = (ptr + 1) % len(sub)
|
232 |
+
elif direction == "prev":
|
233 |
+
ptr = (ptr - 1) % len(sub)
|
234 |
+
row = sub.iloc[ptr]
|
235 |
+
head, left, right, foot, meta = format_row(row)
|
236 |
+
return idxs, ptr, head, left, right, foot, meta
|
237 |
+
|
238 |
+
def rand(a, b, l):
|
239 |
+
sub = filter_df(a, b, l)
|
240 |
+
if not len(sub):
|
241 |
+
return [], -1, "_No rows match your filters._", "", "", "", ""
|
242 |
+
r = random.randrange(len(sub))
|
243 |
+
row = sub.iloc[r]
|
244 |
+
head, left, right, foot, meta = format_row(row)
|
245 |
+
return list(range(len(sub))), r, head, left, right, foot, meta
|
246 |
+
|
247 |
+
# Auto-update when dropdowns change
|
248 |
+
dd_a.change(apply_filters, [dd_a, dd_b, dd_l],
|
249 |
+
[st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
|
250 |
+
dd_b.change(apply_filters, [dd_a, dd_b, dd_l],
|
251 |
+
[st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
|
252 |
+
dd_l.change(apply_filters, [dd_a, dd_b, dd_l],
|
253 |
+
[st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
|
254 |
+
|
255 |
+
btn_next.click(nav, [dd_a, dd_b, dd_l, st_indices, st_ptr, gr.State("next")],
|
256 |
+
[st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
|
257 |
+
btn_prev.click(nav, [dd_a, dd_b, dd_l, st_indices, st_ptr, gr.State("prev")],
|
258 |
+
[st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
|
259 |
+
btn_rand.click(rand, [dd_a, dd_b, dd_l],
|
260 |
+
[st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
|
261 |
+
|
262 |
+
gr.on([demo.load], apply_filters, [dd_a, dd_b, dd_l],
|
263 |
+
[st_indices, st_ptr, header_md, left_html, right_html, footer_md, meta_md])
|
264 |
+
|
265 |
+
if __name__ == "__main__":
|
266 |
+
demo.launch()
|