Suzana commited on
Commit
66bd7e3
Β·
verified Β·
1 Parent(s): efeca40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -47
app.py CHANGED
@@ -2,40 +2,40 @@ import gradio as gr
2
  import pandas as pd
3
  from pathlib import Path
4
 
5
- token_df = pd.DataFrame() # global store
6
 
7
- # ───────────────────────── helpers ──────────────────────────────
 
 
8
  def tokenize_df(df: pd.DataFrame) -> pd.DataFrame:
9
- """Explode dataframe into token rows with default 'O' label."""
10
- records = []
11
  if "text" in df.columns:
12
  lines = df["text"].astype(str)
13
- else: # user+assistant dialog
 
14
  lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
15
  for sid, line in enumerate(lines):
16
  for tok in line.split():
17
- records.append({"sentence_id": sid, "token": tok, "label": "O"})
18
- return pd.DataFrame(records)
19
 
20
- # ───────────────────────── callbacks ────────────────────────────
21
  def load_data(file):
22
  global token_df
23
  df = pd.read_csv(file.name)
24
 
25
  if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
26
- return (None, "❌ CSV must have `text` **or** `user`+`assistant` columns.",
 
27
  gr.update(visible=False), gr.update(visible=False))
28
 
29
  token_df = tokenize_df(df)
30
 
31
  return (
32
- # Show table with correct row_count
33
- gr.update(value=token_df.values.tolist(), # list-of-lists
34
- row_count=len(token_df),
35
- visible=True),
36
  f"βœ… Loaded {len(df)} rows β†’ {len(token_df)} tokens.",
37
- gr.update(visible=True), # show action row
38
- gr.update(visible=False) # hide downloads until first export
39
  )
40
 
41
  def save_edits(table_data):
@@ -43,34 +43,31 @@ def save_edits(table_data):
43
  token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
44
  return "πŸ’Ύ Edits saved."
45
 
46
- def make_tokens_csv():
47
  path = "raw_tokens.csv"
48
  token_df.to_csv(path, index=False)
49
  return Path(path)
50
 
51
- def make_iob_csv():
52
- iob_tags, prev = [], {}
53
  for _, r in token_df.iterrows():
54
  sid, lbl = r["sentence_id"], r["label"]
55
  if lbl == "O":
56
- iob_tags.append("O")
57
  prev[sid] = None
58
  else:
59
- prefix = "I-" if prev.get(sid) == lbl else "B-"
60
- iob_tags.append(prefix + lbl)
61
  prev[sid] = lbl
62
- out = token_df.copy()
63
- out["iob"] = iob_tags
64
- path = "ner_iob.csv"
65
- out.to_csv(path, index=False)
66
  return Path(path)
67
 
68
- # ───────────────────────── UI ───────────────────────────────────
69
  with gr.Blocks() as demo:
70
  gr.Markdown("# 🏷️ Label It! Mini-NER")
71
 
72
  gr.Markdown(
73
- "**Step 1** – Upload a CSV containing either a `text` column or `user` + `assistant` columns."
74
  )
75
 
76
  with gr.Row():
@@ -79,38 +76,46 @@ with gr.Blocks() as demo:
79
 
80
  status = gr.Textbox(label="Status", interactive=False)
81
 
82
- # Token table (hidden until data loaded)
83
  tok_table = gr.Dataframe(
84
  headers=["sentence_id", "token", "label"],
85
  datatype=["number", "str", "str"],
86
- row_count=0, col_count=3,
87
- visible=False
 
 
 
 
 
 
 
88
  )
89
 
90
- # Buttons row (hidden until loaded)
91
- with gr.Row(visible=False) as action_row:
92
- save_btn = gr.Button("πŸ’Ύ Save")
93
- dl_tok_btn= gr.Button("β¬‡οΈŽ Download Tokens CSV")
94
- dl_iob_btn= gr.Button("β¬‡οΈŽ Download IOB CSV")
95
 
96
- # Hidden download files (appear only after first export)
97
- dl_tokens_file = gr.File(label="Tokens CSV", visible=False)
98
- dl_iob_file = gr.File(label="IOB CSV", visible=False)
99
 
100
- # Bind events
101
- load_btn.click(load_data,
102
- inputs=csv_file,
103
- outputs=[tok_table, status, action_row, dl_tokens_file])
104
 
105
  save_btn.click(save_edits, inputs=tok_table, outputs=status)
106
 
107
- dl_tok_btn.click(lambda: make_tokens_csv(),
108
- outputs=dl_tokens_file)
109
- dl_iob_btn.click(lambda: make_iob_csv(),
110
- outputs=dl_iob_file)
 
111
 
112
  gr.Markdown(
113
- "**Step 2** – Edit the `label` column (`PER`, `ORG`, `LOC`, or `O`) β†’ click **Save** β†’ export."
 
114
  )
115
 
116
  demo.launch()
 
2
  import pandas as pd
3
  from pathlib import Path
4
 
5
+ LABEL_CHOICES = ["O", "PER", "ORG", "LOC", "EV"] # EV = Event
6
 
7
+ token_df = pd.DataFrame()
8
+
9
+ # ────────── helpers ─────────────────────────────────────────────
10
  def tokenize_df(df: pd.DataFrame) -> pd.DataFrame:
11
+ rows = []
 
12
  if "text" in df.columns:
13
  lines = df["text"].astype(str)
14
+ else:
15
+ # user + assistant fallback
16
  lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
17
  for sid, line in enumerate(lines):
18
  for tok in line.split():
19
+ rows.append({"sentence_id": sid, "token": tok, "label": "O"})
20
+ return pd.DataFrame(rows)
21
 
22
+ # ────────── callbacks ───────────────────────────────────────────
23
  def load_data(file):
24
  global token_df
25
  df = pd.read_csv(file.name)
26
 
27
  if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
28
+ return (None,
29
+ "❌ CSV must have a `text` column **or** `user`&`assistant` columns.",
30
  gr.update(visible=False), gr.update(visible=False))
31
 
32
  token_df = tokenize_df(df)
33
 
34
  return (
35
+ gr.update(value=token_df, visible=True),
 
 
 
36
  f"βœ… Loaded {len(df)} rows β†’ {len(token_df)} tokens.",
37
+ gr.update(visible=True),
38
+ gr.update(visible=False)
39
  )
40
 
41
  def save_edits(table_data):
 
43
  token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
44
  return "πŸ’Ύ Edits saved."
45
 
46
+ def export_tokens():
47
  path = "raw_tokens.csv"
48
  token_df.to_csv(path, index=False)
49
  return Path(path)
50
 
51
+ def export_iob():
52
+ iob, prev = [], {}
53
  for _, r in token_df.iterrows():
54
  sid, lbl = r["sentence_id"], r["label"]
55
  if lbl == "O":
56
+ iob.append("O")
57
  prev[sid] = None
58
  else:
59
+ iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl)
 
60
  prev[sid] = lbl
61
+ out = token_df.copy(); out["iob"] = iob
62
+ path = "ner_iob.csv"; out.to_csv(path, index=False)
 
 
63
  return Path(path)
64
 
65
+ # ────────── UI ──────────────────────────────────────────────────
66
  with gr.Blocks() as demo:
67
  gr.Markdown("# 🏷️ Label It! Mini-NER")
68
 
69
  gr.Markdown(
70
+ "**Step 1** – Upload a CSV containing either a `text` column, or `user` & `assistant` columns."
71
  )
72
 
73
  with gr.Row():
 
76
 
77
  status = gr.Textbox(label="Status", interactive=False)
78
 
79
+ # Editable token table (hidden until load)
80
  tok_table = gr.Dataframe(
81
  headers=["sentence_id", "token", "label"],
82
  datatype=["number", "str", "str"],
83
+ column_config={
84
+ "label": gr.ColumnConfig(
85
+ label="label",
86
+ dtype="categorical",
87
+ choices=LABEL_CHOICES,
88
+ )
89
+ },
90
+ row_count=0,
91
+ visible=False,
92
  )
93
 
94
+ # Action buttons row (hidden until load)
95
+ with gr.Row(visible=False) as buttons_row:
96
+ save_btn = gr.Button("πŸ’Ύ Save")
97
+ download_tok = gr.Button("β¬‡οΈŽ Tokens CSV")
98
+ download_iob = gr.Button("β¬‡οΈŽ IOB CSV")
99
 
100
+ # File components that appear after export
101
+ file_tok = gr.File(label="Click to download", visible=False)
102
+ file_iob = gr.File(label="Click to download", visible=False)
103
 
104
+ # Bindings
105
+ load_btn.click(load_data, inputs=csv_file,
106
+ outputs=[tok_table, status, buttons_row, file_tok])
 
107
 
108
  save_btn.click(save_edits, inputs=tok_table, outputs=status)
109
 
110
+ download_tok.click(lambda: export_tokens(),
111
+ outputs=file_tok) # file appears for click
112
+
113
+ download_iob.click(lambda: export_iob(),
114
+ outputs=file_iob)
115
 
116
  gr.Markdown(
117
+ "**Step 2** – In the `label` dropdown choose `PER`, `ORG`, `LOC`, `EV`, or leave `O`."
118
+ "\nAfter saving, use the download buttons."
119
  )
120
 
121
  demo.launch()