Suzana commited on
Commit
11b95d7
Β·
verified Β·
1 Parent(s): 47e0f7e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -32
app.py CHANGED
@@ -3,48 +3,53 @@ import pandas as pd
3
  from pathlib import Path
4
  from huggingface_hub import HfApi, Repository
5
 
 
6
  LABELS = {"PER", "ORG", "LOC", "EV", "O"}
7
- token_df = pd.DataFrame()
8
 
9
- # ───────────────── helpers ─────────────────
10
- def explode(df):
11
- """Return sentence-level token rows with default O label."""
 
 
12
  if "text" in df.columns:
13
  lines = df["text"].astype(str)
14
- else:
15
  lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
16
 
17
  rows = []
18
- for sid, line in enumerate(lines):
19
  for tok in line.split():
20
  rows.append({"sentence_id": sid, "token": tok, "label": "O"})
21
  return pd.DataFrame(rows)
22
 
23
- # ───────────────── callbacks ───────────────
24
  def load_csv(file):
25
  global token_df
 
26
  df = pd.read_csv(file.name)
27
- if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
28
- return None, "❌ Need `text` or `user`+`assistant` cols.", \
29
- gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
 
30
 
31
  token_df = explode(df)
 
32
  return (gr.update(value=token_df, visible=True, row_count=len(token_df)),
33
  f"βœ… {len(df)} rows β†’ {len(token_df)} tokens.",
34
- gr.update(visible=True), # action row
35
- gr.update(visible=False), # reset downloads
36
  gr.update(visible=False))
37
 
38
  def save_table(tbl):
39
  global token_df
40
  token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
41
- bad = token_df.loc[~token_df.label.isin(LABELS), "label"].unique()
42
- return "πŸ’Ύ Saved." if len(bad) == 0 else f"⚠️ Unknown: {', '.join(bad)}"
43
 
44
  def export_tokens():
45
  fname = "raw_tokens.csv"
46
  token_df.to_csv(fname, index=False)
47
- return gr.update(value=fname, visible=True) # <<< string path + visible
48
 
49
  def export_iob():
50
  iob, prev = [], {}
@@ -53,11 +58,11 @@ def export_iob():
53
  if lbl == "O":
54
  iob.append("O"); prev[sid] = None
55
  else:
56
- iob.append(("I-" if prev.get(sid)==lbl else "B-")+lbl)
57
  prev[sid] = lbl
58
  out = token_df.copy(); out["iob"] = iob
59
  fname = "ner_iob.csv"; out.to_csv(fname, index=False)
60
- return gr.update(value=fname, visible=True) # <<< same pattern
61
 
62
  def push_to_hub(repo_id, token):
63
  try:
@@ -68,44 +73,50 @@ def push_to_hub(repo_id, token):
68
  local.rmdir()
69
  repo = Repository(str(local), clone_from=repo_id,
70
  repo_type="dataset", use_auth_token=token)
71
- token_df.to_csv(local/"data.csv", index=False)
72
  repo.push_to_hub("Add annotated NER data")
73
  return f"πŸš€ https://huggingface.co/datasets/{repo_id}"
74
  except Exception as e:
75
  return f"❌ {e}"
76
 
77
- # ───────────────── UI ──────────────────────
78
  with gr.Blocks() as demo:
79
  gr.Markdown("# 🏷️ Label It! Mini-NER")
80
 
 
 
81
  with gr.Row():
82
- f_in, load_btn = gr.File(file_types=[".csv"]), gr.Button("Load")
 
83
 
84
  status = gr.Textbox(interactive=False)
85
- table = gr.Dataframe(headers=["sentence_id","token","label"],
86
- datatype=["number","str","str"], visible=False)
87
 
88
- with gr.Row(visible=False) as actions:
89
- save_btn = gr.Button("πŸ’Ύ Save")
90
- tok_btn = gr.Button("β¬‡οΈŽ Tokens CSV")
91
- iob_btn = gr.Button("β¬‡οΈŽ IOB CSV")
 
 
 
 
92
 
93
  file_tok = gr.File(visible=False)
94
  file_iob = gr.File(visible=False)
95
 
96
- with gr.Accordion("πŸ“¦ Push to Hub", open=False, visible=False) as push_acc:
97
  repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password")
98
- push_btn, push_out = gr.Button("Push"), gr.Textbox(interactive=False)
 
99
 
100
  # wiring
101
- load_btn.click(load_csv, f_in, [table, status, actions, file_tok, file_iob])
102
- load_btn.click(lambda: gr.update(visible=True), None, push_acc)
103
 
104
- save_btn.click(save_table, table, status)
105
  tok_btn.click(export_tokens, outputs=file_tok)
106
  iob_btn.click(export_iob, outputs=file_iob)
107
  push_btn.click(push_to_hub, [repo_in, token_in], push_out)
108
 
109
- gr.Markdown("Edit **label** (`PER`, `ORG`, `LOC`, `EV`, `O`) β†’ Save β†’ Download / Push.")
110
 
111
  demo.launch()
 
3
  from pathlib import Path
4
  from huggingface_hub import HfApi, Repository
5
 
6
+ # Allowed tags
7
  LABELS = {"PER", "ORG", "LOC", "EV", "O"}
 
8
 
9
+ token_df = pd.DataFrame() # global store
10
+
11
+ # ───────────────────────── token explode ───────────────────────
12
+ def explode(df: pd.DataFrame) -> pd.DataFrame:
13
+ """Return DataFrame(sentence_id, token, label='O')."""
14
  if "text" in df.columns:
15
  lines = df["text"].astype(str)
16
+ else: # user / assistant dialogs
17
  lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
18
 
19
  rows = []
20
+ for sid, line in enumerate(lines, start=0): # ensure unique 0,1,2,...
21
  for tok in line.split():
22
  rows.append({"sentence_id": sid, "token": tok, "label": "O"})
23
  return pd.DataFrame(rows)
24
 
25
+ # ───────────────────────── callbacks ───────────────────────────
26
  def load_csv(file):
27
  global token_df
28
+
29
  df = pd.read_csv(file.name)
30
+ valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
31
+ if not valid:
32
+ msg = "❌ CSV needs a `text` column **or** both `user` and `assistant` columns."
33
+ return None, msg, *(gr.update(visible=False),) * 3
34
 
35
  token_df = explode(df)
36
+
37
  return (gr.update(value=token_df, visible=True, row_count=len(token_df)),
38
  f"βœ… {len(df)} rows β†’ {len(token_df)} tokens.",
39
+ gr.update(visible=True), # show buttons
40
+ gr.update(visible=False), # reset download links
41
  gr.update(visible=False))
42
 
43
  def save_table(tbl):
44
  global token_df
45
  token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
46
+ bad = token_df.loc[~token_df["label"].isin(LABELS), "label"].unique()
47
+ return "πŸ’Ύ Saved." if bad.size == 0 else f"⚠️ Unknown label(s): {', '.join(bad)}"
48
 
49
  def export_tokens():
50
  fname = "raw_tokens.csv"
51
  token_df.to_csv(fname, index=False)
52
+ return gr.update(value=fname, visible=True)
53
 
54
  def export_iob():
55
  iob, prev = [], {}
 
58
  if lbl == "O":
59
  iob.append("O"); prev[sid] = None
60
  else:
61
+ iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl)
62
  prev[sid] = lbl
63
  out = token_df.copy(); out["iob"] = iob
64
  fname = "ner_iob.csv"; out.to_csv(fname, index=False)
65
+ return gr.update(value=fname, visible=True)
66
 
67
  def push_to_hub(repo_id, token):
68
  try:
 
73
  local.rmdir()
74
  repo = Repository(str(local), clone_from=repo_id,
75
  repo_type="dataset", use_auth_token=token)
76
+ token_df.to_csv(local / "data.csv", index=False)
77
  repo.push_to_hub("Add annotated NER data")
78
  return f"πŸš€ https://huggingface.co/datasets/{repo_id}"
79
  except Exception as e:
80
  return f"❌ {e}"
81
 
82
+ # ───────────────────────── UI ──────────────────────────────────
83
  with gr.Blocks() as demo:
84
  gr.Markdown("# 🏷️ Label It! Mini-NER")
85
 
86
+ gr.Markdown("**Step 1** – upload CSV (`text` **or** `user`+`assistant`).")
87
+
88
  with gr.Row():
89
+ csv_file = gr.File(file_types=[".csv"])
90
+ load_btn = gr.Button("Load")
91
 
92
  status = gr.Textbox(interactive=False)
 
 
93
 
94
+ tok_table = gr.Dataframe(headers=["sentence_id", "token", "label"],
95
+ datatype=["number", "str", "str"],
96
+ visible=False)
97
+
98
+ with gr.Row(visible=False) as buttons:
99
+ save_btn = gr.Button("πŸ’Ύ Save")
100
+ tok_btn = gr.Button("β¬‡οΈŽ Tokens CSV")
101
+ iob_btn = gr.Button("β¬‡οΈŽ IOB CSV")
102
 
103
  file_tok = gr.File(visible=False)
104
  file_iob = gr.File(visible=False)
105
 
106
+ with gr.Accordion("πŸ“¦ Push to Hugging Face Hub", open=False, visible=False) as acc:
107
  repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password")
108
+ push_btn = gr.Button("Push")
109
+ push_out = gr.Textbox(interactive=False)
110
 
111
  # wiring
112
+ load_btn.click(load_csv, csv_file, [tok_table, status, buttons, file_tok, file_iob])
113
+ load_btn.click(lambda: gr.update(visible=True), None, acc)
114
 
115
+ save_btn.click(save_table, tok_table, status)
116
  tok_btn.click(export_tokens, outputs=file_tok)
117
  iob_btn.click(export_iob, outputs=file_iob)
118
  push_btn.click(push_to_hub, [repo_in, token_in], push_out)
119
 
120
+ gr.Markdown("**Step 2** – label tokens (`PER`, `ORG`, `LOC`, `EV`, `O`) ➜ Save ➜ Download / Push.")
121
 
122
  demo.launch()