Suzana commited on
Commit
4a75b2d
Β·
verified Β·
1 Parent(s): 4455f2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -54
app.py CHANGED
@@ -3,100 +3,115 @@ import pandas as pd
3
  from pathlib import Path
4
  from huggingface_hub import HfApi, Repository
5
 
6
- # Allowed tags
7
- LABELS = {"PER", "ORG", "LOC", "EV", "O"}
8
 
9
- # Global token DataFrame
10
  token_df = pd.DataFrame()
11
 
12
- # ───────────────────────── helpers ──────────────────────────────
13
  def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
14
- """Return DataFrame(sentence_id, token, label[=O])"""
15
- rows = []
16
- if "text" in df.columns:
17
- lines = df["text"].astype(str)
18
- else: # dialog pair
19
- lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
20
 
 
21
  for sid, line in enumerate(lines):
22
  for tok in line.split():
23
  rows.append({"sentence_id": sid, "token": tok, "label": "O"})
24
  return pd.DataFrame(rows)
25
 
26
- # ───────────────────────── callbacks ────────────────────────────
 
27
  def load_csv(file):
28
  global token_df
29
  df = pd.read_csv(file.name)
 
30
  valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
31
  if not valid:
32
- return None, "❌ CSV must contain `text` OR `user`+`assistant` columns.", \
33
- gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
34
 
35
  token_df = explode_to_tokens(df)
36
 
37
  return (
38
- gr.update(value=token_df, visible=True, # show table
39
- row_count=len(token_df)),
40
- f"βœ… Loaded {len(df)} rows – {len(token_df)} tokens.",
41
- gr.update(visible=True), # show action row
42
- gr.update(visible=False), # hide token file
43
- gr.update(visible=False) # hide iob file
44
  )
45
 
46
- def save_table(table_data):
 
47
  global token_df
48
- token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
49
- if not set(token_df["label"]).issubset(LABELS):
50
- return "⚠️ Unknown label detected. Allowed: PER / ORG / LOC / EV / O"
 
51
  return "πŸ’Ύ Saved."
52
 
 
53
  def export_tokens():
54
  path = "raw_tokens.csv"
55
  token_df.to_csv(path, index=False)
56
  return gr.update(value=Path(path), visible=True)
57
 
 
58
  def export_iob():
59
- iob, prev = [], {}
60
  for _, r in token_df.iterrows():
61
  sid, lbl = r["sentence_id"], r["label"]
62
  if lbl == "O":
63
- iob.append("O"); prev[sid] = None
 
64
  else:
65
- tag = ("I-" if prev.get(sid) == lbl else "B-") + lbl
66
- iob.append(tag); prev[sid] = lbl
67
- out = token_df.copy(); out["iob"] = iob
68
- path = "ner_iob.csv"; out.to_csv(path, index=False)
 
 
 
69
  return gr.update(value=Path(path), visible=True)
70
 
 
71
  def push_to_hub(repo_id, token):
72
- global token_df
73
  try:
74
  api = HfApi()
75
  api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
76
 
77
- local_dir = Path(f"./{repo_id.replace('/','_')}")
78
- if local_dir.exists():
79
- for f in local_dir.iterdir(): f.unlink()
80
- local_dir.rmdir()
 
81
 
82
- repo = Repository(local_dir=str(local_dir),
83
  clone_from=repo_id,
84
  repo_type="dataset",
85
  use_auth_token=token)
86
- token_df.to_csv(local_dir / "data.csv", index=False)
87
- repo.push_to_hub(commit_message="Add annotated NER data")
88
- return f"πŸš€ Pushed to https://huggingface.co/datasets/{repo_id}"
 
89
  except Exception as e:
90
  return f"❌ Push failed: {e}"
91
 
92
- # ───────────────────────── UI ───────────────────────────────────
 
93
  with gr.Blocks() as demo:
94
  gr.Markdown("# 🏷️ Label It! Mini-NER")
95
 
96
- gr.Markdown("**Step 1** – upload a CSV with a `text` column **or** a `user`+`assistant` pair.")
 
 
97
 
98
  with gr.Row():
99
- csv_file = gr.File(file_types=[".csv"], label="πŸ“ Upload CSV")
100
  load_btn = gr.Button("Load")
101
 
102
  status = gr.Textbox(label="Status", interactive=False)
@@ -105,39 +120,40 @@ with gr.Blocks() as demo:
105
  headers=["sentence_id", "token", "label"],
106
  datatype=["number", "str", "str"],
107
  row_count=0, col_count=3,
108
- visible=False
109
  )
110
 
111
- with gr.Row(visible=False) as action_row:
112
- save_btn = gr.Button("πŸ’Ύ Save")
113
  dl_tok_btn = gr.Button("β¬‡οΈŽ Tokens CSV")
114
  dl_iob_btn = gr.Button("β¬‡οΈŽ IOB CSV")
115
 
116
- file_tok = gr.File(visible=False)
117
- file_iob = gr.File(visible=False)
118
 
119
  # Push accordion
120
- with gr.Accordion("πŸ“¦ Push to Hugging Face Hub", open=False) as push_acc:
 
121
  repo_in = gr.Textbox(label="dataset repo (username/name)")
122
  token_in = gr.Textbox(label="HF Token", type="password")
123
  push_btn = gr.Button("Push")
124
  push_out = gr.Textbox(label="Push Status", interactive=False)
125
 
126
- # Hide accordion until data load
127
- push_acc.visible = False
128
-
129
  # ── wiring
130
- load_btn.click(load_csv, inputs=csv_file,
131
- outputs=[tok_table, status, action_row, file_tok, file_iob])
132
- load_btn.click(lambda: gr.update(visible=True), None, push_acc) # show accordion after load
 
133
 
134
  save_btn.click(save_table, inputs=tok_table, outputs=status)
 
135
  dl_tok_btn.click(export_tokens, outputs=file_tok)
136
  dl_iob_btn.click(export_iob, outputs=file_iob)
 
137
  push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out)
138
 
139
  gr.Markdown(
140
- "**Step 2** – edit the `label` column (`PER`, `ORG`, `LOC`, `EV`, `O`) ➜ Save ➜ Download / Push."
141
  )
142
 
143
  demo.launch()
 
3
  from pathlib import Path
4
  from huggingface_hub import HfApi, Repository
5
 
6
+ # Allowed labels
7
+ LABEL_SET = {"PER", "ORG", "LOC", "EV", "O"}
8
 
9
+ # In-memory token store
10
  token_df = pd.DataFrame()
11
 
12
+ # ─────────────────────── helpers ──────────────────────────
13
  def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
14
+ """Return DataFrame(sentence_id, token, label='O')."""
15
+ lines = (
16
+ df["text"].astype(str)
17
+ if "text" in df.columns
18
+ else df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
19
+ )
20
 
21
+ rows = []
22
  for sid, line in enumerate(lines):
23
  for tok in line.split():
24
  rows.append({"sentence_id": sid, "token": tok, "label": "O"})
25
  return pd.DataFrame(rows)
26
 
27
+
28
+ # ────────────────────── callbacks ─────────────────────────
29
  def load_csv(file):
30
  global token_df
31
  df = pd.read_csv(file.name)
32
+
33
  valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
34
  if not valid:
35
+ msg = "❌ CSV must contain a `text` column **or** `user` & `assistant` columns."
36
+ return None, msg, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
37
 
38
  token_df = explode_to_tokens(df)
39
 
40
  return (
41
+ gr.update(value=token_df, visible=True, row_count=len(token_df)),
42
+ f"βœ… Loaded {len(df)} rows β†’ {len(token_df)} tokens.",
43
+ gr.update(visible=True), # show buttons row
44
+ gr.update(visible=False), # hide download links (reset)
45
+ gr.update(visible=False),
 
46
  )
47
 
48
+
49
+ def save_table(tbl):
50
  global token_df
51
+ token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
52
+ bad = token_df.loc[~token_df["label"].isin(LABEL_SET), "label"].unique()
53
+ if len(bad):
54
+ return f"⚠️ Unknown label(s): {', '.join(map(str, bad))}"
55
  return "πŸ’Ύ Saved."
56
 
57
+
58
  def export_tokens():
59
  path = "raw_tokens.csv"
60
  token_df.to_csv(path, index=False)
61
  return gr.update(value=Path(path), visible=True)
62
 
63
+
64
  def export_iob():
65
+ iob_tags, prev = [], {}
66
  for _, r in token_df.iterrows():
67
  sid, lbl = r["sentence_id"], r["label"]
68
  if lbl == "O":
69
+ iob_tags.append("O")
70
+ prev[sid] = None
71
  else:
72
+ prefix = "I-" if prev.get(sid) == lbl else "B-"
73
+ iob_tags.append(prefix + lbl)
74
+ prev[sid] = lbl
75
+ out = token_df.copy()
76
+ out["iob"] = iob_tags
77
+ path = "ner_iob.csv"
78
+ out.to_csv(path, index=False)
79
  return gr.update(value=Path(path), visible=True)
80
 
81
+
82
  def push_to_hub(repo_id, token):
 
83
  try:
84
  api = HfApi()
85
  api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
86
 
87
+ local = Path(repo_id.replace("/", "_"))
88
+ if local.exists():
89
+ for f in local.iterdir():
90
+ f.unlink()
91
+ local.rmdir()
92
 
93
+ repo = Repository(local_dir=str(local),
94
  clone_from=repo_id,
95
  repo_type="dataset",
96
  use_auth_token=token)
97
+
98
+ token_df.to_csv(local / "data.csv", index=False)
99
+ repo.push_to_hub("Add annotated NER data")
100
+ return f"πŸš€ https://huggingface.co/datasets/{repo_id}"
101
  except Exception as e:
102
  return f"❌ Push failed: {e}"
103
 
104
+
105
+ # ──────────────────────── UI ──────────────────────────────
106
  with gr.Blocks() as demo:
107
  gr.Markdown("# 🏷️ Label It! Mini-NER")
108
 
109
+ gr.Markdown(
110
+ "**Step 1** – Upload a CSV containing a `text` column *or* `user` + `assistant` dialogue columns."
111
+ )
112
 
113
  with gr.Row():
114
+ csv_file = gr.File(label="πŸ“ Upload CSV", file_types=[".csv"])
115
  load_btn = gr.Button("Load")
116
 
117
  status = gr.Textbox(label="Status", interactive=False)
 
120
  headers=["sentence_id", "token", "label"],
121
  datatype=["number", "str", "str"],
122
  row_count=0, col_count=3,
123
+ visible=False,
124
  )
125
 
126
+ with gr.Row(visible=False) as btn_row:
127
+ save_btn = gr.Button("πŸ’Ύ Save")
128
  dl_tok_btn = gr.Button("β¬‡οΈŽ Tokens CSV")
129
  dl_iob_btn = gr.Button("β¬‡οΈŽ IOB CSV")
130
 
131
+ file_tok = gr.File(label="Tokens CSV", visible=False)
132
+ file_iob = gr.File(label="IOB CSV", visible=False)
133
 
134
  # Push accordion
135
+ push_acc = gr.Accordion("πŸ“¦ Push to Hugging Face Hub", open=False, visible=False)
136
+ with push_acc:
137
  repo_in = gr.Textbox(label="dataset repo (username/name)")
138
  token_in = gr.Textbox(label="HF Token", type="password")
139
  push_btn = gr.Button("Push")
140
  push_out = gr.Textbox(label="Push Status", interactive=False)
141
 
 
 
 
142
  # ── wiring
143
+ load_btn.click(load_csv,
144
+ inputs=csv_file,
145
+ outputs=[tok_table, status, btn_row, file_tok, file_iob])
146
+ load_btn.click(lambda: gr.update(visible=True), None, push_acc)
147
 
148
  save_btn.click(save_table, inputs=tok_table, outputs=status)
149
+
150
  dl_tok_btn.click(export_tokens, outputs=file_tok)
151
  dl_iob_btn.click(export_iob, outputs=file_iob)
152
+
153
  push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out)
154
 
155
  gr.Markdown(
156
+ "**Step 2** – Type `PER`, `ORG`, `LOC`, `EV`, or `O` in the `label` column, save, then download or push."
157
  )
158
 
159
  demo.launch()