Suzana commited on
Commit
47e0f7e
Β·
verified Β·
1 Parent(s): 4a75b2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -108
app.py CHANGED
@@ -3,20 +3,16 @@ import pandas as pd
3
  from pathlib import Path
4
  from huggingface_hub import HfApi, Repository
5
 
6
- # Allowed labels
7
- LABEL_SET = {"PER", "ORG", "LOC", "EV", "O"}
8
-
9
- # In-memory token store
10
  token_df = pd.DataFrame()
11
 
12
- # ─────────────────────── helpers ──────────────────────────
13
- def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
14
- """Return DataFrame(sentence_id, token, label='O')."""
15
- lines = (
16
- df["text"].astype(str)
17
- if "text" in df.columns
18
- else df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
19
- )
20
 
21
  rows = []
22
  for sid, line in enumerate(lines):
@@ -24,136 +20,92 @@ def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
24
  rows.append({"sentence_id": sid, "token": tok, "label": "O"})
25
  return pd.DataFrame(rows)
26
 
27
-
28
- # ────────────────────── callbacks ─────────────────────────
29
  def load_csv(file):
30
  global token_df
31
  df = pd.read_csv(file.name)
 
 
 
32
 
33
- valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
34
- if not valid:
35
- msg = "❌ CSV must contain a `text` column **or** `user` & `assistant` columns."
36
- return None, msg, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
37
-
38
- token_df = explode_to_tokens(df)
39
-
40
- return (
41
- gr.update(value=token_df, visible=True, row_count=len(token_df)),
42
- f"βœ… Loaded {len(df)} rows β†’ {len(token_df)} tokens.",
43
- gr.update(visible=True), # show buttons row
44
- gr.update(visible=False), # hide download links (reset)
45
- gr.update(visible=False),
46
- )
47
-
48
 
49
  def save_table(tbl):
50
  global token_df
51
  token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
52
- bad = token_df.loc[~token_df["label"].isin(LABEL_SET), "label"].unique()
53
- if len(bad):
54
- return f"⚠️ Unknown label(s): {', '.join(map(str, bad))}"
55
- return "πŸ’Ύ Saved."
56
-
57
 
58
  def export_tokens():
59
- path = "raw_tokens.csv"
60
- token_df.to_csv(path, index=False)
61
- return gr.update(value=Path(path), visible=True)
62
-
63
 
64
  def export_iob():
65
- iob_tags, prev = [], {}
66
  for _, r in token_df.iterrows():
67
  sid, lbl = r["sentence_id"], r["label"]
68
  if lbl == "O":
69
- iob_tags.append("O")
70
- prev[sid] = None
71
  else:
72
- prefix = "I-" if prev.get(sid) == lbl else "B-"
73
- iob_tags.append(prefix + lbl)
74
  prev[sid] = lbl
75
- out = token_df.copy()
76
- out["iob"] = iob_tags
77
- path = "ner_iob.csv"
78
- out.to_csv(path, index=False)
79
- return gr.update(value=Path(path), visible=True)
80
-
81
 
82
  def push_to_hub(repo_id, token):
83
  try:
84
- api = HfApi()
85
- api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
86
-
87
  local = Path(repo_id.replace("/", "_"))
88
  if local.exists():
89
- for f in local.iterdir():
90
- f.unlink()
91
  local.rmdir()
92
-
93
- repo = Repository(local_dir=str(local),
94
- clone_from=repo_id,
95
- repo_type="dataset",
96
- use_auth_token=token)
97
-
98
- token_df.to_csv(local / "data.csv", index=False)
99
  repo.push_to_hub("Add annotated NER data")
100
  return f"πŸš€ https://huggingface.co/datasets/{repo_id}"
101
  except Exception as e:
102
- return f"❌ Push failed: {e}"
103
-
104
 
105
- # ──────────────────────── UI ──────────────────────────────
106
  with gr.Blocks() as demo:
107
  gr.Markdown("# 🏷️ Label It! Mini-NER")
108
 
109
- gr.Markdown(
110
- "**Step 1** – Upload a CSV containing a `text` column *or* `user` + `assistant` dialogue columns."
111
- )
112
-
113
  with gr.Row():
114
- csv_file = gr.File(label="πŸ“ Upload CSV", file_types=[".csv"])
115
- load_btn = gr.Button("Load")
116
-
117
- status = gr.Textbox(label="Status", interactive=False)
118
-
119
- tok_table = gr.Dataframe(
120
- headers=["sentence_id", "token", "label"],
121
- datatype=["number", "str", "str"],
122
- row_count=0, col_count=3,
123
- visible=False,
124
- )
125
-
126
- with gr.Row(visible=False) as btn_row:
127
- save_btn = gr.Button("πŸ’Ύ Save")
128
- dl_tok_btn = gr.Button("β¬‡οΈŽ Tokens CSV")
129
- dl_iob_btn = gr.Button("β¬‡οΈŽ IOB CSV")
130
-
131
- file_tok = gr.File(label="Tokens CSV", visible=False)
132
- file_iob = gr.File(label="IOB CSV", visible=False)
133
-
134
- # Push accordion
135
- push_acc = gr.Accordion("πŸ“¦ Push to Hugging Face Hub", open=False, visible=False)
136
- with push_acc:
137
- repo_in = gr.Textbox(label="dataset repo (username/name)")
138
- token_in = gr.Textbox(label="HF Token", type="password")
139
- push_btn = gr.Button("Push")
140
- push_out = gr.Textbox(label="Push Status", interactive=False)
141
-
142
- # ── wiring
143
- load_btn.click(load_csv,
144
- inputs=csv_file,
145
- outputs=[tok_table, status, btn_row, file_tok, file_iob])
146
- load_btn.click(lambda: gr.update(visible=True), None, push_acc)
147
 
148
- save_btn.click(save_table, inputs=tok_table, outputs=status)
 
 
149
 
150
- dl_tok_btn.click(export_tokens, outputs=file_tok)
151
- dl_iob_btn.click(export_iob, outputs=file_iob)
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
- push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out)
 
 
 
154
 
155
- gr.Markdown(
156
- "**Step 2** – Type `PER`, `ORG`, `LOC`, `EV`, or `O` in the `label` column, save, then download or push."
157
- )
158
 
159
  demo.launch()
 
3
  from pathlib import Path
4
  from huggingface_hub import HfApi, Repository
5
 
6
+ LABELS = {"PER", "ORG", "LOC", "EV", "O"}
 
 
 
7
  token_df = pd.DataFrame()
8
 
9
+ # ───────────────── helpers ─────────────────
10
+ def explode(df):
11
+ """Return sentence-level token rows with default O label."""
12
+ if "text" in df.columns:
13
+ lines = df["text"].astype(str)
14
+ else:
15
+ lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
 
16
 
17
  rows = []
18
  for sid, line in enumerate(lines):
 
20
  rows.append({"sentence_id": sid, "token": tok, "label": "O"})
21
  return pd.DataFrame(rows)
22
 
23
+ # ───────────────── callbacks ───────────────
 
24
  def load_csv(file):
25
  global token_df
26
  df = pd.read_csv(file.name)
27
+ if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
28
+ return None, "❌ Need `text` or `user`+`assistant` cols.", \
29
+ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
30
 
31
+ token_df = explode(df)
32
+ return (gr.update(value=token_df, visible=True, row_count=len(token_df)),
33
+ f"βœ… {len(df)} rows β†’ {len(token_df)} tokens.",
34
+ gr.update(visible=True), # action row
35
+ gr.update(visible=False), # reset downloads
36
+ gr.update(visible=False))
 
 
 
 
 
 
 
 
 
37
 
38
  def save_table(tbl):
39
  global token_df
40
  token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
41
+ bad = token_df.loc[~token_df.label.isin(LABELS), "label"].unique()
42
+ return "πŸ’Ύ Saved." if len(bad) == 0 else f"⚠️ Unknown: {', '.join(bad)}"
 
 
 
43
 
44
  def export_tokens():
45
+ fname = "raw_tokens.csv"
46
+ token_df.to_csv(fname, index=False)
47
+ return gr.update(value=fname, visible=True) # <<< string path + visible
 
48
 
49
  def export_iob():
50
+ iob, prev = [], {}
51
  for _, r in token_df.iterrows():
52
  sid, lbl = r["sentence_id"], r["label"]
53
  if lbl == "O":
54
+ iob.append("O"); prev[sid] = None
 
55
  else:
56
+ iob.append(("I-" if prev.get(sid)==lbl else "B-")+lbl)
 
57
  prev[sid] = lbl
58
+ out = token_df.copy(); out["iob"] = iob
59
+ fname = "ner_iob.csv"; out.to_csv(fname, index=False)
60
+ return gr.update(value=fname, visible=True) # <<< same pattern
 
 
 
61
 
62
  def push_to_hub(repo_id, token):
63
  try:
64
+ HfApi().create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
 
 
65
  local = Path(repo_id.replace("/", "_"))
66
  if local.exists():
67
+ for f in local.iterdir(): f.unlink()
 
68
  local.rmdir()
69
+ repo = Repository(str(local), clone_from=repo_id,
70
+ repo_type="dataset", use_auth_token=token)
71
+ token_df.to_csv(local/"data.csv", index=False)
 
 
 
 
72
  repo.push_to_hub("Add annotated NER data")
73
  return f"πŸš€ https://huggingface.co/datasets/{repo_id}"
74
  except Exception as e:
75
+ return f"❌ {e}"
 
76
 
77
+ # ───────────────── UI ──────────────────────
78
  with gr.Blocks() as demo:
79
  gr.Markdown("# 🏷️ Label It! Mini-NER")
80
 
 
 
 
 
81
  with gr.Row():
82
+ f_in, load_btn = gr.File(file_types=[".csv"]), gr.Button("Load")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ status = gr.Textbox(interactive=False)
85
+ table = gr.Dataframe(headers=["sentence_id","token","label"],
86
+ datatype=["number","str","str"], visible=False)
87
 
88
+ with gr.Row(visible=False) as actions:
89
+ save_btn = gr.Button("πŸ’Ύ Save")
90
+ tok_btn = gr.Button("β¬‡οΈŽ Tokens CSV")
91
+ iob_btn = gr.Button("β¬‡οΈŽ IOB CSV")
92
+
93
+ file_tok = gr.File(visible=False)
94
+ file_iob = gr.File(visible=False)
95
+
96
+ with gr.Accordion("πŸ“¦ Push to Hub", open=False, visible=False) as push_acc:
97
+ repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password")
98
+ push_btn, push_out = gr.Button("Push"), gr.Textbox(interactive=False)
99
+
100
+ # wiring
101
+ load_btn.click(load_csv, f_in, [table, status, actions, file_tok, file_iob])
102
+ load_btn.click(lambda: gr.update(visible=True), None, push_acc)
103
 
104
+ save_btn.click(save_table, table, status)
105
+ tok_btn.click(export_tokens, outputs=file_tok)
106
+ iob_btn.click(export_iob, outputs=file_iob)
107
+ push_btn.click(push_to_hub, [repo_in, token_in], push_out)
108
 
109
+ gr.Markdown("Edit **label** (`PER`, `ORG`, `LOC`, `EV`, `O`) β†’ Save β†’ Download / Push.")
 
 
110
 
111
  demo.launch()