Suzana commited on
Commit
4455f2c
Β·
verified Β·
1 Parent(s): d1f4849

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -43
app.py CHANGED
@@ -1,71 +1,102 @@
1
  import gradio as gr
2
  import pandas as pd
3
  from pathlib import Path
 
4
 
5
- LABELS = {"PER", "ORG", "LOC", "EV", "O"} # allowed tags
6
- token_df = pd.DataFrame() # global
7
 
8
- # ───────────────── tokenization ────────────────────────────────
9
- def tokenize(df: pd.DataFrame) -> pd.DataFrame:
 
 
 
 
10
  rows = []
11
  if "text" in df.columns:
12
  lines = df["text"].astype(str)
13
- else:
14
  lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
 
15
  for sid, line in enumerate(lines):
16
  for tok in line.split():
17
  rows.append({"sentence_id": sid, "token": tok, "label": "O"})
18
  return pd.DataFrame(rows)
19
 
20
- # ───────────────── callbacks ───────────────────────────────────
21
  def load_csv(file):
22
  global token_df
23
  df = pd.read_csv(file.name)
24
- if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
25
- return None, "❌ CSV must have `text` OR `user`+`assistant` columns.", \
26
- gr.update(visible=False), gr.update(visible=False)
27
-
28
- token_df = tokenize(df)
29
- return gr.update(value=token_df, visible=True), \
30
- f"βœ… Loaded {len(df)} rows – {len(token_df)} tokens.", \
31
- gr.update(visible=True), gr.update(visible=False)
 
 
 
 
 
 
 
32
 
33
- def save_table(tbl):
34
  global token_df
35
- token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
36
- # simple validation
37
- bad = token_df[~token_df["label"].isin(LABELS)]
38
- if not bad.empty:
39
- return "⚠️ Unknown labels found. Allowed: PER / ORG / LOC / EV / O"
40
  return "πŸ’Ύ Saved."
41
 
42
- def to_tokens_csv():
43
  path = "raw_tokens.csv"
44
  token_df.to_csv(path, index=False)
45
- return Path(path)
46
 
47
- def to_iob_csv():
48
- # build IOB tags
49
  iob, prev = [], {}
50
  for _, r in token_df.iterrows():
51
  sid, lbl = r["sentence_id"], r["label"]
52
  if lbl == "O":
53
  iob.append("O"); prev[sid] = None
54
  else:
55
- prefix = "I-" if prev.get(sid) == lbl else "B-"
56
- iob.append(prefix + lbl)
57
- prev[sid] = lbl
58
  out = token_df.copy(); out["iob"] = iob
59
  path = "ner_iob.csv"; out.to_csv(path, index=False)
60
- return Path(path)
61
 
62
- # ───────────────── UI ──────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  with gr.Blocks() as demo:
64
  gr.Markdown("# 🏷️ Label It! Mini-NER")
65
- gr.Markdown("**Step 1** – upload a CSV containing a `text` column *or* `user`+`assistant` columns.")
 
66
 
67
  with gr.Row():
68
- csv_file = gr.File(file_types=[".csv"])
69
  load_btn = gr.Button("Load")
70
 
71
  status = gr.Textbox(label="Status", interactive=False)
@@ -73,30 +104,40 @@ with gr.Blocks() as demo:
73
  tok_table = gr.Dataframe(
74
  headers=["sentence_id", "token", "label"],
75
  datatype=["number", "str", "str"],
76
- row_count=0,
77
- col_count=3,
78
  visible=False
79
  )
80
 
81
- with gr.Row(visible=False) as btn_row:
82
- save_btn = gr.Button("πŸ’Ύ Save")
83
- dl_tok = gr.Button("β¬‡οΈŽ Tokens CSV")
84
- dl_iob = gr.Button("β¬‡οΈŽ IOB CSV")
85
 
86
  file_tok = gr.File(visible=False)
87
  file_iob = gr.File(visible=False)
88
 
89
- # bind
 
 
 
 
 
 
 
 
 
 
90
  load_btn.click(load_csv, inputs=csv_file,
91
- outputs=[tok_table, status, btn_row, file_tok])
 
92
 
93
  save_btn.click(save_table, inputs=tok_table, outputs=status)
94
-
95
- dl_tok.click(lambda: to_tokens_csv(), outputs=file_tok)
96
- dl_iob.click(lambda: to_iob_csv(), outputs=file_iob)
97
 
98
  gr.Markdown(
99
- "**Step 2** – type `PER`, `ORG`, `LOC`, `EV`, or `O` in the **label** column β†’ Save β†’ Download."
100
  )
101
 
102
  demo.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
  from pathlib import Path
4
+ from huggingface_hub import HfApi, Repository
5
 
6
+ # Allowed tags
7
+ LABELS = {"PER", "ORG", "LOC", "EV", "O"}
8
 
9
+ # Global token DataFrame
10
+ token_df = pd.DataFrame()
11
+
12
+ # ───────────────────────── helpers ──────────────────────────────
13
+ def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
14
+ """Return DataFrame(sentence_id, token, label[=O])"""
15
  rows = []
16
  if "text" in df.columns:
17
  lines = df["text"].astype(str)
18
+ else: # dialog pair
19
  lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
20
+
21
  for sid, line in enumerate(lines):
22
  for tok in line.split():
23
  rows.append({"sentence_id": sid, "token": tok, "label": "O"})
24
  return pd.DataFrame(rows)
25
 
26
+ # ───────────────────────── callbacks ────────────────────────────
27
  def load_csv(file):
28
  global token_df
29
  df = pd.read_csv(file.name)
30
+ valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
31
+ if not valid:
32
+ return None, "❌ CSV must contain `text` OR `user`+`assistant` columns.", \
33
+ gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
34
+
35
+ token_df = explode_to_tokens(df)
36
+
37
+ return (
38
+ gr.update(value=token_df, visible=True, # show table
39
+ row_count=len(token_df)),
40
+ f"βœ… Loaded {len(df)} rows – {len(token_df)} tokens.",
41
+ gr.update(visible=True), # show action row
42
+ gr.update(visible=False), # hide token file
43
+ gr.update(visible=False) # hide iob file
44
+ )
45
 
46
+ def save_table(table_data):
47
  global token_df
48
+ token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
49
+ if not set(token_df["label"]).issubset(LABELS):
50
+ return "⚠️ Unknown label detected. Allowed: PER / ORG / LOC / EV / O"
 
 
51
  return "πŸ’Ύ Saved."
52
 
53
+ def export_tokens():
54
  path = "raw_tokens.csv"
55
  token_df.to_csv(path, index=False)
56
+ return gr.update(value=Path(path), visible=True)
57
 
58
+ def export_iob():
 
59
  iob, prev = [], {}
60
  for _, r in token_df.iterrows():
61
  sid, lbl = r["sentence_id"], r["label"]
62
  if lbl == "O":
63
  iob.append("O"); prev[sid] = None
64
  else:
65
+ tag = ("I-" if prev.get(sid) == lbl else "B-") + lbl
66
+ iob.append(tag); prev[sid] = lbl
 
67
  out = token_df.copy(); out["iob"] = iob
68
  path = "ner_iob.csv"; out.to_csv(path, index=False)
69
+ return gr.update(value=Path(path), visible=True)
70
 
71
+ def push_to_hub(repo_id, token):
72
+ global token_df
73
+ try:
74
+ api = HfApi()
75
+ api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
76
+
77
+ local_dir = Path(f"./{repo_id.replace('/','_')}")
78
+ if local_dir.exists():
79
+ for f in local_dir.iterdir(): f.unlink()
80
+ local_dir.rmdir()
81
+
82
+ repo = Repository(local_dir=str(local_dir),
83
+ clone_from=repo_id,
84
+ repo_type="dataset",
85
+ use_auth_token=token)
86
+ token_df.to_csv(local_dir / "data.csv", index=False)
87
+ repo.push_to_hub(commit_message="Add annotated NER data")
88
+ return f"πŸš€ Pushed to https://huggingface.co/datasets/{repo_id}"
89
+ except Exception as e:
90
+ return f"❌ Push failed: {e}"
91
+
92
+ # ───────────────────────── UI ───────────────────────────────────
93
  with gr.Blocks() as demo:
94
  gr.Markdown("# 🏷️ Label It! Mini-NER")
95
+
96
+ gr.Markdown("**Step 1** – upload a CSV with a `text` column **or** a `user`+`assistant` pair.")
97
 
98
  with gr.Row():
99
+ csv_file = gr.File(file_types=[".csv"], label="πŸ“ Upload CSV")
100
  load_btn = gr.Button("Load")
101
 
102
  status = gr.Textbox(label="Status", interactive=False)
 
104
  tok_table = gr.Dataframe(
105
  headers=["sentence_id", "token", "label"],
106
  datatype=["number", "str", "str"],
107
+ row_count=0, col_count=3,
 
108
  visible=False
109
  )
110
 
111
+ with gr.Row(visible=False) as action_row:
112
+ save_btn = gr.Button("πŸ’Ύ Save")
113
+ dl_tok_btn = gr.Button("β¬‡οΈŽ Tokens CSV")
114
+ dl_iob_btn = gr.Button("β¬‡οΈŽ IOB CSV")
115
 
116
  file_tok = gr.File(visible=False)
117
  file_iob = gr.File(visible=False)
118
 
119
+ # Push accordion
120
+ with gr.Accordion("πŸ“¦ Push to Hugging Face Hub", open=False) as push_acc:
121
+ repo_in = gr.Textbox(label="dataset repo (username/name)")
122
+ token_in = gr.Textbox(label="HF Token", type="password")
123
+ push_btn = gr.Button("Push")
124
+ push_out = gr.Textbox(label="Push Status", interactive=False)
125
+
126
+ # Hide accordion until data load
127
+ push_acc.visible = False
128
+
129
+ # ── wiring
130
  load_btn.click(load_csv, inputs=csv_file,
131
+ outputs=[tok_table, status, action_row, file_tok, file_iob])
132
+ load_btn.click(lambda: gr.update(visible=True), None, push_acc) # show accordion after load
133
 
134
  save_btn.click(save_table, inputs=tok_table, outputs=status)
135
+ dl_tok_btn.click(export_tokens, outputs=file_tok)
136
+ dl_iob_btn.click(export_iob, outputs=file_iob)
137
+ push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out)
138
 
139
  gr.Markdown(
140
+ "**Step 2** – edit the `label` column (`PER`, `ORG`, `LOC`, `EV`, `O`) ➜ Save ➜ Download / Push."
141
  )
142
 
143
  demo.launch()