Spaces:

Suzana
/

labelit-mini-ner

Running

App Files Files Community

Suzana commited on 30 days ago

Commit

4a75b2d

verified ·

1 Parent(s): 4455f2c

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -54

app.py CHANGED Viewed

@@ -3,100 +3,115 @@ import pandas as pd
 from pathlib import Path
 from huggingface_hub import HfApi, Repository
-# Allowed tags
-LABELS = {"PER", "ORG", "LOC", "EV", "O"}
-# Global token DataFrame
 token_df = pd.DataFrame()
-# ───────────────────────── helpers ──────────────────────────────
 def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
-    """Return DataFrame(sentence_id, token, label[=O])"""
-    rows = []
-    if "text" in df.columns:
-        lines = df["text"].astype(str)
-    else:  # dialog pair
-        lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     for sid, line in enumerate(lines):
         for tok in line.split():
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
-# ───────────────────────── callbacks ────────────────────────────
 def load_csv(file):
     global token_df
     df = pd.read_csv(file.name)
     valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
     if not valid:
-        return None, "❌ CSV must contain `text` OR `user`+`assistant` columns.", \
-               gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     token_df = explode_to_tokens(df)
     return (
-        gr.update(value=token_df, visible=True,  # show table
-                  row_count=len(token_df)),
-        f"✅ Loaded {len(df)} rows – {len(token_df)} tokens.",
-        gr.update(visible=True),   # show action row
-        gr.update(visible=False),  # hide token file
-        gr.update(visible=False)   # hide iob file
     )
-def save_table(table_data):
     global token_df
-    token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
-    if not set(token_df["label"]).issubset(LABELS):
-        return "⚠️ Unknown label detected. Allowed: PER / ORG / LOC / EV / O"
     return "💾 Saved."
 def export_tokens():
     path = "raw_tokens.csv"
     token_df.to_csv(path, index=False)
     return gr.update(value=Path(path), visible=True)
 def export_iob():
-    iob, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
-            iob.append("O"); prev[sid] = None
         else:
-            tag = ("I-" if prev.get(sid) == lbl else "B-") + lbl
-            iob.append(tag); prev[sid] = lbl
-    out = token_df.copy(); out["iob"] = iob
-    path = "ner_iob.csv"; out.to_csv(path, index=False)
     return gr.update(value=Path(path), visible=True)
 def push_to_hub(repo_id, token):
-    global token_df
     try:
         api = HfApi()
         api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
-        local_dir = Path(f"./{repo_id.replace('/','_')}")
-        if local_dir.exists():
-            for f in local_dir.iterdir(): f.unlink()
-            local_dir.rmdir()
-        repo = Repository(local_dir=str(local_dir),
                           clone_from=repo_id,
                           repo_type="dataset",
                           use_auth_token=token)
-        token_df.to_csv(local_dir / "data.csv", index=False)
-        repo.push_to_hub(commit_message="Add annotated NER data")
-        return f"🚀 Pushed to https://huggingface.co/datasets/{repo_id}"
     except Exception as e:
         return f"❌ Push failed: {e}"
-# ───────────────────────── UI ───────────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
-    gr.Markdown("**Step 1** – upload a CSV with a `text` column **or** a `user`+`assistant` pair.")
     with gr.Row():
-        csv_file = gr.File(file_types=[".csv"], label="📁 Upload CSV")
         load_btn = gr.Button("Load")
     status = gr.Textbox(label="Status", interactive=False)
@@ -105,39 +120,40 @@ with gr.Blocks() as demo:
         headers=["sentence_id", "token", "label"],
         datatype=["number", "str", "str"],
         row_count=0, col_count=3,
-        visible=False
     )
-    with gr.Row(visible=False) as action_row:
-        save_btn   = gr.Button("💾 Save")
         dl_tok_btn = gr.Button("⬇︎ Tokens CSV")
         dl_iob_btn = gr.Button("⬇︎ IOB CSV")
-    file_tok = gr.File(visible=False)
-    file_iob = gr.File(visible=False)
     # Push accordion
-    with gr.Accordion("📦 Push to Hugging Face Hub", open=False) as push_acc:
         repo_in  = gr.Textbox(label="dataset repo (username/name)")
         token_in = gr.Textbox(label="HF Token", type="password")
         push_btn = gr.Button("Push")
         push_out = gr.Textbox(label="Push Status", interactive=False)
-    # Hide accordion until data load
-    push_acc.visible = False
     # ── wiring
-    load_btn.click(load_csv, inputs=csv_file,
-                   outputs=[tok_table, status, action_row, file_tok, file_iob])
-    load_btn.click(lambda: gr.update(visible=True), None, push_acc)  # show accordion after load
     save_btn.click(save_table, inputs=tok_table, outputs=status)
     dl_tok_btn.click(export_tokens, outputs=file_tok)
     dl_iob_btn.click(export_iob,  outputs=file_iob)
     push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out)
     gr.Markdown(
-        "**Step 2** – edit the `label` column (`PER`, `ORG`, `LOC`, `EV`, `O`) ➜ Save ➜ Download / Push."
     )
 demo.launch()

 from pathlib import Path
 from huggingface_hub import HfApi, Repository
+# Allowed labels
+LABEL_SET = {"PER", "ORG", "LOC", "EV", "O"}
+# In-memory token store
 token_df = pd.DataFrame()
+# ─────────────────────── helpers ──────────────────────────
 def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
+    """Return DataFrame(sentence_id, token, label='O')."""
+    lines = (
+        df["text"].astype(str)
+        if "text" in df.columns
+        else df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
+    )
+    rows = []
     for sid, line in enumerate(lines):
         for tok in line.split():
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
+# ────────────────────── callbacks ─────────────────────────
 def load_csv(file):
     global token_df
     df = pd.read_csv(file.name)
     valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
     if not valid:
+        msg = "❌ CSV must contain a `text` column **or** `user` & `assistant` columns."
+        return None, msg, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     token_df = explode_to_tokens(df)
     return (
+        gr.update(value=token_df, visible=True, row_count=len(token_df)),
+        f"✅ Loaded {len(df)} rows → {len(token_df)} tokens.",
+        gr.update(visible=True),                 # show buttons row
+        gr.update(visible=False),                # hide download links (reset)
+        gr.update(visible=False),
     )
+def save_table(tbl):
     global token_df
+    token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
+    bad = token_df.loc[~token_df["label"].isin(LABEL_SET), "label"].unique()
+    if len(bad):
+        return f"⚠️ Unknown label(s): {', '.join(map(str, bad))}"
     return "💾 Saved."
 def export_tokens():
     path = "raw_tokens.csv"
     token_df.to_csv(path, index=False)
     return gr.update(value=Path(path), visible=True)
 def export_iob():
+    iob_tags, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
+            iob_tags.append("O")
+            prev[sid] = None
         else:
+            prefix = "I-" if prev.get(sid) == lbl else "B-"
+            iob_tags.append(prefix + lbl)
+            prev[sid] = lbl
+    out = token_df.copy()
+    out["iob"] = iob_tags
+    path = "ner_iob.csv"
+    out.to_csv(path, index=False)
     return gr.update(value=Path(path), visible=True)
 def push_to_hub(repo_id, token):
     try:
         api = HfApi()
         api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
+        local = Path(repo_id.replace("/", "_"))
+        if local.exists():
+            for f in local.iterdir():
+                f.unlink()
+            local.rmdir()
+        repo = Repository(local_dir=str(local),
                           clone_from=repo_id,
                           repo_type="dataset",
                           use_auth_token=token)
+        token_df.to_csv(local / "data.csv", index=False)
+        repo.push_to_hub("Add annotated NER data")
+        return f"🚀 https://huggingface.co/datasets/{repo_id}"
     except Exception as e:
         return f"❌ Push failed: {e}"
+# ──────────────────────── UI ──────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
+    gr.Markdown(
+        "**Step 1** – Upload a CSV containing a `text` column *or* `user` + `assistant` dialogue columns."
+    )
     with gr.Row():
+        csv_file = gr.File(label="📁 Upload CSV", file_types=[".csv"])
         load_btn = gr.Button("Load")
     status = gr.Textbox(label="Status", interactive=False)
         headers=["sentence_id", "token", "label"],
         datatype=["number", "str", "str"],
         row_count=0, col_count=3,
+        visible=False,
     )
+    with gr.Row(visible=False) as btn_row:
+        save_btn = gr.Button("💾 Save")
         dl_tok_btn = gr.Button("⬇︎ Tokens CSV")
         dl_iob_btn = gr.Button("⬇︎ IOB CSV")
+    file_tok = gr.File(label="Tokens CSV", visible=False)
+    file_iob = gr.File(label="IOB CSV", visible=False)
     # Push accordion
+    push_acc = gr.Accordion("📦 Push to Hugging Face Hub", open=False, visible=False)
+    with push_acc:
         repo_in  = gr.Textbox(label="dataset repo (username/name)")
         token_in = gr.Textbox(label="HF Token", type="password")
         push_btn = gr.Button("Push")
         push_out = gr.Textbox(label="Push Status", interactive=False)
     # ── wiring
+    load_btn.click(load_csv,
+                   inputs=csv_file,
+                   outputs=[tok_table, status, btn_row, file_tok, file_iob])
+    load_btn.click(lambda: gr.update(visible=True), None, push_acc)
     save_btn.click(save_table, inputs=tok_table, outputs=status)
     dl_tok_btn.click(export_tokens, outputs=file_tok)
     dl_iob_btn.click(export_iob,  outputs=file_iob)
     push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out)
     gr.Markdown(
+        "**Step 2** – Type `PER`, `ORG`, `LOC`, `EV`, or `O` in the `label` column, save, then download or push."
     )
 demo.launch()