Spaces:

Suzana
/

labelit-mini-ner

Running

App Files Files Community

Suzana commited on 28 days ago

Commit

47e0f7e

verified ·

1 Parent(s): 4a75b2d

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -108

app.py CHANGED Viewed

@@ -3,20 +3,16 @@ import pandas as pd
 from pathlib import Path
 from huggingface_hub import HfApi, Repository
-# Allowed labels
-LABEL_SET = {"PER", "ORG", "LOC", "EV", "O"}
-# In-memory token store
 token_df = pd.DataFrame()
-# ─────────────────────── helpers ──────────────────────────
-def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
-    """Return DataFrame(sentence_id, token, label='O')."""
-    lines = (
-        df["text"].astype(str)
-        if "text" in df.columns
-        else df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
-    )
     rows = []
     for sid, line in enumerate(lines):
@@ -24,136 +20,92 @@ def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
-# ────────────────────── callbacks ─────────────────────────
 def load_csv(file):
     global token_df
     df = pd.read_csv(file.name)
-    valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
-    if not valid:
-        msg = "❌ CSV must contain a `text` column **or** `user` & `assistant` columns."
-        return None, msg, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
-    token_df = explode_to_tokens(df)
-    return (
-        gr.update(value=token_df, visible=True, row_count=len(token_df)),
-        f"✅ Loaded {len(df)} rows → {len(token_df)} tokens.",
-        gr.update(visible=True),                 # show buttons row
-        gr.update(visible=False),                # hide download links (reset)
-        gr.update(visible=False),
-    )
 def save_table(tbl):
     global token_df
     token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
-    bad = token_df.loc[~token_df["label"].isin(LABEL_SET), "label"].unique()
-    if len(bad):
-        return f"⚠️ Unknown label(s): {', '.join(map(str, bad))}"
-    return "💾 Saved."
 def export_tokens():
-    path = "raw_tokens.csv"
-    token_df.to_csv(path, index=False)
-    return gr.update(value=Path(path), visible=True)
 def export_iob():
-    iob_tags, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
-            iob_tags.append("O")
-            prev[sid] = None
         else:
-            prefix = "I-" if prev.get(sid) == lbl else "B-"
-            iob_tags.append(prefix + lbl)
             prev[sid] = lbl
-    out = token_df.copy()
-    out["iob"] = iob_tags
-    path = "ner_iob.csv"
-    out.to_csv(path, index=False)
-    return gr.update(value=Path(path), visible=True)
 def push_to_hub(repo_id, token):
     try:
-        api = HfApi()
-        api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
         local = Path(repo_id.replace("/", "_"))
         if local.exists():
-            for f in local.iterdir():
-                f.unlink()
             local.rmdir()
-        repo = Repository(local_dir=str(local),
-                          clone_from=repo_id,
-                          repo_type="dataset",
-                          use_auth_token=token)
-        token_df.to_csv(local / "data.csv", index=False)
         repo.push_to_hub("Add annotated NER data")
         return f"🚀 https://huggingface.co/datasets/{repo_id}"
     except Exception as e:
-        return f"❌ Push failed: {e}"
-# ──────────────────────── UI ──────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
-    gr.Markdown(
-        "**Step 1** – Upload a CSV containing a `text` column *or* `user` + `assistant` dialogue columns."
-    )
     with gr.Row():
-        csv_file = gr.File(label="📁 Upload CSV", file_types=[".csv"])
-        load_btn = gr.Button("Load")
-    status = gr.Textbox(label="Status", interactive=False)
-    tok_table = gr.Dataframe(
-        headers=["sentence_id", "token", "label"],
-        datatype=["number", "str", "str"],
-        row_count=0, col_count=3,
-        visible=False,
-    )
-    with gr.Row(visible=False) as btn_row:
-        save_btn = gr.Button("💾 Save")
-        dl_tok_btn = gr.Button("⬇︎ Tokens CSV")
-        dl_iob_btn = gr.Button("⬇︎ IOB CSV")
-    file_tok = gr.File(label="Tokens CSV", visible=False)
-    file_iob = gr.File(label="IOB CSV", visible=False)
-    # Push accordion
-    push_acc = gr.Accordion("📦 Push to Hugging Face Hub", open=False, visible=False)
-    with push_acc:
-        repo_in  = gr.Textbox(label="dataset repo (username/name)")
-        token_in = gr.Textbox(label="HF Token", type="password")
-        push_btn = gr.Button("Push")
-        push_out = gr.Textbox(label="Push Status", interactive=False)
-    # ── wiring
-    load_btn.click(load_csv,
-                   inputs=csv_file,
-                   outputs=[tok_table, status, btn_row, file_tok, file_iob])
-    load_btn.click(lambda: gr.update(visible=True), None, push_acc)
-    save_btn.click(save_table, inputs=tok_table, outputs=status)
-    dl_tok_btn.click(export_tokens, outputs=file_tok)
-    dl_iob_btn.click(export_iob,  outputs=file_iob)
-    push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out)
-    gr.Markdown(
-        "**Step 2** – Type `PER`, `ORG`, `LOC`, `EV`, or `O` in the `label` column, save, then download or push."
-    )
 demo.launch()

 from pathlib import Path
 from huggingface_hub import HfApi, Repository
+LABELS = {"PER", "ORG", "LOC", "EV", "O"}
 token_df = pd.DataFrame()
+# ───────────────── helpers ─────────────────
+def explode(df):
+    """Return sentence-level token rows with default O label."""
+    if "text" in df.columns:
+        lines = df["text"].astype(str)
+    else:
+        lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     rows = []
     for sid, line in enumerate(lines):
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
+# ───────────────── callbacks ───────────────
 def load_csv(file):
     global token_df
     df = pd.read_csv(file.name)
+    if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
+        return None, "❌ Need `text` or `user`+`assistant` cols.", \
+               gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+    token_df = explode(df)
+    return (gr.update(value=token_df, visible=True, row_count=len(token_df)),
+            f"✅ {len(df)} rows → {len(token_df)} tokens.",
+            gr.update(visible=True),  # action row
+            gr.update(visible=False),  # reset downloads
+            gr.update(visible=False))
 def save_table(tbl):
     global token_df
     token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
+    bad = token_df.loc[~token_df.label.isin(LABELS), "label"].unique()
+    return "💾 Saved." if len(bad) == 0 else f"⚠️ Unknown: {', '.join(bad)}"
 def export_tokens():
+    fname = "raw_tokens.csv"
+    token_df.to_csv(fname, index=False)
+    return gr.update(value=fname, visible=True)   # <<< string path + visible
 def export_iob():
+    iob, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
+            iob.append("O"); prev[sid] = None
         else:
+            iob.append(("I-" if prev.get(sid)==lbl else "B-")+lbl)
             prev[sid] = lbl
+    out = token_df.copy(); out["iob"] = iob
+    fname = "ner_iob.csv"; out.to_csv(fname, index=False)
+    return gr.update(value=fname, visible=True)   # <<< same pattern
 def push_to_hub(repo_id, token):
     try:
+        HfApi().create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
         local = Path(repo_id.replace("/", "_"))
         if local.exists():
+            for f in local.iterdir(): f.unlink()
             local.rmdir()
+        repo = Repository(str(local), clone_from=repo_id,
+                          repo_type="dataset", use_auth_token=token)
+        token_df.to_csv(local/"data.csv", index=False)
         repo.push_to_hub("Add annotated NER data")
         return f"🚀 https://huggingface.co/datasets/{repo_id}"
     except Exception as e:
+        return f"❌ {e}"
+# ───────────────── UI ──────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
     with gr.Row():
+        f_in, load_btn = gr.File(file_types=[".csv"]), gr.Button("Load")
+    status = gr.Textbox(interactive=False)
+    table  = gr.Dataframe(headers=["sentence_id","token","label"],
+                          datatype=["number","str","str"], visible=False)
+    with gr.Row(visible=False) as actions:
+        save_btn   = gr.Button("💾 Save")
+        tok_btn    = gr.Button("⬇︎ Tokens CSV")
+        iob_btn    = gr.Button("⬇︎ IOB CSV")
+    file_tok = gr.File(visible=False)
+    file_iob = gr.File(visible=False)
+    with gr.Accordion("📦 Push to Hub", open=False, visible=False) as push_acc:
+        repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password")
+        push_btn, push_out = gr.Button("Push"), gr.Textbox(interactive=False)
+    # wiring
+    load_btn.click(load_csv, f_in, [table, status, actions, file_tok, file_iob])
+    load_btn.click(lambda: gr.update(visible=True), None, push_acc)
+    save_btn.click(save_table, table, status)
+    tok_btn.click(export_tokens, outputs=file_tok)
+    iob_btn.click(export_iob,  outputs=file_iob)
+    push_btn.click(push_to_hub, [repo_in, token_in], push_out)
+    gr.Markdown("Edit **label** (`PER`, `ORG`, `LOC`, `EV`, `O`) → Save → Download / Push.")
 demo.launch()