Spaces:

Suzana
/

labelit-mini-ner

Sleeping

App Files Files Community

Suzana commited on 10 days ago

Commit

11b95d7

verified ·

1 Parent(s): 47e0f7e

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -32

app.py CHANGED Viewed

@@ -3,48 +3,53 @@ import pandas as pd
 from pathlib import Path
 from huggingface_hub import HfApi, Repository
 LABELS = {"PER", "ORG", "LOC", "EV", "O"}
-token_df = pd.DataFrame()
-# ───────────────── helpers ─────────────────
-def explode(df):
-    """Return sentence-level token rows with default O label."""
     if "text" in df.columns:
         lines = df["text"].astype(str)
-    else:
         lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     rows = []
-    for sid, line in enumerate(lines):
         for tok in line.split():
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
-# ───────────────── callbacks ───────────────
 def load_csv(file):
     global token_df
     df = pd.read_csv(file.name)
-    if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
-        return None, "❌ Need `text` or `user`+`assistant` cols.", \
-               gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
     token_df = explode(df)
     return (gr.update(value=token_df, visible=True, row_count=len(token_df)),
             f"✅ {len(df)} rows → {len(token_df)} tokens.",
-            gr.update(visible=True),  # action row
-            gr.update(visible=False),  # reset downloads
             gr.update(visible=False))
 def save_table(tbl):
     global token_df
     token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
-    bad = token_df.loc[~token_df.label.isin(LABELS), "label"].unique()
-    return "💾 Saved." if len(bad) == 0 else f"⚠️ Unknown: {', '.join(bad)}"
 def export_tokens():
     fname = "raw_tokens.csv"
     token_df.to_csv(fname, index=False)
-    return gr.update(value=fname, visible=True)   # <<< string path + visible
 def export_iob():
     iob, prev = [], {}
@@ -53,11 +58,11 @@ def export_iob():
         if lbl == "O":
             iob.append("O"); prev[sid] = None
         else:
-            iob.append(("I-" if prev.get(sid)==lbl else "B-")+lbl)
             prev[sid] = lbl
     out = token_df.copy(); out["iob"] = iob
     fname = "ner_iob.csv"; out.to_csv(fname, index=False)
-    return gr.update(value=fname, visible=True)   # <<< same pattern
 def push_to_hub(repo_id, token):
     try:
@@ -68,44 +73,50 @@ def push_to_hub(repo_id, token):
             local.rmdir()
         repo = Repository(str(local), clone_from=repo_id,
                           repo_type="dataset", use_auth_token=token)
-        token_df.to_csv(local/"data.csv", index=False)
         repo.push_to_hub("Add annotated NER data")
         return f"🚀 https://huggingface.co/datasets/{repo_id}"
     except Exception as e:
         return f"❌ {e}"
-# ───────────────── UI ──────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
     with gr.Row():
-        f_in, load_btn = gr.File(file_types=[".csv"]), gr.Button("Load")
     status = gr.Textbox(interactive=False)
-    table  = gr.Dataframe(headers=["sentence_id","token","label"],
-                          datatype=["number","str","str"], visible=False)
-    with gr.Row(visible=False) as actions:
-        save_btn   = gr.Button("💾 Save")
-        tok_btn    = gr.Button("⬇︎ Tokens CSV")
-        iob_btn    = gr.Button("⬇︎ IOB CSV")
     file_tok = gr.File(visible=False)
     file_iob = gr.File(visible=False)
-    with gr.Accordion("📦 Push to Hub", open=False, visible=False) as push_acc:
         repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password")
-        push_btn, push_out = gr.Button("Push"), gr.Textbox(interactive=False)
     # wiring
-    load_btn.click(load_csv, f_in, [table, status, actions, file_tok, file_iob])
-    load_btn.click(lambda: gr.update(visible=True), None, push_acc)
-    save_btn.click(save_table, table, status)
     tok_btn.click(export_tokens, outputs=file_tok)
     iob_btn.click(export_iob,  outputs=file_iob)
     push_btn.click(push_to_hub, [repo_in, token_in], push_out)
-    gr.Markdown("Edit **label** (`PER`, `ORG`, `LOC`, `EV`, `O`) → Save → Download / Push.")
 demo.launch()

 from pathlib import Path
 from huggingface_hub import HfApi, Repository
+# Allowed tags
 LABELS = {"PER", "ORG", "LOC", "EV", "O"}
+token_df = pd.DataFrame()  # global store
+# ───────────────────────── token explode ───────────────────────
+def explode(df: pd.DataFrame) -> pd.DataFrame:
+    """Return DataFrame(sentence_id, token, label='O')."""
     if "text" in df.columns:
         lines = df["text"].astype(str)
+    else:  # user / assistant dialogs
         lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     rows = []
+    for sid, line in enumerate(lines, start=0):      # ensure unique 0,1,2,...
         for tok in line.split():
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
+# ───────────────────────── callbacks ───────────────────────────
 def load_csv(file):
     global token_df
     df = pd.read_csv(file.name)
+    valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
+    if not valid:
+        msg = "❌ CSV needs a `text` column **or** both `user` and `assistant` columns."
+        return None, msg, *(gr.update(visible=False),) * 3
     token_df = explode(df)
     return (gr.update(value=token_df, visible=True, row_count=len(token_df)),
             f"✅ {len(df)} rows → {len(token_df)} tokens.",
+            gr.update(visible=True),  # show buttons
+            gr.update(visible=False),  # reset download links
             gr.update(visible=False))
 def save_table(tbl):
     global token_df
     token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
+    bad = token_df.loc[~token_df["label"].isin(LABELS), "label"].unique()
+    return "💾 Saved." if bad.size == 0 else f"⚠️ Unknown label(s): {', '.join(bad)}"
 def export_tokens():
     fname = "raw_tokens.csv"
     token_df.to_csv(fname, index=False)
+    return gr.update(value=fname, visible=True)
 def export_iob():
     iob, prev = [], {}
         if lbl == "O":
             iob.append("O"); prev[sid] = None
         else:
+            iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl)
             prev[sid] = lbl
     out = token_df.copy(); out["iob"] = iob
     fname = "ner_iob.csv"; out.to_csv(fname, index=False)
+    return gr.update(value=fname, visible=True)
 def push_to_hub(repo_id, token):
     try:
             local.rmdir()
         repo = Repository(str(local), clone_from=repo_id,
                           repo_type="dataset", use_auth_token=token)
+        token_df.to_csv(local / "data.csv", index=False)
         repo.push_to_hub("Add annotated NER data")
         return f"🚀 https://huggingface.co/datasets/{repo_id}"
     except Exception as e:
         return f"❌ {e}"
+# ───────────────────────── UI ──────────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
+    gr.Markdown("**Step 1** – upload CSV (`text` **or** `user`+`assistant`).")
     with gr.Row():
+        csv_file = gr.File(file_types=[".csv"])
+        load_btn = gr.Button("Load")
     status = gr.Textbox(interactive=False)
+    tok_table = gr.Dataframe(headers=["sentence_id", "token", "label"],
+                             datatype=["number", "str", "str"],
+                             visible=False)
+    with gr.Row(visible=False) as buttons:
+        save_btn = gr.Button("💾 Save")
+        tok_btn  = gr.Button("⬇︎ Tokens CSV")
+        iob_btn  = gr.Button("⬇︎ IOB CSV")
     file_tok = gr.File(visible=False)
     file_iob = gr.File(visible=False)
+    with gr.Accordion("📦 Push to Hugging Face Hub", open=False, visible=False) as acc:
         repo_in, token_in = gr.Textbox(label="repo"), gr.Textbox(label="token", type="password")
+        push_btn   = gr.Button("Push")
+        push_out   = gr.Textbox(interactive=False)
     # wiring
+    load_btn.click(load_csv, csv_file, [tok_table, status, buttons, file_tok, file_iob])
+    load_btn.click(lambda: gr.update(visible=True), None, acc)
+    save_btn.click(save_table, tok_table, status)
     tok_btn.click(export_tokens, outputs=file_tok)
     iob_btn.click(export_iob,  outputs=file_iob)
     push_btn.click(push_to_hub, [repo_in, token_in], push_out)
+    gr.Markdown("**Step 2** – label tokens (`PER`, `ORG`, `LOC`, `EV`, `O`) ➜ Save ➜ Download / Push.")
 demo.launch()