Spaces:

Suzana
/

labelit-mini-ner

Sleeping

App Files Files Community

Suzana commited on 12 days ago

Commit

4455f2c

verified ·

1 Parent(s): d1f4849

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -43

app.py CHANGED Viewed

@@ -1,71 +1,102 @@
 import gradio as gr
 import pandas as pd
 from pathlib import Path
-LABELS = {"PER", "ORG", "LOC", "EV", "O"}      # allowed tags
-token_df = pd.DataFrame()                      # global
-# ───────────────── tokenization ────────────────────────────────
-def tokenize(df: pd.DataFrame) -> pd.DataFrame:
     rows = []
     if "text" in df.columns:
         lines = df["text"].astype(str)
-    else:
         lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     for sid, line in enumerate(lines):
         for tok in line.split():
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
-# ───────────────── callbacks ───────────────────────────────────
 def load_csv(file):
     global token_df
     df = pd.read_csv(file.name)
-    if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
-        return None, "❌ CSV must have `text` OR `user`+`assistant` columns.", \
-               gr.update(visible=False), gr.update(visible=False)
-    token_df = tokenize(df)
-    return gr.update(value=token_df, visible=True), \
-           f"✅ Loaded {len(df)} rows – {len(token_df)} tokens.", \
-           gr.update(visible=True), gr.update(visible=False)
-def save_table(tbl):
     global token_df
-    token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
-    # simple validation
-    bad = token_df[~token_df["label"].isin(LABELS)]
-    if not bad.empty:
-        return "⚠️ Unknown labels found. Allowed: PER / ORG / LOC / EV / O"
     return "💾 Saved."
-def to_tokens_csv():
     path = "raw_tokens.csv"
     token_df.to_csv(path, index=False)
-    return Path(path)
-def to_iob_csv():
-    # build IOB tags
     iob, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
             iob.append("O"); prev[sid] = None
         else:
-            prefix = "I-" if prev.get(sid) == lbl else "B-"
-            iob.append(prefix + lbl)
-            prev[sid] = lbl
     out = token_df.copy(); out["iob"] = iob
     path = "ner_iob.csv"; out.to_csv(path, index=False)
-    return Path(path)
-# ───────────────── UI ──────────────────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
-    gr.Markdown("**Step 1** – upload a CSV containing a `text` column *or* `user`+`assistant` columns.")
     with gr.Row():
-        csv_file = gr.File(file_types=[".csv"])
         load_btn = gr.Button("Load")
     status = gr.Textbox(label="Status", interactive=False)
@@ -73,30 +104,40 @@ with gr.Blocks() as demo:
     tok_table = gr.Dataframe(
         headers=["sentence_id", "token", "label"],
         datatype=["number", "str", "str"],
-        row_count=0,
-        col_count=3,
         visible=False
     )
-    with gr.Row(visible=False) as btn_row:
-        save_btn = gr.Button("💾 Save")
-        dl_tok   = gr.Button("⬇︎ Tokens CSV")
-        dl_iob   = gr.Button("⬇︎ IOB CSV")
     file_tok = gr.File(visible=False)
     file_iob = gr.File(visible=False)
-    # bind
     load_btn.click(load_csv, inputs=csv_file,
-                   outputs=[tok_table, status, btn_row, file_tok])
     save_btn.click(save_table, inputs=tok_table, outputs=status)
-    dl_tok.click(lambda: to_tokens_csv(), outputs=file_tok)
-    dl_iob.click(lambda: to_iob_csv(),   outputs=file_iob)
     gr.Markdown(
-        "**Step 2** – type `PER`, `ORG`, `LOC`, `EV`, or `O` in the **label** column → Save → Download."
     )
 demo.launch()

 import gradio as gr
 import pandas as pd
 from pathlib import Path
+from huggingface_hub import HfApi, Repository
+# Allowed tags
+LABELS = {"PER", "ORG", "LOC", "EV", "O"}
+# Global token DataFrame
+token_df = pd.DataFrame()
+# ───────────────────────── helpers ──────────────────────────────
+def explode_to_tokens(df: pd.DataFrame) -> pd.DataFrame:
+    """Return DataFrame(sentence_id, token, label[=O])"""
     rows = []
     if "text" in df.columns:
         lines = df["text"].astype(str)
+    else:  # dialog pair
         lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     for sid, line in enumerate(lines):
         for tok in line.split():
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
+# ───────────────────────── callbacks ────────────────────────────
 def load_csv(file):
     global token_df
     df = pd.read_csv(file.name)
+    valid = ("text" in df.columns) or ({"user", "assistant"}.issubset(df.columns))
+    if not valid:
+        return None, "❌ CSV must contain `text` OR `user`+`assistant` columns.", \
+               gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+    token_df = explode_to_tokens(df)
+    return (
+        gr.update(value=token_df, visible=True,  # show table
+                  row_count=len(token_df)),
+        f"✅ Loaded {len(df)} rows – {len(token_df)} tokens.",
+        gr.update(visible=True),   # show action row
+        gr.update(visible=False),  # hide token file
+        gr.update(visible=False)   # hide iob file
+    )
+def save_table(table_data):
     global token_df
+    token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
+    if not set(token_df["label"]).issubset(LABELS):
+        return "⚠️ Unknown label detected. Allowed: PER / ORG / LOC / EV / O"
     return "💾 Saved."
+def export_tokens():
     path = "raw_tokens.csv"
     token_df.to_csv(path, index=False)
+    return gr.update(value=Path(path), visible=True)
+def export_iob():
     iob, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
             iob.append("O"); prev[sid] = None
         else:
+            tag = ("I-" if prev.get(sid) == lbl else "B-") + lbl
+            iob.append(tag); prev[sid] = lbl
     out = token_df.copy(); out["iob"] = iob
     path = "ner_iob.csv"; out.to_csv(path, index=False)
+    return gr.update(value=Path(path), visible=True)
+def push_to_hub(repo_id, token):
+    global token_df
+    try:
+        api = HfApi()
+        api.create_repo(repo_id, token=token, repo_type="dataset", exist_ok=True)
+        local_dir = Path(f"./{repo_id.replace('/','_')}")
+        if local_dir.exists():
+            for f in local_dir.iterdir(): f.unlink()
+            local_dir.rmdir()
+        repo = Repository(local_dir=str(local_dir),
+                          clone_from=repo_id,
+                          repo_type="dataset",
+                          use_auth_token=token)
+        token_df.to_csv(local_dir / "data.csv", index=False)
+        repo.push_to_hub(commit_message="Add annotated NER data")
+        return f"🚀 Pushed to https://huggingface.co/datasets/{repo_id}"
+    except Exception as e:
+        return f"❌ Push failed: {e}"
+# ───────────────────────── UI ───────────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
+    gr.Markdown("**Step 1** – upload a CSV with a `text` column **or** a `user`+`assistant` pair.")
     with gr.Row():
+        csv_file = gr.File(file_types=[".csv"], label="📁 Upload CSV")
         load_btn = gr.Button("Load")
     status = gr.Textbox(label="Status", interactive=False)
     tok_table = gr.Dataframe(
         headers=["sentence_id", "token", "label"],
         datatype=["number", "str", "str"],
+        row_count=0, col_count=3,
         visible=False
     )
+    with gr.Row(visible=False) as action_row:
+        save_btn   = gr.Button("💾 Save")
+        dl_tok_btn = gr.Button("⬇︎ Tokens CSV")
+        dl_iob_btn = gr.Button("⬇︎ IOB CSV")
     file_tok = gr.File(visible=False)
     file_iob = gr.File(visible=False)
+    # Push accordion
+    with gr.Accordion("📦 Push to Hugging Face Hub", open=False) as push_acc:
+        repo_in  = gr.Textbox(label="dataset repo (username/name)")
+        token_in = gr.Textbox(label="HF Token", type="password")
+        push_btn = gr.Button("Push")
+        push_out = gr.Textbox(label="Push Status", interactive=False)
+    # Hide accordion until data load
+    push_acc.visible = False
+    # ── wiring
     load_btn.click(load_csv, inputs=csv_file,
+                   outputs=[tok_table, status, action_row, file_tok, file_iob])
+    load_btn.click(lambda: gr.update(visible=True), None, push_acc)  # show accordion after load
     save_btn.click(save_table, inputs=tok_table, outputs=status)
+    dl_tok_btn.click(export_tokens, outputs=file_tok)
+    dl_iob_btn.click(export_iob,  outputs=file_iob)
+    push_btn.click(push_to_hub, inputs=[repo_in, token_in], outputs=push_out)
     gr.Markdown(
+        "**Step 2** – edit the `label` column (`PER`, `ORG`, `LOC`, `EV`, `O`) ➜ Save ➜ Download / Push."
     )
 demo.launch()