Spaces:

Suzana
/

labelit-mini-ner

Sleeping

App Files Files Community

Suzana commited on 15 days ago

Commit

d1f4849

verified ·

1 Parent(s): 66bd7e3

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -63

app.py CHANGED Viewed

@@ -2,120 +2,101 @@ import gradio as gr
 import pandas as pd
 from pathlib import Path
-LABEL_CHOICES = ["O", "PER", "ORG", "LOC", "EV"]  # EV = Event
-token_df = pd.DataFrame()
-# ────────── helpers ─────────────────────────────────────────────
-def tokenize_df(df: pd.DataFrame) -> pd.DataFrame:
     rows = []
     if "text" in df.columns:
         lines = df["text"].astype(str)
     else:
-        # user + assistant fallback
         lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     for sid, line in enumerate(lines):
         for tok in line.split():
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
-# ────────── callbacks ───────────────────────────────────────────
-def load_data(file):
     global token_df
     df = pd.read_csv(file.name)
     if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
-        return (None,
-                "❌ CSV must have a `text` column **or** `user`&`assistant` columns.",
-                gr.update(visible=False), gr.update(visible=False))
-    token_df = tokenize_df(df)
-    return (
-        gr.update(value=token_df, visible=True),
-        f"✅ Loaded {len(df)} rows → {len(token_df)} tokens.",
-        gr.update(visible=True),
-        gr.update(visible=False)
-    )
-def save_edits(table_data):
     global token_df
-    token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
-    return "💾 Edits saved."
-def export_tokens():
     path = "raw_tokens.csv"
     token_df.to_csv(path, index=False)
     return Path(path)
-def export_iob():
     iob, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
-            iob.append("O")
-            prev[sid] = None
         else:
-            iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl)
             prev[sid] = lbl
     out = token_df.copy(); out["iob"] = iob
     path = "ner_iob.csv"; out.to_csv(path, index=False)
     return Path(path)
-# ────────── UI ──────────────────────────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
-    gr.Markdown(
-        "**Step 1** – Upload a CSV containing either a `text` column, or `user` & `assistant` columns."
-    )
     with gr.Row():
-        csv_file = gr.File(label="📁 Upload CSV", file_types=[".csv"])
         load_btn = gr.Button("Load")
     status = gr.Textbox(label="Status", interactive=False)
-    # Editable token table (hidden until load)
     tok_table = gr.Dataframe(
         headers=["sentence_id", "token", "label"],
         datatype=["number", "str", "str"],
-        column_config={
-            "label": gr.ColumnConfig(
-                label="label",
-                dtype="categorical",
-                choices=LABEL_CHOICES,
-            )
-        },
         row_count=0,
-        visible=False,
     )
-    # Action buttons row (hidden until load)
-    with gr.Row(visible=False) as buttons_row:
-        save_btn     = gr.Button("💾 Save")
-        download_tok = gr.Button("⬇︎ Tokens CSV")
-        download_iob = gr.Button("⬇︎ IOB CSV")
-    # File components that appear after export
-    file_tok = gr.File(label="Click to download", visible=False)
-    file_iob = gr.File(label="Click to download", visible=False)
-    # Bindings
-    load_btn.click(load_data, inputs=csv_file,
-                   outputs=[tok_table, status, buttons_row, file_tok])
-    save_btn.click(save_edits, inputs=tok_table, outputs=status)
-    download_tok.click(lambda: export_tokens(),
-                       outputs=file_tok)  # file appears for click
-    download_iob.click(lambda: export_iob(),
-                       outputs=file_iob)
     gr.Markdown(
-        "**Step 2** – In the `label` dropdown choose `PER`, `ORG`, `LOC`, `EV`, or leave `O`."
-        "\nAfter saving, use the download buttons."
     )
 demo.launch()

 import pandas as pd
 from pathlib import Path
+LABELS = {"PER", "ORG", "LOC", "EV", "O"}      # allowed tags
+token_df = pd.DataFrame()                      # global
+# ───────────────── tokenization ────────────────────────────────
+def tokenize(df: pd.DataFrame) -> pd.DataFrame:
     rows = []
     if "text" in df.columns:
         lines = df["text"].astype(str)
     else:
         lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     for sid, line in enumerate(lines):
         for tok in line.split():
             rows.append({"sentence_id": sid, "token": tok, "label": "O"})
     return pd.DataFrame(rows)
+# ───────────────── callbacks ───────────────────────────────────
+def load_csv(file):
     global token_df
     df = pd.read_csv(file.name)
     if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
+        return None, "❌ CSV must have `text` OR `user`+`assistant` columns.", \
+               gr.update(visible=False), gr.update(visible=False)
+    token_df = tokenize(df)
+    return gr.update(value=token_df, visible=True), \
+           f"✅ Loaded {len(df)} rows – {len(token_df)} tokens.", \
+           gr.update(visible=True), gr.update(visible=False)
+def save_table(tbl):
     global token_df
+    token_df = pd.DataFrame(tbl, columns=["sentence_id", "token", "label"])
+    # simple validation
+    bad = token_df[~token_df["label"].isin(LABELS)]
+    if not bad.empty:
+        return "⚠️ Unknown labels found. Allowed: PER / ORG / LOC / EV / O"
+    return "💾 Saved."
+def to_tokens_csv():
     path = "raw_tokens.csv"
     token_df.to_csv(path, index=False)
     return Path(path)
+def to_iob_csv():
+    # build IOB tags
     iob, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
+            iob.append("O"); prev[sid] = None
         else:
+            prefix = "I-" if prev.get(sid) == lbl else "B-"
+            iob.append(prefix + lbl)
             prev[sid] = lbl
     out = token_df.copy(); out["iob"] = iob
     path = "ner_iob.csv"; out.to_csv(path, index=False)
     return Path(path)
+# ───────────────── UI ──────────────────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
+    gr.Markdown("**Step 1** – upload a CSV containing a `text` column *or* `user`+`assistant` columns.")
     with gr.Row():
+        csv_file = gr.File(file_types=[".csv"])
         load_btn = gr.Button("Load")
     status = gr.Textbox(label="Status", interactive=False)
     tok_table = gr.Dataframe(
         headers=["sentence_id", "token", "label"],
         datatype=["number", "str", "str"],
         row_count=0,
+        col_count=3,
+        visible=False
     )
+    with gr.Row(visible=False) as btn_row:
+        save_btn = gr.Button("💾 Save")
+        dl_tok   = gr.Button("⬇︎ Tokens CSV")
+        dl_iob   = gr.Button("⬇︎ IOB CSV")
+    file_tok = gr.File(visible=False)
+    file_iob = gr.File(visible=False)
+    # bind
+    load_btn.click(load_csv, inputs=csv_file,
+                   outputs=[tok_table, status, btn_row, file_tok])
+    save_btn.click(save_table, inputs=tok_table, outputs=status)
+    dl_tok.click(lambda: to_tokens_csv(), outputs=file_tok)
+    dl_iob.click(lambda: to_iob_csv(),   outputs=file_iob)
     gr.Markdown(
+        "**Step 2** – type `PER`, `ORG`, `LOC`, `EV`, or `O` in the **label** column → Save → Download."
     )
 demo.launch()