Spaces:

Suzana
/

labelit-mini-ner

Sleeping

App Files Files Community

Suzana commited on 14 days ago

Commit

66bd7e3

verified ·

1 Parent(s): efeca40

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -47

app.py CHANGED Viewed

@@ -2,40 +2,40 @@ import gradio as gr
 import pandas as pd
 from pathlib import Path
-token_df = pd.DataFrame()          # global store
-# ───────────────────────── helpers ──────────────────────────────
 def tokenize_df(df: pd.DataFrame) -> pd.DataFrame:
-    """Explode dataframe into token rows with default 'O' label."""
-    records = []
     if "text" in df.columns:
         lines = df["text"].astype(str)
-    else:  # user+assistant dialog
         lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     for sid, line in enumerate(lines):
         for tok in line.split():
-            records.append({"sentence_id": sid, "token": tok, "label": "O"})
-    return pd.DataFrame(records)
-# ───────────────────────── callbacks ────────────────────────────
 def load_data(file):
     global token_df
     df = pd.read_csv(file.name)
     if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
-        return (None, "❌ CSV must have `text` **or** `user`+`assistant` columns.",
                 gr.update(visible=False), gr.update(visible=False))
     token_df = tokenize_df(df)
     return (
-        # Show table with correct row_count
-        gr.update(value=token_df.values.tolist(),          # list-of-lists
-                  row_count=len(token_df),
-                  visible=True),
         f"✅ Loaded {len(df)} rows → {len(token_df)} tokens.",
-        gr.update(visible=True),     # show action row
-        gr.update(visible=False)     # hide downloads until first export
     )
 def save_edits(table_data):
@@ -43,34 +43,31 @@ def save_edits(table_data):
     token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
     return "💾 Edits saved."
-def make_tokens_csv():
     path = "raw_tokens.csv"
     token_df.to_csv(path, index=False)
     return Path(path)
-def make_iob_csv():
-    iob_tags, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
-            iob_tags.append("O")
             prev[sid] = None
         else:
-            prefix = "I-" if prev.get(sid) == lbl else "B-"
-            iob_tags.append(prefix + lbl)
             prev[sid] = lbl
-    out = token_df.copy()
-    out["iob"] = iob_tags
-    path = "ner_iob.csv"
-    out.to_csv(path, index=False)
     return Path(path)
-# ───────────────────────── UI ───────────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
     gr.Markdown(
-        "**Step 1** – Upload a CSV containing either a `text` column or `user` + `assistant` columns."
     )
     with gr.Row():
@@ -79,38 +76,46 @@ with gr.Blocks() as demo:
     status = gr.Textbox(label="Status", interactive=False)
-    # Token table (hidden until data loaded)
     tok_table = gr.Dataframe(
         headers=["sentence_id", "token", "label"],
         datatype=["number", "str", "str"],
-        row_count=0, col_count=3,
-        visible=False
     )
-    # Buttons row (hidden until loaded)
-    with gr.Row(visible=False) as action_row:
-        save_btn  = gr.Button("💾 Save")
-        dl_tok_btn= gr.Button("⬇︎ Download Tokens CSV")
-        dl_iob_btn= gr.Button("⬇︎ Download IOB CSV")
-    # Hidden download files (appear only after first export)
-    dl_tokens_file = gr.File(label="Tokens CSV", visible=False)
-    dl_iob_file    = gr.File(label="IOB CSV",    visible=False)
-    # Bind events
-    load_btn.click(load_data,
-                   inputs=csv_file,
-                   outputs=[tok_table, status, action_row, dl_tokens_file])
     save_btn.click(save_edits, inputs=tok_table, outputs=status)
-    dl_tok_btn.click(lambda: make_tokens_csv(),
-                     outputs=dl_tokens_file)
-    dl_iob_btn.click(lambda: make_iob_csv(),
-                     outputs=dl_iob_file)
     gr.Markdown(
-        "**Step 2** – Edit the `label` column (`PER`, `ORG`, `LOC`, or `O`) → click **Save** → export."
     )
 demo.launch()

 import pandas as pd
 from pathlib import Path
+LABEL_CHOICES = ["O", "PER", "ORG", "LOC", "EV"]  # EV = Event
+token_df = pd.DataFrame()
+# ────────── helpers ─────────────────────────────────────────────
 def tokenize_df(df: pd.DataFrame) -> pd.DataFrame:
+    rows = []
     if "text" in df.columns:
         lines = df["text"].astype(str)
+    else:
+        # user + assistant fallback
         lines = df.apply(lambda r: f"User: {r['user']} Assistant: {r['assistant']}", axis=1)
     for sid, line in enumerate(lines):
         for tok in line.split():
+            rows.append({"sentence_id": sid, "token": tok, "label": "O"})
+    return pd.DataFrame(rows)
+# ────────── callbacks ───────────────────────────────────────────
 def load_data(file):
     global token_df
     df = pd.read_csv(file.name)
     if "text" not in df.columns and not {"user", "assistant"}.issubset(df.columns):
+        return (None,
+                "❌ CSV must have a `text` column **or** `user`&`assistant` columns.",
                 gr.update(visible=False), gr.update(visible=False))
     token_df = tokenize_df(df)
     return (
+        gr.update(value=token_df, visible=True),
         f"✅ Loaded {len(df)} rows → {len(token_df)} tokens.",
+        gr.update(visible=True),
+        gr.update(visible=False)
     )
 def save_edits(table_data):
     token_df = pd.DataFrame(table_data, columns=["sentence_id", "token", "label"])
     return "💾 Edits saved."
+def export_tokens():
     path = "raw_tokens.csv"
     token_df.to_csv(path, index=False)
     return Path(path)
+def export_iob():
+    iob, prev = [], {}
     for _, r in token_df.iterrows():
         sid, lbl = r["sentence_id"], r["label"]
         if lbl == "O":
+            iob.append("O")
             prev[sid] = None
         else:
+            iob.append(("I-" if prev.get(sid) == lbl else "B-") + lbl)
             prev[sid] = lbl
+    out = token_df.copy(); out["iob"] = iob
+    path = "ner_iob.csv"; out.to_csv(path, index=False)
     return Path(path)
+# ────────── UI ──────────────────────────────────────────────────
 with gr.Blocks() as demo:
     gr.Markdown("# 🏷️ Label It! Mini-NER")
     gr.Markdown(
+        "**Step 1** – Upload a CSV containing either a `text` column, or `user` & `assistant` columns."
     )
     with gr.Row():
     status = gr.Textbox(label="Status", interactive=False)
+    # Editable token table (hidden until load)
     tok_table = gr.Dataframe(
         headers=["sentence_id", "token", "label"],
         datatype=["number", "str", "str"],
+        column_config={
+            "label": gr.ColumnConfig(
+                label="label",
+                dtype="categorical",
+                choices=LABEL_CHOICES,
+            )
+        },
+        row_count=0,
+        visible=False,
     )
+    # Action buttons row (hidden until load)
+    with gr.Row(visible=False) as buttons_row:
+        save_btn     = gr.Button("💾 Save")
+        download_tok = gr.Button("⬇︎ Tokens CSV")
+        download_iob = gr.Button("⬇︎ IOB CSV")
+    # File components that appear after export
+    file_tok = gr.File(label="Click to download", visible=False)
+    file_iob = gr.File(label="Click to download", visible=False)
+    # Bindings
+    load_btn.click(load_data, inputs=csv_file,
+                   outputs=[tok_table, status, buttons_row, file_tok])
     save_btn.click(save_edits, inputs=tok_table, outputs=status)
+    download_tok.click(lambda: export_tokens(),
+                       outputs=file_tok)  # file appears for click
+    download_iob.click(lambda: export_iob(),
+                       outputs=file_iob)
     gr.Markdown(
+        "**Step 2** – In the `label` dropdown choose `PER`, `ORG`, `LOC`, `EV`, or leave `O`."
+        "\nAfter saving, use the download buttons."
     )
 demo.launch()