Spaces:

Suzana
/

labelit-mini-ner

Sleeping

File size: 3,641 Bytes

f3b49b2

import gradio as gr
import pandas as pd
from pathlib import Path

# Global token storage
token_df = pd.DataFrame()

# Generate generic sample sentences
def make_sample_data(n=100):
    people = ["Alice","Bob","Charlie","Diane","Eve"]
    orgs   = ["Acme Corp","Globex","Initech","Umbrella","Stark Inc"]
    locs   = ["Paris","New York","London","Tokyo","Sydney"]
    verbs  = ["visited","joined","founded","traveled to","met with"]
    rows = []
    for i in range(n):
        p = people[i % len(people)]
        v = verbs[i % len(verbs)]
        o = orgs[i % len(orgs)]
        l = locs[i % len(locs)]
        rows.append({"text": f"{p} {v} {o} in {l}."})
    return pd.DataFrame(rows)

def load_data(file):
    global token_df
    # Load user CSV or fallback to sample
    if file:
        df = pd.read_csv(file.name)
    else:
        df = make_sample_data(100)
    if "text" not in df.columns:
        return (
            gr.update(visible=False),
            "❌ CSV must contain a `text` column.",
            gr.update(visible=False)
        )
    # Tokenize into (sentence_id, token, label)
    records = []
    for sid, txt in enumerate(df["text"]):
        for tok in txt.split():
            records.append({"sentence_id": sid, "token": tok, "label": "O"})
    token_df = pd.DataFrame(records)
    return (
        gr.update(value=token_df, visible=True),
        f"✅ Loaded {len(df)} sentences → {len(token_df)} tokens.",
        gr.update(visible=True)
    )

def save_edits(table):
    global token_df
    token_df = pd.DataFrame(table, columns=["sentence_id","token","label"])
    return "💾 Edits saved."

def download_tokens():
    token_df.to_csv("raw_tokens.csv", index=False)
    return Path("raw_tokens.csv")

def download_iob():
    # Convert to IOB
    iob, prev = [], {}
    for _, r in token_df.iterrows():
        sid, lbl = r["sentence_id"], r["label"]
        if lbl == "O":
            iob.append("O")
            prev[sid] = None
        else:
            tag = ("I-" if prev.get(sid)==lbl else "B-") + lbl
            iob.append(tag)
            prev[sid] = lbl
    out = token_df.copy()
    out["iob"] = iob
    out.to_csv("ner_iob.csv", index=False)
    return Path("ner_iob.csv")

with gr.Blocks() as app:
    gr.Markdown("# 🏷️ Label It! Mini-NER")
    gr.Markdown("**Step 1:** Upload a CSV with a `text` column, or leave blank for sample sentences.")

    with gr.Row():
        file_in = gr.File(label="📁 Upload CSV", file_types=[".csv"])
        load_btn = gr.Button("Load Data")

    status = gr.Textbox(label="Status", interactive=False)

    table = gr.Dataframe(
        headers=["sentence_id","token","label"],
        editable=True,
        visible=False,
        label="📝 Annotate Tokens"
    )

    with gr.Row(visible=False) as actions:
        save_btn    = gr.Button("💾 Save Edits")
        dl_tokens   = gr.DownloadButton(
            fn=download_tokens,
            file_name="raw_tokens.csv",
            label="⬇️ Download Tokens CSV"
        )
        dl_iob      = gr.DownloadButton(
            fn=download_iob,
            file_name="ner_iob.csv",
            label="⬇️ Download IOB CSV"
        )

    # Bind events
    load_btn.click(
        load_data,
        inputs=file_in,
        outputs=[table, status, actions]
    )
    save_btn.click(
        save_edits,
        inputs=table,
        outputs=status
    )

    gr.Markdown("""
    **Step 2:**  
    - Click into the **label** column and type one of:  
      `PER`, `ORG`, `LOC`, or leave as `O`.  
    - **Save Edits**, then download your token CSV or IOB‐tagged CSV.
    """)

app.launch()