File size: 3,641 Bytes
f3b49b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import gradio as gr
import pandas as pd
from pathlib import Path

# Global token storage
token_df = pd.DataFrame()

# Generate generic sample sentences
def make_sample_data(n=100):
    people = ["Alice","Bob","Charlie","Diane","Eve"]
    orgs   = ["Acme Corp","Globex","Initech","Umbrella","Stark Inc"]
    locs   = ["Paris","New York","London","Tokyo","Sydney"]
    verbs  = ["visited","joined","founded","traveled to","met with"]
    rows = []
    for i in range(n):
        p = people[i % len(people)]
        v = verbs[i % len(verbs)]
        o = orgs[i % len(orgs)]
        l = locs[i % len(locs)]
        rows.append({"text": f"{p} {v} {o} in {l}."})
    return pd.DataFrame(rows)

def load_data(file):
    global token_df
    # Load user CSV or fallback to sample
    if file:
        df = pd.read_csv(file.name)
    else:
        df = make_sample_data(100)
    if "text" not in df.columns:
        return (
            gr.update(visible=False),
            "❌ CSV must contain a `text` column.",
            gr.update(visible=False)
        )
    # Tokenize into (sentence_id, token, label)
    records = []
    for sid, txt in enumerate(df["text"]):
        for tok in txt.split():
            records.append({"sentence_id": sid, "token": tok, "label": "O"})
    token_df = pd.DataFrame(records)
    return (
        gr.update(value=token_df, visible=True),
        f"✅ Loaded {len(df)} sentences → {len(token_df)} tokens.",
        gr.update(visible=True)
    )

def save_edits(table):
    global token_df
    token_df = pd.DataFrame(table, columns=["sentence_id","token","label"])
    return "💾 Edits saved."

def download_tokens():
    token_df.to_csv("raw_tokens.csv", index=False)
    return Path("raw_tokens.csv")

def download_iob():
    # Convert to IOB
    iob, prev = [], {}
    for _, r in token_df.iterrows():
        sid, lbl = r["sentence_id"], r["label"]
        if lbl == "O":
            iob.append("O")
            prev[sid] = None
        else:
            tag = ("I-" if prev.get(sid)==lbl else "B-") + lbl
            iob.append(tag)
            prev[sid] = lbl
    out = token_df.copy()
    out["iob"] = iob
    out.to_csv("ner_iob.csv", index=False)
    return Path("ner_iob.csv")

with gr.Blocks() as app:
    gr.Markdown("# 🏷️ Label It! Mini-NER")
    gr.Markdown("**Step 1:** Upload a CSV with a `text` column, or leave blank for sample sentences.")

    with gr.Row():
        file_in = gr.File(label="📁 Upload CSV", file_types=[".csv"])
        load_btn = gr.Button("Load Data")

    status = gr.Textbox(label="Status", interactive=False)

    table = gr.Dataframe(
        headers=["sentence_id","token","label"],
        editable=True,
        visible=False,
        label="📝 Annotate Tokens"
    )

    with gr.Row(visible=False) as actions:
        save_btn    = gr.Button("💾 Save Edits")
        dl_tokens   = gr.DownloadButton(
            fn=download_tokens,
            file_name="raw_tokens.csv",
            label="⬇️ Download Tokens CSV"
        )
        dl_iob      = gr.DownloadButton(
            fn=download_iob,
            file_name="ner_iob.csv",
            label="⬇️ Download IOB CSV"
        )

    # Bind events
    load_btn.click(
        load_data,
        inputs=file_in,
        outputs=[table, status, actions]
    )
    save_btn.click(
        save_edits,
        inputs=table,
        outputs=status
    )

    gr.Markdown("""
    **Step 2:**  
    - Click into the **label** column and type one of:  
      `PER`, `ORG`, `LOC`, or leave as `O`.  
    - **Save Edits**, then download your token CSV or IOB‐tagged CSV.
    """)

app.launch()