Spaces:
Sleeping
Sleeping
File size: 3,641 Bytes
f3b49b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import pandas as pd
from pathlib import Path
# Global token storage
token_df = pd.DataFrame()
# Generate generic sample sentences
def make_sample_data(n=100):
people = ["Alice","Bob","Charlie","Diane","Eve"]
orgs = ["Acme Corp","Globex","Initech","Umbrella","Stark Inc"]
locs = ["Paris","New York","London","Tokyo","Sydney"]
verbs = ["visited","joined","founded","traveled to","met with"]
rows = []
for i in range(n):
p = people[i % len(people)]
v = verbs[i % len(verbs)]
o = orgs[i % len(orgs)]
l = locs[i % len(locs)]
rows.append({"text": f"{p} {v} {o} in {l}."})
return pd.DataFrame(rows)
def load_data(file):
global token_df
# Load user CSV or fallback to sample
if file:
df = pd.read_csv(file.name)
else:
df = make_sample_data(100)
if "text" not in df.columns:
return (
gr.update(visible=False),
"❌ CSV must contain a `text` column.",
gr.update(visible=False)
)
# Tokenize into (sentence_id, token, label)
records = []
for sid, txt in enumerate(df["text"]):
for tok in txt.split():
records.append({"sentence_id": sid, "token": tok, "label": "O"})
token_df = pd.DataFrame(records)
return (
gr.update(value=token_df, visible=True),
f"✅ Loaded {len(df)} sentences → {len(token_df)} tokens.",
gr.update(visible=True)
)
def save_edits(table):
global token_df
token_df = pd.DataFrame(table, columns=["sentence_id","token","label"])
return "💾 Edits saved."
def download_tokens():
token_df.to_csv("raw_tokens.csv", index=False)
return Path("raw_tokens.csv")
def download_iob():
# Convert to IOB
iob, prev = [], {}
for _, r in token_df.iterrows():
sid, lbl = r["sentence_id"], r["label"]
if lbl == "O":
iob.append("O")
prev[sid] = None
else:
tag = ("I-" if prev.get(sid)==lbl else "B-") + lbl
iob.append(tag)
prev[sid] = lbl
out = token_df.copy()
out["iob"] = iob
out.to_csv("ner_iob.csv", index=False)
return Path("ner_iob.csv")
with gr.Blocks() as app:
gr.Markdown("# 🏷️ Label It! Mini-NER")
gr.Markdown("**Step 1:** Upload a CSV with a `text` column, or leave blank for sample sentences.")
with gr.Row():
file_in = gr.File(label="📁 Upload CSV", file_types=[".csv"])
load_btn = gr.Button("Load Data")
status = gr.Textbox(label="Status", interactive=False)
table = gr.Dataframe(
headers=["sentence_id","token","label"],
editable=True,
visible=False,
label="📝 Annotate Tokens"
)
with gr.Row(visible=False) as actions:
save_btn = gr.Button("💾 Save Edits")
dl_tokens = gr.DownloadButton(
fn=download_tokens,
file_name="raw_tokens.csv",
label="⬇️ Download Tokens CSV"
)
dl_iob = gr.DownloadButton(
fn=download_iob,
file_name="ner_iob.csv",
label="⬇️ Download IOB CSV"
)
# Bind events
load_btn.click(
load_data,
inputs=file_in,
outputs=[table, status, actions]
)
save_btn.click(
save_edits,
inputs=table,
outputs=status
)
gr.Markdown("""
**Step 2:**
- Click into the **label** column and type one of:
`PER`, `ORG`, `LOC`, or leave as `O`.
- **Save Edits**, then download your token CSV or IOB‐tagged CSV.
""")
app.launch()
|