Spaces:
Sleeping
Sleeping
File size: 3,443 Bytes
f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 9ed6d9a f3b49b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import gradio as gr
import pandas as pd
# In-memory token DataFrame
token_df = pd.DataFrame()
def make_sample_data(n=100):
people = ["Alice","Bob","Charlie","Diane","Eve"]
orgs = ["Acme","Globex","Initech","Umbrella","Stark"]
locs = ["Paris","NYC","London","Tokyo","Sydney"]
verbs = ["visited","joined","founded","traveled to","met with"]
rows = []
for i in range(n):
p = people[i % len(people)]
v = verbs[i % len(verbs)]
o = orgs[i % len(orgs)]
l = locs[i % len(locs)]
rows.append({"text": f"{p} {v} {o} in {l}."})
return pd.DataFrame(rows)
def load_data(file):
global token_df
# Load uploaded or sample
if file:
df = pd.read_csv(file.name)
else:
df = make_sample_data(100)
if "text" not in df.columns:
return (
gr.update(visible=False),
"β CSV must contain a `text` column.",
gr.update(visible=False)
)
# Tokenize
records = []
for sid, txt in enumerate(df["text"]):
for tok in txt.split():
records.append({"sentence_id": sid, "token": tok, "label": "O"})
token_df = pd.DataFrame(records)
return (
gr.update(value=token_df, visible=True),
f"β
Loaded {len(df)} sentences β {len(token_df)} tokens.",
gr.update(visible=True),
)
def save_edits(table):
global token_df
token_df = pd.DataFrame(table, columns=["sentence_id","token","label"])
return "πΎ Edits saved."
def download_tokens():
token_df.to_csv("raw_tokens.csv", index=False)
return "raw_tokens.csv"
def download_iob():
# Build IOB tags
iob, prev = [], {}
for _, r in token_df.iterrows():
sid, lbl = r["sentence_id"], r["label"]
if lbl == "O":
iob.append("O")
prev[sid] = None
else:
tag = ("I-" if prev.get(sid)==lbl else "B-") + lbl
iob.append(tag)
prev[sid] = lbl
out = token_df.copy()
out["iob"] = iob
out.to_csv("ner_iob.csv", index=False)
return "ner_iob.csv"
with gr.Blocks() as app:
gr.Markdown("# π·οΈ Label It! Mini-NER")
gr.Markdown("**Step 1:** Upload a CSV with a `text` column (or leave blank for sample).")
with gr.Row():
file_in = gr.File(label="π Upload CSV", file_types=[".csv"])
load_btn = gr.Button("Load Data")
status = gr.Textbox(label="Status", interactive=False)
table = gr.Dataframe(
headers=["sentence_id","token","label"],
interactive=True,
visible=False,
label="π Annotate Tokens"
)
# Action buttons: Save + Downloads
with gr.Row(visible=False) as actions:
save_btn = gr.Button("πΎ Save Edits")
dl_tokens = gr.DownloadButton(fn=download_tokens, file_name="raw_tokens.csv", label="β¬οΈ Download Tokens CSV")
dl_iob = gr.DownloadButton(fn=download_iob, file_name="ner_iob.csv", label="β¬οΈ Download IOB CSV")
load_btn.click(load_data, inputs=file_in, outputs=[table, status, actions])
save_btn.click(save_edits, inputs=table, outputs=status)
gr.Markdown("""
**Step 2:**
β’ Click into the **label** column and type one of: `PER`, `ORG`, `LOC`, or leave as `O`.
β’ Press **Save Edits** to lock your annotations.
β’ Download your **Tokens CSV** or **IOB CSV** with the buttons above.
""")
app.launch()
|