Suzana commited on
Commit
f3b49b2
·
verified ·
1 Parent(s): 1736c22

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -0
app.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pathlib import Path
4
+
5
+ # Global token storage
6
+ token_df = pd.DataFrame()
7
+
8
+ # Generate generic sample sentences
9
+ def make_sample_data(n=100):
10
+ people = ["Alice","Bob","Charlie","Diane","Eve"]
11
+ orgs = ["Acme Corp","Globex","Initech","Umbrella","Stark Inc"]
12
+ locs = ["Paris","New York","London","Tokyo","Sydney"]
13
+ verbs = ["visited","joined","founded","traveled to","met with"]
14
+ rows = []
15
+ for i in range(n):
16
+ p = people[i % len(people)]
17
+ v = verbs[i % len(verbs)]
18
+ o = orgs[i % len(orgs)]
19
+ l = locs[i % len(locs)]
20
+ rows.append({"text": f"{p} {v} {o} in {l}."})
21
+ return pd.DataFrame(rows)
22
+
23
+ def load_data(file):
24
+ global token_df
25
+ # Load user CSV or fallback to sample
26
+ if file:
27
+ df = pd.read_csv(file.name)
28
+ else:
29
+ df = make_sample_data(100)
30
+ if "text" not in df.columns:
31
+ return (
32
+ gr.update(visible=False),
33
+ "❌ CSV must contain a `text` column.",
34
+ gr.update(visible=False)
35
+ )
36
+ # Tokenize into (sentence_id, token, label)
37
+ records = []
38
+ for sid, txt in enumerate(df["text"]):
39
+ for tok in txt.split():
40
+ records.append({"sentence_id": sid, "token": tok, "label": "O"})
41
+ token_df = pd.DataFrame(records)
42
+ return (
43
+ gr.update(value=token_df, visible=True),
44
+ f"✅ Loaded {len(df)} sentences → {len(token_df)} tokens.",
45
+ gr.update(visible=True)
46
+ )
47
+
48
+ def save_edits(table):
49
+ global token_df
50
+ token_df = pd.DataFrame(table, columns=["sentence_id","token","label"])
51
+ return "💾 Edits saved."
52
+
53
+ def download_tokens():
54
+ token_df.to_csv("raw_tokens.csv", index=False)
55
+ return Path("raw_tokens.csv")
56
+
57
+ def download_iob():
58
+ # Convert to IOB
59
+ iob, prev = [], {}
60
+ for _, r in token_df.iterrows():
61
+ sid, lbl = r["sentence_id"], r["label"]
62
+ if lbl == "O":
63
+ iob.append("O")
64
+ prev[sid] = None
65
+ else:
66
+ tag = ("I-" if prev.get(sid)==lbl else "B-") + lbl
67
+ iob.append(tag)
68
+ prev[sid] = lbl
69
+ out = token_df.copy()
70
+ out["iob"] = iob
71
+ out.to_csv("ner_iob.csv", index=False)
72
+ return Path("ner_iob.csv")
73
+
74
+ with gr.Blocks() as app:
75
+ gr.Markdown("# 🏷️ Label It! Mini-NER")
76
+ gr.Markdown("**Step 1:** Upload a CSV with a `text` column, or leave blank for sample sentences.")
77
+
78
+ with gr.Row():
79
+ file_in = gr.File(label="📁 Upload CSV", file_types=[".csv"])
80
+ load_btn = gr.Button("Load Data")
81
+
82
+ status = gr.Textbox(label="Status", interactive=False)
83
+
84
+ table = gr.Dataframe(
85
+ headers=["sentence_id","token","label"],
86
+ editable=True,
87
+ visible=False,
88
+ label="📝 Annotate Tokens"
89
+ )
90
+
91
+ with gr.Row(visible=False) as actions:
92
+ save_btn = gr.Button("💾 Save Edits")
93
+ dl_tokens = gr.DownloadButton(
94
+ fn=download_tokens,
95
+ file_name="raw_tokens.csv",
96
+ label="⬇️ Download Tokens CSV"
97
+ )
98
+ dl_iob = gr.DownloadButton(
99
+ fn=download_iob,
100
+ file_name="ner_iob.csv",
101
+ label="⬇️ Download IOB CSV"
102
+ )
103
+
104
+ # Bind events
105
+ load_btn.click(
106
+ load_data,
107
+ inputs=file_in,
108
+ outputs=[table, status, actions]
109
+ )
110
+ save_btn.click(
111
+ save_edits,
112
+ inputs=table,
113
+ outputs=status
114
+ )
115
+
116
+ gr.Markdown("""
117
+ **Step 2:**
118
+ - Click into the **label** column and type one of:
119
+ `PER`, `ORG`, `LOC`, or leave as `O`.
120
+ - **Save Edits**, then download your token CSV or IOB‐tagged CSV.
121
+ """)
122
+
123
+ app.launch()