nam pham commited on
Commit
090dddd
·
1 Parent(s): 1422152

feat: create app

Browse files
Files changed (6) hide show
  1. .python-version +1 -0
  2. app.py +651 -0
  3. data/annotated_data.json +0 -0
  4. pyproject.toml +12 -0
  5. requirements.txt +4 -0
  6. uv.lock +0 -0
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
app.py ADDED
@@ -0,0 +1,651 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from huggingface_hub import HfApi
3
+ import os
4
+ import re
5
+ import json
6
+ import torch
7
+ import random
8
+ from typing import List, Dict, Union, Tuple
9
+ from gliner import GLiNER
10
+ from datasets import load_dataset
11
+
12
+ # Available models for annotation
13
+ AVAILABLE_MODELS = [
14
+ "BookingCare/gliner-multi-healthcare",
15
+ "knowledgator/gliner-multitask-large-v0.5",
16
+ "knowledgator/gliner-multitask-base-v0.5"
17
+ ]
18
+
19
+ # Dataset Viewer Classes and Functions
20
+ class DynamicDataset:
21
+ def __init__(
22
+ self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
23
+ ) -> None:
24
+ self.data = data
25
+ self.data_len = len(self.data)
26
+ self.current = -1
27
+ for example in self.data:
28
+ if not "validated" in example.keys():
29
+ example["validated"] = False
30
+
31
+ def next_example(self):
32
+ self.current += 1
33
+ if self.current > self.data_len-1:
34
+ self.current = self.data_len -1
35
+ elif self.current < 0:
36
+ self.current = 0
37
+
38
+ def previous_example(self):
39
+ self.current -= 1
40
+ if self.current > self.data_len-1:
41
+ self.current = self.data_len -1
42
+ elif self.current < 0:
43
+ self.current = 0
44
+
45
+ def example_by_id(self, id):
46
+ self.current = id
47
+ if self.current > self.data_len-1:
48
+ self.current = self.data_len -1
49
+ elif self.current < 0:
50
+ self.current = 0
51
+
52
+ def validate(self):
53
+ self.data[self.current]["validated"] = True
54
+
55
+ def load_current_example(self):
56
+ return self.data[self.current]
57
+
58
+ def tokenize_text(text):
59
+ """Tokenize the input text into a list of tokens."""
60
+ return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
61
+
62
+ def join_tokens(tokens):
63
+ # Joining tokens with space, but handling special characters correctly
64
+ text = ""
65
+ for token in tokens:
66
+ if token in {",", ".", "!", "?", ":", ";", "..."}:
67
+ text = text.rstrip() + token
68
+ else:
69
+ text += " " + token
70
+ return text.strip()
71
+
72
+ def prepare_for_highlight(data):
73
+ tokens = data["tokenized_text"]
74
+ ner = data["ner"]
75
+
76
+ highlighted_text = []
77
+ current_entity = None
78
+ entity_tokens = []
79
+ normal_tokens = []
80
+
81
+ for idx, token in enumerate(tokens):
82
+ # Check if the current token is the start of a new entity
83
+ if current_entity is None or idx > current_entity[1]:
84
+ if entity_tokens:
85
+ highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
86
+ entity_tokens = []
87
+ current_entity = next((entity for entity in ner if entity[0] == idx), None)
88
+
89
+ # If current token is part of an entity
90
+ if current_entity and current_entity[0] <= idx <= current_entity[1]:
91
+ if normal_tokens:
92
+ highlighted_text.append((" ".join(normal_tokens), None))
93
+ normal_tokens = []
94
+ entity_tokens.append(token + " ")
95
+ else:
96
+ if entity_tokens:
97
+ highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
98
+ entity_tokens = []
99
+ normal_tokens.append(token + " ")
100
+
101
+ # Append any remaining tokens
102
+ if entity_tokens:
103
+ highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
104
+ if normal_tokens:
105
+ highlighted_text.append((" ".join(normal_tokens), None))
106
+ # Clean up spaces before punctuation
107
+ cleaned_highlighted_text = []
108
+ for text, label in highlighted_text:
109
+ cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
110
+ cleaned_highlighted_text.append((cleaned_text, label))
111
+
112
+ return cleaned_highlighted_text
113
+
114
+ def extract_tokens_and_labels(data: List[Dict[str, Union[str, None]]]) -> Dict[str, Union[List[str], List[Tuple[int, int, str]]]]:
115
+ tokens = []
116
+ ner = []
117
+
118
+ token_start_idx = 0
119
+
120
+ for entry in data:
121
+ char = entry['token']
122
+ label = entry['class_or_confidence']
123
+
124
+ # Tokenize the current text chunk
125
+ token_list = tokenize_text(char)
126
+
127
+ # Append tokens to the main tokens list
128
+ tokens.extend(token_list)
129
+
130
+ if label:
131
+ token_end_idx = token_start_idx + len(token_list) - 1
132
+ ner.append((token_start_idx, token_end_idx, label))
133
+
134
+ token_start_idx += len(token_list)
135
+
136
+ return tokens, ner
137
+
138
+ # Global variables for dataset viewer
139
+ dynamic_dataset = None
140
+
141
+ def update_example(data):
142
+ global dynamic_dataset
143
+ tokens, ner = extract_tokens_and_labels(data)
144
+ dynamic_dataset.data[dynamic_dataset.current]["tokenized_text"] = tokens
145
+ dynamic_dataset.data[dynamic_dataset.current]["ner"] = ner
146
+ return prepare_for_highlight(dynamic_dataset.load_current_example())
147
+
148
+ def validate_example():
149
+ global dynamic_dataset
150
+ dynamic_dataset.data[dynamic_dataset.current]["validated"] = True
151
+ return [("The example was validated!", None)]
152
+
153
+ def next_example():
154
+ global dynamic_dataset
155
+ dynamic_dataset.next_example()
156
+ return prepare_for_highlight(dynamic_dataset.load_current_example()), dynamic_dataset.current
157
+
158
+ def previous_example():
159
+ global dynamic_dataset
160
+ dynamic_dataset.previous_example()
161
+ return prepare_for_highlight(dynamic_dataset.load_current_example()), dynamic_dataset.current
162
+
163
+ def save_dataset(inp):
164
+ global dynamic_dataset
165
+ with open("data/annotated_data.json", "wt") as file:
166
+ json.dump(dynamic_dataset.data, file)
167
+ return [("The validated dataset was saved as data/annotated_data.json", None)]
168
+
169
+ def load_dataset():
170
+ global dynamic_dataset
171
+ try:
172
+ with open("data/annotated_data.json", 'rt') as dataset:
173
+ ANNOTATED_DATA = json.load(dataset)
174
+ dynamic_dataset = DynamicDataset(ANNOTATED_DATA)
175
+ max_value = len(dynamic_dataset.data) - 1 if dynamic_dataset.data else 0
176
+ return prepare_for_highlight(dynamic_dataset.load_current_example()), 0, max_value
177
+ except Exception as e:
178
+ return [("Error loading dataset: " + str(e), None)], 0, 0
179
+
180
+ # Original annotation functions
181
+ def transform_data(data):
182
+ tokens = tokenize_text(data['text'])
183
+ spans = []
184
+
185
+ for entity in data['entities']:
186
+ entity_tokens = tokenize_text(entity['word'])
187
+ entity_length = len(entity_tokens)
188
+
189
+ # Find the start and end indices of each entity in the tokenized text
190
+ for i in range(len(tokens) - entity_length + 1):
191
+ if tokens[i:i + entity_length] == entity_tokens:
192
+ spans.append([i, i + entity_length - 1, entity['entity']])
193
+ break
194
+
195
+ return {"tokenized_text": tokens, "ner": spans, "validated": False}
196
+
197
+ def merge_entities(entities):
198
+ if not entities:
199
+ return []
200
+ merged = []
201
+ current = entities[0]
202
+ for next_entity in entities[1:]:
203
+ if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
204
+ current['word'] += ' ' + next_entity['word']
205
+ current['end'] = next_entity['end']
206
+ else:
207
+ merged.append(current)
208
+ current = next_entity
209
+ merged.append(current)
210
+ return merged
211
+
212
+ def annotate_text(model, text, labels: List[str], threshold: float, nested_ner: bool) -> Dict:
213
+ labels = [label.strip() for label in labels]
214
+ r = {
215
+ "text": text,
216
+ "entities": [
217
+ {
218
+ "entity": entity["label"],
219
+ "word": entity["text"],
220
+ "start": entity["start"],
221
+ "end": entity["end"],
222
+ "score": 0,
223
+ }
224
+ for entity in model.predict_entities(
225
+ text, labels, flat_ner=not nested_ner, threshold=threshold
226
+ )
227
+ ],
228
+ }
229
+ r["entities"] = merge_entities(r["entities"])
230
+ return transform_data(r)
231
+
232
+ class AutoAnnotator:
233
+ def __init__(
234
+ self, model: str = "knowledgator/gliner-multitask-large-v0.5",
235
+ device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
236
+ ) -> None:
237
+
238
+ self.model = GLiNER.from_pretrained(model).to(device)
239
+ self.annotated_data = []
240
+ self.stat = {
241
+ "total": None,
242
+ "current": -1
243
+ }
244
+
245
+ def auto_annotate(
246
+ self, data: List[str], labels: List[str],
247
+ prompt: Union[str, List[str]] = None, threshold: float = 0.5, nested_ner: bool = False
248
+ ) -> List[Dict]:
249
+ self.stat["total"] = len(data)
250
+ self.stat["current"] = -1 # Reset current progress
251
+ for text in data:
252
+ self.stat["current"] += 1
253
+ if isinstance(prompt, list):
254
+ prompt_text = random.choice(prompt)
255
+ else:
256
+ prompt_text = prompt
257
+ text = f"{prompt_text}\n{text}" if prompt_text else text
258
+
259
+ annotation = annotate_text(self.model, text, labels, threshold, nested_ner)
260
+
261
+ if not annotation["ner"]: # If no entities identified
262
+ annotation = {"tokenized_text": tokenize_text(text), "ner": [], "validated": False}
263
+
264
+ self.annotated_data.append(annotation)
265
+ return self.annotated_data
266
+
267
+ # Global variables
268
+ annotator = None
269
+ sentences = []
270
+
271
+ def process_uploaded_file(file_obj):
272
+ if file_obj is None:
273
+ return "Please upload a file first!"
274
+
275
+ try:
276
+ # Read the uploaded file
277
+ with open(file_obj.name, 'r', encoding='utf-8') as f:
278
+ global sentences
279
+ sentences = [line.strip() for line in f if line.strip()]
280
+ return f"Successfully loaded {len(sentences)} sentences from file!"
281
+ except Exception as e:
282
+ return f"Error reading file: {str(e)}"
283
+
284
+ def annotate(model, labels, threshold, prompt):
285
+ global annotator
286
+ try:
287
+ if not sentences:
288
+ return "Please upload a file with text first!"
289
+
290
+ labels = [label.strip() for label in labels.split(",")]
291
+ annotator = AutoAnnotator(model)
292
+ annotated_data = annotator.auto_annotate(sentences, labels, prompt, threshold)
293
+
294
+ # Save annotated data
295
+ os.makedirs("data", exist_ok=True)
296
+ with open("data/annotated_data.json", "wt") as file:
297
+ json.dump(annotated_data, file, ensure_ascii=False)
298
+
299
+ # Upload to Hugging Face Hub
300
+ api = HfApi()
301
+ api.upload_file(
302
+ path_or_fileobj="data/annotated_data.json",
303
+ path_in_repo="annotated_data.json",
304
+ repo_id="YOUR_USERNAME/YOUR_REPO_NAME", # Replace with your repo
305
+ repo_type="dataset"
306
+ )
307
+
308
+ return "Successfully annotated and saved to Hugging Face Hub!"
309
+ except Exception as e:
310
+ return f"Error during annotation: {str(e)}"
311
+
312
+ def convert_hf_dataset_to_ner_format(dataset):
313
+ """Convert Hugging Face dataset to NER format"""
314
+ converted_data = []
315
+ for item in dataset:
316
+ # Assuming the dataset has 'tokens' and 'ner_tags' fields
317
+ # Adjust the field names based on your dataset structure
318
+ if 'tokens' in item and 'ner_tags' in item:
319
+ ner_spans = []
320
+ current_span = None
321
+
322
+ for i, (token, tag) in enumerate(zip(item['tokens'], item['ner_tags'])):
323
+ if tag != 'O': # Not Outside
324
+ if current_span is None:
325
+ current_span = [i, i, tag]
326
+ elif tag == current_span[2]:
327
+ current_span[1] = i
328
+ else:
329
+ ner_spans.append(current_span)
330
+ current_span = [i, i, tag]
331
+ elif current_span is not None:
332
+ ner_spans.append(current_span)
333
+ current_span = None
334
+
335
+ if current_span is not None:
336
+ ner_spans.append(current_span)
337
+
338
+ converted_data.append({
339
+ "tokenized_text": item['tokens'],
340
+ "ner": ner_spans,
341
+ "validated": False
342
+ })
343
+
344
+ return converted_data
345
+
346
+ def load_from_huggingface(dataset_name: str, split: str = "train"):
347
+ """Load dataset from Hugging Face Hub"""
348
+ try:
349
+ dataset = load_dataset(dataset_name, split=split)
350
+ converted_data = convert_hf_dataset_to_ner_format(dataset)
351
+
352
+ # Save the converted data
353
+ os.makedirs("data", exist_ok=True)
354
+ with open("data/annotated_data.json", "wt") as file:
355
+ json.dump(converted_data, file, ensure_ascii=False)
356
+
357
+ return f"Successfully loaded and converted dataset: {dataset_name}"
358
+ except Exception as e:
359
+ return f"Error loading dataset: {str(e)}"
360
+
361
+ def load_from_local_file(file_path: str, file_format: str = "json"):
362
+ """Load and convert data from local file in various formats"""
363
+ try:
364
+ if file_format == "json":
365
+ with open(file_path, 'r', encoding='utf-8') as f:
366
+ data = json.load(f)
367
+ if isinstance(data, list):
368
+ # If data is already in the correct format
369
+ if all("tokenized_text" in item and "ner" in item for item in data):
370
+ return data
371
+ # Convert from other JSON formats
372
+ converted_data = []
373
+ for item in data:
374
+ if "tokens" in item and "ner_tags" in item:
375
+ ner_spans = []
376
+ current_span = None
377
+ for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])):
378
+ if tag != "O":
379
+ if current_span is None:
380
+ current_span = [i, i, tag]
381
+ elif tag == current_span[2]:
382
+ current_span[1] = i
383
+ else:
384
+ ner_spans.append(current_span)
385
+ current_span = [i, i, tag]
386
+ elif current_span is not None:
387
+ ner_spans.append(current_span)
388
+ current_span = None
389
+ if current_span is not None:
390
+ ner_spans.append(current_span)
391
+ converted_data.append({
392
+ "tokenized_text": item["tokens"],
393
+ "ner": ner_spans,
394
+ "validated": False
395
+ })
396
+ return converted_data
397
+ else:
398
+ raise ValueError("JSON file must contain a list of examples")
399
+
400
+ elif file_format == "conll":
401
+ converted_data = []
402
+ current_example = {"tokens": [], "ner_tags": []}
403
+
404
+ with open(file_path, 'r', encoding='utf-8') as f:
405
+ for line in f:
406
+ line = line.strip()
407
+ if line:
408
+ if line.startswith("#"):
409
+ continue
410
+ parts = line.split()
411
+ if len(parts) >= 2:
412
+ token, tag = parts[0], parts[-1]
413
+ current_example["tokens"].append(token)
414
+ current_example["ner_tags"].append(tag)
415
+ elif current_example["tokens"]:
416
+ # Convert current example
417
+ ner_spans = []
418
+ current_span = None
419
+ for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
420
+ if tag != "O":
421
+ if current_span is None:
422
+ current_span = [i, i, tag]
423
+ elif tag == current_span[2]:
424
+ current_span[1] = i
425
+ else:
426
+ ner_spans.append(current_span)
427
+ current_span = [i, i, tag]
428
+ elif current_span is not None:
429
+ ner_spans.append(current_span)
430
+ current_span = None
431
+ if current_span is not None:
432
+ ner_spans.append(current_span)
433
+
434
+ converted_data.append({
435
+ "tokenized_text": current_example["tokens"],
436
+ "ner": ner_spans,
437
+ "validated": False
438
+ })
439
+ current_example = {"tokens": [], "ner_tags": []}
440
+
441
+ # Handle last example if exists
442
+ if current_example["tokens"]:
443
+ ner_spans = []
444
+ current_span = None
445
+ for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
446
+ if tag != "O":
447
+ if current_span is None:
448
+ current_span = [i, i, tag]
449
+ elif tag == current_span[2]:
450
+ current_span[1] = i
451
+ else:
452
+ ner_spans.append(current_span)
453
+ current_span = [i, i, tag]
454
+ elif current_span is not None:
455
+ ner_spans.append(current_span)
456
+ current_span = None
457
+ if current_span is not None:
458
+ ner_spans.append(current_span)
459
+
460
+ converted_data.append({
461
+ "tokenized_text": current_example["tokens"],
462
+ "ner": ner_spans,
463
+ "validated": False
464
+ })
465
+
466
+ return converted_data
467
+
468
+ elif file_format == "txt":
469
+ # Simple text file with one sentence per line
470
+ converted_data = []
471
+ with open(file_path, 'r', encoding='utf-8') as f:
472
+ for line in f:
473
+ line = line.strip()
474
+ if line:
475
+ tokens = tokenize_text(line)
476
+ converted_data.append({
477
+ "tokenized_text": tokens,
478
+ "ner": [],
479
+ "validated": False
480
+ })
481
+ return converted_data
482
+
483
+ else:
484
+ raise ValueError(f"Unsupported file format: {file_format}")
485
+
486
+ except Exception as e:
487
+ raise Exception(f"Error loading file: {str(e)}")
488
+
489
+ def process_local_file(file_obj, file_format):
490
+ """Process uploaded local file"""
491
+ if file_obj is None:
492
+ return "Please upload a file first!"
493
+
494
+ try:
495
+ # Load and convert the data
496
+ data = load_from_local_file(file_obj.name, file_format)
497
+
498
+ # Save the converted data
499
+ os.makedirs("data", exist_ok=True)
500
+ with open("data/annotated_data.json", "wt") as file:
501
+ json.dump(data, file, ensure_ascii=False)
502
+
503
+ return f"Successfully loaded and converted {len(data)} examples from {file_format} file!"
504
+ except Exception as e:
505
+ return f"Error processing file: {str(e)}"
506
+
507
+ # Create the main interface with tabs
508
+ with gr.Blocks() as demo:
509
+ gr.Markdown("# NER Annotation Tool")
510
+
511
+ with gr.Tabs():
512
+ with gr.TabItem("Auto Annotation"):
513
+ with gr.Row():
514
+ with gr.Column():
515
+ file_uploader = gr.File(label="Upload text file (one sentence per line)")
516
+ upload_status = gr.Textbox(label="Upload Status")
517
+ file_uploader.change(fn=process_uploaded_file, inputs=[file_uploader], outputs=[upload_status])
518
+
519
+ with gr.Column():
520
+ model = gr.Dropdown(
521
+ label="Choose the model for annotation",
522
+ choices=AVAILABLE_MODELS,
523
+ value=AVAILABLE_MODELS[0]
524
+ )
525
+ labels = gr.Textbox(
526
+ label="Labels",
527
+ placeholder="Enter comma-separated labels (e.g., PERSON,ORG,LOC)",
528
+ scale=2
529
+ )
530
+ threshold = gr.Slider(
531
+ 0, 1,
532
+ value=0.3,
533
+ step=0.01,
534
+ label="Threshold",
535
+ info="Lower threshold increases entity predictions"
536
+ )
537
+ prompt = gr.Textbox(
538
+ label="Prompt",
539
+ placeholder="Enter your annotation prompt (optional)",
540
+ scale=2
541
+ )
542
+ annotate_btn = gr.Button("Annotate Data")
543
+ output_info = gr.Textbox(label="Processing Status")
544
+
545
+ annotate_btn.click(
546
+ fn=annotate,
547
+ inputs=[model, labels, threshold, prompt],
548
+ outputs=[output_info]
549
+ )
550
+
551
+ with gr.TabItem("Dataset Viewer"):
552
+ with gr.Row():
553
+ with gr.Column():
554
+ with gr.Row():
555
+ load_local_btn = gr.Button("Load Local Dataset")
556
+ load_hf_btn = gr.Button("Load from Hugging Face")
557
+
558
+ local_file = gr.File(label="Upload Local Dataset", visible=False)
559
+ file_format = gr.Dropdown(
560
+ choices=["json", "conll", "txt"],
561
+ value="json",
562
+ label="File Format",
563
+ visible=False
564
+ )
565
+ local_status = gr.Textbox(label="Local File Status", visible=False)
566
+
567
+ dataset_name = gr.Textbox(
568
+ label="Hugging Face Dataset Name",
569
+ placeholder="Enter dataset name (e.g., conll2003)",
570
+ visible=False
571
+ )
572
+ dataset_split = gr.Dropdown(
573
+ choices=["train", "validation", "test"],
574
+ value="train",
575
+ label="Dataset Split",
576
+ visible=False
577
+ )
578
+
579
+ bar = gr.Slider(minimum=0, maximum=1, step=1, label="Progress", interactive=False)
580
+
581
+ with gr.Row():
582
+ previous_btn = gr.Button("Previous example")
583
+ apply_btn = gr.Button("Apply changes")
584
+ next_btn = gr.Button("Next example")
585
+
586
+ validate_btn = gr.Button("Validate")
587
+ save_btn = gr.Button("Save validated dataset")
588
+
589
+ inp_box = gr.HighlightedText(value=None, interactive=True)
590
+
591
+ def toggle_local_inputs():
592
+ return {
593
+ local_file: gr.update(visible=True),
594
+ file_format: gr.update(visible=True),
595
+ local_status: gr.update(visible=True),
596
+ dataset_name: gr.update(visible=False),
597
+ dataset_split: gr.update(visible=False)
598
+ }
599
+
600
+ def toggle_hf_inputs():
601
+ return {
602
+ local_file: gr.update(visible=False),
603
+ file_format: gr.update(visible=False),
604
+ local_status: gr.update(visible=False),
605
+ dataset_name: gr.update(visible=True),
606
+ dataset_split: gr.update(visible=True)
607
+ }
608
+
609
+ load_local_btn.click(
610
+ fn=toggle_local_inputs,
611
+ inputs=None,
612
+ outputs=[local_file, file_format, local_status, dataset_name, dataset_split]
613
+ )
614
+
615
+ load_hf_btn.click(
616
+ fn=toggle_hf_inputs,
617
+ inputs=None,
618
+ outputs=[local_file, file_format, local_status, dataset_name, dataset_split]
619
+ )
620
+
621
+ def process_and_load_local(file_obj, format):
622
+ status = process_local_file(file_obj, format)
623
+ if "Successfully" in status:
624
+ return load_dataset()
625
+ return [status], 0, 0
626
+
627
+ local_file.change(
628
+ fn=process_and_load_local,
629
+ inputs=[local_file, file_format],
630
+ outputs=[inp_box, bar]
631
+ )
632
+
633
+ def load_hf_dataset(name, split):
634
+ status = load_from_huggingface(name, split)
635
+ if "Successfully" in status:
636
+ return load_dataset()
637
+ return [status], 0, 0
638
+
639
+ load_hf_btn.click(
640
+ fn=load_hf_dataset,
641
+ inputs=[dataset_name, dataset_split],
642
+ outputs=[inp_box, bar]
643
+ )
644
+
645
+ apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
646
+ save_btn.click(fn=save_dataset, inputs=inp_box, outputs=inp_box)
647
+ validate_btn.click(fn=validate_example, inputs=None, outputs=inp_box)
648
+ next_btn.click(fn=next_example, inputs=None, outputs=[inp_box, bar])
649
+ previous_btn.click(fn=previous_example, inputs=None, outputs=[inp_box, bar])
650
+
651
+ demo.launch()
data/annotated_data.json ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "ner-annotation"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "datasets>=3.6.0",
9
+ "gliner>=0.2.20",
10
+ "gradio>=5.31.0",
11
+ "huggingface-hub>=0.32.1",
12
+ ]
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==5.31.0
2
+ datasets>=3.6.0
3
+ gliner>=0.2.20
4
+ huggingface-hub>=0.32.1
uv.lock ADDED
The diff for this file is too large to render. See raw diff