ner-annotation / app.py
nam pham
feat: improve ui/ux
a33a001
"""Main application module for NER annotation tool."""
import os
import json
import gradio as gr
from typing import List, Dict, Union, Tuple
from src.ner_annotation.core.dataset import DynamicDataset, prepare_for_highlight
from src.ner_annotation.core.annotator import AutoAnnotator
from src.ner_annotation.utils.text_processing import extract_tokens_and_labels
from src.ner_annotation.utils.file_processing import process_uploaded_file, load_from_local_file
from src.ner_annotation.utils.huggingface import (
is_valid_repo_name,
upload_to_hf,
download_from_hf
)
# Available models for annotation
AVAILABLE_MODELS = [
"BookingCare/gliner-multi-healthcare",
"knowledgator/gliner-multitask-large-v0.5",
"knowledgator/gliner-multitask-base-v0.5"
]
# Global variables
dynamic_dataset = None
annotator = None
sentences = []
def load_dataset():
"""Load the dataset and return the first example."""
global dynamic_dataset
try:
with open("data/annotated_data.json", 'rt') as dataset:
ANNOTATED_DATA = json.load(dataset)
dynamic_dataset = DynamicDataset(ANNOTATED_DATA)
max_value = len(dynamic_dataset.data) - 1 if dynamic_dataset.data else 0
return prepare_for_highlight(dynamic_dataset.load_current_example()), gr.update(value=0, maximum=max_value)
except Exception as e:
return [("Error loading dataset: " + str(e), None)], gr.update(value=0, maximum=1)
def example_by_id(id):
"""Navigate to a specific example by ID."""
global dynamic_dataset
if dynamic_dataset is None:
return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
try:
id = int(id)
dynamic_dataset.example_by_id(id)
current = dynamic_dataset.current
max_value = len(dynamic_dataset.data) - 1
return prepare_for_highlight(dynamic_dataset.load_current_example()), gr.update(value=current, maximum=max_value)
except Exception as e:
return [("Error navigating to example: " + str(e), None)], gr.update(value=0, maximum=1)
def next_example():
"""Move to the next example."""
global dynamic_dataset
if dynamic_dataset is None:
return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
try:
dynamic_dataset.next_example()
current = dynamic_dataset.current
max_value = len(dynamic_dataset.data) - 1
return prepare_for_highlight(dynamic_dataset.load_current_example()), gr.update(value=current, maximum=max_value)
except Exception as e:
return [("Error navigating to next example: " + str(e), None)], gr.update(value=0, maximum=1)
def previous_example():
"""Move to the previous example."""
global dynamic_dataset
if dynamic_dataset is None:
return [("Please load a dataset first", None)], gr.update(value=0, maximum=1)
try:
dynamic_dataset.previous_example()
current = dynamic_dataset.current
max_value = len(dynamic_dataset.data) - 1
return prepare_for_highlight(dynamic_dataset.load_current_example()), gr.update(value=current, maximum=max_value)
except Exception as e:
return [("Error navigating to previous example: " + str(e), None)], gr.update(value=0, maximum=1)
def update_example(data):
"""Update the current example with new annotations."""
global dynamic_dataset
if dynamic_dataset is None:
return [("Please load a dataset first", None)]
tokens, ner = extract_tokens_and_labels(data)
dynamic_dataset.data[dynamic_dataset.current]["tokenized_text"] = tokens
dynamic_dataset.data[dynamic_dataset.current]["ner"] = ner
return prepare_for_highlight(dynamic_dataset.load_current_example())
def validate_example():
"""Mark the current example as validated."""
global dynamic_dataset
if dynamic_dataset is None:
return [("Please load a dataset first", None)]
dynamic_dataset.data[dynamic_dataset.current]["validated"] = True
return [("The example was validated!", None)]
def save_dataset(inp):
"""Save the dataset to a file."""
global dynamic_dataset
if dynamic_dataset is None:
return [("Please load a dataset first", None)]
with open("data/annotated_data.json", "wt") as file:
json.dump(dynamic_dataset.data, file)
return [("The validated dataset was saved as data/annotated_data.json", None)]
def annotate(model, labels, threshold, prompt, save_to_hub, repo_name, repo_type, is_private):
"""Annotate the uploaded text using the selected model."""
global annotator, sentences
try:
if not sentences:
return "Please upload a file with text first!"
if save_to_hub and not is_valid_repo_name(repo_name):
return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)."
labels = [label.strip() for label in labels.split(",")]
annotator = AutoAnnotator(model)
annotated_data = annotator.auto_annotate(sentences, labels, prompt, threshold)
# Save annotated data locally
os.makedirs("data", exist_ok=True)
local_path = "data/annotated_data.json"
with open(local_path, "wt") as file:
json.dump(annotated_data, file, ensure_ascii=False)
status_messages = [f"Successfully annotated and saved locally to {local_path}"]
# Upload to Hugging Face Hub if requested
if save_to_hub:
try:
repo_id = upload_to_hf(local_path, repo_name, repo_type, is_private)
status_messages.append(f"Successfully uploaded to Hugging Face Hub repository: {repo_id}")
except Exception as e:
status_messages.append(f"Error with Hugging Face Hub: {str(e)}")
return "\n".join(status_messages)
except Exception as e:
return f"Error during annotation: {str(e)}"
def load_from_huggingface(name):
"""Load a dataset from Hugging Face Hub."""
global dynamic_dataset
try:
# Download dataset from Hugging Face Hub
local_path = download_from_hf(name, "annotated_data.json")
# Load the downloaded dataset
with open(local_path, 'rt') as dataset:
data = json.load(dataset)
# Initialize the dataset
dynamic_dataset = DynamicDataset(data)
return "Successfully loaded dataset from Hugging Face Hub"
except Exception as e:
return f"Error loading dataset from Hugging Face Hub: {str(e)}"
def update_hf_dataset(repo_name, repo_type, is_private):
"""Upload the current dataset to Hugging Face Hub."""
global dynamic_dataset
if dynamic_dataset is None:
return "Please load a dataset first"
try:
if not is_valid_repo_name(repo_name):
return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)."
# Save dataset locally first
os.makedirs("data", exist_ok=True)
local_path = "data/annotated_data.json"
with open(local_path, "wt") as file:
json.dump(dynamic_dataset.data, file, ensure_ascii=False)
# Upload to Hugging Face Hub
repo_id = upload_to_hf(local_path, repo_name, repo_type, is_private)
return f"Successfully uploaded to Hugging Face Hub repository: {repo_id}"
except Exception as e:
return f"Error uploading to Hugging Face Hub: {str(e)}"
def process_conll(content):
"""Convert CoNLL format to JSON."""
sentences = []
current_sentence = {"text": "", "tokenized_text": [], "ner": []}
for line in content.split('\n'):
if not line.strip():
if current_sentence["text"]:
sentences.append(current_sentence)
current_sentence = {"text": "", "tokenized_text": [], "ner": []}
continue
parts = line.split()
if len(parts) >= 2:
token, label = parts[0], parts[-1]
current_sentence["tokenized_text"].append(token)
current_sentence["ner"].append(label)
current_sentence["text"] += token + " "
if current_sentence["text"]:
sentences.append(current_sentence)
return sentences
def process_txt(content):
"""Convert plain text to JSON format."""
sentences = []
for line in content.split('\n'):
if line.strip():
sentences.append({
"text": line.strip(),
"tokenized_text": line.strip().split(),
"ner": ["O"] * len(line.strip().split())
})
return sentences
def process_local_file(file_obj, format):
"""Process a local file and save it as JSON."""
try:
if file_obj is None:
return "No file uploaded"
# Get the file content from the Gradio file object
content = file_obj.name
with open(content, 'r', encoding='utf-8') as f:
content = f.read()
if format == "json":
data = json.loads(content)
elif format == "conll":
data = process_conll(content)
elif format == "txt":
data = process_txt(content)
else:
return "Unsupported file format"
os.makedirs("data", exist_ok=True)
with open("data/annotated_data.json", "wt") as f:
json.dump(data, f, ensure_ascii=False)
return "Successfully processed and saved file"
except Exception as e:
return f"Error processing file: {str(e)}"
def create_interface():
"""Create and return the Gradio interface."""
with gr.Blocks() as demo:
gr.Markdown("# NER Annotation Tool")
with gr.Tabs():
with gr.TabItem("Auto Annotation"):
with gr.Row():
with gr.Column():
file_uploader = gr.File(label="Upload text file (one sentence per line)")
upload_status = gr.Textbox(label="Upload Status")
file_uploader.change(fn=process_uploaded_file, inputs=[file_uploader], outputs=[upload_status])
with gr.Column():
model = gr.Dropdown(
label="Choose the model for annotation",
choices=AVAILABLE_MODELS,
value=AVAILABLE_MODELS[0]
)
labels = gr.Textbox(
label="Labels",
placeholder="Enter comma-separated labels (e.g., PERSON,ORG,LOC)",
scale=2
)
threshold = gr.Slider(
0, 1,
value=0.3,
step=0.01,
label="Threshold",
info="Lower threshold increases entity predictions"
)
prompt = gr.Textbox(
label="Prompt",
placeholder="Enter your annotation prompt (optional)",
scale=2
)
with gr.Group():
gr.Markdown("### Save Options")
save_to_hub = gr.Checkbox(
label="Save to Hugging Face Hub",
value=False
)
with gr.Group(visible=False) as hub_settings:
gr.Markdown("#### Hugging Face Hub Settings")
repo_name = gr.Textbox(
label="Repository Name",
placeholder="Enter repository name (e.g., my-ner-dataset)",
scale=2
)
repo_type = gr.Dropdown(
choices=["dataset", "model", "space"],
value="dataset",
label="Repository Type"
)
is_private = gr.Checkbox(
label="Private Repository",
value=False
)
annotate_btn = gr.Button("Annotate Data")
output_info = gr.Textbox(label="Processing Status")
# Add download buttons for annotated data
with gr.Row():
download_btn_annot = gr.Button("Download Annotated Data", visible=False)
download_file_annot = gr.File(label="Download", interactive=False, visible=False)
download_status = gr.Textbox(label="Download Status", visible=False)
def toggle_hub_settings(save_to_hub):
return {
hub_settings: gr.update(visible=save_to_hub)
}
save_to_hub.change(
fn=toggle_hub_settings,
inputs=[save_to_hub],
outputs=[hub_settings]
)
def show_download_buttons(status):
if status and status.startswith("Successfully annotated and saved locally"):
return gr.update(visible=True), gr.update(visible=True)
return gr.update(visible=False), gr.update(visible=False)
annotate_btn.click(
fn=annotate,
inputs=[
model, labels, threshold, prompt,
save_to_hub, repo_name, repo_type, is_private
],
outputs=[output_info]
)
output_info.change(
fn=show_download_buttons,
inputs=[output_info],
outputs=[download_btn_annot, download_status]
)
def handle_download_annot():
file_path = "data/annotated_data.json"
if os.path.exists(file_path):
return gr.update(value=file_path, visible=True)
return gr.update(visible=False)
download_btn_annot.click(
fn=handle_download_annot,
inputs=None,
outputs=[download_file_annot]
)
with gr.TabItem("Dataset Viewer"):
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### Dataset Controls")
with gr.Group():
with gr.Row():
load_local_btn = gr.Button("Load Local Dataset", variant="primary")
load_hf_btn = gr.Button("Load from Hugging Face", variant="secondary")
with gr.Group() as local_inputs:
local_file = gr.File(label="Upload Local Dataset")
file_format = gr.Dropdown(
choices=["json", "conll", "txt"],
value="json",
label="File Format"
)
local_status = gr.Textbox(label="Status", interactive=False)
with gr.Group(visible=False) as hf_inputs:
with gr.Row():
dataset_name = gr.Textbox(
label="Dataset Name",
placeholder="Enter dataset name (e.g., conll2003)",
scale=4
)
with gr.Row():
gr.Column(scale=1)
load_dataset_btn = gr.Button("📥 Load Dataset", variant="primary")
gr.Column(scale=1)
with gr.Row():
gr.Markdown(
"💡 Tip: Enter a valid Hugging Face dataset name",
elem_classes=["text-sm", "text-gray-500"]
)
gr.Markdown("### Navigation")
with gr.Group():
bar = gr.Slider(
minimum=0,
maximum=1,
step=1,
label="Progress",
interactive=True,
info="Use slider to navigate through examples"
)
with gr.Row():
previous_btn = gr.Button("← Previous", variant="secondary")
next_btn = gr.Button("Next →", variant="secondary")
gr.Markdown("### Actions")
with gr.Group():
with gr.Row():
apply_btn = gr.Button("Apply Changes", variant="primary")
validate_btn = gr.Button("Validate", variant="secondary")
save_btn = gr.Button("Save Dataset", variant="primary")
gr.Markdown("### Hugging Face Upload")
with gr.Group():
with gr.Row():
show_hf_upload_btn = gr.Button("📤 Show Upload Options", variant="secondary", scale=1)
hide_hf_upload_btn = gr.Button("📥 Hide Upload Options", visible=False, variant="secondary", scale=1)
with gr.Group(visible=False) as hf_upload_group:
with gr.Row():
hf_repo_name = gr.Textbox(
label="Repository Name",
placeholder="Enter repository name (e.g., my-ner-dataset)",
scale=2
)
hf_repo_type = gr.Dropdown(
choices=["dataset", "model", "space"],
value="dataset",
label="Repository Type",
scale=1
)
with gr.Row():
hf_is_private = gr.Checkbox(
label="Private Repository",
value=False,
scale=1
)
upload_to_hf_btn = gr.Button("Upload to Hugging Face", variant="primary", scale=2)
hf_upload_status = gr.Textbox(
label="Upload Status",
interactive=False,
show_label=True
)
def toggle_upload_options(show: bool):
return {
hf_upload_group: gr.update(visible=show),
show_hf_upload_btn: gr.update(visible=not show),
hide_hf_upload_btn: gr.update(visible=show)
}
show_hf_upload_btn.click(
fn=lambda: toggle_upload_options(True),
inputs=None,
outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
)
hide_hf_upload_btn.click(
fn=lambda: toggle_upload_options(False),
inputs=None,
outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn]
)
with gr.Column(scale=2):
gr.Markdown("### Current Example")
inp_box = gr.HighlightedText(value=None, interactive=True)
def toggle_local_inputs():
return {
local_inputs: gr.update(visible=True),
hf_inputs: gr.update(visible=False)
}
def toggle_hf_inputs():
return {
local_inputs: gr.update(visible=False),
hf_inputs: gr.update(visible=True)
}
load_local_btn.click(
fn=toggle_local_inputs,
inputs=None,
outputs=[local_inputs, hf_inputs]
)
load_hf_btn.click(
fn=toggle_hf_inputs,
inputs=None,
outputs=[local_inputs, hf_inputs]
)
def process_and_load_local(file_obj, format):
status = process_local_file(file_obj, format)
if "Successfully" in status:
result = load_dataset()
return result[0], result[1], status
return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1), status
local_file.change(
fn=process_and_load_local,
inputs=[local_file, file_format],
outputs=[inp_box, bar, local_status]
)
def load_hf_dataset(name):
status = load_from_huggingface(name)
if "Successfully" in status:
return load_dataset()
return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1)
load_dataset_btn.click(
fn=load_hf_dataset,
inputs=[dataset_name],
outputs=[inp_box, bar]
)
apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box)
save_btn.click(fn=save_dataset, inputs=inp_box, outputs=inp_box)
validate_btn.click(fn=validate_example, inputs=None, outputs=inp_box)
next_btn.click(fn=next_example, inputs=None, outputs=[inp_box, bar])
previous_btn.click(fn=previous_example, inputs=None, outputs=[inp_box, bar])
bar.change(
fn=example_by_id,
inputs=[bar],
outputs=[inp_box, bar],
api_name="example_by_id"
)
upload_to_hf_btn.click(
fn=update_hf_dataset,
inputs=[hf_repo_name, hf_repo_type, hf_is_private],
outputs=[hf_upload_status]
)
return demo
def main():
"""Run the application."""
demo = create_interface()
demo.launch()
if __name__ == "__main__":
main()