"""Main application module for NER annotation tool.""" import os import json import gradio as gr from typing import List, Dict, Union, Tuple from src.ner_annotation.core.dataset import DynamicDataset, prepare_for_highlight from src.ner_annotation.core.annotator import AutoAnnotator from src.ner_annotation.utils.text_processing import extract_tokens_and_labels from src.ner_annotation.utils.file_processing import process_uploaded_file, load_from_local_file from src.ner_annotation.utils.huggingface import ( is_valid_repo_name, upload_to_hf, download_from_hf ) # Available models for annotation AVAILABLE_MODELS = [ "BookingCare/gliner-multi-healthcare", "knowledgator/gliner-multitask-large-v0.5", "knowledgator/gliner-multitask-base-v0.5" ] # Global variables dynamic_dataset = None annotator = None sentences = [] def load_dataset(): """Load the dataset and return the first example.""" global dynamic_dataset try: with open("data/annotated_data.json", 'rt') as dataset: ANNOTATED_DATA = json.load(dataset) dynamic_dataset = DynamicDataset(ANNOTATED_DATA) max_value = len(dynamic_dataset.data) - 1 if dynamic_dataset.data else 0 return prepare_for_highlight(dynamic_dataset.load_current_example()), gr.update(value=0, maximum=max_value) except Exception as e: return [("Error loading dataset: " + str(e), None)], gr.update(value=0, maximum=1) def example_by_id(id): """Navigate to a specific example by ID.""" global dynamic_dataset if dynamic_dataset is None: return [("Please load a dataset first", None)], gr.update(value=0, maximum=1) try: id = int(id) dynamic_dataset.example_by_id(id) current = dynamic_dataset.current max_value = len(dynamic_dataset.data) - 1 return prepare_for_highlight(dynamic_dataset.load_current_example()), gr.update(value=current, maximum=max_value) except Exception as e: return [("Error navigating to example: " + str(e), None)], gr.update(value=0, maximum=1) def next_example(): """Move to the next example.""" global dynamic_dataset if dynamic_dataset is None: return [("Please load a dataset first", None)], gr.update(value=0, maximum=1) try: dynamic_dataset.next_example() current = dynamic_dataset.current max_value = len(dynamic_dataset.data) - 1 return prepare_for_highlight(dynamic_dataset.load_current_example()), gr.update(value=current, maximum=max_value) except Exception as e: return [("Error navigating to next example: " + str(e), None)], gr.update(value=0, maximum=1) def previous_example(): """Move to the previous example.""" global dynamic_dataset if dynamic_dataset is None: return [("Please load a dataset first", None)], gr.update(value=0, maximum=1) try: dynamic_dataset.previous_example() current = dynamic_dataset.current max_value = len(dynamic_dataset.data) - 1 return prepare_for_highlight(dynamic_dataset.load_current_example()), gr.update(value=current, maximum=max_value) except Exception as e: return [("Error navigating to previous example: " + str(e), None)], gr.update(value=0, maximum=1) def update_example(data): """Update the current example with new annotations.""" global dynamic_dataset if dynamic_dataset is None: return [("Please load a dataset first", None)] tokens, ner = extract_tokens_and_labels(data) dynamic_dataset.data[dynamic_dataset.current]["tokenized_text"] = tokens dynamic_dataset.data[dynamic_dataset.current]["ner"] = ner return prepare_for_highlight(dynamic_dataset.load_current_example()) def validate_example(): """Mark the current example as validated.""" global dynamic_dataset if dynamic_dataset is None: return [("Please load a dataset first", None)] dynamic_dataset.data[dynamic_dataset.current]["validated"] = True return [("The example was validated!", None)] def save_dataset(inp): """Save the dataset to a file.""" global dynamic_dataset if dynamic_dataset is None: return [("Please load a dataset first", None)] with open("data/annotated_data.json", "wt") as file: json.dump(dynamic_dataset.data, file) return [("The validated dataset was saved as data/annotated_data.json", None)] def annotate(model, labels, threshold, prompt, save_to_hub, repo_name, repo_type, is_private): """Annotate the uploaded text using the selected model.""" global annotator, sentences try: if not sentences: return "Please upload a file with text first!" if save_to_hub and not is_valid_repo_name(repo_name): return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)." labels = [label.strip() for label in labels.split(",")] annotator = AutoAnnotator(model) annotated_data = annotator.auto_annotate(sentences, labels, prompt, threshold) # Save annotated data locally os.makedirs("data", exist_ok=True) local_path = "data/annotated_data.json" with open(local_path, "wt") as file: json.dump(annotated_data, file, ensure_ascii=False) status_messages = [f"Successfully annotated and saved locally to {local_path}"] # Upload to Hugging Face Hub if requested if save_to_hub: try: repo_id = upload_to_hf(local_path, repo_name, repo_type, is_private) status_messages.append(f"Successfully uploaded to Hugging Face Hub repository: {repo_id}") except Exception as e: status_messages.append(f"Error with Hugging Face Hub: {str(e)}") return "\n".join(status_messages) except Exception as e: return f"Error during annotation: {str(e)}" def load_from_huggingface(name): """Load a dataset from Hugging Face Hub.""" global dynamic_dataset try: # Download dataset from Hugging Face Hub local_path = download_from_hf(name, "annotated_data.json") # Load the downloaded dataset with open(local_path, 'rt') as dataset: data = json.load(dataset) # Initialize the dataset dynamic_dataset = DynamicDataset(data) return "Successfully loaded dataset from Hugging Face Hub" except Exception as e: return f"Error loading dataset from Hugging Face Hub: {str(e)}" def update_hf_dataset(repo_name, repo_type, is_private): """Upload the current dataset to Hugging Face Hub.""" global dynamic_dataset if dynamic_dataset is None: return "Please load a dataset first" try: if not is_valid_repo_name(repo_name): return "Error: Invalid repo name. Only use letters, numbers, '-', '_', or '.' (no slashes or spaces)." # Save dataset locally first os.makedirs("data", exist_ok=True) local_path = "data/annotated_data.json" with open(local_path, "wt") as file: json.dump(dynamic_dataset.data, file, ensure_ascii=False) # Upload to Hugging Face Hub repo_id = upload_to_hf(local_path, repo_name, repo_type, is_private) return f"Successfully uploaded to Hugging Face Hub repository: {repo_id}" except Exception as e: return f"Error uploading to Hugging Face Hub: {str(e)}" def process_conll(content): """Convert CoNLL format to JSON.""" sentences = [] current_sentence = {"text": "", "tokenized_text": [], "ner": []} for line in content.split('\n'): if not line.strip(): if current_sentence["text"]: sentences.append(current_sentence) current_sentence = {"text": "", "tokenized_text": [], "ner": []} continue parts = line.split() if len(parts) >= 2: token, label = parts[0], parts[-1] current_sentence["tokenized_text"].append(token) current_sentence["ner"].append(label) current_sentence["text"] += token + " " if current_sentence["text"]: sentences.append(current_sentence) return sentences def process_txt(content): """Convert plain text to JSON format.""" sentences = [] for line in content.split('\n'): if line.strip(): sentences.append({ "text": line.strip(), "tokenized_text": line.strip().split(), "ner": ["O"] * len(line.strip().split()) }) return sentences def process_local_file(file_obj, format): """Process a local file and save it as JSON.""" try: if file_obj is None: return "No file uploaded" # Get the file content from the Gradio file object content = file_obj.name with open(content, 'r', encoding='utf-8') as f: content = f.read() if format == "json": data = json.loads(content) elif format == "conll": data = process_conll(content) elif format == "txt": data = process_txt(content) else: return "Unsupported file format" os.makedirs("data", exist_ok=True) with open("data/annotated_data.json", "wt") as f: json.dump(data, f, ensure_ascii=False) return "Successfully processed and saved file" except Exception as e: return f"Error processing file: {str(e)}" def create_interface(): """Create and return the Gradio interface.""" with gr.Blocks() as demo: gr.Markdown("# NER Annotation Tool") with gr.Tabs(): with gr.TabItem("Auto Annotation"): with gr.Row(): with gr.Column(): file_uploader = gr.File(label="Upload text file (one sentence per line)") upload_status = gr.Textbox(label="Upload Status") file_uploader.change(fn=process_uploaded_file, inputs=[file_uploader], outputs=[upload_status]) with gr.Column(): model = gr.Dropdown( label="Choose the model for annotation", choices=AVAILABLE_MODELS, value=AVAILABLE_MODELS[0] ) labels = gr.Textbox( label="Labels", placeholder="Enter comma-separated labels (e.g., PERSON,ORG,LOC)", scale=2 ) threshold = gr.Slider( 0, 1, value=0.3, step=0.01, label="Threshold", info="Lower threshold increases entity predictions" ) prompt = gr.Textbox( label="Prompt", placeholder="Enter your annotation prompt (optional)", scale=2 ) with gr.Group(): gr.Markdown("### Save Options") save_to_hub = gr.Checkbox( label="Save to Hugging Face Hub", value=False ) with gr.Group(visible=False) as hub_settings: gr.Markdown("#### Hugging Face Hub Settings") repo_name = gr.Textbox( label="Repository Name", placeholder="Enter repository name (e.g., my-ner-dataset)", scale=2 ) repo_type = gr.Dropdown( choices=["dataset", "model", "space"], value="dataset", label="Repository Type" ) is_private = gr.Checkbox( label="Private Repository", value=False ) annotate_btn = gr.Button("Annotate Data") output_info = gr.Textbox(label="Processing Status") # Add download buttons for annotated data with gr.Row(): download_btn_annot = gr.Button("Download Annotated Data", visible=False) download_file_annot = gr.File(label="Download", interactive=False, visible=False) download_status = gr.Textbox(label="Download Status", visible=False) def toggle_hub_settings(save_to_hub): return { hub_settings: gr.update(visible=save_to_hub) } save_to_hub.change( fn=toggle_hub_settings, inputs=[save_to_hub], outputs=[hub_settings] ) def show_download_buttons(status): if status and status.startswith("Successfully annotated and saved locally"): return gr.update(visible=True), gr.update(visible=True) return gr.update(visible=False), gr.update(visible=False) annotate_btn.click( fn=annotate, inputs=[ model, labels, threshold, prompt, save_to_hub, repo_name, repo_type, is_private ], outputs=[output_info] ) output_info.change( fn=show_download_buttons, inputs=[output_info], outputs=[download_btn_annot, download_status] ) def handle_download_annot(): file_path = "data/annotated_data.json" if os.path.exists(file_path): return gr.update(value=file_path, visible=True) return gr.update(visible=False) download_btn_annot.click( fn=handle_download_annot, inputs=None, outputs=[download_file_annot] ) with gr.TabItem("Dataset Viewer"): with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Dataset Controls") with gr.Group(): with gr.Row(): load_local_btn = gr.Button("Load Local Dataset", variant="primary") load_hf_btn = gr.Button("Load from Hugging Face", variant="secondary") with gr.Group() as local_inputs: local_file = gr.File(label="Upload Local Dataset") file_format = gr.Dropdown( choices=["json", "conll", "txt"], value="json", label="File Format" ) local_status = gr.Textbox(label="Status", interactive=False) with gr.Group(visible=False) as hf_inputs: with gr.Row(): dataset_name = gr.Textbox( label="Dataset Name", placeholder="Enter dataset name (e.g., conll2003)", scale=4 ) with gr.Row(): gr.Column(scale=1) load_dataset_btn = gr.Button("📥 Load Dataset", variant="primary") gr.Column(scale=1) with gr.Row(): gr.Markdown( "💡 Tip: Enter a valid Hugging Face dataset name", elem_classes=["text-sm", "text-gray-500"] ) gr.Markdown("### Navigation") with gr.Group(): bar = gr.Slider( minimum=0, maximum=1, step=1, label="Progress", interactive=True, info="Use slider to navigate through examples" ) with gr.Row(): previous_btn = gr.Button("← Previous", variant="secondary") next_btn = gr.Button("Next →", variant="secondary") gr.Markdown("### Actions") with gr.Group(): with gr.Row(): apply_btn = gr.Button("Apply Changes", variant="primary") validate_btn = gr.Button("Validate", variant="secondary") save_btn = gr.Button("Save Dataset", variant="primary") gr.Markdown("### Hugging Face Upload") with gr.Group(): with gr.Row(): show_hf_upload_btn = gr.Button("📤 Show Upload Options", variant="secondary", scale=1) hide_hf_upload_btn = gr.Button("📥 Hide Upload Options", visible=False, variant="secondary", scale=1) with gr.Group(visible=False) as hf_upload_group: with gr.Row(): hf_repo_name = gr.Textbox( label="Repository Name", placeholder="Enter repository name (e.g., my-ner-dataset)", scale=2 ) hf_repo_type = gr.Dropdown( choices=["dataset", "model", "space"], value="dataset", label="Repository Type", scale=1 ) with gr.Row(): hf_is_private = gr.Checkbox( label="Private Repository", value=False, scale=1 ) upload_to_hf_btn = gr.Button("Upload to Hugging Face", variant="primary", scale=2) hf_upload_status = gr.Textbox( label="Upload Status", interactive=False, show_label=True ) def toggle_upload_options(show: bool): return { hf_upload_group: gr.update(visible=show), show_hf_upload_btn: gr.update(visible=not show), hide_hf_upload_btn: gr.update(visible=show) } show_hf_upload_btn.click( fn=lambda: toggle_upload_options(True), inputs=None, outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn] ) hide_hf_upload_btn.click( fn=lambda: toggle_upload_options(False), inputs=None, outputs=[hf_upload_group, show_hf_upload_btn, hide_hf_upload_btn] ) with gr.Column(scale=2): gr.Markdown("### Current Example") inp_box = gr.HighlightedText(value=None, interactive=True) def toggle_local_inputs(): return { local_inputs: gr.update(visible=True), hf_inputs: gr.update(visible=False) } def toggle_hf_inputs(): return { local_inputs: gr.update(visible=False), hf_inputs: gr.update(visible=True) } load_local_btn.click( fn=toggle_local_inputs, inputs=None, outputs=[local_inputs, hf_inputs] ) load_hf_btn.click( fn=toggle_hf_inputs, inputs=None, outputs=[local_inputs, hf_inputs] ) def process_and_load_local(file_obj, format): status = process_local_file(file_obj, format) if "Successfully" in status: result = load_dataset() return result[0], result[1], status return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1), status local_file.change( fn=process_and_load_local, inputs=[local_file, file_format], outputs=[inp_box, bar, local_status] ) def load_hf_dataset(name): status = load_from_huggingface(name) if "Successfully" in status: return load_dataset() return [("Error loading dataset: " + status, None)], gr.update(value=0, maximum=1) load_dataset_btn.click( fn=load_hf_dataset, inputs=[dataset_name], outputs=[inp_box, bar] ) apply_btn.click(fn=update_example, inputs=inp_box, outputs=inp_box) save_btn.click(fn=save_dataset, inputs=inp_box, outputs=inp_box) validate_btn.click(fn=validate_example, inputs=None, outputs=inp_box) next_btn.click(fn=next_example, inputs=None, outputs=[inp_box, bar]) previous_btn.click(fn=previous_example, inputs=None, outputs=[inp_box, bar]) bar.change( fn=example_by_id, inputs=[bar], outputs=[inp_box, bar], api_name="example_by_id" ) upload_to_hf_btn.click( fn=update_hf_dataset, inputs=[hf_repo_name, hf_repo_type, hf_is_private], outputs=[hf_upload_status] ) return demo def main(): """Run the application.""" demo = create_interface() demo.launch() if __name__ == "__main__": main()