"""File processing utilities for NER annotation.""" import os import json import pandas as pd from typing import List, Dict, Union, Optional from .text_processing import tokenize_text, process_text_for_gliner def process_uploaded_file(file_obj) -> List[str]: """Process an uploaded file into a list of sentences. Args: file_obj: The uploaded file object Returns: List of processed sentences Raises: Exception: If file processing fails """ if file_obj is None: raise ValueError("Please upload a file first!") try: if file_obj.name.endswith('.csv'): # Process CSV file df = pd.read_csv(file_obj.name) sentences = df['Nội dung'].dropna().tolist() else: # Process text file content = file_obj.read().decode('utf-8') sentences = [line.strip() for line in content.splitlines() if line.strip()] # Process each sentence and flatten the list processed_sentences = [] for sentence in sentences: processed_sentences.extend(process_text_for_gliner(sentence)) return processed_sentences except Exception as e: raise Exception(f"Error reading file: {str(e)}") def load_from_local_file( file_path: str, file_format: str = "json" ) -> List[Dict]: """Load and convert data from local file in various formats. Args: file_path: Path to the file to load file_format: Format of the file (json, conll, or txt) Returns: List of converted examples Raises: Exception: If file loading fails """ try: if file_format == "json": with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) if isinstance(data, list): # If data is already in the correct format if all("tokenized_text" in item and "ner" in item for item in data): return data # Convert from other JSON formats return _convert_json_format(data) else: raise ValueError("JSON file must contain a list of examples") elif file_format == "conll": return _load_conll_file(file_path) elif file_format == "txt": return _load_txt_file(file_path) else: raise ValueError(f"Unsupported file format: {file_format}") except Exception as e: raise Exception(f"Error loading file: {str(e)}") def _convert_json_format(data: List[Dict]) -> List[Dict]: """Convert JSON data from various formats to the standard format. Args: data: List of examples in various JSON formats Returns: List of examples in the standard format """ converted_data = [] for item in data: if "tokens" in item and "ner_tags" in item: ner_spans = [] current_span = None for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])): if tag != "O": if current_span is None: current_span = [i, i, tag] elif tag == current_span[2]: current_span[1] = i else: ner_spans.append(current_span) current_span = [i, i, tag] elif current_span is not None: ner_spans.append(current_span) current_span = None if current_span is not None: ner_spans.append(current_span) converted_data.append({ "tokenized_text": item["tokens"], "ner": ner_spans, "validated": False }) return converted_data def _load_conll_file(file_path: str) -> List[Dict]: """Load and convert data from a CoNLL format file. Args: file_path: Path to the CoNLL file Returns: List of converted examples """ converted_data = [] current_example = {"tokens": [], "ner_tags": []} with open(file_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: if line.startswith("#"): continue parts = line.split() if len(parts) >= 2: token, tag = parts[0], parts[-1] current_example["tokens"].append(token) current_example["ner_tags"].append(tag) elif current_example["tokens"]: # Convert current example ner_spans = [] current_span = None for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])): if tag != "O": if current_span is None: current_span = [i, i, tag] elif tag == current_span[2]: current_span[1] = i else: ner_spans.append(current_span) current_span = [i, i, tag] elif current_span is not None: ner_spans.append(current_span) current_span = None if current_span is not None: ner_spans.append(current_span) converted_data.append({ "tokenized_text": current_example["tokens"], "ner": ner_spans, "validated": False }) current_example = {"tokens": [], "ner_tags": []} # Handle last example if exists if current_example["tokens"]: ner_spans = [] current_span = None for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])): if tag != "O": if current_span is None: current_span = [i, i, tag] elif tag == current_span[2]: current_span[1] = i else: ner_spans.append(current_span) current_span = [i, i, tag] elif current_span is not None: ner_spans.append(current_span) current_span = None if current_span is not None: ner_spans.append(current_span) converted_data.append({ "tokenized_text": current_example["tokens"], "ner": ner_spans, "validated": False }) return converted_data def _load_txt_file(file_path: str) -> List[Dict]: """Load and convert data from a text file. Args: file_path: Path to the text file Returns: List of converted examples """ converted_data = [] with open(file_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: tokens = tokenize_text(line) converted_data.append({ "tokenized_text": tokens, "ner": [], "validated": False }) return converted_data