Spaces:

BookingCare
/

ner-annotation

Running

File size: 7,532 Bytes

a33a001

"""File processing utilities for NER annotation."""

import os
import json
import pandas as pd
from typing import List, Dict, Union, Optional
from .text_processing import tokenize_text, process_text_for_gliner

def process_uploaded_file(file_obj) -> List[str]:
    """Process an uploaded file into a list of sentences.
    
    Args:
        file_obj: The uploaded file object
        
    Returns:
        List of processed sentences
        
    Raises:
        Exception: If file processing fails
    """
    if file_obj is None:
        raise ValueError("Please upload a file first!")
    
    try:
        if file_obj.name.endswith('.csv'):
            # Process CSV file
            df = pd.read_csv(file_obj.name)
            sentences = df['Nội dung'].dropna().tolist()
        else:
            # Process text file
            content = file_obj.read().decode('utf-8')
            sentences = [line.strip() for line in content.splitlines() if line.strip()]
        
        # Process each sentence and flatten the list
        processed_sentences = []
        for sentence in sentences:
            processed_sentences.extend(process_text_for_gliner(sentence))
        
        return processed_sentences
    except Exception as e:
        raise Exception(f"Error reading file: {str(e)}")

def load_from_local_file(
    file_path: str,
    file_format: str = "json"
) -> List[Dict]:
    """Load and convert data from local file in various formats.
    
    Args:
        file_path: Path to the file to load
        file_format: Format of the file (json, conll, or txt)
        
    Returns:
        List of converted examples
        
    Raises:
        Exception: If file loading fails
    """
    try:
        if file_format == "json":
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                if isinstance(data, list):
                    # If data is already in the correct format
                    if all("tokenized_text" in item and "ner" in item for item in data):
                        return data
                    # Convert from other JSON formats
                    return _convert_json_format(data)
                else:
                    raise ValueError("JSON file must contain a list of examples")

        elif file_format == "conll":
            return _load_conll_file(file_path)

        elif file_format == "txt":
            return _load_txt_file(file_path)

        else:
            raise ValueError(f"Unsupported file format: {file_format}")

    except Exception as e:
        raise Exception(f"Error loading file: {str(e)}")

def _convert_json_format(data: List[Dict]) -> List[Dict]:
    """Convert JSON data from various formats to the standard format.
    
    Args:
        data: List of examples in various JSON formats
        
    Returns:
        List of examples in the standard format
    """
    converted_data = []
    for item in data:
        if "tokens" in item and "ner_tags" in item:
            ner_spans = []
            current_span = None
            for i, (token, tag) in enumerate(zip(item["tokens"], item["ner_tags"])):
                if tag != "O":
                    if current_span is None:
                        current_span = [i, i, tag]
                    elif tag == current_span[2]:
                        current_span[1] = i
                    else:
                        ner_spans.append(current_span)
                        current_span = [i, i, tag]
                elif current_span is not None:
                    ner_spans.append(current_span)
                    current_span = None
            if current_span is not None:
                ner_spans.append(current_span)
            converted_data.append({
                "tokenized_text": item["tokens"],
                "ner": ner_spans,
                "validated": False
            })
    return converted_data

def _load_conll_file(file_path: str) -> List[Dict]:
    """Load and convert data from a CoNLL format file.
    
    Args:
        file_path: Path to the CoNLL file
        
    Returns:
        List of converted examples
    """
    converted_data = []
    current_example = {"tokens": [], "ner_tags": []}
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                if line.startswith("#"):
                    continue
                parts = line.split()
                if len(parts) >= 2:
                    token, tag = parts[0], parts[-1]
                    current_example["tokens"].append(token)
                    current_example["ner_tags"].append(tag)
            elif current_example["tokens"]:
                # Convert current example
                ner_spans = []
                current_span = None
                for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
                    if tag != "O":
                        if current_span is None:
                            current_span = [i, i, tag]
                        elif tag == current_span[2]:
                            current_span[1] = i
                        else:
                            ner_spans.append(current_span)
                            current_span = [i, i, tag]
                    elif current_span is not None:
                        ner_spans.append(current_span)
                        current_span = None
                if current_span is not None:
                    ner_spans.append(current_span)
                
                converted_data.append({
                    "tokenized_text": current_example["tokens"],
                    "ner": ner_spans,
                    "validated": False
                })
                current_example = {"tokens": [], "ner_tags": []}
        
        # Handle last example if exists
        if current_example["tokens"]:
            ner_spans = []
            current_span = None
            for i, (token, tag) in enumerate(zip(current_example["tokens"], current_example["ner_tags"])):
                if tag != "O":
                    if current_span is None:
                        current_span = [i, i, tag]
                    elif tag == current_span[2]:
                        current_span[1] = i
                    else:
                        ner_spans.append(current_span)
                        current_span = [i, i, tag]
                elif current_span is not None:
                    ner_spans.append(current_span)
                    current_span = None
            if current_span is not None:
                ner_spans.append(current_span)
            
            converted_data.append({
                "tokenized_text": current_example["tokens"],
                "ner": ner_spans,
                "validated": False
            })
    
    return converted_data

def _load_txt_file(file_path: str) -> List[Dict]:
    """Load and convert data from a text file.
    
    Args:
        file_path: Path to the text file
        
    Returns:
        List of converted examples
    """
    converted_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                tokens = tokenize_text(line)
                converted_data.append({
                    "tokenized_text": tokens,
                    "ner": [],
                    "validated": False
                })
    return converted_data