Spaces:

BookingCare
/

ner-annotation

Running

File size: 4,964 Bytes

a33a001

"""Dataset management module for NER annotation."""

from typing import List, Dict, Union, Tuple
import json
import os
import re

class DynamicDataset:
    """A class to manage and navigate through annotated dataset examples."""
    
    def __init__(
            self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
    ) -> None:
        """Initialize the dataset with examples.
        
        Args:
            data: List of examples, each containing tokenized text and NER annotations
        """
        self.data = data
        self.data_len = len(self.data)
        self.current = -1
        for example in self.data:
            if "validated" not in example:
                example["validated"] = False

    def next_example(self) -> None:
        """Move to the next example in the dataset."""
        self.current += 1
        if self.current > self.data_len - 1:
            self.current = self.data_len - 1
        elif self.current < 0:
            self.current = 0

    def previous_example(self) -> None:
        """Move to the previous example in the dataset."""
        self.current -= 1
        if self.current > self.data_len - 1:
            self.current = self.data_len - 1
        elif self.current < 0:
            self.current = 0

    def example_by_id(self, id: int) -> None:
        """Navigate to a specific example by its ID.
        
        Args:
            id: The index of the example to navigate to
        """
        self.current = id
        if self.current > self.data_len - 1:
            self.current = self.data_len - 1
        elif self.current < 0:
            self.current = 0

    def validate(self) -> None:
        """Mark the current example as validated."""
        self.data[self.current]["validated"] = True

    def load_current_example(self) -> Dict:
        """Get the current example.
        
        Returns:
            The current example data
        """
        return self.data[self.current]

def tokenize_text(text: str) -> List[str]:
    """Tokenize the input text into a list of tokens.
    
    Args:
        text: The input text to tokenize
        
    Returns:
        List of tokens
    """
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def join_tokens(tokens: List[str]) -> str:
    """Join tokens with proper spacing.
    
    Args:
        tokens: List of tokens to join
        
    Returns:
        Joined text string
    """
    text = ""
    for token in tokens:
        if token in {",", ".", "!", "?", ":", ";", "..."}:
            text = text.rstrip() + token
        else:
            text += " " + token
    return text.strip()

def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]:
    """Prepare text for highlighting with NER annotations.
    
    Args:
        data: Dictionary containing tokenized text and NER annotations
        
    Returns:
        List of tuples containing text segments and their entity labels
    """
    tokens = data["tokenized_text"]
    ner = data["ner"]

    highlighted_text = []
    current_entity = None
    entity_tokens = []
    normal_tokens = []

    for idx, token in enumerate(tokens):
        if current_entity is None or idx > current_entity[1]:
            if entity_tokens:
                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
                entity_tokens = []
            current_entity = next((entity for entity in ner if entity[0] == idx), None)

        if current_entity and current_entity[0] <= idx <= current_entity[1]:
            if normal_tokens:
                highlighted_text.append((" ".join(normal_tokens), None))
                normal_tokens = []
            entity_tokens.append(token + " ")
        else:
            if entity_tokens:
                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
                entity_tokens = []
            normal_tokens.append(token + " ")

    if entity_tokens:
        highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
    if normal_tokens:
        highlighted_text.append((" ".join(normal_tokens), None))

    cleaned_highlighted_text = []
    for text, label in highlighted_text:
        cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
        cleaned_highlighted_text.append((cleaned_text, label))

    return cleaned_highlighted_text

def save_dataset(data: List[Dict], filepath: str) -> None:
    """Save the dataset to a JSON file.
    
    Args:
        data: The dataset to save
        filepath: Path to save the dataset
    """
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "wt") as file:
        json.dump(data, file, ensure_ascii=False)

def load_dataset(filepath: str) -> List[Dict]:
    """Load a dataset from a JSON file.
    
    Args:
        filepath: Path to the dataset file
        
    Returns:
        The loaded dataset
    """
    with open(filepath, "rt") as file:
        return json.load(file)