"""Dataset management module for NER annotation.""" from typing import List, Dict, Union, Tuple import json import os import re class DynamicDataset: """A class to manage and navigate through annotated dataset examples.""" def __init__( self, data: List[Dict[str, Union[List[Union[int, str]], bool]]] ) -> None: """Initialize the dataset with examples. Args: data: List of examples, each containing tokenized text and NER annotations """ self.data = data self.data_len = len(self.data) self.current = -1 for example in self.data: if "validated" not in example: example["validated"] = False def next_example(self) -> None: """Move to the next example in the dataset.""" self.current += 1 if self.current > self.data_len - 1: self.current = self.data_len - 1 elif self.current < 0: self.current = 0 def previous_example(self) -> None: """Move to the previous example in the dataset.""" self.current -= 1 if self.current > self.data_len - 1: self.current = self.data_len - 1 elif self.current < 0: self.current = 0 def example_by_id(self, id: int) -> None: """Navigate to a specific example by its ID. Args: id: The index of the example to navigate to """ self.current = id if self.current > self.data_len - 1: self.current = self.data_len - 1 elif self.current < 0: self.current = 0 def validate(self) -> None: """Mark the current example as validated.""" self.data[self.current]["validated"] = True def load_current_example(self) -> Dict: """Get the current example. Returns: The current example data """ return self.data[self.current] def tokenize_text(text: str) -> List[str]: """Tokenize the input text into a list of tokens. Args: text: The input text to tokenize Returns: List of tokens """ return re.findall(r'\w+(?:[-_]\w+)*|\S', text) def join_tokens(tokens: List[str]) -> str: """Join tokens with proper spacing. Args: tokens: List of tokens to join Returns: Joined text string """ text = "" for token in tokens: if token in {",", ".", "!", "?", ":", ";", "..."}: text = text.rstrip() + token else: text += " " + token return text.strip() def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]: """Prepare text for highlighting with NER annotations. Args: data: Dictionary containing tokenized text and NER annotations Returns: List of tuples containing text segments and their entity labels """ tokens = data["tokenized_text"] ner = data["ner"] highlighted_text = [] current_entity = None entity_tokens = [] normal_tokens = [] for idx, token in enumerate(tokens): if current_entity is None or idx > current_entity[1]: if entity_tokens: highlighted_text.append((" ".join(entity_tokens), current_entity[2])) entity_tokens = [] current_entity = next((entity for entity in ner if entity[0] == idx), None) if current_entity and current_entity[0] <= idx <= current_entity[1]: if normal_tokens: highlighted_text.append((" ".join(normal_tokens), None)) normal_tokens = [] entity_tokens.append(token + " ") else: if entity_tokens: highlighted_text.append((" ".join(entity_tokens), current_entity[2])) entity_tokens = [] normal_tokens.append(token + " ") if entity_tokens: highlighted_text.append((" ".join(entity_tokens), current_entity[2])) if normal_tokens: highlighted_text.append((" ".join(normal_tokens), None)) cleaned_highlighted_text = [] for text, label in highlighted_text: cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text) cleaned_highlighted_text.append((cleaned_text, label)) return cleaned_highlighted_text def save_dataset(data: List[Dict], filepath: str) -> None: """Save the dataset to a JSON file. Args: data: The dataset to save filepath: Path to save the dataset """ os.makedirs(os.path.dirname(filepath), exist_ok=True) with open(filepath, "wt") as file: json.dump(data, file, ensure_ascii=False) def load_dataset(filepath: str) -> List[Dict]: """Load a dataset from a JSON file. Args: filepath: Path to the dataset file Returns: The loaded dataset """ with open(filepath, "rt") as file: return json.load(file)