nam pham
feat: improve ui/ux
a33a001
"""Dataset management module for NER annotation."""
from typing import List, Dict, Union, Tuple
import json
import os
import re
class DynamicDataset:
"""A class to manage and navigate through annotated dataset examples."""
def __init__(
self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
) -> None:
"""Initialize the dataset with examples.
Args:
data: List of examples, each containing tokenized text and NER annotations
"""
self.data = data
self.data_len = len(self.data)
self.current = -1
for example in self.data:
if "validated" not in example:
example["validated"] = False
def next_example(self) -> None:
"""Move to the next example in the dataset."""
self.current += 1
if self.current > self.data_len - 1:
self.current = self.data_len - 1
elif self.current < 0:
self.current = 0
def previous_example(self) -> None:
"""Move to the previous example in the dataset."""
self.current -= 1
if self.current > self.data_len - 1:
self.current = self.data_len - 1
elif self.current < 0:
self.current = 0
def example_by_id(self, id: int) -> None:
"""Navigate to a specific example by its ID.
Args:
id: The index of the example to navigate to
"""
self.current = id
if self.current > self.data_len - 1:
self.current = self.data_len - 1
elif self.current < 0:
self.current = 0
def validate(self) -> None:
"""Mark the current example as validated."""
self.data[self.current]["validated"] = True
def load_current_example(self) -> Dict:
"""Get the current example.
Returns:
The current example data
"""
return self.data[self.current]
def tokenize_text(text: str) -> List[str]:
"""Tokenize the input text into a list of tokens.
Args:
text: The input text to tokenize
Returns:
List of tokens
"""
return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
def join_tokens(tokens: List[str]) -> str:
"""Join tokens with proper spacing.
Args:
tokens: List of tokens to join
Returns:
Joined text string
"""
text = ""
for token in tokens:
if token in {",", ".", "!", "?", ":", ";", "..."}:
text = text.rstrip() + token
else:
text += " " + token
return text.strip()
def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]:
"""Prepare text for highlighting with NER annotations.
Args:
data: Dictionary containing tokenized text and NER annotations
Returns:
List of tuples containing text segments and their entity labels
"""
tokens = data["tokenized_text"]
ner = data["ner"]
highlighted_text = []
current_entity = None
entity_tokens = []
normal_tokens = []
for idx, token in enumerate(tokens):
if current_entity is None or idx > current_entity[1]:
if entity_tokens:
highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
entity_tokens = []
current_entity = next((entity for entity in ner if entity[0] == idx), None)
if current_entity and current_entity[0] <= idx <= current_entity[1]:
if normal_tokens:
highlighted_text.append((" ".join(normal_tokens), None))
normal_tokens = []
entity_tokens.append(token + " ")
else:
if entity_tokens:
highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
entity_tokens = []
normal_tokens.append(token + " ")
if entity_tokens:
highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
if normal_tokens:
highlighted_text.append((" ".join(normal_tokens), None))
cleaned_highlighted_text = []
for text, label in highlighted_text:
cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
cleaned_highlighted_text.append((cleaned_text, label))
return cleaned_highlighted_text
def save_dataset(data: List[Dict], filepath: str) -> None:
"""Save the dataset to a JSON file.
Args:
data: The dataset to save
filepath: Path to save the dataset
"""
os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, "wt") as file:
json.dump(data, file, ensure_ascii=False)
def load_dataset(filepath: str) -> List[Dict]:
"""Load a dataset from a JSON file.
Args:
filepath: Path to the dataset file
Returns:
The loaded dataset
"""
with open(filepath, "rt") as file:
return json.load(file)