Spaces:
Running
Running
"""Dataset management module for NER annotation.""" | |
from typing import List, Dict, Union, Tuple | |
import json | |
import os | |
import re | |
class DynamicDataset: | |
"""A class to manage and navigate through annotated dataset examples.""" | |
def __init__( | |
self, data: List[Dict[str, Union[List[Union[int, str]], bool]]] | |
) -> None: | |
"""Initialize the dataset with examples. | |
Args: | |
data: List of examples, each containing tokenized text and NER annotations | |
""" | |
self.data = data | |
self.data_len = len(self.data) | |
self.current = -1 | |
for example in self.data: | |
if "validated" not in example: | |
example["validated"] = False | |
def next_example(self) -> None: | |
"""Move to the next example in the dataset.""" | |
self.current += 1 | |
if self.current > self.data_len - 1: | |
self.current = self.data_len - 1 | |
elif self.current < 0: | |
self.current = 0 | |
def previous_example(self) -> None: | |
"""Move to the previous example in the dataset.""" | |
self.current -= 1 | |
if self.current > self.data_len - 1: | |
self.current = self.data_len - 1 | |
elif self.current < 0: | |
self.current = 0 | |
def example_by_id(self, id: int) -> None: | |
"""Navigate to a specific example by its ID. | |
Args: | |
id: The index of the example to navigate to | |
""" | |
self.current = id | |
if self.current > self.data_len - 1: | |
self.current = self.data_len - 1 | |
elif self.current < 0: | |
self.current = 0 | |
def validate(self) -> None: | |
"""Mark the current example as validated.""" | |
self.data[self.current]["validated"] = True | |
def load_current_example(self) -> Dict: | |
"""Get the current example. | |
Returns: | |
The current example data | |
""" | |
return self.data[self.current] | |
def tokenize_text(text: str) -> List[str]: | |
"""Tokenize the input text into a list of tokens. | |
Args: | |
text: The input text to tokenize | |
Returns: | |
List of tokens | |
""" | |
return re.findall(r'\w+(?:[-_]\w+)*|\S', text) | |
def join_tokens(tokens: List[str]) -> str: | |
"""Join tokens with proper spacing. | |
Args: | |
tokens: List of tokens to join | |
Returns: | |
Joined text string | |
""" | |
text = "" | |
for token in tokens: | |
if token in {",", ".", "!", "?", ":", ";", "..."}: | |
text = text.rstrip() + token | |
else: | |
text += " " + token | |
return text.strip() | |
def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]: | |
"""Prepare text for highlighting with NER annotations. | |
Args: | |
data: Dictionary containing tokenized text and NER annotations | |
Returns: | |
List of tuples containing text segments and their entity labels | |
""" | |
tokens = data["tokenized_text"] | |
ner = data["ner"] | |
highlighted_text = [] | |
current_entity = None | |
entity_tokens = [] | |
normal_tokens = [] | |
for idx, token in enumerate(tokens): | |
if current_entity is None or idx > current_entity[1]: | |
if entity_tokens: | |
highlighted_text.append((" ".join(entity_tokens), current_entity[2])) | |
entity_tokens = [] | |
current_entity = next((entity for entity in ner if entity[0] == idx), None) | |
if current_entity and current_entity[0] <= idx <= current_entity[1]: | |
if normal_tokens: | |
highlighted_text.append((" ".join(normal_tokens), None)) | |
normal_tokens = [] | |
entity_tokens.append(token + " ") | |
else: | |
if entity_tokens: | |
highlighted_text.append((" ".join(entity_tokens), current_entity[2])) | |
entity_tokens = [] | |
normal_tokens.append(token + " ") | |
if entity_tokens: | |
highlighted_text.append((" ".join(entity_tokens), current_entity[2])) | |
if normal_tokens: | |
highlighted_text.append((" ".join(normal_tokens), None)) | |
cleaned_highlighted_text = [] | |
for text, label in highlighted_text: | |
cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text) | |
cleaned_highlighted_text.append((cleaned_text, label)) | |
return cleaned_highlighted_text | |
def save_dataset(data: List[Dict], filepath: str) -> None: | |
"""Save the dataset to a JSON file. | |
Args: | |
data: The dataset to save | |
filepath: Path to save the dataset | |
""" | |
os.makedirs(os.path.dirname(filepath), exist_ok=True) | |
with open(filepath, "wt") as file: | |
json.dump(data, file, ensure_ascii=False) | |
def load_dataset(filepath: str) -> List[Dict]: | |
"""Load a dataset from a JSON file. | |
Args: | |
filepath: Path to the dataset file | |
Returns: | |
The loaded dataset | |
""" | |
with open(filepath, "rt") as file: | |
return json.load(file) |