Spaces:
Running
Running
File size: 4,964 Bytes
a33a001 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 |
"""Dataset management module for NER annotation."""
from typing import List, Dict, Union, Tuple
import json
import os
import re
class DynamicDataset:
"""A class to manage and navigate through annotated dataset examples."""
def __init__(
self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
) -> None:
"""Initialize the dataset with examples.
Args:
data: List of examples, each containing tokenized text and NER annotations
"""
self.data = data
self.data_len = len(self.data)
self.current = -1
for example in self.data:
if "validated" not in example:
example["validated"] = False
def next_example(self) -> None:
"""Move to the next example in the dataset."""
self.current += 1
if self.current > self.data_len - 1:
self.current = self.data_len - 1
elif self.current < 0:
self.current = 0
def previous_example(self) -> None:
"""Move to the previous example in the dataset."""
self.current -= 1
if self.current > self.data_len - 1:
self.current = self.data_len - 1
elif self.current < 0:
self.current = 0
def example_by_id(self, id: int) -> None:
"""Navigate to a specific example by its ID.
Args:
id: The index of the example to navigate to
"""
self.current = id
if self.current > self.data_len - 1:
self.current = self.data_len - 1
elif self.current < 0:
self.current = 0
def validate(self) -> None:
"""Mark the current example as validated."""
self.data[self.current]["validated"] = True
def load_current_example(self) -> Dict:
"""Get the current example.
Returns:
The current example data
"""
return self.data[self.current]
def tokenize_text(text: str) -> List[str]:
"""Tokenize the input text into a list of tokens.
Args:
text: The input text to tokenize
Returns:
List of tokens
"""
return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
def join_tokens(tokens: List[str]) -> str:
"""Join tokens with proper spacing.
Args:
tokens: List of tokens to join
Returns:
Joined text string
"""
text = ""
for token in tokens:
if token in {",", ".", "!", "?", ":", ";", "..."}:
text = text.rstrip() + token
else:
text += " " + token
return text.strip()
def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]:
"""Prepare text for highlighting with NER annotations.
Args:
data: Dictionary containing tokenized text and NER annotations
Returns:
List of tuples containing text segments and their entity labels
"""
tokens = data["tokenized_text"]
ner = data["ner"]
highlighted_text = []
current_entity = None
entity_tokens = []
normal_tokens = []
for idx, token in enumerate(tokens):
if current_entity is None or idx > current_entity[1]:
if entity_tokens:
highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
entity_tokens = []
current_entity = next((entity for entity in ner if entity[0] == idx), None)
if current_entity and current_entity[0] <= idx <= current_entity[1]:
if normal_tokens:
highlighted_text.append((" ".join(normal_tokens), None))
normal_tokens = []
entity_tokens.append(token + " ")
else:
if entity_tokens:
highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
entity_tokens = []
normal_tokens.append(token + " ")
if entity_tokens:
highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
if normal_tokens:
highlighted_text.append((" ".join(normal_tokens), None))
cleaned_highlighted_text = []
for text, label in highlighted_text:
cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
cleaned_highlighted_text.append((cleaned_text, label))
return cleaned_highlighted_text
def save_dataset(data: List[Dict], filepath: str) -> None:
"""Save the dataset to a JSON file.
Args:
data: The dataset to save
filepath: Path to save the dataset
"""
os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, "wt") as file:
json.dump(data, file, ensure_ascii=False)
def load_dataset(filepath: str) -> List[Dict]:
"""Load a dataset from a JSON file.
Args:
filepath: Path to the dataset file
Returns:
The loaded dataset
"""
with open(filepath, "rt") as file:
return json.load(file) |