File size: 4,964 Bytes
a33a001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""Dataset management module for NER annotation."""

from typing import List, Dict, Union, Tuple
import json
import os
import re

class DynamicDataset:
    """A class to manage and navigate through annotated dataset examples."""
    
    def __init__(
            self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
    ) -> None:
        """Initialize the dataset with examples.
        
        Args:
            data: List of examples, each containing tokenized text and NER annotations
        """
        self.data = data
        self.data_len = len(self.data)
        self.current = -1
        for example in self.data:
            if "validated" not in example:
                example["validated"] = False

    def next_example(self) -> None:
        """Move to the next example in the dataset."""
        self.current += 1
        if self.current > self.data_len - 1:
            self.current = self.data_len - 1
        elif self.current < 0:
            self.current = 0

    def previous_example(self) -> None:
        """Move to the previous example in the dataset."""
        self.current -= 1
        if self.current > self.data_len - 1:
            self.current = self.data_len - 1
        elif self.current < 0:
            self.current = 0

    def example_by_id(self, id: int) -> None:
        """Navigate to a specific example by its ID.
        
        Args:
            id: The index of the example to navigate to
        """
        self.current = id
        if self.current > self.data_len - 1:
            self.current = self.data_len - 1
        elif self.current < 0:
            self.current = 0

    def validate(self) -> None:
        """Mark the current example as validated."""
        self.data[self.current]["validated"] = True

    def load_current_example(self) -> Dict:
        """Get the current example.
        
        Returns:
            The current example data
        """
        return self.data[self.current]

def tokenize_text(text: str) -> List[str]:
    """Tokenize the input text into a list of tokens.
    
    Args:
        text: The input text to tokenize
        
    Returns:
        List of tokens
    """
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def join_tokens(tokens: List[str]) -> str:
    """Join tokens with proper spacing.
    
    Args:
        tokens: List of tokens to join
        
    Returns:
        Joined text string
    """
    text = ""
    for token in tokens:
        if token in {",", ".", "!", "?", ":", ";", "..."}:
            text = text.rstrip() + token
        else:
            text += " " + token
    return text.strip()

def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]:
    """Prepare text for highlighting with NER annotations.
    
    Args:
        data: Dictionary containing tokenized text and NER annotations
        
    Returns:
        List of tuples containing text segments and their entity labels
    """
    tokens = data["tokenized_text"]
    ner = data["ner"]

    highlighted_text = []
    current_entity = None
    entity_tokens = []
    normal_tokens = []

    for idx, token in enumerate(tokens):
        if current_entity is None or idx > current_entity[1]:
            if entity_tokens:
                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
                entity_tokens = []
            current_entity = next((entity for entity in ner if entity[0] == idx), None)

        if current_entity and current_entity[0] <= idx <= current_entity[1]:
            if normal_tokens:
                highlighted_text.append((" ".join(normal_tokens), None))
                normal_tokens = []
            entity_tokens.append(token + " ")
        else:
            if entity_tokens:
                highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
                entity_tokens = []
            normal_tokens.append(token + " ")

    if entity_tokens:
        highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
    if normal_tokens:
        highlighted_text.append((" ".join(normal_tokens), None))

    cleaned_highlighted_text = []
    for text, label in highlighted_text:
        cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
        cleaned_highlighted_text.append((cleaned_text, label))

    return cleaned_highlighted_text

def save_dataset(data: List[Dict], filepath: str) -> None:
    """Save the dataset to a JSON file.
    
    Args:
        data: The dataset to save
        filepath: Path to save the dataset
    """
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, "wt") as file:
        json.dump(data, file, ensure_ascii=False)

def load_dataset(filepath: str) -> List[Dict]:
    """Load a dataset from a JSON file.
    
    Args:
        filepath: Path to the dataset file
        
    Returns:
        The loaded dataset
    """
    with open(filepath, "rt") as file:
        return json.load(file)