File size: 3,666 Bytes
a33a001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""Text processing utilities for NER annotation."""

import re
from typing import List, Dict, Union, Tuple

def tokenize_text(text: str) -> List[str]:
    """Tokenize the input text into a list of tokens.
    
    Args:
        text: The input text to tokenize
        
    Returns:
        List of tokens
    """
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def join_tokens(tokens: List[str]) -> str:
    """Join tokens with proper spacing.
    
    Args:
        tokens: List of tokens to join
        
    Returns:
        Joined text string
    """
    text = ""
    for token in tokens:
        if token in {",", ".", "!", "?", ":", ";", "..."}:
            text = text.rstrip() + token
        else:
            text += " " + token
    return text.strip()

def process_text_for_gliner(
    text: str,
    max_tokens: int = 256,
    overlap: int = 32
) -> List[str]:
    """Process text for GLiNER by splitting long texts into overlapping chunks.
    
    Preserves sentence boundaries and context when possible.
    
    Args:
        text: The input text to process
        max_tokens: Maximum number of tokens per chunk
        overlap: Number of tokens to overlap between chunks
        
    Returns:
        List of text chunks suitable for GLiNER
    """
    # First split into sentences to preserve natural boundaries
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenize_text(sentence)
        sentence_length = len(sentence_tokens)

        # If a single sentence is too long, split it
        if sentence_length > max_tokens:
            # If we have accumulated tokens, add them as a chunk
            if current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_length = 0

            # Split the long sentence into smaller chunks
            start = 0
            while start < sentence_length:
                end = min(start + max_tokens, sentence_length)
                chunk_tokens = sentence_tokens[start:end]
                chunks.append(" ".join(chunk_tokens))
                start = end - overlap if end < sentence_length else end

        # If adding this sentence would exceed max_tokens, start a new chunk
        elif current_length + sentence_length > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = sentence_tokens
            current_length = sentence_length
        else:
            current_chunk.extend(sentence_tokens)
            current_length += sentence_length

    # Add any remaining tokens as the final chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def extract_tokens_and_labels(
    data: List[Dict[str, Union[str, None]]]
) -> Tuple[List[str], List[Tuple[int, int, str]]]:
    """Extract tokens and NER labels from annotation data.
    
    Args:
        data: List of token-label pairs
        
    Returns:
        Tuple of (tokens, ner_spans)
    """
    tokens = []
    ner = []
    token_start_idx = 0

    for entry in data:
        char = entry['token']
        label = entry['class_or_confidence']

        # Tokenize the current text chunk
        token_list = tokenize_text(char)

        # Append tokens to the main tokens list
        tokens.extend(token_list)

        if label:
            token_end_idx = token_start_idx + len(token_list) - 1
            ner.append((token_start_idx, token_end_idx, label))

        token_start_idx += len(token_list)

    return tokens, ner