Spaces:
Running
Running
"""Text processing utilities for NER annotation.""" | |
import re | |
from typing import List, Dict, Union, Tuple | |
def tokenize_text(text: str) -> List[str]: | |
"""Tokenize the input text into a list of tokens. | |
Args: | |
text: The input text to tokenize | |
Returns: | |
List of tokens | |
""" | |
return re.findall(r'\w+(?:[-_]\w+)*|\S', text) | |
def join_tokens(tokens: List[str]) -> str: | |
"""Join tokens with proper spacing. | |
Args: | |
tokens: List of tokens to join | |
Returns: | |
Joined text string | |
""" | |
text = "" | |
for token in tokens: | |
if token in {",", ".", "!", "?", ":", ";", "..."}: | |
text = text.rstrip() + token | |
else: | |
text += " " + token | |
return text.strip() | |
def process_text_for_gliner( | |
text: str, | |
max_tokens: int = 256, | |
overlap: int = 32 | |
) -> List[str]: | |
"""Process text for GLiNER by splitting long texts into overlapping chunks. | |
Preserves sentence boundaries and context when possible. | |
Args: | |
text: The input text to process | |
max_tokens: Maximum number of tokens per chunk | |
overlap: Number of tokens to overlap between chunks | |
Returns: | |
List of text chunks suitable for GLiNER | |
""" | |
# First split into sentences to preserve natural boundaries | |
sentences = re.split(r'(?<=[.!?])\s+', text) | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for sentence in sentences: | |
# Tokenize the sentence | |
sentence_tokens = tokenize_text(sentence) | |
sentence_length = len(sentence_tokens) | |
# If a single sentence is too long, split it | |
if sentence_length > max_tokens: | |
# If we have accumulated tokens, add them as a chunk | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = [] | |
current_length = 0 | |
# Split the long sentence into smaller chunks | |
start = 0 | |
while start < sentence_length: | |
end = min(start + max_tokens, sentence_length) | |
chunk_tokens = sentence_tokens[start:end] | |
chunks.append(" ".join(chunk_tokens)) | |
start = end - overlap if end < sentence_length else end | |
# If adding this sentence would exceed max_tokens, start a new chunk | |
elif current_length + sentence_length > max_tokens: | |
chunks.append(" ".join(current_chunk)) | |
current_chunk = sentence_tokens | |
current_length = sentence_length | |
else: | |
current_chunk.extend(sentence_tokens) | |
current_length += sentence_length | |
# Add any remaining tokens as the final chunk | |
if current_chunk: | |
chunks.append(" ".join(current_chunk)) | |
return chunks | |
def extract_tokens_and_labels( | |
data: List[Dict[str, Union[str, None]]] | |
) -> Tuple[List[str], List[Tuple[int, int, str]]]: | |
"""Extract tokens and NER labels from annotation data. | |
Args: | |
data: List of token-label pairs | |
Returns: | |
Tuple of (tokens, ner_spans) | |
""" | |
tokens = [] | |
ner = [] | |
token_start_idx = 0 | |
for entry in data: | |
char = entry['token'] | |
label = entry['class_or_confidence'] | |
# Tokenize the current text chunk | |
token_list = tokenize_text(char) | |
# Append tokens to the main tokens list | |
tokens.extend(token_list) | |
if label: | |
token_end_idx = token_start_idx + len(token_list) - 1 | |
ner.append((token_start_idx, token_end_idx, label)) | |
token_start_idx += len(token_list) | |
return tokens, ner |