"""Text processing utilities for NER annotation.""" import re from typing import List, Dict, Union, Tuple def tokenize_text(text: str) -> List[str]: """Tokenize the input text into a list of tokens. Args: text: The input text to tokenize Returns: List of tokens """ return re.findall(r'\w+(?:[-_]\w+)*|\S', text) def join_tokens(tokens: List[str]) -> str: """Join tokens with proper spacing. Args: tokens: List of tokens to join Returns: Joined text string """ text = "" for token in tokens: if token in {",", ".", "!", "?", ":", ";", "..."}: text = text.rstrip() + token else: text += " " + token return text.strip() def process_text_for_gliner( text: str, max_tokens: int = 256, overlap: int = 32 ) -> List[str]: """Process text for GLiNER by splitting long texts into overlapping chunks. Preserves sentence boundaries and context when possible. Args: text: The input text to process max_tokens: Maximum number of tokens per chunk overlap: Number of tokens to overlap between chunks Returns: List of text chunks suitable for GLiNER """ # First split into sentences to preserve natural boundaries sentences = re.split(r'(?<=[.!?])\s+', text) chunks = [] current_chunk = [] current_length = 0 for sentence in sentences: # Tokenize the sentence sentence_tokens = tokenize_text(sentence) sentence_length = len(sentence_tokens) # If a single sentence is too long, split it if sentence_length > max_tokens: # If we have accumulated tokens, add them as a chunk if current_chunk: chunks.append(" ".join(current_chunk)) current_chunk = [] current_length = 0 # Split the long sentence into smaller chunks start = 0 while start < sentence_length: end = min(start + max_tokens, sentence_length) chunk_tokens = sentence_tokens[start:end] chunks.append(" ".join(chunk_tokens)) start = end - overlap if end < sentence_length else end # If adding this sentence would exceed max_tokens, start a new chunk elif current_length + sentence_length > max_tokens: chunks.append(" ".join(current_chunk)) current_chunk = sentence_tokens current_length = sentence_length else: current_chunk.extend(sentence_tokens) current_length += sentence_length # Add any remaining tokens as the final chunk if current_chunk: chunks.append(" ".join(current_chunk)) return chunks def extract_tokens_and_labels( data: List[Dict[str, Union[str, None]]] ) -> Tuple[List[str], List[Tuple[int, int, str]]]: """Extract tokens and NER labels from annotation data. Args: data: List of token-label pairs Returns: Tuple of (tokens, ner_spans) """ tokens = [] ner = [] token_start_idx = 0 for entry in data: char = entry['token'] label = entry['class_or_confidence'] # Tokenize the current text chunk token_list = tokenize_text(char) # Append tokens to the main tokens list tokens.extend(token_list) if label: token_end_idx = token_start_idx + len(token_list) - 1 ner.append((token_start_idx, token_end_idx, label)) token_start_idx += len(token_list) return tokens, ner