nam pham
feat: improve ui/ux
a33a001
"""Text processing utilities for NER annotation."""
import re
from typing import List, Dict, Union, Tuple
def tokenize_text(text: str) -> List[str]:
"""Tokenize the input text into a list of tokens.
Args:
text: The input text to tokenize
Returns:
List of tokens
"""
return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
def join_tokens(tokens: List[str]) -> str:
"""Join tokens with proper spacing.
Args:
tokens: List of tokens to join
Returns:
Joined text string
"""
text = ""
for token in tokens:
if token in {",", ".", "!", "?", ":", ";", "..."}:
text = text.rstrip() + token
else:
text += " " + token
return text.strip()
def process_text_for_gliner(
text: str,
max_tokens: int = 256,
overlap: int = 32
) -> List[str]:
"""Process text for GLiNER by splitting long texts into overlapping chunks.
Preserves sentence boundaries and context when possible.
Args:
text: The input text to process
max_tokens: Maximum number of tokens per chunk
overlap: Number of tokens to overlap between chunks
Returns:
List of text chunks suitable for GLiNER
"""
# First split into sentences to preserve natural boundaries
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
# Tokenize the sentence
sentence_tokens = tokenize_text(sentence)
sentence_length = len(sentence_tokens)
# If a single sentence is too long, split it
if sentence_length > max_tokens:
# If we have accumulated tokens, add them as a chunk
if current_chunk:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0
# Split the long sentence into smaller chunks
start = 0
while start < sentence_length:
end = min(start + max_tokens, sentence_length)
chunk_tokens = sentence_tokens[start:end]
chunks.append(" ".join(chunk_tokens))
start = end - overlap if end < sentence_length else end
# If adding this sentence would exceed max_tokens, start a new chunk
elif current_length + sentence_length > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = sentence_tokens
current_length = sentence_length
else:
current_chunk.extend(sentence_tokens)
current_length += sentence_length
# Add any remaining tokens as the final chunk
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def extract_tokens_and_labels(
data: List[Dict[str, Union[str, None]]]
) -> Tuple[List[str], List[Tuple[int, int, str]]]:
"""Extract tokens and NER labels from annotation data.
Args:
data: List of token-label pairs
Returns:
Tuple of (tokens, ner_spans)
"""
tokens = []
ner = []
token_start_idx = 0
for entry in data:
char = entry['token']
label = entry['class_or_confidence']
# Tokenize the current text chunk
token_list = tokenize_text(char)
# Append tokens to the main tokens list
tokens.extend(token_list)
if label:
token_end_idx = token_start_idx + len(token_list) - 1
ner.append((token_start_idx, token_end_idx, label))
token_start_idx += len(token_list)
return tokens, ner