Spaces:
Running
Running
File size: 3,666 Bytes
a33a001 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
"""Text processing utilities for NER annotation."""
import re
from typing import List, Dict, Union, Tuple
def tokenize_text(text: str) -> List[str]:
"""Tokenize the input text into a list of tokens.
Args:
text: The input text to tokenize
Returns:
List of tokens
"""
return re.findall(r'\w+(?:[-_]\w+)*|\S', text)
def join_tokens(tokens: List[str]) -> str:
"""Join tokens with proper spacing.
Args:
tokens: List of tokens to join
Returns:
Joined text string
"""
text = ""
for token in tokens:
if token in {",", ".", "!", "?", ":", ";", "..."}:
text = text.rstrip() + token
else:
text += " " + token
return text.strip()
def process_text_for_gliner(
text: str,
max_tokens: int = 256,
overlap: int = 32
) -> List[str]:
"""Process text for GLiNER by splitting long texts into overlapping chunks.
Preserves sentence boundaries and context when possible.
Args:
text: The input text to process
max_tokens: Maximum number of tokens per chunk
overlap: Number of tokens to overlap between chunks
Returns:
List of text chunks suitable for GLiNER
"""
# First split into sentences to preserve natural boundaries
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
# Tokenize the sentence
sentence_tokens = tokenize_text(sentence)
sentence_length = len(sentence_tokens)
# If a single sentence is too long, split it
if sentence_length > max_tokens:
# If we have accumulated tokens, add them as a chunk
if current_chunk:
chunks.append(" ".join(current_chunk))
current_chunk = []
current_length = 0
# Split the long sentence into smaller chunks
start = 0
while start < sentence_length:
end = min(start + max_tokens, sentence_length)
chunk_tokens = sentence_tokens[start:end]
chunks.append(" ".join(chunk_tokens))
start = end - overlap if end < sentence_length else end
# If adding this sentence would exceed max_tokens, start a new chunk
elif current_length + sentence_length > max_tokens:
chunks.append(" ".join(current_chunk))
current_chunk = sentence_tokens
current_length = sentence_length
else:
current_chunk.extend(sentence_tokens)
current_length += sentence_length
# Add any remaining tokens as the final chunk
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def extract_tokens_and_labels(
data: List[Dict[str, Union[str, None]]]
) -> Tuple[List[str], List[Tuple[int, int, str]]]:
"""Extract tokens and NER labels from annotation data.
Args:
data: List of token-label pairs
Returns:
Tuple of (tokens, ner_spans)
"""
tokens = []
ner = []
token_start_idx = 0
for entry in data:
char = entry['token']
label = entry['class_or_confidence']
# Tokenize the current text chunk
token_list = tokenize_text(char)
# Append tokens to the main tokens list
tokens.extend(token_list)
if label:
token_end_idx = token_start_idx + len(token_list) - 1
ner.append((token_start_idx, token_end_idx, label))
token_start_idx += len(token_list)
return tokens, ner |