Spaces:

BookingCare
/

ner-annotation

Running

File size: 3,666 Bytes

a33a001

"""Text processing utilities for NER annotation."""

import re
from typing import List, Dict, Union, Tuple

def tokenize_text(text: str) -> List[str]:
    """Tokenize the input text into a list of tokens.
    
    Args:
        text: The input text to tokenize
        
    Returns:
        List of tokens
    """
    return re.findall(r'\w+(?:[-_]\w+)*|\S', text)

def join_tokens(tokens: List[str]) -> str:
    """Join tokens with proper spacing.
    
    Args:
        tokens: List of tokens to join
        
    Returns:
        Joined text string
    """
    text = ""
    for token in tokens:
        if token in {",", ".", "!", "?", ":", ";", "..."}:
            text = text.rstrip() + token
        else:
            text += " " + token
    return text.strip()

def process_text_for_gliner(
    text: str,
    max_tokens: int = 256,
    overlap: int = 32
) -> List[str]:
    """Process text for GLiNER by splitting long texts into overlapping chunks.
    
    Preserves sentence boundaries and context when possible.
    
    Args:
        text: The input text to process
        max_tokens: Maximum number of tokens per chunk
        overlap: Number of tokens to overlap between chunks
        
    Returns:
        List of text chunks suitable for GLiNER
    """
    # First split into sentences to preserve natural boundaries
    sentences = re.split(r'(?<=[.!?])\s+', text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        # Tokenize the sentence
        sentence_tokens = tokenize_text(sentence)
        sentence_length = len(sentence_tokens)

        # If a single sentence is too long, split it
        if sentence_length > max_tokens:
            # If we have accumulated tokens, add them as a chunk
            if current_chunk:
                chunks.append(" ".join(current_chunk))
                current_chunk = []
                current_length = 0

            # Split the long sentence into smaller chunks
            start = 0
            while start < sentence_length:
                end = min(start + max_tokens, sentence_length)
                chunk_tokens = sentence_tokens[start:end]
                chunks.append(" ".join(chunk_tokens))
                start = end - overlap if end < sentence_length else end

        # If adding this sentence would exceed max_tokens, start a new chunk
        elif current_length + sentence_length > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = sentence_tokens
            current_length = sentence_length
        else:
            current_chunk.extend(sentence_tokens)
            current_length += sentence_length

    # Add any remaining tokens as the final chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def extract_tokens_and_labels(
    data: List[Dict[str, Union[str, None]]]
) -> Tuple[List[str], List[Tuple[int, int, str]]]:
    """Extract tokens and NER labels from annotation data.
    
    Args:
        data: List of token-label pairs
        
    Returns:
        Tuple of (tokens, ner_spans)
    """
    tokens = []
    ner = []
    token_start_idx = 0

    for entry in data:
        char = entry['token']
        label = entry['class_or_confidence']

        # Tokenize the current text chunk
        token_list = tokenize_text(char)

        # Append tokens to the main tokens list
        tokens.extend(token_list)

        if label:
            token_end_idx = token_start_idx + len(token_list) - 1
            ner.append((token_start_idx, token_end_idx, label))

        token_start_idx += len(token_list)

    return tokens, ner