Spaces:
Sleeping
Sleeping
File size: 3,678 Bytes
b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
from typing import List, Dict
import hashlib
import logging
# Configure logging
logger = logging.getLogger(__name__)
# Initialize a cache for tokenization results
# Using a simple in-memory dictionary with text hash as key
_tokenization_cache: Dict[str, List[str]] = {}
# Maximum cache size (number of entries)
MAX_CACHE_SIZE = 1000
try:
from botok import WordTokenizer
# Initialize the tokenizer once at the module level
BOTOK_TOKENIZER = WordTokenizer()
except ImportError:
# Handle the case where botok might not be installed,
# though it's a core dependency for this app.
BOTOK_TOKENIZER = None
logger.error("botok library not found. Tokenization will fail.")
# Optionally, raise an error here if botok is absolutely critical for the app to even start
# raise ImportError("botok is required for tokenization. Please install it.")
def _get_text_hash(text: str) -> str:
"""
Generate a hash for the input text to use as a cache key.
Args:
text: The input text to hash
Returns:
A string representation of the MD5 hash of the input text
"""
return hashlib.md5(text.encode('utf-8')).hexdigest()
def tokenize_texts(texts: List[str]) -> List[List[str]]:
"""
Tokenizes a list of raw Tibetan texts using botok, with caching for performance.
This function maintains an in-memory cache of previously tokenized texts to avoid
redundant processing of the same content. The cache uses MD5 hashes of the input
texts as keys.
Args:
texts: List of raw text strings to tokenize.
Returns:
List of tokenized texts (each as a list of tokens).
Raises:
RuntimeError: If the botok tokenizer failed to initialize.
"""
if BOTOK_TOKENIZER is None:
# This case should ideally be handled more gracefully,
# perhaps by preventing analysis if the tokenizer failed to load.
raise RuntimeError(
"Botok tokenizer failed to initialize. Cannot tokenize texts."
)
tokenized_texts_list = []
# Process each text
for text_content in texts:
# Skip empty texts
if not text_content.strip():
tokenized_texts_list.append([])
continue
# Generate hash for cache lookup
text_hash = _get_text_hash(text_content)
# Check if we have this text in cache
if text_hash in _tokenization_cache:
# Cache hit - use cached tokens
tokens = _tokenization_cache[text_hash]
logger.debug(f"Cache hit for text hash {text_hash[:8]}...")
else:
# Cache miss - tokenize and store in cache
try:
tokens = [
w.text for w in BOTOK_TOKENIZER.tokenize(text_content) if w.text.strip()
]
# Store in cache if not empty
if tokens:
# If cache is full, remove a random entry (simple strategy)
if len(_tokenization_cache) >= MAX_CACHE_SIZE:
# Remove first key (oldest if ordered dict, random otherwise)
_tokenization_cache.pop(next(iter(_tokenization_cache)))
_tokenization_cache[text_hash] = tokens
logger.debug(f"Added tokens to cache with hash {text_hash[:8]}...")
except Exception as e:
logger.error(f"Error tokenizing text: {e}")
tokens = []
tokenized_texts_list.append(tokens)
return tokenized_texts_list
|