Spaces:
Running
Running
File size: 8,643 Bytes
b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 3011301 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 3011301 b4c92f5 4bf5701 b4c92f5 3011301 b4c92f5 3011301 b4c92f5 3011301 b4c92f5 3011301 b4c92f5 3011301 b4c92f5 4bf5701 b4c92f5 4bf5701 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 |
from typing import List, Dict
import hashlib
import logging
# Configure logging
logger = logging.getLogger(__name__)
# Initialize a cache for tokenization results
# Using a simple in-memory dictionary with text hash as key
_tokenization_cache: Dict[str, List[str]] = {}
# Maximum cache size (number of entries)
MAX_CACHE_SIZE = 1000
try:
from botok import WordTokenizer
# Initialize the tokenizer once at the module level
BOTOK_TOKENIZER = WordTokenizer()
except ImportError:
# Handle the case where botok might not be installed,
# though it's a core dependency for this app.
BOTOK_TOKENIZER = None
logger.error("botok library not found. Tokenization will fail.")
# Optionally, raise an error here if botok is absolutely critical for the app to even start
# raise ImportError("botok is required for tokenization. Please install it.")
def _get_text_hash(text: str) -> str:
"""
Generate a hash for the input text to use as a cache key.
Args:
text: The input text to hash
Returns:
A string representation of the MD5 hash of the input text
"""
return hashlib.md5(text.encode('utf-8')).hexdigest()
def tokenize_texts(texts: List[str], mode: str = "syllable") -> List[List[str]]:
"""
Tokenizes a list of raw Tibetan texts using botok, with caching for performance.
This function maintains an in-memory cache of previously tokenized texts to avoid
redundant processing of the same content. The cache uses MD5 hashes of the input
texts as keys.
Args:
texts: List of raw text strings to tokenize.
Returns:
List of tokenized texts (each as a list of tokens).
Raises:
RuntimeError: If the botok tokenizer failed to initialize.
"""
if BOTOK_TOKENIZER is None:
# This case should ideally be handled more gracefully,
# perhaps by preventing analysis if the tokenizer failed to load.
raise RuntimeError(
"Botok tokenizer failed to initialize. Cannot tokenize texts."
)
tokenized_texts_list = []
if mode not in ["word", "syllable"]:
logger.warning(f"Invalid tokenization mode: '{mode}'. Defaulting to 'syllable'.")
mode = "syllable"
# Process each text
for text_content in texts:
# Skip empty texts
if not text_content.strip():
tokenized_texts_list.append([])
continue
# Generate hash for cache lookup
cache_key_string = text_content + f"_{mode}" # Include mode in string for hashing
text_hash = _get_text_hash(cache_key_string)
# Check if we have this text in cache
if text_hash in _tokenization_cache:
# Cache hit - use cached tokens
tokens = _tokenization_cache[text_hash]
logger.debug(f"Cache hit for text hash {text_hash[:8]}... (mode: {mode})")
else:
# Cache miss - tokenize and store in cache
try:
current_tokens = []
if BOTOK_TOKENIZER:
raw_botok_items = list(BOTOK_TOKENIZER.tokenize(text_content))
if mode == "word":
for item_idx, w in enumerate(raw_botok_items):
if hasattr(w, 'text') and isinstance(w.text, str):
token_text = w.text.strip()
if token_text: # Ensure token is not empty or just whitespace
current_tokens.append(token_text)
# Optionally log if w.text is not a string or missing, for debugging
# elif w.text is not None:
# logger.debug(f"Token item {item_idx} has non-string text {type(w.text)} for hash {text_hash[:8]}. Skipping word.")
# else:
# logger.debug(f"Token item {item_idx} missing text attribute for hash {text_hash[:8]}. Skipping word.")
logger.debug(
f"WORD TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): "
f"{current_tokens[:30]}"
)
elif mode == "syllable":
# This is the original syllable extraction logic
for item_idx, w in enumerate(raw_botok_items):
if hasattr(w, 'syls') and w.syls:
for syl_idx, syl_item in enumerate(w.syls):
syllable_to_process = None
if isinstance(syl_item, str):
syllable_to_process = syl_item
elif isinstance(syl_item, list):
try:
syllable_to_process = "".join(syl_item)
except TypeError:
logger.warning(
f"Syllable item in w.syls was a list, but could not be joined (non-string elements?): {syl_item} "
f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} "
f"for hash {text_hash[:8]}. Skipping this syllable."
)
continue
if syllable_to_process is not None:
stripped_syl = syllable_to_process.strip()
if stripped_syl:
current_tokens.append(stripped_syl)
elif syl_item is not None:
logger.warning(
f"Unexpected type for syllable item (neither str nor list): {type(syl_item)} ('{str(syl_item)[:100]}') "
f"from word item {item_idx} (text: {getattr(w, 'text', 'N/A')}), syl_idx {syl_idx} "
f"for hash {text_hash[:8]}. Skipping this syllable."
)
elif hasattr(w, 'text') and w.text: # Fallback if no 'syls' but in syllable mode
if isinstance(w.text, str):
token_text = w.text.strip()
if token_text:
current_tokens.append(token_text) # Treat as a single syllable/token
elif w.text is not None:
logger.warning(
f"Unexpected type for w.text (in syllable mode fallback): {type(w.text)} ('{str(w.text)[:100]}') "
f"for item {item_idx} (POS: {getattr(w, 'pos', 'N/A')}) "
f"for hash {text_hash[:8]}. Skipping this token."
)
logger.debug(
f"SYLLABLE TOKENS FORMED for hash {text_hash[:8]} (mode: {mode}, first 30): "
f"{current_tokens[:30]}"
)
tokens = current_tokens
else:
logger.error(f"BOTOK_TOKENIZER is None for text hash {text_hash[:8]}, cannot tokenize (mode: {mode}).")
tokens = []
# Store in cache if not empty
if tokens:
# If cache is full, remove a random entry (simple strategy)
if len(_tokenization_cache) >= MAX_CACHE_SIZE:
# Remove first key (oldest if ordered dict, random otherwise)
_tokenization_cache.pop(next(iter(_tokenization_cache)))
_tokenization_cache[text_hash] = tokens
logger.debug(f"Added tokens to cache with hash {text_hash[:8]}... (mode: {mode})")
except Exception as e:
logger.error(f"Error tokenizing text (mode: {mode}): {e}")
tokens = []
tokenized_texts_list.append(tokens)
return tokenized_texts_list
|