ttm-webapp-hf / pipeline /tibetan_stopwords.py
daniel-wojahn's picture
Reafactoring of the tokenization pipeline, adjusted fasttext implementation
3011301 verified
import logging
logger = logging.getLogger(__name__)
def get_stopwords(use_lite: bool = False) -> set:
"""
Returns a set of Tibetan stopwords by importing them from the respective .py files.
Args:
use_lite (bool): If True, returns a smaller, less aggressive list of stopwords
from stopwords_lite_bo.py.
Otherwise, returns the full list from stopwords_bo.py.
Returns:
set: A set of stopword strings. Returns an empty set on failure.
"""
stopwords_set = set()
try:
if use_lite:
from .stopwords_lite_bo import STOPWORDS
stopwords_set = STOPWORDS
else:
from .stopwords_bo import STOPWORDS
stopwords_set = STOPWORDS
logger.info(f"Successfully loaded {len(stopwords_set)} stopwords from {source_module_name.lstrip('.')}.py")
except ImportError:
logger.error(
f"Failed to import STOPWORDS from {source_module_name.lstrip('.')}.py. "
f"Ensure the file exists in the 'pipeline' directory, is a Python module (ends in .py), "
f"and is importable (e.g., no syntax errors)."
)
except AttributeError:
logger.error(
f"Variable 'STOPWORDS' (all caps) not found in {source_module_name.lstrip('.')}.py. "
f"Please ensure the stopword set is defined with this name within the module."
)
except Exception as e:
logger.error(f"An unexpected error occurred while loading stopwords from {source_module_name.lstrip('.')}.py: {e}")
return stopwords_set