Spaces:

daniel-wojahn
/

ttm-webapp-hf

Running

ttm-webapp-hf / pipeline /tibetan_stopwords.py

Reafactoring of the tokenization pipeline, adjusted fasttext implementation

3011301 verified 19 days ago

1.62 kB

	import logging

	logger = logging.getLogger(__name__)

	def get_stopwords(use_lite: bool = False) -> set:
	"""
	Returns a set of Tibetan stopwords by importing them from the respective .py files.

	Args:
	use_lite (bool): If True, returns a smaller, less aggressive list of stopwords
	from stopwords_lite_bo.py.
	Otherwise, returns the full list from stopwords_bo.py.

	Returns:
	set: A set of stopword strings. Returns an empty set on failure.
	"""
	stopwords_set = set()
	try:
	if use_lite:
	from .stopwords_lite_bo import STOPWORDS
	stopwords_set = STOPWORDS
	else:
	from .stopwords_bo import STOPWORDS
	stopwords_set = STOPWORDS

	logger.info(f"Successfully loaded {len(stopwords_set)} stopwords from {source_module_name.lstrip('.')}.py")
	except ImportError:
	logger.error(
	f"Failed to import STOPWORDS from {source_module_name.lstrip('.')}.py. "
	f"Ensure the file exists in the 'pipeline' directory, is a Python module (ends in .py), "
	f"and is importable (e.g., no syntax errors)."
	)
	except AttributeError:
	logger.error(
	f"Variable 'STOPWORDS' (all caps) not found in {source_module_name.lstrip('.')}.py. "
	f"Please ensure the stopword set is defined with this name within the module."
	)
	except Exception as e:
	logger.error(f"An unexpected error occurred while loading stopwords from {source_module_name.lstrip('.')}.py: {e}")

	return stopwords_set