ttm-webapp-hf / pipeline /stopwords_bo.py
daniel-wojahn's picture
revamped the pipeline and added stopwords and documentation
0bbf2df verified
# -*- coding: utf-8 -*-
"""Module for Tibetan stopwords.
This file centralizes the Tibetan stopwords list used in the Tibetan Text Metrics application.
Sources for these stopwords are acknowledged in the main README.md of the webapp.
"""
# Initial set of stopwords with clear categories
PARTICLES_INITIAL = [
"ཏུ", "གི", "ཀྱི", "གིས", "ཀྱིས", "ཡིས", "ཀྱང", "སྟེ", "ཏེ", "ནོ", "ཏོ",
"ཅིང", "ཅིག", "ཅེས", "ཞེས", "གྱིས", "ན",
]
MARKERS_AND_PUNCTUATION = ["༈", "།", "༎", "༑"]
ORDINAL_NUMBERS = [
"དང་པོ", "གཉིས་པ", "གསུམ་པ", "བཞི་པ", "ལྔ་པ", "དྲུག་པ", "བདུན་པ", "བརྒྱད་པ", "དགུ་པ", "བཅུ་པ",
"བཅུ་གཅིག་པ", "བཅུ་གཉིས་པ", "བཅུ་གསུམ་པ", "བཅུ་བཞི་པ", "བཅོ་ལྔ་པ",
"བཅུ་དྲུག་པ", "བཅུ་བདུན་པ", "བཅོ་བརྒྱད་པ", "བཅུ་དགུ་པ", "ཉི་ཤུ་པ",
]
# Additional stopwords from the comprehensive list, categorized for readability
MORE_PARTICLES_SUFFIXES = [
"འི་", "དུ་", "གིས་", "ཏེ", "གི་", "ཡི་", "ཀྱི་", "པས་", "ཀྱིས་", "ཡི", "ལ", "ནི་", "ར", "དུ",
"ལས", "གྱིས་", "ས", "ཏེ་", "གྱི་", "དེ", "ཀ་", "སྟེ", "སྟེ་", "ངམ", "ཏོ", "དོ", "དམ་",
"གྱིན་", "ན", "འམ་", "ཀྱིན་", "ལོ", "ཀྱིས", "བས་", "ཤིག", "གིས", "ཀི་", "ཡིས་", "གྱི", "གི",
"བམ་", "ཤིག་", "ནམ", "མིན་", "ནམ་", "ངམ་", "རུ་", "ཤས་", "ཏུ", "ཡིས", "གིན་", "གམ་",
"གྱིས", "ཅང་", "སམ་", "ཞིག", "འང", "རུ", "དང", "ཡ", "འག", "སམ", "ཀ", "འམ", "མམ",
"དམ", "ཀྱི", "ལམ", "ནོ་", "སོ་", "རམ་", "བོ་", "ཨང་", "ཕྱི", "ཏོ་", "གེ", "གོ", "རོ་", "བོ",
"པས", "འི", "རམ", "བས", "མཾ་", "པོ", "ག་", "ག", "གམ", "བམ", "མོ་", "མམ་", "ཏམ་", "ངོ",
"ཏམ", "གིང་", "ཀྱང" # ཀྱང also in PARTICLES_INITIAL, set() will handle duplicates
]
PRONOUNS_DEMONSTRATIVES = ["འདི", "གཞན་", "དེ་", "རང་", "སུ་"]
VERBS_AUXILIARIES = ["ཡིན་", "མི་", "ལགས་པ", "ཡིན་པ", "ལགས་", "མིན་", "ཡིན་པ་", "མིན", "ཡིན་བ", "ཡིན་ལུགས་"]
ADVERBS_QUALIFIERS_INTENSIFIERS = [
"སོགས་", "ཙམ་", "ཡང་", "ཉིད་", "ཞིང་", "རུང་", "ན་རེ", "འང་", "ཁོ་ན་", "འཕྲལ་", "བར་",
"ཅུང་ཟད་", "ཙམ་པ་", "ཤ་སྟག་"
]
QUANTIFIERS_DETERMINERS_COLLECTIVES = [
"རྣམས་", "ཀུན་", "སྙེད་", "བཅས་", "ཡོངས་", "མཐའ་དག་", "དག་", "ཚུ", "ཚང་མ", "ཐམས་ཅད་",
"ཅིག་", "སྣ་ཚོགས་", "སྙེད་པ", "རེ་རེ་", "འགའ་", "སྤྱི", "དུ་མ", "མ", "ཁོ་ན", "ཚོ", "ལ་ལ་",
"སྙེད་པ་", "འབའ་", "སྙེད", "གྲང་", "ཁ་", "ངེ་", "ཅོག་", "རིལ་", "ཉུང་ཤས་", "ཚ་"
]
CONNECTORS_CONJUNCTIONS = ["དང་", "ཅིང་", "ཤིང་"]
INTERJECTIONS_EXCLAMATIONS = ["ཨེ་", "འོ་"]
# Combine all categorized lists
_ALL_STOPWORDS_CATEGORIZED = (
PARTICLES_INITIAL +
MARKERS_AND_PUNCTUATION +
ORDINAL_NUMBERS +
MORE_PARTICLES_SUFFIXES +
PRONOUNS_DEMONSTRATIVES +
VERBS_AUXILIARIES +
ADVERBS_QUALIFIERS_INTENSIFIERS +
QUANTIFIERS_DETERMINERS_COLLECTIVES +
CONNECTORS_CONJUNCTIONS +
INTERJECTIONS_EXCLAMATIONS
)
# Final flat list of unique stopwords for TfidfVectorizer (as a list)
TIBETAN_STOPWORDS = list(set(_ALL_STOPWORDS_CATEGORIZED))
# Final set of unique stopwords for efficient Jaccard/LCS filtering (as a set)
TIBETAN_STOPWORDS_SET = set(TIBETAN_STOPWORDS)