Spaces:
Running
Running
import logging | |
from typing import List, Any, Optional | |
import numpy as np # Added for type hinting Optional[np.ndarray] | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
) | |
logger = logging.getLogger(__name__) | |
# Define the model ID for the Facebook FastText pretrained model | |
DEFAULT_MODEL_NAME = "facebook-fasttext-pretrained" | |
# FASTTEXT_MODEL_ID = "fasttext-tibetan" # Removed: Custom model loading to be handled in process.py directly | |
def get_model_and_device(model_id: str = DEFAULT_MODEL_NAME): | |
""" | |
Loads the Facebook official pre-trained FastText model for Tibetan. | |
Args: | |
model_id (str): The model ID. Must be 'facebook-fasttext-pretrained' (DEFAULT_MODEL_NAME). | |
Returns: | |
Tuple[Optional[Any], Optional[str]]: | |
A tuple containing the loaded FastText model and its type ("fasttext"), | |
or (None, None) if loading fails or model_id is unsupported. | |
""" | |
logger.info("Attempting to load FastText model via semantic_embedding.get_model_and_device: %s", model_id) | |
if model_id == DEFAULT_MODEL_NAME: # DEFAULT_MODEL_NAME is "facebook-fasttext-pretrained" | |
try: | |
# Importing here to minimize issues if fasttext_embedding also imports from semantic_embedding | |
from .fasttext_embedding import load_facebook_official_tibetan_model | |
model = load_facebook_official_tibetan_model() | |
if model: | |
logger.info(f"FastText model object received in get_model_and_device. Type: {type(model)}.") | |
try: | |
logger.info(f"Model dimensions: {model.get_dimension()}") | |
# Basic check for model validity via an expected attribute/method | |
if hasattr(model, 'get_word_vector'): | |
logger.info("Model has 'get_word_vector' method (Python API expected for fasttext.load_model results).") | |
except Exception as diag_e: | |
logger.error(f"Error during diagnostic check of FastText model '{model_id}': {diag_e}", exc_info=True) | |
return model, "fasttext" | |
else: | |
# This case implies load_facebook_official_tibetan_model returned None without raising an error. | |
logger.error(f"Model loading for '{model_id}' via load_facebook_official_tibetan_model() returned None unexpectedly.") | |
return None, None | |
except Exception as e: | |
logger.error(f"Failed to load or initialize FastText model '{model_id}': {e}. Semantic similarity will not be available.", exc_info=True) | |
return None, None | |
else: | |
logger.error(f"Unsupported model_id for get_model_and_device in semantic_embedding.py: '{model_id}'. Only '{DEFAULT_MODEL_NAME}' is supported by this function.") | |
return None, None | |
def generate_embeddings(texts: List[str], model: Any, tokenize_fn=None, use_stopwords: bool = True, use_lite_stopwords: bool = False, corpus_token_freq=None, doc_freq_map=None, total_docs_in_corpus=0) -> Optional[np.ndarray]: | |
""" | |
Generates FastText embeddings for a list of texts. | |
Args: | |
texts (list[str]): A list of texts to embed. | |
model: The loaded FastText model. | |
tokenize_fn: Optional tokenization function for FastText (if different from default botok based). | |
use_stopwords (bool): Whether to filter out stopwords for FastText embeddings. | |
use_lite_stopwords (bool): Whether to use the 'lite' stopwords list. | |
corpus_token_freq: Corpus-wide term frequencies for TF-IDF weighted FastText. | |
doc_freq_map: Document frequency map for TF-IDF weighted FastText. | |
total_docs_in_corpus: Total documents in corpus for TF-IDF weighted FastText. | |
Returns: | |
Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails. | |
""" | |
if not texts: | |
logger.warning( | |
"No texts provided to generate_embeddings. Returning None." | |
) | |
return None | |
logger.info(f"Generating FastText embeddings for {len(texts)} texts...") | |
try: | |
from .fasttext_embedding import get_batch_embeddings | |
stopwords_set = None | |
if use_stopwords: | |
if use_lite_stopwords: | |
from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE_SET | |
stopwords_set = TIBETAN_STOPWORDS_LITE_SET | |
else: | |
from .stopwords_bo import TIBETAN_STOPWORDS_SET | |
stopwords_set = TIBETAN_STOPWORDS_SET | |
embeddings = get_batch_embeddings( | |
texts, | |
model, | |
tokenize_fn=tokenize_fn, | |
use_stopwords=use_stopwords, | |
stopwords_set=stopwords_set, | |
corpus_token_freq=corpus_token_freq, | |
doc_freq_map=doc_freq_map, | |
total_docs_in_corpus=total_docs_in_corpus | |
) | |
if embeddings is None: | |
logger.error(f"get_batch_embeddings returned None for {len(texts)} texts. First few: {texts[:2]}") | |
return None | |
logger.info("FastText embeddings generated with shape: %s", str(embeddings.shape)) | |
return embeddings | |
except ImportError: | |
logger.error("Required FastText modules not found. Please ensure 'fasttext' and its dependencies are correctly installed.") | |
return None | |
except Exception as e: | |
logger.error(f"An unexpected error occurred during FastText embedding generation: {e}", exc_info=True) | |
return None | |
def train_fasttext_model(corpus_texts: List[str], **kwargs): | |
""" | |
Train a FastText model on the provided corpus texts. | |
Args: | |
corpus_texts: List of texts to use for training | |
**kwargs: Additional parameters for training (dim, epoch, etc.) | |
Returns: | |
Trained model and path to the model file (Note: current implementation returns only model object) | |
""" # Docstring updated for return type | |
try: | |
from .fasttext_embedding import prepare_corpus_file, train_fasttext_model as train_ft | |
corpus_path = prepare_corpus_file(corpus_texts) | |
model = train_ft(corpus_path=corpus_path, **kwargs) | |
return model # Returns model object, not path as previously suggested by older docstring | |
except ImportError: | |
logger.error("FastText module not found. Please install it with 'pip install fasttext'.") | |
raise # Re-raising to signal critical failure if training components are missing | |