Spaces:
Running
Running
File size: 6,580 Bytes
4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 b4c92f5 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 4bf5701 3011301 b4c92f5 4bf5701 b4c92f5 3011301 b4c92f5 3011301 b4c92f5 3011301 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import logging
from typing import List, Any, Optional
import numpy as np # Added for type hinting Optional[np.ndarray]
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# Define the model ID for the Facebook FastText pretrained model
DEFAULT_MODEL_NAME = "facebook-fasttext-pretrained"
# FASTTEXT_MODEL_ID = "fasttext-tibetan" # Removed: Custom model loading to be handled in process.py directly
def get_model_and_device(model_id: str = DEFAULT_MODEL_NAME):
"""
Loads the Facebook official pre-trained FastText model for Tibetan.
Args:
model_id (str): The model ID. Must be 'facebook-fasttext-pretrained' (DEFAULT_MODEL_NAME).
Returns:
Tuple[Optional[Any], Optional[str]]:
A tuple containing the loaded FastText model and its type ("fasttext"),
or (None, None) if loading fails or model_id is unsupported.
"""
logger.info("Attempting to load FastText model via semantic_embedding.get_model_and_device: %s", model_id)
if model_id == DEFAULT_MODEL_NAME: # DEFAULT_MODEL_NAME is "facebook-fasttext-pretrained"
try:
# Importing here to minimize issues if fasttext_embedding also imports from semantic_embedding
from .fasttext_embedding import load_facebook_official_tibetan_model
model = load_facebook_official_tibetan_model()
if model:
logger.info(f"FastText model object received in get_model_and_device. Type: {type(model)}.")
try:
logger.info(f"Model dimensions: {model.get_dimension()}")
# Basic check for model validity via an expected attribute/method
if hasattr(model, 'get_word_vector'):
logger.info("Model has 'get_word_vector' method (Python API expected for fasttext.load_model results).")
except Exception as diag_e:
logger.error(f"Error during diagnostic check of FastText model '{model_id}': {diag_e}", exc_info=True)
return model, "fasttext"
else:
# This case implies load_facebook_official_tibetan_model returned None without raising an error.
logger.error(f"Model loading for '{model_id}' via load_facebook_official_tibetan_model() returned None unexpectedly.")
return None, None
except Exception as e:
logger.error(f"Failed to load or initialize FastText model '{model_id}': {e}. Semantic similarity will not be available.", exc_info=True)
return None, None
else:
logger.error(f"Unsupported model_id for get_model_and_device in semantic_embedding.py: '{model_id}'. Only '{DEFAULT_MODEL_NAME}' is supported by this function.")
return None, None
def generate_embeddings(texts: List[str], model: Any, tokenize_fn=None, use_stopwords: bool = True, use_lite_stopwords: bool = False, corpus_token_freq=None, doc_freq_map=None, total_docs_in_corpus=0) -> Optional[np.ndarray]:
"""
Generates FastText embeddings for a list of texts.
Args:
texts (list[str]): A list of texts to embed.
model: The loaded FastText model.
tokenize_fn: Optional tokenization function for FastText (if different from default botok based).
use_stopwords (bool): Whether to filter out stopwords for FastText embeddings.
use_lite_stopwords (bool): Whether to use the 'lite' stopwords list.
corpus_token_freq: Corpus-wide term frequencies for TF-IDF weighted FastText.
doc_freq_map: Document frequency map for TF-IDF weighted FastText.
total_docs_in_corpus: Total documents in corpus for TF-IDF weighted FastText.
Returns:
Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
"""
if not texts:
logger.warning(
"No texts provided to generate_embeddings. Returning None."
)
return None
logger.info(f"Generating FastText embeddings for {len(texts)} texts...")
try:
from .fasttext_embedding import get_batch_embeddings
stopwords_set = None
if use_stopwords:
if use_lite_stopwords:
from .stopwords_lite_bo import TIBETAN_STOPWORDS_LITE_SET
stopwords_set = TIBETAN_STOPWORDS_LITE_SET
else:
from .stopwords_bo import TIBETAN_STOPWORDS_SET
stopwords_set = TIBETAN_STOPWORDS_SET
embeddings = get_batch_embeddings(
texts,
model,
tokenize_fn=tokenize_fn,
use_stopwords=use_stopwords,
stopwords_set=stopwords_set,
corpus_token_freq=corpus_token_freq,
doc_freq_map=doc_freq_map,
total_docs_in_corpus=total_docs_in_corpus
)
if embeddings is None:
logger.error(f"get_batch_embeddings returned None for {len(texts)} texts. First few: {texts[:2]}")
return None
logger.info("FastText embeddings generated with shape: %s", str(embeddings.shape))
return embeddings
except ImportError:
logger.error("Required FastText modules not found. Please ensure 'fasttext' and its dependencies are correctly installed.")
return None
except Exception as e:
logger.error(f"An unexpected error occurred during FastText embedding generation: {e}", exc_info=True)
return None
def train_fasttext_model(corpus_texts: List[str], **kwargs):
"""
Train a FastText model on the provided corpus texts.
Args:
corpus_texts: List of texts to use for training
**kwargs: Additional parameters for training (dim, epoch, etc.)
Returns:
Trained model and path to the model file (Note: current implementation returns only model object)
""" # Docstring updated for return type
try:
from .fasttext_embedding import prepare_corpus_file, train_fasttext_model as train_ft
corpus_path = prepare_corpus_file(corpus_texts)
model = train_ft(corpus_path=corpus_path, **kwargs)
return model # Returns model object, not path as previously suggested by older docstring
except ImportError:
logger.error("FastText module not found. Please install it with 'pip install fasttext'.")
raise # Re-raising to signal critical failure if training components are missing
|