Spaces:
Sleeping
Sleeping
Commit
·
7ed74d5
1
Parent(s):
59185b1
fixed hf model
Browse files- pipeline/fasttext_embedding.py +18 -12
pipeline/fasttext_embedding.py
CHANGED
@@ -377,18 +377,24 @@ def get_batch_embeddings(
|
|
377 |
logger.warning(f"Pre-tokenized list `tokenize_fn` is shorter than the list of texts. Index {i} is out of bounds for `tokenize_fn` with length {len(tokenize_fn)}. Defaulting to None for this text.")
|
378 |
# If tokenize_fn is None or other, tokens_or_tokenizer_for_current_text remains None (get_text_embedding handles default).
|
379 |
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
|
393 |
return np.array(embeddings)
|
394 |
|
|
|
377 |
logger.warning(f"Pre-tokenized list `tokenize_fn` is shorter than the list of texts. Index {i} is out of bounds for `tokenize_fn` with length {len(tokenize_fn)}. Defaulting to None for this text.")
|
378 |
# If tokenize_fn is None or other, tokens_or_tokenizer_for_current_text remains None (get_text_embedding handles default).
|
379 |
|
380 |
+
try:
|
381 |
+
embedding = get_text_embedding(
|
382 |
+
text_content, # Use renamed variable
|
383 |
+
model,
|
384 |
+
tokenize_fn=tokens_or_tokenizer_for_current_text, # Pass the correctly determined function or token list
|
385 |
+
use_stopwords=use_stopwords,
|
386 |
+
stopwords_set=stopwords_set,
|
387 |
+
use_tfidf_weighting=use_tfidf_weighting,
|
388 |
+
corpus_token_freq=corpus_token_freq,
|
389 |
+
doc_freq_map=doc_freq_map,
|
390 |
+
total_docs_in_corpus=total_docs_in_corpus
|
391 |
+
)
|
392 |
+
embeddings.append(embedding)
|
393 |
+
except Exception as e:
|
394 |
+
source_module_name = "fasttext_embedding.py"
|
395 |
+
logger.error(f"Error generating FastText embeddings in {source_module_name}: {e}", exc_info=True)
|
396 |
+
# Append a zero vector or handle as per desired error strategy
|
397 |
+
embeddings.append(np.zeros(model.get_dimension()))
|
398 |
|
399 |
return np.array(embeddings)
|
400 |
|