Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

daniel-wojahn commited on 5 days ago

Commit

7ed74d5

1 Parent(s): 59185b1

fixed hf model

Files changed (1) hide show

pipeline/fasttext_embedding.py CHANGED Viewed

@@ -377,18 +377,24 @@ def get_batch_embeddings(
                 logger.warning(f"Pre-tokenized list `tokenize_fn` is shorter than the list of texts. Index {i} is out of bounds for `tokenize_fn` with length {len(tokenize_fn)}. Defaulting to None for this text.")
         # If tokenize_fn is None or other, tokens_or_tokenizer_for_current_text remains None (get_text_embedding handles default).
-        embedding = get_text_embedding(
-            text_content,  # Use renamed variable
-            model,
-            tokenize_fn=tokens_or_tokenizer_for_current_text,  # Pass the correctly determined function or token list
-            use_stopwords=use_stopwords,
-            stopwords_set=stopwords_set,
-            use_tfidf_weighting=use_tfidf_weighting,
-            corpus_token_freq=corpus_token_freq,
-            doc_freq_map=doc_freq_map,
-            total_docs_in_corpus=total_docs_in_corpus
-        )
-        embeddings.append(embedding)
     return np.array(embeddings)

                 logger.warning(f"Pre-tokenized list `tokenize_fn` is shorter than the list of texts. Index {i} is out of bounds for `tokenize_fn` with length {len(tokenize_fn)}. Defaulting to None for this text.")
         # If tokenize_fn is None or other, tokens_or_tokenizer_for_current_text remains None (get_text_embedding handles default).
+        try:
+            embedding = get_text_embedding(
+                text_content,  # Use renamed variable
+                model,
+                tokenize_fn=tokens_or_tokenizer_for_current_text,  # Pass the correctly determined function or token list
+                use_stopwords=use_stopwords,
+                stopwords_set=stopwords_set,
+                use_tfidf_weighting=use_tfidf_weighting,
+                corpus_token_freq=corpus_token_freq,
+                doc_freq_map=doc_freq_map,
+                total_docs_in_corpus=total_docs_in_corpus
+            )
+            embeddings.append(embedding)
+        except Exception as e:
+            source_module_name = "fasttext_embedding.py"
+            logger.error(f"Error generating FastText embeddings in {source_module_name}: {e}", exc_info=True)
+            # Append a zero vector or handle as per desired error strategy
+            embeddings.append(np.zeros(model.get_dimension()))
     return np.array(embeddings)