daniel-wojahn commited on
Commit
7ed74d5
·
1 Parent(s): 59185b1

fixed hf model

Browse files
Files changed (1) hide show
  1. pipeline/fasttext_embedding.py +18 -12
pipeline/fasttext_embedding.py CHANGED
@@ -377,18 +377,24 @@ def get_batch_embeddings(
377
  logger.warning(f"Pre-tokenized list `tokenize_fn` is shorter than the list of texts. Index {i} is out of bounds for `tokenize_fn` with length {len(tokenize_fn)}. Defaulting to None for this text.")
378
  # If tokenize_fn is None or other, tokens_or_tokenizer_for_current_text remains None (get_text_embedding handles default).
379
 
380
- embedding = get_text_embedding(
381
- text_content, # Use renamed variable
382
- model,
383
- tokenize_fn=tokens_or_tokenizer_for_current_text, # Pass the correctly determined function or token list
384
- use_stopwords=use_stopwords,
385
- stopwords_set=stopwords_set,
386
- use_tfidf_weighting=use_tfidf_weighting,
387
- corpus_token_freq=corpus_token_freq,
388
- doc_freq_map=doc_freq_map,
389
- total_docs_in_corpus=total_docs_in_corpus
390
- )
391
- embeddings.append(embedding)
 
 
 
 
 
 
392
 
393
  return np.array(embeddings)
394
 
 
377
  logger.warning(f"Pre-tokenized list `tokenize_fn` is shorter than the list of texts. Index {i} is out of bounds for `tokenize_fn` with length {len(tokenize_fn)}. Defaulting to None for this text.")
378
  # If tokenize_fn is None or other, tokens_or_tokenizer_for_current_text remains None (get_text_embedding handles default).
379
 
380
+ try:
381
+ embedding = get_text_embedding(
382
+ text_content, # Use renamed variable
383
+ model,
384
+ tokenize_fn=tokens_or_tokenizer_for_current_text, # Pass the correctly determined function or token list
385
+ use_stopwords=use_stopwords,
386
+ stopwords_set=stopwords_set,
387
+ use_tfidf_weighting=use_tfidf_weighting,
388
+ corpus_token_freq=corpus_token_freq,
389
+ doc_freq_map=doc_freq_map,
390
+ total_docs_in_corpus=total_docs_in_corpus
391
+ )
392
+ embeddings.append(embedding)
393
+ except Exception as e:
394
+ source_module_name = "fasttext_embedding.py"
395
+ logger.error(f"Error generating FastText embeddings in {source_module_name}: {e}", exc_info=True)
396
+ # Append a zero vector or handle as per desired error strategy
397
+ embeddings.append(np.zeros(model.get_dimension()))
398
 
399
  return np.array(embeddings)
400