Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

App Files Files Community

daniel-wojahn commited on 7 days ago

Commit

b44d470

1 Parent(s): b2ce320

bugs fixed

Browse files

Files changed (6) hide show

app.py +27 -10
pipeline/fast_lcs.pyx +29 -7
pipeline/fasttext_embedding.py +54 -85
pipeline/hf_embedding.py +21 -6
pipeline/metrics.py +36 -20
pipeline/process.py +10 -3

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pipeline.visualize import generate_visualizations, generate_word_count_char
 from pipeline.llm_service import get_interpretation
 import logging
 import pandas as pd
 from dotenv import load_dotenv
@@ -14,8 +15,6 @@ load_dotenv()
 from theme import tibetan_theme
 logger = logging.getLogger(__name__)
-# Main interface logic
 def main_interface():
     with gr.Blocks(
         theme=tibetan_theme,
@@ -24,8 +23,9 @@ def main_interface():
     ) as demo:
         gr.Markdown(
             """# Tibetan Text Metrics Web App
-<span style='font-size:18px;'>A user-friendly web application for analyzing textual similarities and variations in Tibetan manuscripts, providing a graphical interface to the core functionalities of the [Tibetan Text Metrics (TTM)](https://github.com/daniel-wojahn/tibetan-text-metrics) project. Powered by Mistral 7B via OpenRouter for advanced text analysis.</span>
         """,
             elem_classes="gr-markdown",
         )
@@ -75,6 +75,21 @@ def main_interface():
                         info="Select the embedding model to use for semantic similarity analysis."
                     )
                     stopwords_dropdown = gr.Dropdown(
                         label="Stopword Filtering",
                         choices=[
@@ -258,7 +273,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
         warning_box = gr.Markdown(visible=False)
-        def run_pipeline(files, enable_semantic, model_name, stopwords_option="Aggressive (All function words)", progress=gr.Progress()):
             """Run the text analysis pipeline on the uploaded files.
             Args:
@@ -389,12 +404,15 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                     internal_model_id = "facebook-fasttext-pretrained"
                 df_results, word_counts_df_data, warning_raw = process_texts(
-                    text_data, filenames,
-                    enable_semantic=enable_semantic_bool,
-                    model_name=internal_model_id, # Use the mapped internal ID
                     use_stopwords=use_stopwords,
                     use_lite_stopwords=use_lite_stopwords,
-                    progress_callback=progress_tracker
                 )
                 if df_results.empty:
@@ -493,7 +511,6 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
                 progress(1.0, desc="Analysis complete!")
                 # Add a timestamp to the interpretation
-                from datetime import datetime
                 timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
                 interpretation = f"{interpretation}\n\n<small>Analysis generated on {timestamp}</small>"
                 return interpretation
@@ -503,7 +520,7 @@ Each segment is represented as a vector of these TF-IDF scores, and the cosine s
         process_btn.click(
             fn=run_pipeline,
-            inputs=[file_input, semantic_toggle_radio, model_dropdown, stopwords_dropdown],
             outputs=[
                 csv_output,
                 metrics_preview,

 from pipeline.llm_service import get_interpretation
 import logging
 import pandas as pd
+from datetime import datetime
 from dotenv import load_dotenv
 from theme import tibetan_theme
 logger = logging.getLogger(__name__)
 def main_interface():
     with gr.Blocks(
         theme=tibetan_theme,
     ) as demo:
         gr.Markdown(
             """# Tibetan Text Metrics Web App
+<span style='font-size:18px;'>A user-friendly web application for analyzing textual similarities and variations in Tibetan manuscripts, providing a graphical interface to the core functionalities of the [Tibetan Text Metrics (TTM)](https://github.com/daniel-wojahn/tibetan-text-metrics) project. Powered by advanced language models via OpenRouter for in-depth text analysis.</span>
         """,
             elem_classes="gr-markdown",
         )
                         info="Select the embedding model to use for semantic similarity analysis."
                     )
+                    with gr.Accordion("Advanced Options", open=False):
+                        batch_size_slider = gr.Slider(
+                            minimum=1,
+                            maximum=64,
+                            value=8,
+                            step=1,
+                            label="Batch Size (for Hugging Face models)",
+                            info="Adjust based on your hardware (VRAM). Lower this if you encounter memory issues."
+                        )
+                        progress_bar_checkbox = gr.Checkbox(
+                            label="Show Embedding Progress Bar",
+                            value=False,
+                            info="Display a progress bar during embedding generation. Useful for large datasets."
+                        )
                     stopwords_dropdown = gr.Dropdown(
                         label="Stopword Filtering",
                         choices=[
         warning_box = gr.Markdown(visible=False)
+        def run_pipeline(files, enable_semantic, model_name, stopwords_option, batch_size, show_progress, progress=gr.Progress()):
             """Run the text analysis pipeline on the uploaded files.
             Args:
                     internal_model_id = "facebook-fasttext-pretrained"
                 df_results, word_counts_df_data, warning_raw = process_texts(
+                    text_data,
+                    filenames,
+                    enable_semantic=enable_semantic_bool,
+                    model_name=internal_model_id,
                     use_stopwords=use_stopwords,
                     use_lite_stopwords=use_lite_stopwords,
+                    progress_callback=progress_tracker,
+                    batch_size=batch_size,
+                    show_progress_bar=show_progress
                 )
                 if df_results.empty:
                 progress(1.0, desc="Analysis complete!")
                 # Add a timestamp to the interpretation
                 timestamp = datetime.now().strftime("%Y-%m-%d %H:%M")
                 interpretation = f"{interpretation}\n\n<small>Analysis generated on {timestamp}</small>"
                 return interpretation
         process_btn.click(
             fn=run_pipeline,
+            inputs=[file_input, semantic_toggle_radio, model_dropdown, stopwords_dropdown, batch_size_slider, progress_bar_checkbox],
             outputs=[
                 csv_output,
                 metrics_preview,

pipeline/fast_lcs.pyx CHANGED Viewed

@@ -1,4 +1,3 @@
-# fast_lcs.pyx
 import numpy as np
 cimport cython
@@ -8,16 +7,39 @@ cimport numpy as np
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def compute_lcs_fast(list words1, list words2):
     cdef int m = len(words1)
     cdef int n = len(words2)
-    cdef np.ndarray[np.int32_t, ndim=2] dp = np.zeros((m + 1, n + 1), dtype=np.int32)
     cdef int i, j
     for i in range(1, m + 1):
         for j in range(1, n + 1):
             if words1[i - 1] == words2[j - 1]:
-                dp[i, j] = dp[i - 1, j - 1] + 1
             else:
-                dp[i, j] = max(dp[i - 1, j], dp[i, j - 1])
-    return int(dp[m, n])

 import numpy as np
 cimport cython
 @cython.boundscheck(False)
 @cython.wraparound(False)
 def compute_lcs_fast(list words1, list words2):
+    """
+    Computes the Longest Common Subsequence (LCS) of two lists of words.
+    This implementation is memory-optimized and uses O(min(m, n)) space, where
+    m and n are the lengths of the word lists.
+    Args:
+        words1 (list): The first list of words.
+        words2 (list): The second list of words.
+    Returns:
+        int: The length of the Longest Common Subsequence.
+    """
     cdef int m = len(words1)
     cdef int n = len(words2)
+    # Ensure words2 is the shorter sequence to optimize memory usage
+    if m < n:
+        return compute_lcs_fast(words2, words1)
+    # We only need two rows for the DP table
+    cdef np.ndarray[np.int32_t, ndim=1] prev_row = np.zeros(n + 1, dtype=np.int32)
+    cdef np.ndarray[np.int32_t, ndim=1] curr_row = np.zeros(n + 1, dtype=np.int32)
     cdef int i, j
     for i in range(1, m + 1):
         for j in range(1, n + 1):
             if words1[i - 1] == words2[j - 1]:
+                curr_row[j] = prev_row[j - 1] + 1
             else:
+                curr_row[j] = max(prev_row[j], curr_row[j - 1])
+        # Copy current row to previous row for the next iteration
+        prev_row = curr_row.copy()
+    return int(prev_row[n])

pipeline/fasttext_embedding.py CHANGED Viewed

@@ -66,13 +66,13 @@ def train_fasttext_model(
     Args:
         corpus_path: Path to the corpus file
         model_path: Path where to save the trained model
-        dim: Embedding dimension (default: 300)
-        epoch: Number of training epochs (default: 15)
-        min_count: Minimum count of words (default: 3)
         window: Size of context window (default: 5)
         minn: Minimum length of char n-gram (default: 3)
         maxn: Maximum length of char n-gram (default: 6)
-        neg: Number of negatives in negative sampling (default: 10)
         model_type: FastText model type ('skipgram' or 'cbow')
     Returns:
@@ -83,56 +83,48 @@ def train_fasttext_model(
     logger.info("Training FastText model with %s, dim=%d, epoch=%d, window=%d, minn=%d, maxn=%d...",
                model_type, dim, epoch, window, minn, maxn)
-    # Preprocess corpus for Tibetan - segment by syllable points
-    # This is based on research showing syllable segmentation works better for Tibetan
     try:
-        with open(corpus_path, 'r', encoding='utf-8') as f:
-            content = f.read()
-        # Ensure syllable segmentation by adding spaces after Tibetan syllable markers (if not already present)
-        # This improves model quality for Tibetan text according to research
-        processed_content = content.replace('་', '་ ')
-        # Write back the processed content
-        with open(corpus_path, 'w', encoding='utf-8') as f:
-            f.write(processed_content)
-        logger.info("Preprocessed corpus with syllable segmentation for Tibetan text")
     except Exception as e:
-        logger.warning("Could not preprocess corpus for syllable segmentation: %s", str(e))
-    # Train the model with optimized parameters
-    if model_type == "skipgram":
-        model = fasttext.train_unsupervised(
-            corpus_path,
-            model="skipgram",
-            dim=dim,
-            epoch=epoch,
-            minCount=min_count,
-            wordNgrams=1,
-            minn=minn,
-            maxn=maxn,
-            neg=neg,
-            window=window
-        )
-    else:  # cbow
-        model = fasttext.train_unsupervised(
-            corpus_path,
-            model="cbow",
-            dim=dim,
-            epoch=epoch,
-            minCount=min_count,
-            wordNgrams=1,
-            minn=minn,
-            maxn=maxn,
-            neg=neg,
-            window=window
-        )
-    # Save the model
-    model.save_model(model_path)
-    logger.info("FastText model trained and saved to %s", model_path)
     return model
@@ -200,6 +192,17 @@ def load_fasttext_model(model_path: str = DEFAULT_MODEL_PATH) -> Optional[fastte
         return None
 def get_text_embedding(
     text: str,
     model: fasttext.FastText._FastText,
@@ -248,40 +251,6 @@ def get_text_embedding(
     if use_stopwords and stopwords_set:
         logger.debug(f"Original tokens before stopword check (first 20): {tokens[:20]}")
         original_token_count = len(tokens)
-        def _remove_stopwords_from_tokens(tokens: List[str], stopwords_set: Set[str]) -> List[str]:
-            """
-            Removes stopwords from a list of tokens.
-            Handles Tibetan punctuation by checking both the token itself and the token after
-            stripping trailing '།' or '༔'.
-            """
-            cleaned_tokens = []
-            removed_count = 0
-            for token in tokens:
-                # 1. Check if the original token itself is a stopword (e.g., standalone '།')
-                if token in stopwords_set:
-                    removed_count += 1
-                    continue  # Skip this token
-                # 2. If not a direct stopword, check if it becomes one after stripping trailing punctuation
-                #    This handles cases like "གྲུབ་པའི་།" where "གྲུབ་པའི་" is the stopword.
-                token_for_check = token
-                punctuation_was_stripped = False
-                if token.endswith(('།', '༔')):
-                    stripped_token = token.rstrip('།༔')
-                    if stripped_token != token:  # Check if stripping actually changed the token
-                        token_for_check = stripped_token
-                        punctuation_was_stripped = True
-                if punctuation_was_stripped and token_for_check in stopwords_set:
-                    removed_count += 1
-                    continue # Skip this token
-                # 3. If neither the original token nor its base form is a stopword, keep it.
-                cleaned_tokens.append(token)
-            return cleaned_tokens
         tokens = _remove_stopwords_from_tokens(tokens, stopwords_set)
         removed_count = original_token_count - len(tokens)
         logger.debug(f"Tokens after stopword removal (removed {removed_count}): {tokens[:20]}")

     Args:
         corpus_path: Path to the corpus file
         model_path: Path where to save the trained model
+        dim: Embedding dimension (default: 100)
+        epoch: Number of training epochs (default: 5)
+        min_count: Minimum count of words (default: 5)
         window: Size of context window (default: 5)
         minn: Minimum length of char n-gram (default: 3)
         maxn: Maximum length of char n-gram (default: 6)
+        neg: Number of negatives in negative sampling (default: 5)
         model_type: FastText model type ('skipgram' or 'cbow')
     Returns:
     logger.info("Training FastText model with %s, dim=%d, epoch=%d, window=%d, minn=%d, maxn=%d...",
                model_type, dim, epoch, window, minn, maxn)
+    processed_corpus_path = corpus_path + ".processed"
+    corpus_to_train = corpus_path
+    model = None
     try:
+        # Preprocess the corpus to a temporary file
+        with open(corpus_path, 'r', encoding='utf-8') as f_in, open(processed_corpus_path, 'w', encoding='utf-8') as f_out:
+            content = f_in.read()
+            processed_content = content.replace('་', '་ ')
+            f_out.write(processed_content)
+        logger.info("Corpus preprocessed to temporary file for Tibetan syllable segmentation.")
+        corpus_to_train = processed_corpus_path
+        # Train the model with optimized parameters
+        if model_type == "skipgram":
+            model = fasttext.train_unsupervised(
+                corpus_to_train,
+                model="skipgram",
+                dim=dim, epoch=epoch, minCount=min_count, wordNgrams=1,
+                minn=minn, maxn=maxn, neg=neg, window=window
+            )
+        else:  # cbow
+            model = fasttext.train_unsupervised(
+                corpus_to_train,
+                model="cbow",
+                dim=dim, epoch=epoch, minCount=min_count, wordNgrams=1,
+                minn=minn, maxn=maxn, neg=neg, window=window
+            )
+        model.save_model(model_path)
+        logger.info("FastText model trained and saved to %s", model_path)
     except Exception as e:
+        logger.error(f"An error occurred during model training: {e}", exc_info=True)
+        # Re-raise the exception after logging and cleanup
+        raise
+    finally:
+        # Clean up the temporary processed file
+        if os.path.exists(processed_corpus_path):
+            os.remove(processed_corpus_path)
+            logger.info(f"Cleaned up temporary file: {processed_corpus_path}")
     return model
         return None
+def _remove_stopwords_from_tokens(tokens: List[str], stopwords_set: Set[str]) -> List[str]:
+    """
+    Removes stopwords from a list of tokens using a list comprehension for efficiency.
+    Handles Tibetan punctuation by checking both the token itself and the token after
+    stripping trailing '།' or '༔'.
+    """
+    if not stopwords_set:
+        return tokens
+    return [token for token in tokens if token not in stopwords_set and token.rstrip('།༔') not in stopwords_set]
 def get_text_embedding(
     text: str,
     model: fasttext.FastText._FastText,
     if use_stopwords and stopwords_set:
         logger.debug(f"Original tokens before stopword check (first 20): {tokens[:20]}")
         original_token_count = len(tokens)
         tokens = _remove_stopwords_from_tokens(tokens, stopwords_set)
         removed_count = original_token_count - len(tokens)
         logger.debug(f"Tokens after stopword removal (removed {removed_count}): {tokens[:20]}")

pipeline/hf_embedding.py CHANGED Viewed

@@ -33,26 +33,41 @@ def get_model(model_id: str) -> Tuple[Optional[SentenceTransformer], Optional[st
         logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
         return None, None
-def generate_embeddings(texts: List[str], model: SentenceTransformer) -> Optional[np.ndarray]:
     """
     Generates embeddings for a list of texts using a SentenceTransformer model.
     Args:
         texts (list[str]): A list of texts to embed.
         model (SentenceTransformer): The loaded SentenceTransformer model.
     Returns:
-        Optional[np.ndarray]: A numpy array containing the embeddings. Returns None if generation fails.
     """
     if not texts or not isinstance(model, SentenceTransformer):
-        logger.warning("Invalid input for generating embeddings. Texts list is empty or model is not a SentenceTransformer.")
-        return None
     logger.info(f"Generating embeddings for {len(texts)} texts with {type(model).__name__}...")
     try:
-        embeddings = model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
         logger.info(f"Embeddings generated with shape: {embeddings.shape}")
         return embeddings
     except Exception as e:
         logger.error(f"An unexpected error occurred during embedding generation: {e}", exc_info=True)
-        return None

         logger.error(f"Failed to load SentenceTransformer model '{model_id}': {e}", exc_info=True)
         return None, None
+def generate_embeddings(
+    texts: List[str],
+    model: SentenceTransformer,
+    batch_size: int = 32,
+    show_progress_bar: bool = False
+) -> np.ndarray:
     """
     Generates embeddings for a list of texts using a SentenceTransformer model.
     Args:
         texts (list[str]): A list of texts to embed.
         model (SentenceTransformer): The loaded SentenceTransformer model.
+        batch_size (int): The batch size for encoding.
+        show_progress_bar (bool): Whether to display a progress bar.
     Returns:
+        np.ndarray: A numpy array containing the embeddings. Returns an empty array of the correct shape on failure.
     """
     if not texts or not isinstance(model, SentenceTransformer):
+        logger.warning("Invalid input for generating embeddings. Returning empty array.")
+        # Return a correctly shaped empty array
+        embedding_dim = model.get_sentence_embedding_dimension() if isinstance(model, SentenceTransformer) else 768 # Fallback
+        return np.zeros((len(texts), embedding_dim))
     logger.info(f"Generating embeddings for {len(texts)} texts with {type(model).__name__}...")
     try:
+        embeddings = model.encode(
+            texts,
+            batch_size=batch_size,
+            convert_to_numpy=True,
+            show_progress_bar=show_progress_bar
+        )
         logger.info(f"Embeddings generated with shape: {embeddings.shape}")
         return embeddings
     except Exception as e:
         logger.error(f"An unexpected error occurred during embedding generation: {e}", exc_info=True)
+        embedding_dim = model.get_sentence_embedding_dimension()
+        return np.zeros((len(texts), embedding_dim))

pipeline/metrics.py CHANGED Viewed

@@ -55,16 +55,18 @@ def compute_normalized_lcs(words1: List[str], words2: List[str]) -> float:
 def compute_semantic_similarity(
     text1_segment: str,
     text2_segment: str,
-    tokens1: List[str], # botok tokens for text1, not directly used by FastText path but kept for signature
-    tokens2: List[str], # botok tokens for text2, not directly used by FastText path but kept for signature
-    model, # FastText model object
-    model_type: str = "fasttext", # Should always be 'fasttext' when called
     use_stopwords: bool = True,
     use_lite_stopwords: bool = False,
     fasttext_tokenize_fn=None,
     term_freq_corpus=None,
     doc_freq_map=None,
-    total_docs_in_corpus=0
 ) -> float:
     """Computes semantic similarity using a FastText model."""
     if model_type != "fasttext":
@@ -92,7 +94,9 @@ def compute_semantic_similarity(
         tokenize_fn_param,
         term_freq_corpus_param,
         doc_freq_map_param,
-        total_docs_in_corpus_param
     ) -> Union[np.ndarray, None]:
         """Helper to get a single embedding for a text using FastText."""
         if not raw_text_segment.strip():
@@ -109,7 +113,12 @@ def compute_semantic_similarity(
                 use_lite_stopwords=use_lite_stopwords_param
             )
         elif model_type == "sentence-transformer":
-            embedding = generate_hf_embeddings(texts=[raw_text_segment], model=model_obj)
         if embedding is None or embedding.size == 0:
             logger.error(
@@ -120,12 +129,12 @@ def compute_semantic_similarity(
     try:
         # Pass all relevant parameters to _get_aggregated_embedding
-        emb1 = _get_aggregated_embedding(text1_segment, tokens1, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus)
-        emb2 = _get_aggregated_embedding(text2_segment, tokens2, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus)
         if emb1 is None or emb2 is None or emb1.size == 0 or emb2.size == 0:
             logger.error(
-                "Failed to obtain one or both FastText embeddings for semantic similarity."
             )
             return np.nan
@@ -135,16 +144,16 @@ def compute_semantic_similarity(
         # Handle cases where embeddings are all zeros
         if np.all(emb1 == 0) and np.all(emb2 == 0):
-            logger.info("Both FastText embeddings are zero. Semantic similarity is 0.0.")
             return 0.0
         if np.all(emb1 == 0) or np.all(emb2 == 0):
-            logger.info("One of the FastText embeddings is zero. Semantic similarity is 0.0.")
             return 0.0
         # Handle NaN or Inf in embeddings
         if np.isnan(emb1).any() or np.isinf(emb1).any() or \
            np.isnan(emb2).any() or np.isinf(emb2).any():
-            logger.warning("NaN or Inf found in FastText embeddings. Semantic similarity set to 0.0.")
             return 0.0
         # Ensure embeddings are 2D for cosine_similarity: [1, dim]
@@ -159,17 +168,22 @@ def compute_semantic_similarity(
         safe_text1 = str(text1_segment)[:100] if text1_segment is not None else "N/A"
         safe_text2 = str(text2_segment)[:100] if text2_segment is not None else "N/A"
         logger.error(
-            f"Error during FastText semantic similarity calculation:\nText1: {safe_text1}...\nText2: {safe_text2}...\nError: {e}"
         )
-        logger.exception("Traceback for FastText semantic similarity calculation error:")
         return np.nan
 def compute_all_metrics(
-    texts: Dict[str, str], model=None, enable_semantic: bool = True, # device=None removed
-    model_type: str = "fasttext", use_stopwords: bool = True,
     use_lite_stopwords: bool = False,
-    fasttext_tokenize_fn=None # Added for FastText specific tokenizer
 ) -> pd.DataFrame:
     """
     Computes all selected similarity metrics between pairs of texts.
@@ -318,11 +332,13 @@ def compute_all_metrics(
         if enable_semantic:
             # Pass raw texts and their pre-computed botok tokens
             semantic_sim = compute_semantic_similarity(
-                texts[f1], texts[f2], words1_raw, words2_raw, model, model_type, use_stopwords, use_lite_stopwords, # device removed
                 fasttext_tokenize_fn=fasttext_tokenize_fn,
                 term_freq_corpus=term_freq_corpus_for_fasttext if model_type == "fasttext" else None,
                 doc_freq_map=document_frequency_map_for_fasttext if model_type == "fasttext" else None,
-                total_docs_in_corpus=total_num_documents_for_fasttext if model_type == "fasttext" else 0
             )
         else:
             semantic_sim = np.nan

 def compute_semantic_similarity(
     text1_segment: str,
     text2_segment: str,
+    tokens1: List[str],
+    tokens2: List[str],
+    model,
+    model_type: str = "fasttext",
     use_stopwords: bool = True,
     use_lite_stopwords: bool = False,
     fasttext_tokenize_fn=None,
     term_freq_corpus=None,
     doc_freq_map=None,
+    total_docs_in_corpus=0,
+    batch_size: int = 32,
+    show_progress_bar: bool = False
 ) -> float:
     """Computes semantic similarity using a FastText model."""
     if model_type != "fasttext":
         tokenize_fn_param,
         term_freq_corpus_param,
         doc_freq_map_param,
+        total_docs_in_corpus_param,
+        batch_size_param: int,
+        show_progress_bar_param: bool
     ) -> Union[np.ndarray, None]:
         """Helper to get a single embedding for a text using FastText."""
         if not raw_text_segment.strip():
                 use_lite_stopwords=use_lite_stopwords_param
             )
         elif model_type == "sentence-transformer":
+            embedding = generate_hf_embeddings(
+                texts=[raw_text_segment],
+                model=model_obj,
+                batch_size=batch_size_param,
+                show_progress_bar=show_progress_bar_param
+            )
         if embedding is None or embedding.size == 0:
             logger.error(
     try:
         # Pass all relevant parameters to _get_aggregated_embedding
+        emb1 = _get_aggregated_embedding(text1_segment, tokens1, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus, batch_size, show_progress_bar)
+        emb2 = _get_aggregated_embedding(text2_segment, tokens2, model, use_stopwords, use_lite_stopwords, fasttext_tokenize_fn, term_freq_corpus, doc_freq_map, total_docs_in_corpus, batch_size, show_progress_bar)
         if emb1 is None or emb2 is None or emb1.size == 0 or emb2.size == 0:
             logger.error(
+                "Failed to obtain one or both embeddings for semantic similarity."
             )
             return np.nan
         # Handle cases where embeddings are all zeros
         if np.all(emb1 == 0) and np.all(emb2 == 0):
+            logger.info("Both embeddings are zero. Semantic similarity is 0.0.")
             return 0.0
         if np.all(emb1 == 0) or np.all(emb2 == 0):
+            logger.info("One of the embeddings is zero. Semantic similarity is 0.0.")
             return 0.0
         # Handle NaN or Inf in embeddings
         if np.isnan(emb1).any() or np.isinf(emb1).any() or \
            np.isnan(emb2).any() or np.isinf(emb2).any():
+            logger.warning("NaN or Inf found in embeddings. Semantic similarity set to 0.0.")
             return 0.0
         # Ensure embeddings are 2D for cosine_similarity: [1, dim]
         safe_text1 = str(text1_segment)[:100] if text1_segment is not None else "N/A"
         safe_text2 = str(text2_segment)[:100] if text2_segment is not None else "N/A"
         logger.error(
+            f"Error during semantic similarity calculation:\nText1: {safe_text1}...\nText2: {safe_text2}...\nError: {e}"
         )
+        logger.exception("Traceback for semantic similarity calculation error:")
         return np.nan
 def compute_all_metrics(
+    texts: Dict[str, str],
+    model=None,
+    enable_semantic: bool = True,
+    model_type: str = "fasttext",
+    use_stopwords: bool = True,
     use_lite_stopwords: bool = False,
+    fasttext_tokenize_fn=None,
+    batch_size: int = 32,
+    show_progress_bar: bool = False
 ) -> pd.DataFrame:
     """
     Computes all selected similarity metrics between pairs of texts.
         if enable_semantic:
             # Pass raw texts and their pre-computed botok tokens
             semantic_sim = compute_semantic_similarity(
+                texts[f1], texts[f2], words1_raw, words2_raw, model, model_type, use_stopwords, use_lite_stopwords,
                 fasttext_tokenize_fn=fasttext_tokenize_fn,
                 term_freq_corpus=term_freq_corpus_for_fasttext if model_type == "fasttext" else None,
                 doc_freq_map=document_frequency_map_for_fasttext if model_type == "fasttext" else None,
+                total_docs_in_corpus=total_num_documents_for_fasttext if model_type == "fasttext" else 0,
+                batch_size=batch_size,
+                show_progress_bar=show_progress_bar
             )
         else:
             semantic_sim = np.nan

pipeline/process.py CHANGED Viewed

@@ -55,7 +55,9 @@ def process_texts(
     model_name: str = "facebook-fasttext-pretrained",
     use_stopwords: bool = True,
     use_lite_stopwords: bool = False,
-    progress_callback = None
 ) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
     """
     Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
@@ -279,12 +281,17 @@ def process_texts(
                     logger.info("Using botok word-level tokenization for FastText model.")
                 pair_metrics = compute_all_metrics(
-                    {seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
                     model=model,
                     enable_semantic=enable_semantic,
                     model_type=model_type,
                     use_stopwords=use_stopwords,
-                    use_lite_stopwords=use_lite_stopwords
                 )
                 # Rename 'Text Pair' to show file stems and chapter number

     model_name: str = "facebook-fasttext-pretrained",
     use_stopwords: bool = True,
     use_lite_stopwords: bool = False,
+    progress_callback = None,
+    batch_size: int = 32,
+    show_progress_bar: bool = False
 ) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
     """
     Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
                     logger.info("Using botok word-level tokenization for FastText model.")
                 pair_metrics = compute_all_metrics(
+                    texts={seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
+                    token_lists={seg1: segment_tokens[seg1], seg2: segment_tokens[seg2]},
+                    metrics_to_compute=["jaccard", "lcs", "tfidf"],
                     model=model,
                     enable_semantic=enable_semantic,
                     model_type=model_type,
                     use_stopwords=use_stopwords,
+                    use_lite_stopwords=use_lite_stopwords,
+                    fasttext_tokenize_fn=tokenizer_for_fasttext,
+                    batch_size=batch_size,
+                    show_progress_bar=show_progress_bar
                 )
                 # Rename 'Text Pair' to show file stems and chapter number