Spaces:

daniel-wojahn
/

ttm-webapp-hf

Running

File size: 19,537 Bytes

import pandas as pd
from typing import Dict, List, Tuple
from .metrics import compute_all_metrics
from .semantic_embedding import get_model_and_device
from .fasttext_embedding import load_fasttext_model # Added for custom fasttext
from .tokenize import tokenize_texts
import logging
from itertools import combinations
import re

# Define FASTTEXT_MODEL_ID if not already defined (it should be, from semantic_embedding or globally)
# For safety, let's assume it might be needed here directly for conditional logic
FASTTEXT_MODEL_ID = "fasttext-tibetan" # Ensure this matches the ID used elsewhere


def get_botok_tokens_for_single_text(text: str, mode: str = "syllable") -> list[str]:
    """
    A wrapper around tokenize_texts to make it suitable for tokenize_fn 
    in generate_embeddings, which expects a function that tokenizes a single string.
    Accepts a 'mode' argument ('syllable' or 'word') to pass to tokenize_texts.
    """
    if not text.strip():
        return []
    # Pass the mode to tokenize_texts
    tokenized_list_of_lists = tokenize_texts([text], mode=mode)
    if tokenized_list_of_lists and tokenized_list_of_lists[0]:
        return tokenized_list_of_lists[0]
    return []

def clean_tibetan_text_for_fasttext(text: str) -> str:
    """
    Applies cleaning steps to Tibetan text similar to those in FastText training:
    - Removes lnX/pX page/line markers.
    - Normalizes double tsheg to single tsheg.
    - Normalizes whitespace.
    """
    # Remove lnX/pX markers
    cleaned_text = re.sub(r"\s*(?:[lL][nN]|[pP])\d{1,3}[abAB]?\s*", " ", text)
    # Normalize double tsheg
    cleaned_text = re.sub(r"།\s*།", "།", cleaned_text)
    # Normalize spaces (multiple spaces to single, strip leading/trailing)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text



logger = logging.getLogger(__name__)


def process_texts(
    text_data: Dict[str, str], 
    filenames: List[str], 
    enable_semantic: bool = True,
    model_name: str = "buddhist-nlp/buddhist-sentence-similarity",
    use_stopwords: bool = True,
    use_lite_stopwords: bool = False,
    progress_callback = None
) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
    """
    Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
    
    Args:
        text_data (Dict[str, str]): A dictionary mapping filenames to their content.
        filenames (List[str]): A list of filenames that were uploaded.
        enable_semantic (bool, optional): Whether to compute semantic similarity metrics. 
            Requires loading a sentence transformer model, which can be time-consuming. Defaults to True.
        model_name (str, optional): The name of the sentence transformer model to use for semantic similarity.
            Must be a valid model identifier on Hugging Face. Defaults to "buddhist-nlp/buddhist-sentence-similarity".
        use_stopwords (bool, optional): Whether to use stopwords in the metrics calculation. Defaults to True.
        use_lite_stopwords (bool, optional): Whether to use the lite stopwords list (common particles only)
            instead of the comprehensive list. Only applies if use_stopwords is True. Defaults to False.
        progress_callback (callable, optional): A callback function for reporting progress updates.
            Should accept a float between 0 and 1 and a description string. Defaults to None.
            
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame, str]:
            - metrics_df: DataFrame with similarity metrics between corresponding chapters of file pairs.
                Contains columns: 'Text Pair', 'Chapter', 'Jaccard Similarity (%)', 'Normalized LCS',
                'Semantic Similarity' (if enable_semantic=True), and 'TF-IDF Cosine Sim'.
            - word_counts_df: DataFrame with word counts for each segment (chapter) in each file.
                Contains columns: 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
            - warning: A string containing any warnings generated during processing (e.g., missing chapter markers).
    
    Raises:
        RuntimeError: If the botok tokenizer fails to initialize.
        ValueError: If the input files cannot be processed or if metrics computation fails.
    """
    # Initialize model and model_type variables
    model, model_type = None, None # st_device removed
    model_warning = ""

    # Update progress if callback provided
    if progress_callback is not None:
        try:
            progress_callback(0.25, desc="Preparing for text analysis...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
            # Continue processing even if progress reporting fails

    # Load semantic model if enabled
    if enable_semantic:
        logger.info("Semantic similarity enabled. Loading embedding model...")
        try:
            logger.info("Using model: %s", model_name)

            if model_name == FASTTEXT_MODEL_ID:  # FASTTEXT_MODEL_ID is 'fasttext-tibetan'
                logger.info(f"Attempting to load custom FastText model: {model_name}")
                if progress_callback is not None:
                    try:
                        progress_callback(0.25, desc=f"Loading custom FastText model: {model_name}...")
                    except Exception as e:
                        logger.warning(f"Progress callback error (non-critical): {e}")
                
                loaded_custom_model = load_fasttext_model(model_id=model_name) # model_id is expected to be path or key by this func
                if loaded_custom_model:
                    model = loaded_custom_model
                    model_type = "fasttext"
                    logger.info(f"Custom FastText model '{model_name}' loaded successfully.")
                    if progress_callback is not None:
                        try:
                            progress_callback(0.3, desc=f"Custom FastText model '{model_name}' loaded.")
                        except Exception as e:
                            logger.warning(f"Progress callback error (non-critical): {e}")
                else:
                    model_warning = f"Custom FastText model ('{model_name}') failed to load. Semantic similarity will be disabled."
                    logger.warning(model_warning)
                    enable_semantic = False
            
            elif model_name == "facebook-fasttext-pretrained":
                logger.info(f"Attempting to load Facebook FastText model: {model_name}")
                if progress_callback is not None:
                    try:
                        progress_callback(0.25, desc=f"Loading Facebook FastText model: {model_name}...")
                    except Exception as e:
                        logger.warning(f"Progress callback error (non-critical): {e}")
                
                fb_model, fb_model_type = get_model_and_device(model_id=model_name) # from semantic_embedding
                if fb_model:
                    model = fb_model
                    model_type = fb_model_type  # Should be "fasttext"
                    logger.info(f"Facebook FastText model '{model_name}' (type: {model_type}) loaded successfully.")
                    if progress_callback is not None:
                        try:
                            progress_callback(0.3, desc=f"Facebook FastText model '{model_name}' loaded.")
                        except Exception as e:
                            logger.warning(f"Progress callback error (non-critical): {e}")
                else:
                    model_warning = f"Facebook FastText model ('{model_name}') failed to load. Semantic similarity will be disabled."
                    logger.warning(model_warning)
                    enable_semantic = False
            
            else:  # Any other model_name is unsupported
                model_warning = f"Unsupported model_name: '{model_name}'. Semantic similarity will be disabled. Supported models are '{FASTTEXT_MODEL_ID}' and 'facebook-fasttext-pretrained'."
                logger.warning(model_warning)
                enable_semantic = False
                if progress_callback is not None:
                    try:
                        progress_callback(0.3, desc="Unsupported model, continuing without semantic similarity.")
                    except Exception as e:
                        logger.warning(f"Progress callback error (non-critical): {e}")
        
        except Exception as e:  # General catch-all for unexpected errors during model loading attempts
            model_warning = f"An unexpected error occurred while attempting to load model '{model_name}': {e}. Semantic similarity will be disabled."
            logger.error(model_warning, exc_info=True)
            enable_semantic = False
            if progress_callback is not None:
                try:
                    progress_callback(0.3, desc="Error loading model, continuing without semantic similarity.")
                except Exception as e_cb:
                    logger.warning(f"Progress callback error (non-critical): {e_cb}")
    else:
        logger.info("Semantic similarity disabled. Skipping model loading.")
        if progress_callback is not None:
            try:
                progress_callback(0.3, desc="Processing text segments")
            except Exception as e:
                logger.warning(f"Progress callback error (non-critical): {e}")

    # Detect chapter marker and segment texts
    if progress_callback is not None:
        try:
            progress_callback(0.35, desc="Segmenting texts by chapters...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    chapter_marker = "༈"
    fallback = False
    segment_texts = {}
    
    # Process each file
    for i, fname in enumerate(filenames):
        if progress_callback is not None and len(filenames) > 1:
            try:
                progress_callback(0.35 + (0.05 * (i / len(filenames))), 
                                desc=f"Segmenting file {i+1}/{len(filenames)}: {fname}")
            except Exception as e:
                logger.warning(f"Progress callback error (non-critical): {e}")
            
        content = text_data[fname]
        
        # Check if content is empty
        if not content.strip():
            logger.warning(f"File '{fname}' is empty or contains only whitespace.")
            continue
            
        # Split by chapter marker if present
        if chapter_marker in content:
            segments = [
                seg.strip() for seg in content.split(chapter_marker) if seg.strip()
            ]
            
            # Check if we have valid segments after splitting
            if not segments:
                logger.warning(f"File '{fname}' contains chapter markers but no valid text segments.")
                continue
                
            for idx, seg in enumerate(segments):
                seg_id = f"{fname}|chapter {idx+1}"
                cleaned_seg = clean_tibetan_text_for_fasttext(seg)
                segment_texts[seg_id] = cleaned_seg
        else:
            # No chapter markers found, treat entire file as one segment
            seg_id = f"{fname}|chapter 1"
            cleaned_content = clean_tibetan_text_for_fasttext(content.strip())
            segment_texts[seg_id] = cleaned_content
            fallback = True
            
    # Generate warning if no chapter markers found
    warning = model_warning  # Include any model warnings
    if fallback:
        chapter_warning = (
            "No chapter marker found in one or more files. "
            "Each file will be treated as a single segment. "
            "For best results, add a unique marker (e.g., ༈) to separate chapters or sections."
        )
        warning = warning + " " + chapter_warning if warning else chapter_warning
        
    # Check if we have any valid segments
    if not segment_texts:
        logger.error("No valid text segments found in any of the uploaded files.")
        return pd.DataFrame(), pd.DataFrame(), "No valid text segments found in the uploaded files. Please check your files and try again."
    # Group chapters by filename (preserving order)
    if progress_callback is not None:
        try:
            progress_callback(0.4, desc="Organizing text segments...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    file_to_chapters = {}
    for seg_id in segment_texts:
        fname = seg_id.split("|")[0]
        file_to_chapters.setdefault(fname, []).append(seg_id)
        
    # For each pair of files, compare corresponding chapters (by index)
    if progress_callback is not None:
        try:
            progress_callback(0.45, desc="Computing similarity metrics...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    results = []
    files = list(file_to_chapters.keys())
    
    # Check if we have at least two files to compare
    if len(files) < 2:
        logger.warning("Need at least two files to compute similarity metrics.")
        return pd.DataFrame(), pd.DataFrame(), "Need at least two files to compute similarity metrics."
    
    # Track total number of comparisons for progress reporting
    total_comparisons = 0
    for file1, file2 in combinations(files, 2):
        chaps1 = file_to_chapters[file1]
        chaps2 = file_to_chapters[file2]
        total_comparisons += min(len(chaps1), len(chaps2))
    
    # Process each file pair
    comparison_count = 0
    for file1, file2 in combinations(files, 2):
        chaps1 = file_to_chapters[file1]
        chaps2 = file_to_chapters[file2]
        min_chaps = min(len(chaps1), len(chaps2))
        
        if progress_callback is not None:
            try:
                progress_callback(0.45, desc=f"Comparing {file1} with {file2}...")
            except Exception as e:
                logger.warning(f"Progress callback error (non-critical): {e}")
            
        for idx in range(min_chaps):
            seg1 = chaps1[idx]
            seg2 = chaps2[idx]
            
            # Update progress
            comparison_count += 1
            if progress_callback is not None and total_comparisons > 0:
                try:
                    progress_percentage = 0.45 + (0.25 * (comparison_count / total_comparisons))
                    progress_callback(progress_percentage, 
                                    desc=f"Computing metrics for chapter {idx+1} ({comparison_count}/{total_comparisons})")
                except Exception as e:
                    logger.warning(f"Progress callback error (non-critical): {e}")
            
            try:
                # Compute metrics for this chapter pair
                tokenizer_for_fasttext = None
                current_model_type = model_type if 'model_type' in locals() else "sentence_transformer"
                if current_model_type == "fasttext":
                    # Tokenizer setup for FastText model:
                    def fasttext_tokenizer_adapter(text_segment: str) -> List[str]:
                        cleaned_segment = clean_tibetan_text_for_fasttext(text_segment)
                        # Use word-level tokenization for the custom FastText model
                        return get_botok_tokens_for_single_text(cleaned_segment, mode="word")

                    tokenizer_for_fasttext = fasttext_tokenizer_adapter
                    logger.info("Using botok word-level tokenization for FastText model.")
                
                pair_metrics = compute_all_metrics(
                    {seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
                    model=model,
                    enable_semantic=enable_semantic,
                    model_type=model_type,
                    use_stopwords=use_stopwords,
                    use_lite_stopwords=use_lite_stopwords,
                    fasttext_tokenize_fn=tokenizer_for_fasttext
                )
                
                # Rename 'Text Pair' to show file stems and chapter number
                pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
                pair_metrics.loc[:, "Chapter"] = idx + 1
                results.append(pair_metrics)
                
            except Exception as e:
                logger.error(f"Error computing metrics for {seg1} vs {seg2}: {e}")
                # Continue with other comparisons instead of failing completely
                continue
    
    # Create the metrics DataFrame
    if results:
        metrics_df = pd.concat(results, ignore_index=True)
    else:
        metrics_df = pd.DataFrame()
        warning += " No valid metrics could be computed. Please check your files and try again."

    # Calculate word counts
    if progress_callback is not None:
        try:
            progress_callback(0.75, desc="Calculating word counts...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    word_counts_data = []
    
    # Process each segment
    for i, (seg_id, text_content) in enumerate(segment_texts.items()):
        # Update progress
        if progress_callback is not None and len(segment_texts) > 0:
            try:
                progress_percentage = 0.75 + (0.15 * (i / len(segment_texts)))
                progress_callback(progress_percentage, desc=f"Counting words in segment {i+1}/{len(segment_texts)}")
            except Exception as e:
                logger.warning(f"Progress callback error (non-critical): {e}")
            
        fname, chapter_info = seg_id.split("|", 1)
        chapter_num = int(chapter_info.replace("chapter ", ""))
        
        try:
            # Use botok for accurate word count for raw Tibetan text
            tokenized_segments = tokenize_texts([text_content])  # Returns a list of lists
            if tokenized_segments and tokenized_segments[0]:
                word_count = len(tokenized_segments[0])
            else:
                word_count = 0
                
            word_counts_data.append(
                {
                    "Filename": fname.replace(".txt", ""),
                    "ChapterNumber": chapter_num,
                    "SegmentID": seg_id,
                    "WordCount": word_count,
                }
            )
        except Exception as e:
            logger.error(f"Error calculating word count for segment {seg_id}: {e}")
            # Add entry with 0 word count to maintain consistency
            word_counts_data.append(
                {
                    "Filename": fname.replace(".txt", ""),
                    "ChapterNumber": chapter_num,
                    "SegmentID": seg_id,
                    "WordCount": 0,
                }
            )
    
    # Create and sort the word counts DataFrame
    word_counts_df = pd.DataFrame(word_counts_data)
    if not word_counts_df.empty:
        word_counts_df = word_counts_df.sort_values(
            by=["Filename", "ChapterNumber"]
        ).reset_index(drop=True)
    
    if progress_callback is not None:
        try:
            progress_callback(0.95, desc="Analysis complete!")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    return metrics_df, word_counts_df, warning