Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

File size: 16,630 Bytes

import pandas as pd
from typing import Dict, List, Tuple
from .metrics import compute_all_metrics
from .semantic_embedding import get_model_and_device, train_fasttext_model, FASTTEXT_MODEL_ID
from .tokenize import tokenize_texts
import logging
from itertools import combinations

logger = logging.getLogger(__name__)


def process_texts(
    text_data: Dict[str, str], 
    filenames: List[str], 
    enable_semantic: bool = True,
    model_name: str = "buddhist-nlp/buddhist-sentence-similarity",
    use_stopwords: bool = True,
    use_lite_stopwords: bool = False,
    progress_callback = None
) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
    """
    Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
    
    Args:
        text_data (Dict[str, str]): A dictionary mapping filenames to their content.
        filenames (List[str]): A list of filenames that were uploaded.
        enable_semantic (bool, optional): Whether to compute semantic similarity metrics. 
            Requires loading a sentence transformer model, which can be time-consuming. Defaults to True.
        model_name (str, optional): The name of the sentence transformer model to use for semantic similarity.
            Must be a valid model identifier on Hugging Face. Defaults to "buddhist-nlp/buddhist-sentence-similarity".
        use_stopwords (bool, optional): Whether to use stopwords in the metrics calculation. Defaults to True.
        use_lite_stopwords (bool, optional): Whether to use the lite stopwords list (common particles only)
            instead of the comprehensive list. Only applies if use_stopwords is True. Defaults to False.
        progress_callback (callable, optional): A callback function for reporting progress updates.
            Should accept a float between 0 and 1 and a description string. Defaults to None.
            
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame, str]:
            - metrics_df: DataFrame with similarity metrics between corresponding chapters of file pairs.
                Contains columns: 'Text Pair', 'Chapter', 'Jaccard Similarity (%)', 'Normalized LCS',
                'Semantic Similarity' (if enable_semantic=True), and 'TF-IDF Cosine Sim'.
            - word_counts_df: DataFrame with word counts for each segment (chapter) in each file.
                Contains columns: 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
            - warning: A string containing any warnings generated during processing (e.g., missing chapter markers).
    
    Raises:
        RuntimeError: If the botok tokenizer fails to initialize.
        ValueError: If the input files cannot be processed or if metrics computation fails.
    """
    # Initialize model and device variables
    st_model, st_device = None, None
    model_warning = ""
    
    # Update progress if callback provided
    if progress_callback is not None:
        try:
            progress_callback(0.25, desc="Preparing for text analysis...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
            # Continue processing even if progress reporting fails
    
    # Load semantic model if enabled
    if enable_semantic:
        logger.info("Semantic similarity enabled. Loading embedding model...")
        try:
            logger.info("Using model: %s", model_name)
            
            # Check if this is a FastText model request
            if model_name == FASTTEXT_MODEL_ID:
                # Try to load the official Facebook FastText Tibetan model first
                if progress_callback is not None:
                    try:
                        progress_callback(0.25, desc="Loading official Facebook FastText Tibetan model...")
                    except Exception as e:
                        logger.warning("Progress callback error (non-critical): %s", str(e))
                
                st_model, st_device, model_type = get_model_and_device(model_id=model_name)
                
                # If model is None, we need to train a fallback model
                if st_model is None:
                    if progress_callback is not None:
                        try:
                            progress_callback(0.25, desc="Official model unavailable. Training fallback FastText model...")
                        except Exception as e:
                            logger.warning("Progress callback error (non-critical): %s", str(e))
                    
                    # Collect all text data for training
                    all_texts = list(text_data.values())
                    
                    # Train the model with standard parameters for stability
                    st_model = train_fasttext_model(all_texts, dim=100, epoch=5)
                    
                    if progress_callback is not None:
                        try:
                            progress_callback(0.3, desc="Fallback FastText model trained successfully")
                        except Exception as e:
                            logger.warning("Progress callback error (non-critical): %s", str(e))
                else:
                    if progress_callback is not None:
                        try:
                            progress_callback(0.3, desc="Official Facebook FastText Tibetan model loaded successfully")
                        except Exception as e:
                            logger.warning(f"Progress callback error (non-critical): {e}")
            else:
                # For sentence transformers
                st_model, st_device, model_type = get_model_and_device(model_id=model_name)
                logger.info(f"Model {model_name} loaded successfully on {st_device}.")
                
                if progress_callback is not None:
                    try:
                        progress_callback(0.3, desc="Model loaded successfully")
                    except Exception as e:
                        logger.warning(f"Progress callback error (non-critical): {e}")
                
        except Exception as e:
            error_msg = str(e)
            logger.error(f"Failed to load sentence transformer model: {error_msg}. Semantic similarity will not be available.")
            
            # Create a user-friendly warning message
            if "is not a valid model identifier" in error_msg:
                model_warning = f"The model '{model_name}' could not be found on Hugging Face. Semantic similarity will not be available."
            elif "CUDA out of memory" in error_msg:
                model_warning = "Not enough GPU memory to load the semantic model. Try using a smaller model or disable semantic similarity."
            else:
                model_warning = f"Failed to load semantic model: {error_msg}. Semantic similarity will not be available."
                
            if progress_callback is not None:
                try:
                    progress_callback(0.3, desc="Continuing without semantic model")
                except Exception as e:
                    logger.warning(f"Progress callback error (non-critical): {e}")
    else:
        logger.info("Semantic similarity disabled. Skipping model loading.")
        if progress_callback is not None:
            try:
                progress_callback(0.3, desc="Processing text segments")
            except Exception as e:
                logger.warning(f"Progress callback error (non-critical): {e}")

    # Detect chapter marker and segment texts
    if progress_callback is not None:
        try:
            progress_callback(0.35, desc="Segmenting texts by chapters...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    chapter_marker = "༈"
    fallback = False
    segment_texts = {}
    
    # Process each file
    for i, fname in enumerate(filenames):
        if progress_callback is not None and len(filenames) > 1:
            try:
                progress_callback(0.35 + (0.05 * (i / len(filenames))), 
                                desc=f"Segmenting file {i+1}/{len(filenames)}: {fname}")
            except Exception as e:
                logger.warning(f"Progress callback error (non-critical): {e}")
            
        content = text_data[fname]
        
        # Check if content is empty
        if not content.strip():
            logger.warning(f"File '{fname}' is empty or contains only whitespace.")
            continue
            
        # Split by chapter marker if present
        if chapter_marker in content:
            segments = [
                seg.strip() for seg in content.split(chapter_marker) if seg.strip()
            ]
            
            # Check if we have valid segments after splitting
            if not segments:
                logger.warning(f"File '{fname}' contains chapter markers but no valid text segments.")
                continue
                
            for idx, seg in enumerate(segments):
                seg_id = f"{fname}|chapter {idx+1}"
                segment_texts[seg_id] = seg
        else:
            # No chapter markers found, treat entire file as one segment
            seg_id = f"{fname}|chapter 1"
            segment_texts[seg_id] = content.strip()
            fallback = True
            
    # Generate warning if no chapter markers found
    warning = model_warning  # Include any model warnings
    if fallback:
        chapter_warning = (
            "No chapter marker found in one or more files. "
            "Each file will be treated as a single segment. "
            "For best results, add a unique marker (e.g., ༈) to separate chapters or sections."
        )
        warning = warning + " " + chapter_warning if warning else chapter_warning
        
    # Check if we have any valid segments
    if not segment_texts:
        logger.error("No valid text segments found in any of the uploaded files.")
        return pd.DataFrame(), pd.DataFrame(), "No valid text segments found in the uploaded files. Please check your files and try again."
    # Group chapters by filename (preserving order)
    if progress_callback is not None:
        try:
            progress_callback(0.4, desc="Organizing text segments...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    file_to_chapters = {}
    for seg_id in segment_texts:
        fname = seg_id.split("|")[0]
        file_to_chapters.setdefault(fname, []).append(seg_id)
        
    # For each pair of files, compare corresponding chapters (by index)
    if progress_callback is not None:
        try:
            progress_callback(0.45, desc="Computing similarity metrics...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    results = []
    files = list(file_to_chapters.keys())
    
    # Check if we have at least two files to compare
    if len(files) < 2:
        logger.warning("Need at least two files to compute similarity metrics.")
        return pd.DataFrame(), pd.DataFrame(), "Need at least two files to compute similarity metrics."
    
    # Track total number of comparisons for progress reporting
    total_comparisons = 0
    for file1, file2 in combinations(files, 2):
        chaps1 = file_to_chapters[file1]
        chaps2 = file_to_chapters[file2]
        total_comparisons += min(len(chaps1), len(chaps2))
    
    # Process each file pair
    comparison_count = 0
    for file1, file2 in combinations(files, 2):
        chaps1 = file_to_chapters[file1]
        chaps2 = file_to_chapters[file2]
        min_chaps = min(len(chaps1), len(chaps2))
        
        if progress_callback is not None:
            try:
                progress_callback(0.45, desc=f"Comparing {file1} with {file2}...")
            except Exception as e:
                logger.warning(f"Progress callback error (non-critical): {e}")
            
        for idx in range(min_chaps):
            seg1 = chaps1[idx]
            seg2 = chaps2[idx]
            
            # Update progress
            comparison_count += 1
            if progress_callback is not None and total_comparisons > 0:
                try:
                    progress_percentage = 0.45 + (0.25 * (comparison_count / total_comparisons))
                    progress_callback(progress_percentage, 
                                    desc=f"Computing metrics for chapter {idx+1} ({comparison_count}/{total_comparisons})")
                except Exception as e:
                    logger.warning(f"Progress callback error (non-critical): {e}")
            
            try:
                # Compute metrics for this chapter pair
                pair_metrics = compute_all_metrics(
                    {seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
                    model=st_model,
                    device=st_device,
                    enable_semantic=enable_semantic,
                    model_type=model_type if 'model_type' in locals() else "sentence_transformer",
                    use_stopwords=use_stopwords,
                    use_lite_stopwords=use_lite_stopwords
                )
                
                # Rename 'Text Pair' to show file stems and chapter number
                pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
                pair_metrics.loc[:, "Chapter"] = idx + 1
                results.append(pair_metrics)
                
            except Exception as e:
                logger.error(f"Error computing metrics for {seg1} vs {seg2}: {e}")
                # Continue with other comparisons instead of failing completely
                continue
    
    # Create the metrics DataFrame
    if results:
        metrics_df = pd.concat(results, ignore_index=True)
    else:
        metrics_df = pd.DataFrame()
        warning += " No valid metrics could be computed. Please check your files and try again."

    # Calculate word counts
    if progress_callback is not None:
        try:
            progress_callback(0.75, desc="Calculating word counts...")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    word_counts_data = []
    
    # Process each segment
    for i, (seg_id, text_content) in enumerate(segment_texts.items()):
        # Update progress
        if progress_callback is not None and len(segment_texts) > 0:
            try:
                progress_percentage = 0.75 + (0.15 * (i / len(segment_texts)))
                progress_callback(progress_percentage, desc=f"Counting words in segment {i+1}/{len(segment_texts)}")
            except Exception as e:
                logger.warning(f"Progress callback error (non-critical): {e}")
            
        fname, chapter_info = seg_id.split("|", 1)
        chapter_num = int(chapter_info.replace("chapter ", ""))
        
        try:
            # Use botok for accurate word count for raw Tibetan text
            tokenized_segments = tokenize_texts([text_content])  # Returns a list of lists
            if tokenized_segments and tokenized_segments[0]:
                word_count = len(tokenized_segments[0])
            else:
                word_count = 0
                
            word_counts_data.append(
                {
                    "Filename": fname.replace(".txt", ""),
                    "ChapterNumber": chapter_num,
                    "SegmentID": seg_id,
                    "WordCount": word_count,
                }
            )
        except Exception as e:
            logger.error(f"Error calculating word count for segment {seg_id}: {e}")
            # Add entry with 0 word count to maintain consistency
            word_counts_data.append(
                {
                    "Filename": fname.replace(".txt", ""),
                    "ChapterNumber": chapter_num,
                    "SegmentID": seg_id,
                    "WordCount": 0,
                }
            )
    
    # Create and sort the word counts DataFrame
    word_counts_df = pd.DataFrame(word_counts_data)
    if not word_counts_df.empty:
        word_counts_df = word_counts_df.sort_values(
            by=["Filename", "ChapterNumber"]
        ).reset_index(drop=True)
    
    if progress_callback is not None:
        try:
            progress_callback(0.95, desc="Analysis complete!")
        except Exception as e:
            logger.warning(f"Progress callback error (non-critical): {e}")
        
    return metrics_df, word_counts_df, warning