File size: 5,367 Bytes
4bf5701
 
 
 
 
 
0bbf2df
4bf5701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bbf2df
 
4bf5701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import pandas as pd
from typing import Dict, List, Tuple
from .metrics import compute_all_metrics
from .semantic_embedding import get_sentence_transformer_model_and_device
from .tokenize import tokenize_texts
import logging
from itertools import combinations

logger = logging.getLogger(__name__)


def process_texts(
    text_data: Dict[str, str], filenames: List[str], enable_semantic: bool = True
) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
    """
    Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
    Args:
        text_data (Dict[str, str]): A dictionary mapping filenames to their content.
        filenames (List[str]): A list of filenames that were uploaded.
    Returns:
        Tuple[pd.DataFrame, pd.DataFrame, str]:
            - metrics_df: DataFrame with similarity metrics between corresponding chapters of file pairs.
            - word_counts_df: DataFrame with word counts for each segment (chapter) in each file.
            - warning: A string containing any warnings generated during processing (e.g., missing chapter markers).
    """
    st_model, st_device = None, None
    if enable_semantic:
        logger.info(
            "Semantic similarity enabled. Loading sentence transformer model..."
        )
        try:
            st_model, st_device = get_sentence_transformer_model_and_device()
            logger.info(
                f"Sentence transformer model loaded successfully on {st_device}."
            )
        except Exception as e:
            logger.error(
                f"Failed to load sentence transformer model: {e}. Semantic similarity will not be available."
            )
            # Optionally, add a warning to the UI if model loading fails
            # For now, keeping it as a logger.error. UI warning can be added later if desired.
            pass # Explicitly noting that we are not changing the warning handling for UI here.
    else:
        logger.info("Semantic similarity disabled. Skipping model loading.")

    # Detect chapter marker
    chapter_marker = "༈"
    fallback = False
    segment_texts = {}
    for fname in filenames:
        content = text_data[fname]
        if chapter_marker in content:
            segments = [
                seg.strip() for seg in content.split(chapter_marker) if seg.strip()
            ]
            for idx, seg in enumerate(segments):
                seg_id = f"{fname}|chapter {idx+1}"
                segment_texts[seg_id] = seg
        else:
            seg_id = f"{fname}|chapter 1"
            segment_texts[seg_id] = content.strip()
            fallback = True
    warning = ""
    if fallback:
        warning = (
            "No chapter marker found in one or more files. "
            "Each file will be treated as a single segment. "
            "For best results, add a unique marker (e.g., ༈) to separate chapters or sections."
        )
    # Group chapters by filename (preserving order)
    file_to_chapters = {}
    for seg_id in segment_texts:
        fname = seg_id.split("|")[0]
        file_to_chapters.setdefault(fname, []).append(seg_id)
    # For each pair of files, compare corresponding chapters (by index)
    results = []
    files = list(file_to_chapters.keys())
    for file1, file2 in combinations(files, 2):
        chaps1 = file_to_chapters[file1]
        chaps2 = file_to_chapters[file2]
        min_chaps = min(len(chaps1), len(chaps2))
        for idx in range(min_chaps):
            seg1 = chaps1[idx]
            seg2 = chaps2[idx]
            # Compute metrics for this chapter pair
            # Use compute_all_metrics on just these two segments
            pair_metrics = compute_all_metrics(
                {seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
                model=st_model,
                device=st_device,
                enable_semantic=enable_semantic,
            )
            # Rename 'Text Pair' to show file stems and chapter number
            # Set Text Pair and Chapter columns
            pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
            pair_metrics.loc[:, "Chapter"] = idx + 1
            results.append(pair_metrics)
    if results:
        metrics_df = pd.concat(results, ignore_index=True)
    else:
        metrics_df = pd.DataFrame()

    # Calculate word counts
    word_counts_data = []
    for seg_id, text_content in segment_texts.items():
        fname, chapter_info = seg_id.split("|", 1)
        chapter_num = int(chapter_info.replace("chapter ", ""))
        # Use botok for accurate word count for raw Tibetan text
        tokenized_segments = tokenize_texts([text_content])  # Returns a list of lists
        if tokenized_segments and tokenized_segments[0]:
            word_count = len(tokenized_segments[0])
        else:
            word_count = 0
        word_counts_data.append(
            {
                "Filename": fname.replace(".txt", ""),
                "ChapterNumber": chapter_num,
                "SegmentID": seg_id,
                "WordCount": word_count,
            }
        )
    word_counts_df = pd.DataFrame(word_counts_data)
    if not word_counts_df.empty:
        word_counts_df = word_counts_df.sort_values(
            by=["Filename", "ChapterNumber"]
        ).reset_index(drop=True)

    return metrics_df, word_counts_df, warning