Spaces:
Sleeping
Sleeping
import pandas as pd | |
from typing import Dict, List, Tuple | |
from .metrics import compute_all_metrics | |
from .semantic_embedding import get_sentence_transformer_model_and_device | |
from .tokenize import tokenize_texts | |
import logging | |
from itertools import combinations | |
logger = logging.getLogger(__name__) | |
def process_texts( | |
text_data: Dict[str, str], filenames: List[str], enable_semantic: bool = True | |
) -> Tuple[pd.DataFrame, pd.DataFrame, str]: | |
""" | |
Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files. | |
Args: | |
text_data (Dict[str, str]): A dictionary mapping filenames to their content. | |
filenames (List[str]): A list of filenames that were uploaded. | |
Returns: | |
Tuple[pd.DataFrame, pd.DataFrame, str]: | |
- metrics_df: DataFrame with similarity metrics between corresponding chapters of file pairs. | |
- word_counts_df: DataFrame with word counts for each segment (chapter) in each file. | |
- warning: A string containing any warnings generated during processing (e.g., missing chapter markers). | |
""" | |
st_model, st_device = None, None | |
if enable_semantic: | |
logger.info( | |
"Semantic similarity enabled. Loading sentence transformer model..." | |
) | |
try: | |
st_model, st_device = get_sentence_transformer_model_and_device() | |
logger.info( | |
f"Sentence transformer model loaded successfully on {st_device}." | |
) | |
except Exception as e: | |
logger.error( | |
f"Failed to load sentence transformer model: {e}. Semantic similarity will not be available." | |
) | |
# Optionally, add a warning to the UI if model loading fails | |
# For now, keeping it as a logger.error. UI warning can be added later if desired. | |
pass # Explicitly noting that we are not changing the warning handling for UI here. | |
else: | |
logger.info("Semantic similarity disabled. Skipping model loading.") | |
# Detect chapter marker | |
chapter_marker = "༈" | |
fallback = False | |
segment_texts = {} | |
for fname in filenames: | |
content = text_data[fname] | |
if chapter_marker in content: | |
segments = [ | |
seg.strip() for seg in content.split(chapter_marker) if seg.strip() | |
] | |
for idx, seg in enumerate(segments): | |
seg_id = f"{fname}|chapter {idx+1}" | |
segment_texts[seg_id] = seg | |
else: | |
seg_id = f"{fname}|chapter 1" | |
segment_texts[seg_id] = content.strip() | |
fallback = True | |
warning = "" | |
if fallback: | |
warning = ( | |
"No chapter marker found in one or more files. " | |
"Each file will be treated as a single segment. " | |
"For best results, add a unique marker (e.g., ༈) to separate chapters or sections." | |
) | |
# Group chapters by filename (preserving order) | |
file_to_chapters = {} | |
for seg_id in segment_texts: | |
fname = seg_id.split("|")[0] | |
file_to_chapters.setdefault(fname, []).append(seg_id) | |
# For each pair of files, compare corresponding chapters (by index) | |
results = [] | |
files = list(file_to_chapters.keys()) | |
for file1, file2 in combinations(files, 2): | |
chaps1 = file_to_chapters[file1] | |
chaps2 = file_to_chapters[file2] | |
min_chaps = min(len(chaps1), len(chaps2)) | |
for idx in range(min_chaps): | |
seg1 = chaps1[idx] | |
seg2 = chaps2[idx] | |
# Compute metrics for this chapter pair | |
# Use compute_all_metrics on just these two segments | |
pair_metrics = compute_all_metrics( | |
{seg1: segment_texts[seg1], seg2: segment_texts[seg2]}, | |
model=st_model, | |
device=st_device, | |
enable_semantic=enable_semantic, | |
) | |
# Rename 'Text Pair' to show file stems and chapter number | |
# Set Text Pair and Chapter columns | |
pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}" | |
pair_metrics.loc[:, "Chapter"] = idx + 1 | |
results.append(pair_metrics) | |
if results: | |
metrics_df = pd.concat(results, ignore_index=True) | |
else: | |
metrics_df = pd.DataFrame() | |
# Calculate word counts | |
word_counts_data = [] | |
for seg_id, text_content in segment_texts.items(): | |
fname, chapter_info = seg_id.split("|", 1) | |
chapter_num = int(chapter_info.replace("chapter ", "")) | |
# Use botok for accurate word count for raw Tibetan text | |
tokenized_segments = tokenize_texts([text_content]) # Returns a list of lists | |
if tokenized_segments and tokenized_segments[0]: | |
word_count = len(tokenized_segments[0]) | |
else: | |
word_count = 0 | |
word_counts_data.append( | |
{ | |
"Filename": fname.replace(".txt", ""), | |
"ChapterNumber": chapter_num, | |
"SegmentID": seg_id, | |
"WordCount": word_count, | |
} | |
) | |
word_counts_df = pd.DataFrame(word_counts_data) | |
if not word_counts_df.empty: | |
word_counts_df = word_counts_df.sort_values( | |
by=["Filename", "ChapterNumber"] | |
).reset_index(drop=True) | |
return metrics_df, word_counts_df, warning | |