Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

App Files Files Community

ttm-webapp-hf / pipeline /process.py

daniel-wojahn

revamped the pipeline and added stopwords and documentation

0bbf2df verified 3 months ago

raw

history blame

5.37 kB

	import pandas as pd
	from typing import Dict, List, Tuple
	from .metrics import compute_all_metrics
	from .semantic_embedding import get_sentence_transformer_model_and_device
	from .tokenize import tokenize_texts
	import logging
	from itertools import combinations

	logger = logging.getLogger(__name__)


	def process_texts(
	text_data: Dict[str, str], filenames: List[str], enable_semantic: bool = True
	) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
	"""
	Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.
	Args:
	text_data (Dict[str, str]): A dictionary mapping filenames to their content.
	filenames (List[str]): A list of filenames that were uploaded.
	Returns:
	Tuple[pd.DataFrame, pd.DataFrame, str]:
	- metrics_df: DataFrame with similarity metrics between corresponding chapters of file pairs.
	- word_counts_df: DataFrame with word counts for each segment (chapter) in each file.
	- warning: A string containing any warnings generated during processing (e.g., missing chapter markers).
	"""
	st_model, st_device = None, None
	if enable_semantic:
	logger.info(
	"Semantic similarity enabled. Loading sentence transformer model..."
	)
	try:
	st_model, st_device = get_sentence_transformer_model_and_device()
	logger.info(
	f"Sentence transformer model loaded successfully on {st_device}."
	)
	except Exception as e:
	logger.error(
	f"Failed to load sentence transformer model: {e}. Semantic similarity will not be available."
	)
	# Optionally, add a warning to the UI if model loading fails
	# For now, keeping it as a logger.error. UI warning can be added later if desired.
	pass # Explicitly noting that we are not changing the warning handling for UI here.
	else:
	logger.info("Semantic similarity disabled. Skipping model loading.")

	# Detect chapter marker
	chapter_marker = "༈"
	fallback = False
	segment_texts = {}
	for fname in filenames:
	content = text_data[fname]
	if chapter_marker in content:
	segments = [
	seg.strip() for seg in content.split(chapter_marker) if seg.strip()
	]
	for idx, seg in enumerate(segments):
	seg_id = f"{fname}\|chapter {idx+1}"
	segment_texts[seg_id] = seg
	else:
	seg_id = f"{fname}\|chapter 1"
	segment_texts[seg_id] = content.strip()
	fallback = True
	warning = ""
	if fallback:
	warning = (
	"No chapter marker found in one or more files. "
	"Each file will be treated as a single segment. "
	"For best results, add a unique marker (e.g., ༈) to separate chapters or sections."
	)
	# Group chapters by filename (preserving order)
	file_to_chapters = {}
	for seg_id in segment_texts:
	fname = seg_id.split("\|")[0]
	file_to_chapters.setdefault(fname, []).append(seg_id)
	# For each pair of files, compare corresponding chapters (by index)
	results = []
	files = list(file_to_chapters.keys())
	for file1, file2 in combinations(files, 2):
	chaps1 = file_to_chapters[file1]
	chaps2 = file_to_chapters[file2]
	min_chaps = min(len(chaps1), len(chaps2))
	for idx in range(min_chaps):
	seg1 = chaps1[idx]
	seg2 = chaps2[idx]
	# Compute metrics for this chapter pair
	# Use compute_all_metrics on just these two segments
	pair_metrics = compute_all_metrics(
	{seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
	model=st_model,
	device=st_device,
	enable_semantic=enable_semantic,
	)
	# Rename 'Text Pair' to show file stems and chapter number
	# Set Text Pair and Chapter columns
	pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
	pair_metrics.loc[:, "Chapter"] = idx + 1
	results.append(pair_metrics)
	if results:
	metrics_df = pd.concat(results, ignore_index=True)
	else:
	metrics_df = pd.DataFrame()

	# Calculate word counts
	word_counts_data = []
	for seg_id, text_content in segment_texts.items():
	fname, chapter_info = seg_id.split("\|", 1)
	chapter_num = int(chapter_info.replace("chapter ", ""))
	# Use botok for accurate word count for raw Tibetan text
	tokenized_segments = tokenize_texts([text_content]) # Returns a list of lists
	if tokenized_segments and tokenized_segments[0]:
	word_count = len(tokenized_segments[0])
	else:
	word_count = 0
	word_counts_data.append(
	{
	"Filename": fname.replace(".txt", ""),
	"ChapterNumber": chapter_num,
	"SegmentID": seg_id,
	"WordCount": word_count,
	}
	)
	word_counts_df = pd.DataFrame(word_counts_data)
	if not word_counts_df.empty:
	word_counts_df = word_counts_df.sort_values(
	by=["Filename", "ChapterNumber"]
	).reset_index(drop=True)

	return metrics_df, word_counts_df, warning