Spaces:

daniel-wojahn
/

ttm-webapp-hf

Sleeping

App Files Files Community

ttm-webapp-hf / pipeline /process.py

daniel-wojahn

Upload 19 files

b4c92f5 verified 2 months ago

raw

history blame

16.6 kB

	import pandas as pd
	from typing import Dict, List, Tuple
	from .metrics import compute_all_metrics
	from .semantic_embedding import get_model_and_device, train_fasttext_model, FASTTEXT_MODEL_ID
	from .tokenize import tokenize_texts
	import logging
	from itertools import combinations

	logger = logging.getLogger(__name__)


	def process_texts(
	text_data: Dict[str, str],
	filenames: List[str],
	enable_semantic: bool = True,
	model_name: str = "buddhist-nlp/buddhist-sentence-similarity",
	use_stopwords: bool = True,
	use_lite_stopwords: bool = False,
	progress_callback = None
	) -> Tuple[pd.DataFrame, pd.DataFrame, str]:
	"""
	Processes uploaded texts, segments them by chapter marker, and computes metrics between chapters of different files.

	Args:
	text_data (Dict[str, str]): A dictionary mapping filenames to their content.
	filenames (List[str]): A list of filenames that were uploaded.
	enable_semantic (bool, optional): Whether to compute semantic similarity metrics.
	Requires loading a sentence transformer model, which can be time-consuming. Defaults to True.
	model_name (str, optional): The name of the sentence transformer model to use for semantic similarity.
	Must be a valid model identifier on Hugging Face. Defaults to "buddhist-nlp/buddhist-sentence-similarity".
	use_stopwords (bool, optional): Whether to use stopwords in the metrics calculation. Defaults to True.
	use_lite_stopwords (bool, optional): Whether to use the lite stopwords list (common particles only)
	instead of the comprehensive list. Only applies if use_stopwords is True. Defaults to False.
	progress_callback (callable, optional): A callback function for reporting progress updates.
	Should accept a float between 0 and 1 and a description string. Defaults to None.

	Returns:
	Tuple[pd.DataFrame, pd.DataFrame, str]:
	- metrics_df: DataFrame with similarity metrics between corresponding chapters of file pairs.
	Contains columns: 'Text Pair', 'Chapter', 'Jaccard Similarity (%)', 'Normalized LCS',
	'Semantic Similarity' (if enable_semantic=True), and 'TF-IDF Cosine Sim'.
	- word_counts_df: DataFrame with word counts for each segment (chapter) in each file.
	Contains columns: 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
	- warning: A string containing any warnings generated during processing (e.g., missing chapter markers).

	Raises:
	RuntimeError: If the botok tokenizer fails to initialize.
	ValueError: If the input files cannot be processed or if metrics computation fails.
	"""
	# Initialize model and device variables
	st_model, st_device = None, None
	model_warning = ""

	# Update progress if callback provided
	if progress_callback is not None:
	try:
	progress_callback(0.25, desc="Preparing for text analysis...")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")
	# Continue processing even if progress reporting fails

	# Load semantic model if enabled
	if enable_semantic:
	logger.info("Semantic similarity enabled. Loading embedding model...")
	try:
	logger.info("Using model: %s", model_name)

	# Check if this is a FastText model request
	if model_name == FASTTEXT_MODEL_ID:
	# Try to load the official Facebook FastText Tibetan model first
	if progress_callback is not None:
	try:
	progress_callback(0.25, desc="Loading official Facebook FastText Tibetan model...")
	except Exception as e:
	logger.warning("Progress callback error (non-critical): %s", str(e))

	st_model, st_device, model_type = get_model_and_device(model_id=model_name)

	# If model is None, we need to train a fallback model
	if st_model is None:
	if progress_callback is not None:
	try:
	progress_callback(0.25, desc="Official model unavailable. Training fallback FastText model...")
	except Exception as e:
	logger.warning("Progress callback error (non-critical): %s", str(e))

	# Collect all text data for training
	all_texts = list(text_data.values())

	# Train the model with standard parameters for stability
	st_model = train_fasttext_model(all_texts, dim=100, epoch=5)

	if progress_callback is not None:
	try:
	progress_callback(0.3, desc="Fallback FastText model trained successfully")
	except Exception as e:
	logger.warning("Progress callback error (non-critical): %s", str(e))
	else:
	if progress_callback is not None:
	try:
	progress_callback(0.3, desc="Official Facebook FastText Tibetan model loaded successfully")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")
	else:
	# For sentence transformers
	st_model, st_device, model_type = get_model_and_device(model_id=model_name)
	logger.info(f"Model {model_name} loaded successfully on {st_device}.")

	if progress_callback is not None:
	try:
	progress_callback(0.3, desc="Model loaded successfully")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	except Exception as e:
	error_msg = str(e)
	logger.error(f"Failed to load sentence transformer model: {error_msg}. Semantic similarity will not be available.")

	# Create a user-friendly warning message
	if "is not a valid model identifier" in error_msg:
	model_warning = f"The model '{model_name}' could not be found on Hugging Face. Semantic similarity will not be available."
	elif "CUDA out of memory" in error_msg:
	model_warning = "Not enough GPU memory to load the semantic model. Try using a smaller model or disable semantic similarity."
	else:
	model_warning = f"Failed to load semantic model: {error_msg}. Semantic similarity will not be available."

	if progress_callback is not None:
	try:
	progress_callback(0.3, desc="Continuing without semantic model")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")
	else:
	logger.info("Semantic similarity disabled. Skipping model loading.")
	if progress_callback is not None:
	try:
	progress_callback(0.3, desc="Processing text segments")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	# Detect chapter marker and segment texts
	if progress_callback is not None:
	try:
	progress_callback(0.35, desc="Segmenting texts by chapters...")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	chapter_marker = "༈"
	fallback = False
	segment_texts = {}

	# Process each file
	for i, fname in enumerate(filenames):
	if progress_callback is not None and len(filenames) > 1:
	try:
	progress_callback(0.35 + (0.05 * (i / len(filenames))),
	desc=f"Segmenting file {i+1}/{len(filenames)}: {fname}")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	content = text_data[fname]

	# Check if content is empty
	if not content.strip():
	logger.warning(f"File '{fname}' is empty or contains only whitespace.")
	continue

	# Split by chapter marker if present
	if chapter_marker in content:
	segments = [
	seg.strip() for seg in content.split(chapter_marker) if seg.strip()
	]

	# Check if we have valid segments after splitting
	if not segments:
	logger.warning(f"File '{fname}' contains chapter markers but no valid text segments.")
	continue

	for idx, seg in enumerate(segments):
	seg_id = f"{fname}\|chapter {idx+1}"
	segment_texts[seg_id] = seg
	else:
	# No chapter markers found, treat entire file as one segment
	seg_id = f"{fname}\|chapter 1"
	segment_texts[seg_id] = content.strip()
	fallback = True

	# Generate warning if no chapter markers found
	warning = model_warning # Include any model warnings
	if fallback:
	chapter_warning = (
	"No chapter marker found in one or more files. "
	"Each file will be treated as a single segment. "
	"For best results, add a unique marker (e.g., ༈) to separate chapters or sections."
	)
	warning = warning + " " + chapter_warning if warning else chapter_warning

	# Check if we have any valid segments
	if not segment_texts:
	logger.error("No valid text segments found in any of the uploaded files.")
	return pd.DataFrame(), pd.DataFrame(), "No valid text segments found in the uploaded files. Please check your files and try again."
	# Group chapters by filename (preserving order)
	if progress_callback is not None:
	try:
	progress_callback(0.4, desc="Organizing text segments...")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	file_to_chapters = {}
	for seg_id in segment_texts:
	fname = seg_id.split("\|")[0]
	file_to_chapters.setdefault(fname, []).append(seg_id)

	# For each pair of files, compare corresponding chapters (by index)
	if progress_callback is not None:
	try:
	progress_callback(0.45, desc="Computing similarity metrics...")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	results = []
	files = list(file_to_chapters.keys())

	# Check if we have at least two files to compare
	if len(files) < 2:
	logger.warning("Need at least two files to compute similarity metrics.")
	return pd.DataFrame(), pd.DataFrame(), "Need at least two files to compute similarity metrics."

	# Track total number of comparisons for progress reporting
	total_comparisons = 0
	for file1, file2 in combinations(files, 2):
	chaps1 = file_to_chapters[file1]
	chaps2 = file_to_chapters[file2]
	total_comparisons += min(len(chaps1), len(chaps2))

	# Process each file pair
	comparison_count = 0
	for file1, file2 in combinations(files, 2):
	chaps1 = file_to_chapters[file1]
	chaps2 = file_to_chapters[file2]
	min_chaps = min(len(chaps1), len(chaps2))

	if progress_callback is not None:
	try:
	progress_callback(0.45, desc=f"Comparing {file1} with {file2}...")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	for idx in range(min_chaps):
	seg1 = chaps1[idx]
	seg2 = chaps2[idx]

	# Update progress
	comparison_count += 1
	if progress_callback is not None and total_comparisons > 0:
	try:
	progress_percentage = 0.45 + (0.25 * (comparison_count / total_comparisons))
	progress_callback(progress_percentage,
	desc=f"Computing metrics for chapter {idx+1} ({comparison_count}/{total_comparisons})")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	try:
	# Compute metrics for this chapter pair
	pair_metrics = compute_all_metrics(
	{seg1: segment_texts[seg1], seg2: segment_texts[seg2]},
	model=st_model,
	device=st_device,
	enable_semantic=enable_semantic,
	model_type=model_type if 'model_type' in locals() else "sentence_transformer",
	use_stopwords=use_stopwords,
	use_lite_stopwords=use_lite_stopwords
	)

	# Rename 'Text Pair' to show file stems and chapter number
	pair_metrics.loc[:, "Text Pair"] = f"{file1} vs {file2}"
	pair_metrics.loc[:, "Chapter"] = idx + 1
	results.append(pair_metrics)

	except Exception as e:
	logger.error(f"Error computing metrics for {seg1} vs {seg2}: {e}")
	# Continue with other comparisons instead of failing completely
	continue

	# Create the metrics DataFrame
	if results:
	metrics_df = pd.concat(results, ignore_index=True)
	else:
	metrics_df = pd.DataFrame()
	warning += " No valid metrics could be computed. Please check your files and try again."

	# Calculate word counts
	if progress_callback is not None:
	try:
	progress_callback(0.75, desc="Calculating word counts...")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	word_counts_data = []

	# Process each segment
	for i, (seg_id, text_content) in enumerate(segment_texts.items()):
	# Update progress
	if progress_callback is not None and len(segment_texts) > 0:
	try:
	progress_percentage = 0.75 + (0.15 * (i / len(segment_texts)))
	progress_callback(progress_percentage, desc=f"Counting words in segment {i+1}/{len(segment_texts)}")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	fname, chapter_info = seg_id.split("\|", 1)
	chapter_num = int(chapter_info.replace("chapter ", ""))

	try:
	# Use botok for accurate word count for raw Tibetan text
	tokenized_segments = tokenize_texts([text_content]) # Returns a list of lists
	if tokenized_segments and tokenized_segments[0]:
	word_count = len(tokenized_segments[0])
	else:
	word_count = 0

	word_counts_data.append(
	{
	"Filename": fname.replace(".txt", ""),
	"ChapterNumber": chapter_num,
	"SegmentID": seg_id,
	"WordCount": word_count,
	}
	)
	except Exception as e:
	logger.error(f"Error calculating word count for segment {seg_id}: {e}")
	# Add entry with 0 word count to maintain consistency
	word_counts_data.append(
	{
	"Filename": fname.replace(".txt", ""),
	"ChapterNumber": chapter_num,
	"SegmentID": seg_id,
	"WordCount": 0,
	}
	)

	# Create and sort the word counts DataFrame
	word_counts_df = pd.DataFrame(word_counts_data)
	if not word_counts_df.empty:
	word_counts_df = word_counts_df.sort_values(
	by=["Filename", "ChapterNumber"]
	).reset_index(drop=True)

	if progress_callback is not None:
	try:
	progress_callback(0.95, desc="Analysis complete!")
	except Exception as e:
	logger.warning(f"Progress callback error (non-critical): {e}")

	return metrics_df, word_counts_df, warning