Spaces:

daniel-wojahn
/

ttm-webapp-hf

Running

App Files Files Community

ttm-webapp-hf / pipeline /visualize.py

daniel-wojahn

Reafactoring of the tokenization pipeline, adjusted fasttext implementation

3011301 verified 17 days ago

raw

history blame contribute delete

6.7 kB

	import plotly.graph_objects as go
	import pandas as pd
	import plotly.express as px # For color palettes
	import numpy as np # Ensure numpy is imported, in case pivot_table uses it for aggfunc


	def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None):
	"""
	Generate heatmap visualizations for all metrics.
	Args:
	metrics_df: DataFrame with similarity metrics (segment-level)
	Returns:
	heatmaps: dict of {metric_name: plotly Figure} for each metric
	"""

	# Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter')
	metric_cols = [
	col
	for col in metrics_df.columns
	if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object
	]
	for col in metrics_df.columns:
	if "Pattern Similarity" in col and col not in metric_cols:
	metric_cols.append(col)

	# --- Heatmaps for each metric ---
	heatmaps = {}
	# Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False).
	for metric in metric_cols:
	# Check if all values for this metric are NaN
	if metrics_df[metric].isnull().all():
	heatmaps[metric] = None
	continue # Move to the next metric

	pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric)
	pivot = pivot.sort_index(ascending=False) # Invert Y-axis: Chapter 1 at the top
	# Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons)
	if pivot.empty or pivot.isnull().all().all():
	heatmaps[metric] = None
	continue

	cleaned_columns = [col.replace(".txt", "") for col in pivot.columns]

	# For consistent interpretation: higher values (more similarity) = darker colors
	# Using 'Reds' colormap for all metrics (dark red = high similarity)
	cmap = "Reds"

	# Format values for display
	text = [
	[f"{val:.2f}" if pd.notnull(val) else "" for val in row]
	for row in pivot.values
	]

	# Create a copy of the pivot data for visualization
	# For LCS and Semantic Similarity, we need to reverse the color scale
	# so that higher values (more similarity) are darker
	viz_values = pivot.values.copy()

	# Determine if we need to reverse the values for consistent color interpretation
	# (darker = more similar across all metrics)
	reverse_colorscale = False

	# All metrics should have darker colors for higher similarity
	# No need to reverse values anymore - we'll use the same scale for all

	fig = go.Figure(
	data=go.Heatmap(
	z=viz_values,
	x=cleaned_columns,
	y=pivot.index,
	colorscale=cmap,
	reversescale=reverse_colorscale, # Use the same scale direction for all metrics
	zmin=float(np.nanmin(viz_values)),
	zmax=float(np.nanmax(viz_values)),
	text=text,
	texttemplate="%{text}",
	hovertemplate="Chapter %{y}<br>Text Pair: %{x}<br>Value: %{z:.2f}<extra></extra>",
	colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)),
	)
	)
	plot_title = (
	descriptive_titles.get(metric, metric) if descriptive_titles else metric
	)
	fig.update_layout(
	title=plot_title,
	xaxis_title="Text Pair",
	yaxis_title="Chapter",
	autosize=False,
	width=1350,
	height=1200,
	font=dict(size=16),
	margin=dict(l=140, b=80, t=60),
	)
	fig.update_xaxes(tickangle=30, tickfont=dict(size=16))
	fig.update_yaxes(tickfont=dict(size=16), autorange="reversed")
	# Ensure all integer chapter numbers are shown if the axis is numeric and reversed
	if pd.api.types.is_numeric_dtype(pivot.index):
	fig.update_yaxes(
	tickmode="array",
	tickvals=pivot.index,
	ticktext=[str(i) for i in pivot.index],
	)
	heatmaps[metric] = fig

	return heatmaps


	def generate_word_count_chart(word_counts_df: pd.DataFrame):
	"""
	Generates a bar chart for word counts per segment (file/chapter).
	Args:
	word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
	Returns:
	plotly Figure for the bar chart, or None if input is empty.
	"""
	if word_counts_df.empty:
	return None

	fig = go.Figure()

	# Assign colors based on Filename
	unique_files = sorted(word_counts_df["Filename"].unique())
	colors = px.colors.qualitative.Plotly # Get a default Plotly color sequence

	for i, filename in enumerate(unique_files):
	file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values(
	"ChapterNumber"
	)
	fig.add_trace(
	go.Bar(
	x=file_df["ChapterNumber"],
	y=file_df["WordCount"],
	name=filename,
	marker_color=colors[i % len(colors)],
	text=file_df["WordCount"],
	textposition="auto",
	customdata=file_df[["Filename"]], # Pass Filename for hovertemplate
	hovertemplate="<b>File</b>: %{customdata[0]}<br>"
	+ "<b>Chapter</b>: %{x}<br>"
	+ "<b>Word Count</b>: %{y}<extra></extra>",
	)
	)

	fig.update_layout(
	title_text="Word Counts per Chapter (Grouped by File)",
	xaxis_title="Chapter Number",
	yaxis_title="Word Count",
	barmode="group",
	font=dict(size=14),
	legend_title_text="Filename",
	xaxis=dict(
	type="category", # Treat chapter numbers as categories
	automargin=True # Automatically adjust margin for x-axis labels/title
	),
	yaxis=dict(
	rangemode='tozero', # Ensure y-axis starts at 0 and includes max value
	automargin=True # Automatically adjust margin for y-axis labels/title
	),
	autosize=True, # Keep for responsiveness in Gradio
	margin=dict(l=80, r=50, b=100, t=50, pad=4) # Keep existing base margins
	)
	# Ensure x-axis ticks are shown for all chapter numbers present
	all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique())
	fig.update_xaxes(
	tickmode="array",
	tickvals=all_chapter_numbers,
	ticktext=[str(ch) for ch in all_chapter_numbers],
	)

	return fig