ttm-webapp-hf / pipeline /visualize.py
daniel-wojahn's picture
Reafactoring of the tokenization pipeline, adjusted fasttext implementation
3011301 verified
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px # For color palettes
import numpy as np # Ensure numpy is imported, in case pivot_table uses it for aggfunc
def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None):
"""
Generate heatmap visualizations for all metrics.
Args:
metrics_df: DataFrame with similarity metrics (segment-level)
Returns:
heatmaps: dict of {metric_name: plotly Figure} for each metric
"""
# Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter')
metric_cols = [
col
for col in metrics_df.columns
if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object
]
for col in metrics_df.columns:
if "Pattern Similarity" in col and col not in metric_cols:
metric_cols.append(col)
# --- Heatmaps for each metric ---
heatmaps = {}
# Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False).
for metric in metric_cols:
# Check if all values for this metric are NaN
if metrics_df[metric].isnull().all():
heatmaps[metric] = None
continue # Move to the next metric
pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric)
pivot = pivot.sort_index(ascending=False) # Invert Y-axis: Chapter 1 at the top
# Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons)
if pivot.empty or pivot.isnull().all().all():
heatmaps[metric] = None
continue
cleaned_columns = [col.replace(".txt", "") for col in pivot.columns]
# For consistent interpretation: higher values (more similarity) = darker colors
# Using 'Reds' colormap for all metrics (dark red = high similarity)
cmap = "Reds"
# Format values for display
text = [
[f"{val:.2f}" if pd.notnull(val) else "" for val in row]
for row in pivot.values
]
# Create a copy of the pivot data for visualization
# For LCS and Semantic Similarity, we need to reverse the color scale
# so that higher values (more similarity) are darker
viz_values = pivot.values.copy()
# Determine if we need to reverse the values for consistent color interpretation
# (darker = more similar across all metrics)
reverse_colorscale = False
# All metrics should have darker colors for higher similarity
# No need to reverse values anymore - we'll use the same scale for all
fig = go.Figure(
data=go.Heatmap(
z=viz_values,
x=cleaned_columns,
y=pivot.index,
colorscale=cmap,
reversescale=reverse_colorscale, # Use the same scale direction for all metrics
zmin=float(np.nanmin(viz_values)),
zmax=float(np.nanmax(viz_values)),
text=text,
texttemplate="%{text}",
hovertemplate="Chapter %{y}<br>Text Pair: %{x}<br>Value: %{z:.2f}<extra></extra>",
colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)),
)
)
plot_title = (
descriptive_titles.get(metric, metric) if descriptive_titles else metric
)
fig.update_layout(
title=plot_title,
xaxis_title="Text Pair",
yaxis_title="Chapter",
autosize=False,
width=1350,
height=1200,
font=dict(size=16),
margin=dict(l=140, b=80, t=60),
)
fig.update_xaxes(tickangle=30, tickfont=dict(size=16))
fig.update_yaxes(tickfont=dict(size=16), autorange="reversed")
# Ensure all integer chapter numbers are shown if the axis is numeric and reversed
if pd.api.types.is_numeric_dtype(pivot.index):
fig.update_yaxes(
tickmode="array",
tickvals=pivot.index,
ticktext=[str(i) for i in pivot.index],
)
heatmaps[metric] = fig
return heatmaps
def generate_word_count_chart(word_counts_df: pd.DataFrame):
"""
Generates a bar chart for word counts per segment (file/chapter).
Args:
word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
Returns:
plotly Figure for the bar chart, or None if input is empty.
"""
if word_counts_df.empty:
return None
fig = go.Figure()
# Assign colors based on Filename
unique_files = sorted(word_counts_df["Filename"].unique())
colors = px.colors.qualitative.Plotly # Get a default Plotly color sequence
for i, filename in enumerate(unique_files):
file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values(
"ChapterNumber"
)
fig.add_trace(
go.Bar(
x=file_df["ChapterNumber"],
y=file_df["WordCount"],
name=filename,
marker_color=colors[i % len(colors)],
text=file_df["WordCount"],
textposition="auto",
customdata=file_df[["Filename"]], # Pass Filename for hovertemplate
hovertemplate="<b>File</b>: %{customdata[0]}<br>"
+ "<b>Chapter</b>: %{x}<br>"
+ "<b>Word Count</b>: %{y}<extra></extra>",
)
)
fig.update_layout(
title_text="Word Counts per Chapter (Grouped by File)",
xaxis_title="Chapter Number",
yaxis_title="Word Count",
barmode="group",
font=dict(size=14),
legend_title_text="Filename",
xaxis=dict(
type="category", # Treat chapter numbers as categories
automargin=True # Automatically adjust margin for x-axis labels/title
),
yaxis=dict(
rangemode='tozero', # Ensure y-axis starts at 0 and includes max value
automargin=True # Automatically adjust margin for y-axis labels/title
),
autosize=True, # Keep for responsiveness in Gradio
margin=dict(l=80, r=50, b=100, t=50, pad=4) # Keep existing base margins
)
# Ensure x-axis ticks are shown for all chapter numbers present
all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique())
fig.update_xaxes(
tickmode="array",
tickvals=all_chapter_numbers,
ticktext=[str(ch) for ch in all_chapter_numbers],
)
return fig