Spaces:
Running
Running
import plotly.graph_objects as go | |
import pandas as pd | |
import plotly.express as px # For color palettes | |
import numpy as np # Ensure numpy is imported, in case pivot_table uses it for aggfunc | |
def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None): | |
""" | |
Generate heatmap visualizations for all metrics. | |
Args: | |
metrics_df: DataFrame with similarity metrics (segment-level) | |
Returns: | |
heatmaps: dict of {metric_name: plotly Figure} for each metric | |
""" | |
# Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter') | |
metric_cols = [ | |
col | |
for col in metrics_df.columns | |
if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object | |
] | |
for col in metrics_df.columns: | |
if "Pattern Similarity" in col and col not in metric_cols: | |
metric_cols.append(col) | |
# --- Heatmaps for each metric --- | |
heatmaps = {} | |
# Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False). | |
for metric in metric_cols: | |
# Check if all values for this metric are NaN | |
if metrics_df[metric].isnull().all(): | |
heatmaps[metric] = None | |
continue # Move to the next metric | |
pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric) | |
pivot = pivot.sort_index(ascending=False) # Invert Y-axis: Chapter 1 at the top | |
# Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons) | |
if pivot.empty or pivot.isnull().all().all(): | |
heatmaps[metric] = None | |
continue | |
cleaned_columns = [col.replace(".txt", "") for col in pivot.columns] | |
# For consistent interpretation: higher values (more similarity) = darker colors | |
# Using 'Reds' colormap for all metrics (dark red = high similarity) | |
cmap = "Reds" | |
# Format values for display | |
text = [ | |
[f"{val:.2f}" if pd.notnull(val) else "" for val in row] | |
for row in pivot.values | |
] | |
# Create a copy of the pivot data for visualization | |
# For LCS and Semantic Similarity, we need to reverse the color scale | |
# so that higher values (more similarity) are darker | |
viz_values = pivot.values.copy() | |
# Determine if we need to reverse the values for consistent color interpretation | |
# (darker = more similar across all metrics) | |
reverse_colorscale = False | |
# All metrics should have darker colors for higher similarity | |
# No need to reverse values anymore - we'll use the same scale for all | |
fig = go.Figure( | |
data=go.Heatmap( | |
z=viz_values, | |
x=cleaned_columns, | |
y=pivot.index, | |
colorscale=cmap, | |
reversescale=reverse_colorscale, # Use the same scale direction for all metrics | |
zmin=float(np.nanmin(viz_values)), | |
zmax=float(np.nanmax(viz_values)), | |
text=text, | |
texttemplate="%{text}", | |
hovertemplate="Chapter %{y}<br>Text Pair: %{x}<br>Value: %{z:.2f}<extra></extra>", | |
colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)), | |
) | |
) | |
plot_title = ( | |
descriptive_titles.get(metric, metric) if descriptive_titles else metric | |
) | |
fig.update_layout( | |
title=plot_title, | |
xaxis_title="Text Pair", | |
yaxis_title="Chapter", | |
autosize=False, | |
width=1350, | |
height=1200, | |
font=dict(size=16), | |
margin=dict(l=140, b=80, t=60), | |
) | |
fig.update_xaxes(tickangle=30, tickfont=dict(size=16)) | |
fig.update_yaxes(tickfont=dict(size=16), autorange="reversed") | |
# Ensure all integer chapter numbers are shown if the axis is numeric and reversed | |
if pd.api.types.is_numeric_dtype(pivot.index): | |
fig.update_yaxes( | |
tickmode="array", | |
tickvals=pivot.index, | |
ticktext=[str(i) for i in pivot.index], | |
) | |
heatmaps[metric] = fig | |
return heatmaps | |
def generate_word_count_chart(word_counts_df: pd.DataFrame): | |
""" | |
Generates a bar chart for word counts per segment (file/chapter). | |
Args: | |
word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'. | |
Returns: | |
plotly Figure for the bar chart, or None if input is empty. | |
""" | |
if word_counts_df.empty: | |
return None | |
fig = go.Figure() | |
# Assign colors based on Filename | |
unique_files = sorted(word_counts_df["Filename"].unique()) | |
colors = px.colors.qualitative.Plotly # Get a default Plotly color sequence | |
for i, filename in enumerate(unique_files): | |
file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values( | |
"ChapterNumber" | |
) | |
fig.add_trace( | |
go.Bar( | |
x=file_df["ChapterNumber"], | |
y=file_df["WordCount"], | |
name=filename, | |
marker_color=colors[i % len(colors)], | |
text=file_df["WordCount"], | |
textposition="auto", | |
customdata=file_df[["Filename"]], # Pass Filename for hovertemplate | |
hovertemplate="<b>File</b>: %{customdata[0]}<br>" | |
+ "<b>Chapter</b>: %{x}<br>" | |
+ "<b>Word Count</b>: %{y}<extra></extra>", | |
) | |
) | |
fig.update_layout( | |
title_text="Word Counts per Chapter (Grouped by File)", | |
xaxis_title="Chapter Number", | |
yaxis_title="Word Count", | |
barmode="group", | |
font=dict(size=14), | |
legend_title_text="Filename", | |
xaxis=dict( | |
type="category", # Treat chapter numbers as categories | |
automargin=True # Automatically adjust margin for x-axis labels/title | |
), | |
yaxis=dict( | |
rangemode='tozero', # Ensure y-axis starts at 0 and includes max value | |
automargin=True # Automatically adjust margin for y-axis labels/title | |
), | |
autosize=True, # Keep for responsiveness in Gradio | |
margin=dict(l=80, r=50, b=100, t=50, pad=4) # Keep existing base margins | |
) | |
# Ensure x-axis ticks are shown for all chapter numbers present | |
all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique()) | |
fig.update_xaxes( | |
tickmode="array", | |
tickvals=all_chapter_numbers, | |
ticktext=[str(ch) for ch in all_chapter_numbers], | |
) | |
return fig | |