Spaces:
Running
Running
File size: 6,704 Bytes
4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 b4c92f5 4bf5701 3011301 4bf5701 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px # For color palettes
import numpy as np # Ensure numpy is imported, in case pivot_table uses it for aggfunc
def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None):
"""
Generate heatmap visualizations for all metrics.
Args:
metrics_df: DataFrame with similarity metrics (segment-level)
Returns:
heatmaps: dict of {metric_name: plotly Figure} for each metric
"""
# Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter')
metric_cols = [
col
for col in metrics_df.columns
if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object
]
for col in metrics_df.columns:
if "Pattern Similarity" in col and col not in metric_cols:
metric_cols.append(col)
# --- Heatmaps for each metric ---
heatmaps = {}
# Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False).
for metric in metric_cols:
# Check if all values for this metric are NaN
if metrics_df[metric].isnull().all():
heatmaps[metric] = None
continue # Move to the next metric
pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric)
pivot = pivot.sort_index(ascending=False) # Invert Y-axis: Chapter 1 at the top
# Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons)
if pivot.empty or pivot.isnull().all().all():
heatmaps[metric] = None
continue
cleaned_columns = [col.replace(".txt", "") for col in pivot.columns]
# For consistent interpretation: higher values (more similarity) = darker colors
# Using 'Reds' colormap for all metrics (dark red = high similarity)
cmap = "Reds"
# Format values for display
text = [
[f"{val:.2f}" if pd.notnull(val) else "" for val in row]
for row in pivot.values
]
# Create a copy of the pivot data for visualization
# For LCS and Semantic Similarity, we need to reverse the color scale
# so that higher values (more similarity) are darker
viz_values = pivot.values.copy()
# Determine if we need to reverse the values for consistent color interpretation
# (darker = more similar across all metrics)
reverse_colorscale = False
# All metrics should have darker colors for higher similarity
# No need to reverse values anymore - we'll use the same scale for all
fig = go.Figure(
data=go.Heatmap(
z=viz_values,
x=cleaned_columns,
y=pivot.index,
colorscale=cmap,
reversescale=reverse_colorscale, # Use the same scale direction for all metrics
zmin=float(np.nanmin(viz_values)),
zmax=float(np.nanmax(viz_values)),
text=text,
texttemplate="%{text}",
hovertemplate="Chapter %{y}<br>Text Pair: %{x}<br>Value: %{z:.2f}<extra></extra>",
colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)),
)
)
plot_title = (
descriptive_titles.get(metric, metric) if descriptive_titles else metric
)
fig.update_layout(
title=plot_title,
xaxis_title="Text Pair",
yaxis_title="Chapter",
autosize=False,
width=1350,
height=1200,
font=dict(size=16),
margin=dict(l=140, b=80, t=60),
)
fig.update_xaxes(tickangle=30, tickfont=dict(size=16))
fig.update_yaxes(tickfont=dict(size=16), autorange="reversed")
# Ensure all integer chapter numbers are shown if the axis is numeric and reversed
if pd.api.types.is_numeric_dtype(pivot.index):
fig.update_yaxes(
tickmode="array",
tickvals=pivot.index,
ticktext=[str(i) for i in pivot.index],
)
heatmaps[metric] = fig
return heatmaps
def generate_word_count_chart(word_counts_df: pd.DataFrame):
"""
Generates a bar chart for word counts per segment (file/chapter).
Args:
word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'.
Returns:
plotly Figure for the bar chart, or None if input is empty.
"""
if word_counts_df.empty:
return None
fig = go.Figure()
# Assign colors based on Filename
unique_files = sorted(word_counts_df["Filename"].unique())
colors = px.colors.qualitative.Plotly # Get a default Plotly color sequence
for i, filename in enumerate(unique_files):
file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values(
"ChapterNumber"
)
fig.add_trace(
go.Bar(
x=file_df["ChapterNumber"],
y=file_df["WordCount"],
name=filename,
marker_color=colors[i % len(colors)],
text=file_df["WordCount"],
textposition="auto",
customdata=file_df[["Filename"]], # Pass Filename for hovertemplate
hovertemplate="<b>File</b>: %{customdata[0]}<br>"
+ "<b>Chapter</b>: %{x}<br>"
+ "<b>Word Count</b>: %{y}<extra></extra>",
)
)
fig.update_layout(
title_text="Word Counts per Chapter (Grouped by File)",
xaxis_title="Chapter Number",
yaxis_title="Word Count",
barmode="group",
font=dict(size=14),
legend_title_text="Filename",
xaxis=dict(
type="category", # Treat chapter numbers as categories
automargin=True # Automatically adjust margin for x-axis labels/title
),
yaxis=dict(
rangemode='tozero', # Ensure y-axis starts at 0 and includes max value
automargin=True # Automatically adjust margin for y-axis labels/title
),
autosize=True, # Keep for responsiveness in Gradio
margin=dict(l=80, r=50, b=100, t=50, pad=4) # Keep existing base margins
)
# Ensure x-axis ticks are shown for all chapter numbers present
all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique())
fig.update_xaxes(
tickmode="array",
tickvals=all_chapter_numbers,
ticktext=[str(ch) for ch in all_chapter_numbers],
)
return fig
|