import plotly.graph_objects as go import pandas as pd import plotly.express as px # For color palettes import numpy as np # Ensure numpy is imported, in case pivot_table uses it for aggfunc def generate_visualizations(metrics_df: pd.DataFrame, descriptive_titles: dict = None): """ Generate heatmap visualizations for all metrics. Args: metrics_df: DataFrame with similarity metrics (segment-level) Returns: heatmaps: dict of {metric_name: plotly Figure} for each metric """ # Identify all numeric metric columns (exclude 'Text Pair' and 'Chapter') metric_cols = [ col for col in metrics_df.columns if col not in ["Text Pair", "Chapter"] and metrics_df[col].dtype != object ] for col in metrics_df.columns: if "Pattern Similarity" in col and col not in metric_cols: metric_cols.append(col) # --- Heatmaps for each metric --- heatmaps = {} # Chapter 1 will be at the top of the Y-axis due to sort_index(ascending=False). for metric in metric_cols: # Check if all values for this metric are NaN if metrics_df[metric].isnull().all(): heatmaps[metric] = None continue # Move to the next metric pivot = metrics_df.pivot(index="Chapter", columns="Text Pair", values=metric) pivot = pivot.sort_index(ascending=False) # Invert Y-axis: Chapter 1 at the top # Additional check: if pivot is empty or all NaNs after pivoting (e.g., due to single chapter comparisons) if pivot.empty or pivot.isnull().all().all(): heatmaps[metric] = None continue cleaned_columns = [col.replace(".txt", "") for col in pivot.columns] # For consistent interpretation: higher values (more similarity) = darker colors # Using 'Reds' colormap for all metrics (dark red = high similarity) cmap = "Reds" # Format values for display text = [ [f"{val:.2f}" if pd.notnull(val) else "" for val in row] for row in pivot.values ] # Create a copy of the pivot data for visualization # For LCS and Semantic Similarity, we need to reverse the color scale # so that higher values (more similarity) are darker viz_values = pivot.values.copy() # Determine if we need to reverse the values for consistent color interpretation # (darker = more similar across all metrics) reverse_colorscale = False # All metrics should have darker colors for higher similarity # No need to reverse values anymore - we'll use the same scale for all fig = go.Figure( data=go.Heatmap( z=viz_values, x=cleaned_columns, y=pivot.index, colorscale=cmap, reversescale=reverse_colorscale, # Use the same scale direction for all metrics zmin=float(np.nanmin(viz_values)), zmax=float(np.nanmax(viz_values)), text=text, texttemplate="%{text}", hovertemplate="Chapter %{y}
Text Pair: %{x}
Value: %{z:.2f}", colorbar=dict(title=metric, thickness=20, tickfont=dict(size=14)), ) ) plot_title = ( descriptive_titles.get(metric, metric) if descriptive_titles else metric ) fig.update_layout( title=plot_title, xaxis_title="Text Pair", yaxis_title="Chapter", autosize=False, width=1350, height=1200, font=dict(size=16), margin=dict(l=140, b=80, t=60), ) fig.update_xaxes(tickangle=30, tickfont=dict(size=16)) fig.update_yaxes(tickfont=dict(size=16), autorange="reversed") # Ensure all integer chapter numbers are shown if the axis is numeric and reversed if pd.api.types.is_numeric_dtype(pivot.index): fig.update_yaxes( tickmode="array", tickvals=pivot.index, ticktext=[str(i) for i in pivot.index], ) heatmaps[metric] = fig return heatmaps def generate_word_count_chart(word_counts_df: pd.DataFrame): """ Generates a bar chart for word counts per segment (file/chapter). Args: word_counts_df: DataFrame with 'Filename', 'ChapterNumber', 'SegmentID', 'WordCount'. Returns: plotly Figure for the bar chart, or None if input is empty. """ if word_counts_df.empty: return None fig = go.Figure() # Assign colors based on Filename unique_files = sorted(word_counts_df["Filename"].unique()) colors = px.colors.qualitative.Plotly # Get a default Plotly color sequence for i, filename in enumerate(unique_files): file_df = word_counts_df[word_counts_df["Filename"] == filename].sort_values( "ChapterNumber" ) fig.add_trace( go.Bar( x=file_df["ChapterNumber"], y=file_df["WordCount"], name=filename, marker_color=colors[i % len(colors)], text=file_df["WordCount"], textposition="auto", customdata=file_df[["Filename"]], # Pass Filename for hovertemplate hovertemplate="File: %{customdata[0]}
" + "Chapter: %{x}
" + "Word Count: %{y}", ) ) fig.update_layout( title_text="Word Counts per Chapter (Grouped by File)", xaxis_title="Chapter Number", yaxis_title="Word Count", barmode="group", font=dict(size=14), legend_title_text="Filename", xaxis=dict( type="category", # Treat chapter numbers as categories automargin=True # Automatically adjust margin for x-axis labels/title ), yaxis=dict( rangemode='tozero', # Ensure y-axis starts at 0 and includes max value automargin=True # Automatically adjust margin for y-axis labels/title ), autosize=True, # Keep for responsiveness in Gradio margin=dict(l=80, r=50, b=100, t=50, pad=4) # Keep existing base margins ) # Ensure x-axis ticks are shown for all chapter numbers present all_chapter_numbers = sorted(word_counts_df["ChapterNumber"].unique()) fig.update_xaxes( tickmode="array", tickvals=all_chapter_numbers, ticktext=[str(ch) for ch in all_chapter_numbers], ) return fig