import gradio as gr import nltk import os import pandas as pd from nltk.tokenize import TreebankWordTokenizer from sklearn.metrics.pairwise import cosine_similarity from sentence_transformers import SentenceTransformer import graphviz from typing import Tuple, Optional from visuals.score_card import render_score_card # Updated import from visuals.layout import ( render_page_header, render_core_reference, render_pipeline, render_pipeline_graph, render_pipeline_warning, render_strategy_alignment, ) # Updated import # Ensure NLTK data is downloaded try: nltk.download("punkt", quiet=True) except Exception as e: print(f"Error downloading NLTK data: {e}") # Load SentenceTransformer model model = SentenceTransformer("all-MiniLM-L6-v2") def calculate_ttr(text: str) -> float: """Calculates Type-Token Ratio (TTR) for lexical diversity.""" if not text: return 0.0 words = text.split() unique_words = set(words) return len(unique_words) / len(words) if words else 0.0 def calculate_similarity(text1: str, text2: str) -> float: """Calculates cosine similarity between two texts.""" embeddings = model.encode([text1, text2]) return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0] def calculate_mad_score(ttr: float, similarity: float) -> float: """Calculates the MAD score.""" return 0.3 * (1 - ttr) + 0.7 * similarity def get_risk_level(mad_score: float) -> str: """Determines the risk level based on the MAD score.""" if mad_score > 0.7: return "High" elif 0.4 <= mad_score <= 0.7: return "Medium" else: return "Low" def process_data(file_obj, model_col: str, train_col: str, data_source: str) -> Tuple[ Optional[str], Optional[bytes], Optional[str], Optional[str], Optional[float], Optional[float], Optional[float], ]: """Processes the uploaded file and calculates metrics.""" try: if not file_obj: return "Error: No file uploaded.", None, None, None, None, None, None global uploaded_df df = uploaded_df.get("data") if df is None: return "Error: File not yet processed.", None, None, None, None, None, None if model_col not in df.columns or train_col not in df.columns: return ( "Error: Selected columns not found in the file.", None, None, None, None, None, None, ) output_text = " ".join(df[model_col].astype(str)) train_text = " ".join(df[train_col].astype(str)) ttr_output = calculate_ttr(output_text) ttr_train = calculate_ttr(train_text) similarity = calculate_similarity(output_text, train_text) mad_score = calculate_mad_score(ttr_output, similarity) risk_level = get_risk_level(mad_score) summary, details, explanation = render_score_card( ttr_output, ttr_train, similarity, mad_score, risk_level ) evaluation_markdown = summary + details + explanation return ( None, render_pipeline_graph(data_source), df.head().to_markdown(index=False, numalign="left", stralign="left"), evaluation_markdown, ttr_output, ttr_train, similarity, ) except Exception as e: return f"An error occurred: {str(e)}", None, None, None, None, None, None # Store uploaded DataFrame globally for later access uploaded_df = {} def update_dropdowns(file_obj) -> Tuple[gr.Dropdown, gr.Dropdown, str]: global uploaded_df if not file_obj: uploaded_df["data"] = None # Clear cached file return ( gr.update(choices=[], value=None), gr.update(choices=[], value=None), "No file uploaded.", ) # Read the file and extract columns try: file_name = getattr(file_obj, "name", "") if file_name.endswith(".csv"): df = pd.read_csv(file_obj) elif file_name.endswith(".json"): df = pd.read_json(file_obj) else: return ( gr.update(choices=[], value=None), gr.update(choices=[], value=None), "Invalid file type.", ) uploaded_df["data"] = df columns = df.columns.tolist() preview = df.head().to_markdown(index=False, numalign="left", stralign="left") return ( gr.update(choices=columns, value=None), gr.update(choices=columns, value=None), preview, ) except Exception as e: return ( gr.update(choices=[], value=None), gr.update(choices=[], value=None), f"Error reading file: {e}", ) def clear_all_fields(): global uploaded_df uploaded_df.clear() # Clear stored DataFrame return ( None, # file_input gr.update(choices=[], value=None), # model_col_input gr.update(choices=[], value=None), # train_col_input "", # file_preview "", # output_markdown "", # warning_output None, # ttr_output_metric None, # ttr_train_metric None, # similarity_metric render_pipeline_graph("Synthetic Generated Data"), # pipeline_output default ) def main_interface(): css = """ .gradio-container { background: linear-gradient(-45deg, #e0f7fa, #e1f5fe, #f1f8e9, #fff3e0); background-size: 400% 400%; animation: oceanWaves 20s ease infinite; } @keyframes oceanWaves { 0% { background-position: 0% 50%; } 50% { background-position: 100% 50%; } 100% { background-position: 0% 50%; } } """ with gr.Blocks(css=css, title="MADGuard AI Explorer") as interface: gr.HTML(render_page_header()) gr.Markdown( """ > ๐ง **MADGuard AI Explorer** helps AI engineers, researchers, and MLOps teams simulate feedback loops in RAG pipelines and detect **Model Autophagy Disorder (MAD)** โ where models start learning from their own outputs, leading to degraded performance. - Compare **real vs. synthetic input effects** - Visualize the data flow - Upload your `.csv` or `.json` data - Get immediate MAD risk diagnostics based on lexical diversity and semantic similarity """ ) with gr.Accordion("๐ Research Reference", open=False): gr.HTML(render_core_reference()) gr.HTML( """