import gradio as gr
import graphviz
import pandas as pd
from typing import Tuple
import tempfile
import os
def render_page_header() -> str:
"""Renders the page header."""
return """
MADGuard AI Explorer
Robust Diagnostic Mode for RAG Pipeline Feedback Loops
"""
def render_core_reference() -> str:
"""Renders the research reference section."""
return """
đ arXiv:2307.01850
Self-consuming LLMs: How and When Models Feed Themselves â Santurkar et al., 2023
This paper introduces and explores Model Autophagy Disorder (MAD) â showing that large language models trained on their own outputs tend to lose performance and accumulate error over time.
The paper proposes detection strategies that MADGuard implements, including:
- Lexical diversity analysis
- Embedding-based similarity checks
- Warnings for training loop risks
"MADGuard AI Explorer is inspired by key findings from this research, aligning with early warnings and pipeline hygiene practices recommended in their work."
đ Read Full Paper (arXiv)
"""
def render_pipeline(default: str = "Real User Inputs") -> Tuple[gr.Radio, str]:
"""Renders the pipeline input selection."""
with gr.Row():
source = gr.Radio(
["Real User Inputs", "Synthetic Generated Data"],
label="Select input source:",
value=default,
# Removed 'help' parameter to avoid TypeError with Gradio 4.44.0
)
description = """âšī¸ Real User Inputs reflect human queries. Synthetic Generated Data simulates model-generated text being reused for retraining."""
return source, description
def render_pipeline_graph(source: str) -> str:
"""Generates a graph of the RAG pipeline and returns the image file path."""
dot = graphviz.Digraph(
graph_attr={"rankdir": "LR", "bgcolor": "transparent"},
node_attr={
"style": "filled",
"fillcolor": "#fefefe",
"color": "#888888",
"fontname": "Helvetica",
"fontsize": "12",
},
edge_attr={"color": "#999999"},
)
dot.edge("User Query", "Retriever")
dot.edge("Retriever", "LLM")
dot.edge("LLM", "Response")
dot.edge(
"Response",
"Retraining Set" if source == "Synthetic Generated Data" else "Embedding Store",
)
# Save to a temporary file and return the file path
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
output_path = tmp_file.name
dot.render(filename=output_path, format="png", cleanup=True)
return output_path + ".png"
def render_pipeline_warning(source: str) -> str:
"""Renders a warning message based on the data source."""
if source == "Synthetic Generated Data":
return "â ī¸ High loop risk: Model may be learning from its own outputs.
"
else:
return "â
Healthy loop: Using diverse real inputs.
"
def render_strategy_alignment() -> str:
"""Renders the strategy alignment table."""
data = {
"Strategy from Research": [
"Lexical redundancy (e.g., n-gram overlap)",
"Embedding-based similarity scoring",
"Flagging high similarity for retraining risk",
"Distinguishing real vs. synthetic data",
"Tracking degradation over retraining iterations",
],
"Status in MADGuard": [
"â
Implemented via TTR",
"â
Implemented",
"â
Implemented (early warning)",
"â Not implemented",
"â Not implemented",
],
"Explanation": [
"MADGuard uses Type-Token Ratio, a proxy for repetition.",
"Uses SentenceTransformers + cosine similarity.",
"Provides a risk score but doesn't block data.",
"Does not currently track source origin.",
"No multi-round training history/logs yet.",
],
}
df = pd.DataFrame(data)
html = """
Strategy from Research | Status in MADGuard | Explanation |
"""
for i in range(len(data["Strategy from Research"])):
html += f"""
{data["Strategy from Research"][i]} |
{data["Status in MADGuard"][i]} |
{data["Explanation"][i]} |
"""
html += """
"""
return html