Spaces:

Priti0210
/

MadGuard

Running

App Files Files Community

Priti0210 commited on May 28

Commit

009ec65

1 Parent(s): 83e2244

Removed tutorial.mov to meet Hugging Face size limits

Browse files

Files changed (4) hide show

.gitignore +7 -0
app.py +59 -76
visuals/layout.py +14 -16
visuals/score_card.py +30 -63

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__/
+*.pyc
+.vscode/
+*.mov
+*.mp4
+.DS_Store
+.env

app.py CHANGED Viewed

@@ -1,13 +1,11 @@
 import gradio as gr
 import nltk
-import os
 import pandas as pd
-from nltk.tokenize import TreebankWordTokenizer
 from sklearn.metrics.pairwise import cosine_similarity
 from sentence_transformers import SentenceTransformer
-import graphviz
 from typing import Tuple, Optional
-from visuals.score_card import render_score_card  # Updated import
 from visuals.layout import (
     render_page_header,
     render_core_reference,
@@ -15,48 +13,46 @@ from visuals.layout import (
     render_pipeline_graph,
     render_pipeline_warning,
     render_strategy_alignment,
-)  # Updated import
-# Ensure NLTK data is downloaded
 try:
     nltk.download("punkt", quiet=True)
 except Exception as e:
     print(f"Error downloading NLTK data: {e}")
-# Load SentenceTransformer model
 model = SentenceTransformer("all-MiniLM-L6-v2")
 def calculate_ttr(text: str) -> float:
-    """Calculates Type-Token Ratio (TTR) for lexical diversity."""
-    if not text:
-        return 0.0
     words = text.split()
     unique_words = set(words)
     return len(unique_words) / len(words) if words else 0.0
 def calculate_similarity(text1: str, text2: str) -> float:
-    """Calculates cosine similarity between two texts."""
     embeddings = model.encode([text1, text2])
     return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
 def calculate_mad_score(ttr: float, similarity: float) -> float:
-    """Calculates the MAD score."""
     return 0.3 * (1 - ttr) + 0.7 * similarity
 def get_risk_level(mad_score: float) -> str:
-    """Determines the risk level based on the MAD score."""
     if mad_score > 0.7:
         return "High"
     elif 0.4 <= mad_score <= 0.7:
         return "Medium"
-    else:
-        return "Low"
 def process_data(file_obj, model_col: str, train_col: str, data_source: str) -> Tuple[
     Optional[str],
     Optional[bytes],
@@ -66,12 +62,10 @@ def process_data(file_obj, model_col: str, train_col: str, data_source: str) ->
     Optional[float],
     Optional[float],
 ]:
-    """Processes the uploaded file and calculates metrics."""
     try:
         if not file_obj:
             return "Error: No file uploaded.", None, None, None, None, None, None
-        global uploaded_df
         df = uploaded_df.get("data")
         if df is None:
             return "Error: File not yet processed.", None, None, None, None, None, None
@@ -110,25 +104,22 @@ def process_data(file_obj, model_col: str, train_col: str, data_source: str) ->
             ttr_train,
             similarity,
         )
     except Exception as e:
         return f"An error occurred: {str(e)}", None, None, None, None, None, None
-# Store uploaded DataFrame globally for later access
-uploaded_df = {}
 def update_dropdowns(file_obj) -> Tuple[gr.Dropdown, gr.Dropdown, str]:
     global uploaded_df
     if not file_obj:
-        uploaded_df["data"] = None  # Clear cached file
         return (
             gr.update(choices=[], value=None),
             gr.update(choices=[], value=None),
             "No file uploaded.",
         )
-    # Read the file and extract columns
     try:
         file_name = getattr(file_obj, "name", "")
         if file_name.endswith(".csv"):
@@ -143,12 +134,10 @@ def update_dropdowns(file_obj) -> Tuple[gr.Dropdown, gr.Dropdown, str]:
             )
         uploaded_df["data"] = df
-        columns = df.columns.tolist()
         preview = df.head().to_markdown(index=False, numalign="left", stralign="left")
         return (
-            gr.update(choices=columns, value=None),
-            gr.update(choices=columns, value=None),
             preview,
         )
@@ -161,22 +150,22 @@ def update_dropdowns(file_obj) -> Tuple[gr.Dropdown, gr.Dropdown, str]:
 def clear_all_fields():
-    global uploaded_df
-    uploaded_df.clear()  # Clear stored DataFrame
     return (
-        None,  # file_input
-        gr.update(choices=[], value=None),  # model_col_input
-        gr.update(choices=[], value=None),  # train_col_input
-        "",  # file_preview
-        "",  # output_markdown
-        "",  # warning_output
-        None,  # ttr_output_metric
-        None,  # ttr_train_metric
-        None,  # similarity_metric
-        render_pipeline_graph("Synthetic Generated Data"),  # pipeline_output default
     )
 def main_interface():
     css = """
     .gradio-container {
@@ -194,38 +183,39 @@ def main_interface():
     with gr.Blocks(css=css, title="MADGuard AI Explorer") as interface:
         gr.HTML(render_page_header())
         gr.Markdown(
             """
-            > 🧠 **MADGuard AI Explorer** helps AI engineers, researchers, and MLOps teams simulate feedback loops in RAG pipelines and detect **Model Autophagy Disorder (MAD)** — where models start learning from their own outputs, leading to degraded performance.
             - Compare **real vs. synthetic input effects**
             - Visualize the data flow
             - Upload your `.csv` or `.json` data
-            - Get immediate MAD risk diagnostics based on lexical diversity and semantic similarity
             """
         )
         with gr.Accordion("📚 Research Reference", open=False):
             gr.HTML(render_core_reference())
-        gr.HTML(
-            """
-            <div style="display: flex; flex-direction: column; align-items: center; margin-bottom: 20px;">
-                <h3 style="text-align: center;">📽️ How to Use MADGuard AI Explorer</h3>
-                <iframe width="720" height="405"
-                        src="https://www.youtube.com/embed/qjMwvaBXQeY"
-                        title="MADGuard AI Tutorial" frameborder="0"
-                        allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
-                        allowfullscreen></iframe>
-            </div>
-            """
-        )
         gr.Markdown("## 1. Pipeline Simulation")
         data_source, description = render_pipeline(default="Synthetic Generated Data")
         gr.HTML(description)
         pipeline_output = gr.Image(type="filepath", label="Pipeline Graph")
         warning_output = gr.HTML()
         data_source.change(
             fn=render_pipeline_warning, inputs=data_source, outputs=warning_output
         )
@@ -233,9 +223,7 @@ def main_interface():
             fn=render_pipeline_graph, inputs=data_source, outputs=pipeline_output
         )
         interface.load(
-            fn=render_pipeline_graph,
-            inputs=[data_source],
-            outputs=[pipeline_output],
         )
         gr.Markdown("## 2. Upload CSV or JSON File")
@@ -246,28 +234,23 @@ def main_interface():
         gr.Markdown(
             """
-    📝 **Note:**
-    - **Model Output Column**: Select the column that contains generated responses, completions, or predictions from your model.
-    - **Training Data Column**: Select the column that may be used for future training or fine-tuning.
-    This helps MADGuard simulate feedback loops by comparing lexical and semantic overlap between current output and future inputs.
-    """
         )
         with gr.Row():
             model_col_input = gr.Dropdown(
-                choices=[],
-                value=None,
-                label="Select column for model output",
-                interactive=True,
             )
             train_col_input = gr.Dropdown(
                 choices=[],
-                value=None,
                 label="Select column for future training data",
                 interactive=True,
             )
-        file_preview = gr.Markdown(label="📄 File Preview")
         output_markdown = gr.Markdown(label="🔍 Evaluation Summary")
         with gr.Accordion("📋 Research-Based Strategy Alignment", open=False):
@@ -290,7 +273,7 @@ def main_interface():
         )
         def process_and_generate(
-            file_obj, model_col_val: str, train_col_val: str, data_source_val: str
         ):
             error, graph, preview, markdown, ttr_out, ttr_tr, sim = process_data(
                 file_obj, model_col_val, train_col_val, data_source_val
@@ -319,6 +302,7 @@ def main_interface():
             ttr_train_metric,
             similarity_metric,
         ]
         clear_btn.click(
             fn=clear_all_fields,
             inputs=[],
@@ -344,19 +328,18 @@ def main_interface():
         gr.Markdown("---")
         gr.Markdown(
             """
-        **The upcoming Pro version of MADGuard will allow:**
-        - 📂 Bulk upload support for `.csv` files or folders of `.txt` documents
-        - 📊 Automated batch scoring with trend visualizations over time
-        - 🧾 One-click export of audit-ready diagnostic reports
-        [**📩 Join the waitlist**](https://docs.google.com/forms/d/e/1FAIpQLSfAPPC_Gm7DQElQSWGSnoB6T5hMxb_rXSu48OC8E6TNGZuKgQ/viewform?usp=sharing&ouid=118007615320536574300)
-        """
         )
     return interface
-# Launch the Gradio interface
 if __name__ == "__main__":
     interface = main_interface()
     interface.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import nltk
 import pandas as pd
 from sklearn.metrics.pairwise import cosine_similarity
 from sentence_transformers import SentenceTransformer
 from typing import Tuple, Optional
+from visuals.score_card import render_score_card
 from visuals.layout import (
     render_page_header,
     render_core_reference,
     render_pipeline_graph,
     render_pipeline_warning,
     render_strategy_alignment,
+)
+# Download tokenizer if not already available
 try:
     nltk.download("punkt", quiet=True)
 except Exception as e:
     print(f"Error downloading NLTK data: {e}")
+# Load embedding model
 model = SentenceTransformer("all-MiniLM-L6-v2")
+# Global state to store uploaded DataFrame
+uploaded_df = {}
+# --- Core Metrics ---
 def calculate_ttr(text: str) -> float:
     words = text.split()
     unique_words = set(words)
     return len(unique_words) / len(words) if words else 0.0
 def calculate_similarity(text1: str, text2: str) -> float:
     embeddings = model.encode([text1, text2])
     return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
 def calculate_mad_score(ttr: float, similarity: float) -> float:
     return 0.3 * (1 - ttr) + 0.7 * similarity
 def get_risk_level(mad_score: float) -> str:
     if mad_score > 0.7:
         return "High"
     elif 0.4 <= mad_score <= 0.7:
         return "Medium"
+    return "Low"
+# --- Data Processing ---
 def process_data(file_obj, model_col: str, train_col: str, data_source: str) -> Tuple[
     Optional[str],
     Optional[bytes],
     Optional[float],
     Optional[float],
 ]:
     try:
         if not file_obj:
             return "Error: No file uploaded.", None, None, None, None, None, None
         df = uploaded_df.get("data")
         if df is None:
             return "Error: File not yet processed.", None, None, None, None, None, None
             ttr_train,
             similarity,
         )
     except Exception as e:
         return f"An error occurred: {str(e)}", None, None, None, None, None, None
+# --- Helpers ---
 def update_dropdowns(file_obj) -> Tuple[gr.Dropdown, gr.Dropdown, str]:
     global uploaded_df
     if not file_obj:
+        uploaded_df["data"] = None
         return (
             gr.update(choices=[], value=None),
             gr.update(choices=[], value=None),
             "No file uploaded.",
         )
     try:
         file_name = getattr(file_obj, "name", "")
         if file_name.endswith(".csv"):
             )
         uploaded_df["data"] = df
         preview = df.head().to_markdown(index=False, numalign="left", stralign="left")
         return (
+            gr.update(choices=df.columns.tolist(), value=None),
+            gr.update(choices=df.columns.tolist(), value=None),
             preview,
         )
 def clear_all_fields():
+    uploaded_df.clear()
     return (
+        None,
+        gr.update(choices=[], value=None),
+        gr.update(choices=[], value=None),
+        "",
+        "",
+        "",
+        None,
+        None,
+        None,
+        render_pipeline_graph("Synthetic Generated Data"),
     )
+# --- Interface ---
 def main_interface():
     css = """
     .gradio-container {
     with gr.Blocks(css=css, title="MADGuard AI Explorer") as interface:
         gr.HTML(render_page_header())
+        gr.HTML(
+            """
+            <div style="text-align:center; margin-bottom: 20px;">
+                <h3>📽️ How to Use MADGuard AI Explorer</h3>
+                <iframe width="560" height="315" src="https://www.youtube.com/embed/qjMwvaBXQeY"
+                        title="Tutorial Video" frameborder="0"
+                        allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
+                        allowfullscreen></iframe>
+            </div>
+            """
+        )
         gr.Markdown(
             """
+            > 🧠 **MADGuard AI Explorer** helps simulate feedback loops in RAG pipelines and detect **Model Autophagy Disorder (MAD)**.
             - Compare **real vs. synthetic input effects**
             - Visualize the data flow
             - Upload your `.csv` or `.json` data
+            - Get diagnostics based on lexical diversity and semantic similarity
             """
         )
         with gr.Accordion("📚 Research Reference", open=False):
             gr.HTML(render_core_reference())
         gr.Markdown("## 1. Pipeline Simulation")
         data_source, description = render_pipeline(default="Synthetic Generated Data")
         gr.HTML(description)
         pipeline_output = gr.Image(type="filepath", label="Pipeline Graph")
         warning_output = gr.HTML()
         data_source.change(
             fn=render_pipeline_warning, inputs=data_source, outputs=warning_output
         )
             fn=render_pipeline_graph, inputs=data_source, outputs=pipeline_output
         )
         interface.load(
+            fn=render_pipeline_graph, inputs=[data_source], outputs=[pipeline_output]
         )
         gr.Markdown("## 2. Upload CSV or JSON File")
         gr.Markdown(
             """
+        📝 **Note:**
+        - **Model Output Column**: Model-generated responses/completions.
+        - **Training Data Column**: Candidate future training input.
+        """
         )
         with gr.Row():
             model_col_input = gr.Dropdown(
+                choices=[], label="Select column for model output", interactive=True
             )
             train_col_input = gr.Dropdown(
                 choices=[],
                 label="Select column for future training data",
                 interactive=True,
             )
+        file_preview = gr.Markdown(label="📄 File Preview")
         output_markdown = gr.Markdown(label="🔍 Evaluation Summary")
         with gr.Accordion("📋 Research-Based Strategy Alignment", open=False):
         )
         def process_and_generate(
+            file_obj, model_col_val, train_col_val, data_source_val
         ):
             error, graph, preview, markdown, ttr_out, ttr_tr, sim = process_data(
                 file_obj, model_col_val, train_col_val, data_source_val
             ttr_train_metric,
             similarity_metric,
         ]
         clear_btn.click(
             fn=clear_all_fields,
             inputs=[],
         gr.Markdown("---")
         gr.Markdown(
             """
+            **Pro version coming soon:**
+            - Bulk CSV uploads
+            - Trend visualizations
+            - One-click export of audit reports
+            [📩 Join the waitlist](https://docs.google.com/forms/d/e/1FAIpQLSfAPPC_Gm7DQElQSWGSnoB6T5hMxb_rXSu48OC8E6TNGZuKgQ/viewform?usp=sharing&ouid=118007615320536574300)
+            """
         )
     return interface
 if __name__ == "__main__":
     interface = main_interface()
     interface.launch(server_name="0.0.0.0", server_port=7860)

visuals/layout.py CHANGED Viewed

@@ -3,11 +3,10 @@ import graphviz
 import pandas as pd
 from typing import Tuple
 import tempfile
-import os
 def render_page_header() -> str:
-    """Renders the page header."""
     return """
     <div style="text-align: center; margin-top: 1rem;">
         <h1 style="margin-bottom: 0.25rem;">MADGuard AI Explorer</h1>
@@ -17,21 +16,20 @@ def render_page_header() -> str:
 def render_core_reference() -> str:
-    """Renders the research reference section."""
     return """
     <details>
     <summary>📚 arXiv:2307.01850</summary>
     <p>
     <b>Self-consuming LLMs: How and When Models Feed Themselves</b> – <i>Santurkar et al., 2023</i><br>
-    This paper introduces and explores <b>Model Autophagy Disorder (MAD)</b> — showing that large language models trained on their own outputs tend to lose performance and accumulate error over time.
-    The paper proposes detection strategies that MADGuard implements, including:
     - Lexical diversity analysis
-    - Embedding-based similarity checks
-    - Warnings for training loop risks
-    <i>"MADGuard AI Explorer is inspired by key findings from this research, aligning with early warnings and pipeline hygiene practices recommended in their work."</i>
     📎 <a href="https://arxiv.org/pdf/2307.01850" target="_blank">Read Full Paper (arXiv)</a>
     </p>
     </details>
@@ -39,20 +37,21 @@ def render_core_reference() -> str:
 def render_pipeline(default: str = "Real User Inputs") -> Tuple[gr.Radio, str]:
-    """Renders the pipeline input selection."""
     with gr.Row():
         source = gr.Radio(
             ["Real User Inputs", "Synthetic Generated Data"],
             label="Select input source:",
             value=default,
-            # Removed 'help' parameter to avoid TypeError with Gradio 4.44.0
         )
-    description = """<center>ℹ️ Real User Inputs reflect human queries. Synthetic Generated Data simulates model-generated text being reused for retraining.</center>"""
     return source, description
 def render_pipeline_graph(source: str) -> str:
-    """Generates a graph of the RAG pipeline and returns the image file path."""
     dot = graphviz.Digraph(
         graph_attr={"rankdir": "LR", "bgcolor": "transparent"},
         node_attr={
@@ -72,7 +71,6 @@ def render_pipeline_graph(source: str) -> str:
         "Retraining Set" if source == "Synthetic Generated Data" else "Embedding Store",
     )
-    # Save to a temporary file and return the file path
     with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
         output_path = tmp_file.name
     dot.render(filename=output_path, format="png", cleanup=True)
@@ -80,7 +78,7 @@ def render_pipeline_graph(source: str) -> str:
 def render_pipeline_warning(source: str) -> str:
-    """Renders a warning message based on the data source."""
     if source == "Synthetic Generated Data":
         return "<div style='color:red; font-weight:bold;'>⚠️ High loop risk: Model may be learning from its own outputs.</div>"
     else:
@@ -88,7 +86,7 @@ def render_pipeline_warning(source: str) -> str:
 def render_strategy_alignment() -> str:
-    """Renders the strategy alignment table."""
     data = {
         "Strategy from Research": [
             "Lexical redundancy (e.g., n-gram overlap)",

 import pandas as pd
 from typing import Tuple
 import tempfile
 def render_page_header() -> str:
+    """Render the page header for the app."""
     return """
     <div style="text-align: center; margin-top: 1rem;">
         <h1 style="margin-bottom: 0.25rem;">MADGuard AI Explorer</h1>
 def render_core_reference() -> str:
+    """Render the reference to the research paper inspiring the app."""
     return """
     <details>
     <summary>📚 arXiv:2307.01850</summary>
     <p>
     <b>Self-consuming LLMs: How and When Models Feed Themselves</b> – <i>Santurkar et al., 2023</i><br>
+    Introduces <b>Model Autophagy Disorder (MAD)</b>—where LLMs degrade from consuming their own outputs.<br><br>
+    Detection strategies implemented in MADGuard include:
     - Lexical diversity analysis
+    - Semantic similarity scoring
+    - MAD risk score warnings
+    <i>MADGuard aligns with practices recommended in this paper.</i><br>
     📎 <a href="https://arxiv.org/pdf/2307.01850" target="_blank">Read Full Paper (arXiv)</a>
     </p>
     </details>
 def render_pipeline(default: str = "Real User Inputs") -> Tuple[gr.Radio, str]:
+    """Render the source selector for RAG simulation."""
     with gr.Row():
         source = gr.Radio(
             ["Real User Inputs", "Synthetic Generated Data"],
             label="Select input source:",
             value=default,
         )
+    description = """
+    <center>ℹ️ <b>Real User Inputs</b> = human queries. <b>Synthetic Generated Data</b> = model-generated content reused in training.</center>
+    """
     return source, description
 def render_pipeline_graph(source: str) -> str:
+    """Generate and return the file path of a RAG pipeline graph visualization."""
     dot = graphviz.Digraph(
         graph_attr={"rankdir": "LR", "bgcolor": "transparent"},
         node_attr={
         "Retraining Set" if source == "Synthetic Generated Data" else "Embedding Store",
     )
     with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_file:
         output_path = tmp_file.name
     dot.render(filename=output_path, format="png", cleanup=True)
 def render_pipeline_warning(source: str) -> str:
+    """Return warning text based on selected data source."""
     if source == "Synthetic Generated Data":
         return "<div style='color:red; font-weight:bold;'>⚠️ High loop risk: Model may be learning from its own outputs.</div>"
     else:
 def render_strategy_alignment() -> str:
+    """Return an HTML table comparing MADGuard features with research strategies."""
     data = {
         "Strategy from Research": [
             "Lexical redundancy (e.g., n-gram overlap)",

visuals/score_card.py CHANGED Viewed

@@ -1,74 +1,41 @@
-import gradio as gr
-from typing import Tuple
-def render_score_card(
-    ttr_output: float,
-    ttr_train: float,
-    similarity: float,
-    mad_score: float,
-    risk_level: str,
-) -> Tuple[str, str, str]:
-    """Renders the evaluation summary and score details."""
-    color = {"High": "#e57373", "Medium": "#ffb74d", "Low": "#81c784"}[risk_level]
-    risk_explanations = {
-        "High": """
-🚨 **High Risk Detected** Your model outputs are **very similar** to your planned training data.
-This suggests a **strong feedback loop**, meaning the model is likely to reinforce existing patterns rather than learning new behaviors.
-**What You Can Do**:
-- Replace synthetic data with more **diverse real user input** - Use **paraphrasing techniques** before reuse
-- Add **augmentation or filtering** before retraining
-""",
-        "Medium": """
-🟠 **Moderate Risk Identified** There is some overlap between your outputs and training content.
-Your model may partially reinforce existing phrasing patterns.
-**Suggestions**:
-- Mix synthetic and real inputs carefully
-- Monitor training logs for semantic redundancy
-""",
-        "Low": """
-🟢 **Low Risk Score** Your model output and training data appear **diverse** and distinct.
-This is a good sign that your model is learning from **new and varied sources**.
-**You’re on the right track!**
-""",
-    }
-    summary = f"""
-### 🔍 Evaluation Summary
-**Lexical Diversity (Output):** {ttr_output:.2f}
-TTR = unique words / total words
-**Lexical Diversity (Training Set):** {ttr_train:.2f}
-Broader content = higher TTR
-**Semantic Similarity (Cosine):** {similarity:.2f}
-Cosine similarity between embeddings
-<div style="padding: 1rem; background-color: #fdfdfd; border-left: 6px solid {color}; border-radius: 6px;">
-    <strong>MAD Risk Score:</strong> {mad_score:.2f} → <span style='color: {color}; font-weight: bold;'>{risk_level} Risk</span>
-</div>
-<div style='margin-top: 0.5rem; width: 100%; background: #e0e0e0; border-radius: 10px; height: 16px;'>
-    <div style='width: {mad_score * 100:.0f}%; background: {color}; height: 100%; border-radius: 10px;'></div>
-</div>
-"""
-    details = f"""
-<details>
-<summary>📊 Score Breakdown</summary>
-TTR Component (0.3 × (1 - TTR)): {(1 - ttr_output) * 0.3:.2f}
-Similarity Component (0.7 × Cosine): {similarity * 0.7:.2f}
-MAD Score = 0.3 × (1 - TTR) + 0.7 × Semantic Similarity
-</details>
-"""
-    explanation = f"""
-<details>
-<summary>🔍 What does this score mean?</summary>
-{risk_explanations[risk_level]}
-</details>
-"""
-    return summary, details, explanation

+import pandas as pd
+import streamlit as st
+def calculate_type_token_ratio(text: str) -> float:
+    """Calculate the Type-Token Ratio (TTR) for a given text."""
+    tokens = text.split()
+    unique_tokens = set(tokens)
+    return len(unique_tokens) / len(tokens) if tokens else 0
+def generate_score_card(data: pd.DataFrame, text_col: str, mode: str) -> pd.DataFrame:
+    """
+    Generate lexical metrics and risk scores for the uploaded dataset.
+    Args:
+        data: The input DataFrame.
+        text_col: Name of the column containing text data.
+        mode: Data source type ('Real User Inputs' or 'Synthetic Generated Data').
+    Returns:
+        DataFrame with added metrics.
+    """
+    result = data.copy()
+    if text_col not in result.columns:
+        st.error(f"Selected column '{text_col}' not found in uploaded data.")
+        return pd.DataFrame()
+    result["TTR"] = result[text_col].apply(calculate_type_token_ratio)
+    result["Length"] = result[text_col].apply(lambda x: len(x.split()))
+    # Risk scoring based on thresholds
+    result["Risk"] = result["TTR"].apply(
+        lambda ttr: "High" if ttr < 0.3 else "Medium" if ttr < 0.5 else "Low"
+    )
+    # Add a column to show data source
+    result["Data Source"] = mode
+    return result