Spaces:

FelixPhilip
/

DeepFundingOracle

Sleeping

App Files Files Community

FelixPhilip commited on 25 days ago

Commit

bf9b592

1 Parent(s): d3e9b26

Oracle

Browse files

Files changed (2) hide show

Oracle/deepfundingoracle.py +69 -4
app.py +194 -51

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -47,8 +47,6 @@ from sklearn.preprocessing import RobustScaler
 from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.preprocessing import StandardScaler
-import matplotlib.pyplot as plt
-import seaborn as sns
 from scipy.special import log1p, expm1
 from sklearn.preprocessing import RobustScaler
 from sklearn.metrics import mean_squared_error
@@ -65,9 +63,76 @@ logging.basicConfig(
         logging.StreamHandler(sys.stdout)
     ],
     level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s"
-)
 ##############################
 #  GitHub API helper: Fetch repository metrics

 from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.preprocessing import StandardScaler
 from scipy.special import log1p, expm1
 from sklearn.preprocessing import RobustScaler
 from sklearn.metrics import mean_squared_error
         logging.StreamHandler(sys.stdout)
     ],
     level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s")
+# Add these functions to make the pipeline importable by app.py
+def prepare_dataset(file_path):
+    """
+    Wrapper function that prepares the dataset by:
+    1. Loading the CSV
+    2. Fetching GitHub features
+    3. Adding derived features
+    4. Cleaning data
+    5. Generating base weights using LLM
+    Args:
+        file_path: Path to the input CSV file
+    Returns:
+        DataFrame with all features and base_weight prepared
+    """
+    logging.info(f"Preparing dataset from {file_path}")
+    # Load data
+    if isinstance(file_path, str):
+        df = pd.read_csv(file_path)
+    else:
+        # Handle file object (from Gradio)
+        df = pd.read_csv(file_path)
+    # Check required columns
+    if not {"repo", "parent"}.issubset(df.columns):
+        raise ValueError("Input CSV must contain 'repo' and 'parent' columns.")
+    # Run the pipeline steps
+    df = fetch_github_features(df)
+    df = add_derived_features(df)
+    df = clean_data(df)
+    df = generate_all_base_weights(df)
+    return df
+def run_full_pipeline(input_file, output_file="submission_enhanced.csv"):
+    """
+    Runs the complete DeepFunding Oracle pipeline.
+    Args:
+        input_file: Path to input CSV file
+        output_file: Path for output CSV file
+    Returns:
+        The processed DataFrame with final_weight column
+    """
+    logging.info("--- Starting DeepFunding Oracle Pipeline ---")
+    # Prepare dataset
+    df = prepare_dataset(input_file)
+    # Train model and predict weights
+    df = train_predict_weight(df)
+    # Normalize weights
+    df = normalize_and_clip_weights(df)
+    # Save results
+    create_submission_csv(df, output_file)
+    logging.info("--- Pipeline Completed Successfully ---")
+    return df
 ##############################
 #  GitHub API helper: Fetch repository metrics

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import os
 import gradio as gr
-from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
@@ -8,59 +7,203 @@ import numpy as np
 import time
 import io
 from PIL import Image
 def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
     start_time = time.time()
-    progress(0, desc="Preparing dataset...")
-    df = prepare_dataset(file.name)
-    progress(0.3, desc="Predicting weights...")
-    df = train_predict_weight(df)
-    progress(0.6, desc="Saving results to CSV...")
-    csv_path = create_submission_csv(df, "submission.csv")
-    progress(0.8, desc="Generating graphs...")
-    # Feature distribution plot
-    dist_fig = plt.figure(figsize=(15, 10))
-    numeric_cols = df.select_dtypes(include=[np.number]).columns
-    df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
-    plt.suptitle("Feature Distributions", fontsize=16)
-    dist_buf = io.BytesIO()
-    plt.savefig(dist_buf, format='png')
-    dist_buf.seek(0)
-    plt.close(dist_fig)
-    dist_img = Image.open(dist_buf)
-    # Correlation matrix plot
-    corr_fig = plt.figure(figsize=(12, 8))
-    correlation_matrix = df[numeric_cols].corr()
-    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
-    plt.title("Feature Correlation Matrix", fontsize=16)
-    corr_buf = io.BytesIO()
-    plt.savefig(corr_buf, format='png')
-    corr_buf.seek(0)
-    plt.close(corr_fig)
-    corr_img = Image.open(corr_buf)
-    progress(1, desc="Done!")
-    elapsed = time.time() - start_time
-    preview = df.head().to_csv(index=False)
-    return preview, csv_path, dist_img, corr_img, f"Analysis completed in {elapsed:.2f} seconds."
-iface = gr.Interface(
-    fn=analyze_file,
-    inputs=gr.File(label="Upload CSV"),
-    outputs=[
-        gr.Textbox(label="Preview of Results"),
-        gr.File(label="Download CSV"),
-        gr.Image(label="Feature Distributions"),
-        gr.Image(label="Feature Correlation Matrix"),
-        gr.Textbox(label="Status/Timing Info")
-    ],
-    title="DeepFunding Oracle",
-    description="Upload a CSV of repo-parent relationships; see analysis progress, get graphs, and download results as CSV.",
-    allow_flagging="never"
-)
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
-    iface.launch(server_name="0.0.0.0", server_port=port)

 import os
 import gradio as gr
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import time
 import io
 from PIL import Image
+import logging
+# Import the functions from deepfundingoracle
+from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv, \
+    normalize_and_clip_weights
+# Configure logging
+logging.basicConfig(level=logging.INFO)
 def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
+    """
+    Analyzes the uploaded file and generates results.
+    """
     start_time = time.time()
+    try:
+        # Step 1: Prepare dataset
+        progress(0, desc="Preparing dataset...")
+        df = prepare_dataset(file.name)
+        # Step 2: Train model and predict weights
+        progress(0.3, desc="Training model and predicting weights...")
+        df = train_predict_weight(df)
+        # Step 3: Normalize weights
+        progress(0.5, desc="Normalizing weights...")
+        df = normalize_and_clip_weights(df)
+        # Step 4: Save results
+        progress(0.6, desc="Saving results to CSV...")
+        output_filename = "submission.csv"
+        create_submission_csv(df, output_filename)
+        # Step 5: Generate visualizations
+        progress(0.8, desc="Generating graphs...")
+        # Feature distribution plot
+        dist_fig = plt.figure(figsize=(15, 10))
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        plot_cols = [col for col in numeric_cols if
+                     col in ['stars', 'forks', 'watchers', 'contributors', 'pulls', 'final_weight']]
+        if plot_cols:
+            df[plot_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
+            plt.suptitle("Feature Distributions", fontsize=16)
+            plt.tight_layout()
+        dist_buf = io.BytesIO()
+        plt.savefig(dist_buf, format='png', dpi=100, bbox_inches='tight')
+        dist_buf.seek(0)
+        plt.close(dist_fig)
+        dist_img = Image.open(dist_buf)
+        # Correlation matrix plot
+        corr_fig = plt.figure(figsize=(12, 8))
+        if len(plot_cols) > 1:
+            correlation_matrix = df[plot_cols].corr()
+            sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
+            plt.title("Feature Correlation Matrix", fontsize=16)
+        corr_buf = io.BytesIO()
+        plt.savefig(corr_buf, format='png', dpi=100, bbox_inches='tight')
+        corr_buf.seek(0)
+        plt.close(corr_fig)
+        corr_img = Image.open(corr_buf)
+        # Prepare preview
+        progress(1, desc="Done!")
+        elapsed = time.time() - start_time
+        # Create a summary preview
+        summary_df = df[['repo', 'parent', 'final_weight']].head(10)
+        preview = f"Top 10 Results:\n{summary_df.to_string(index=False)}\n\nTotal repositories analyzed: {len(df)}"
+        # Return the path to the generated file for automatic download
+        return (
+            preview,
+            output_filename,  # This will trigger automatic download
+            dist_img,
+            corr_img,
+            f"✅ Analysis completed successfully in {elapsed:.2f} seconds.\n📥 Results file ready for download!"
+        )
+    except Exception as e:
+        logging.error(f"Error during analysis: {str(e)}")
+        elapsed = time.time() - start_time
+        error_msg = f"❌ Error: {str(e)}\nTime elapsed: {elapsed:.2f} seconds"
+        # Return empty images and error message
+        empty_img = Image.new('RGB', (800, 600), color='white')
+        return error_msg, None, empty_img, empty_img, error_msg
+# Custom CSS for better styling
+custom_css = """
+    .download-button {
+        background-color: #4CAF50 !important;
+        color: white !important;
+        font-weight: bold !important;
+    }
+    .status-box {
+        font-family: monospace;
+        padding: 10px;
+        border-radius: 5px;
+    }
+"""
+# Create Gradio interface with automatic download
+with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as iface:
+    gr.Markdown("""
+    # 🚀 DeepFunding Oracle
+    Upload a CSV file containing repository dependencies with 'repo' and 'parent' columns.
+    The system will:
+    1. **Fetch** GitHub metrics for each repository
+    2. **Generate** importance weights using AI
+    3. **Train** a model to predict final contribution weights
+    4. **Normalize** weights so they sum to 1 per parent
+    ⚠️ **Note**: Set `GITHUB_API_TOKEN` environment variable for better API rate limits.
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            file_input = gr.File(
+                label="Upload CSV File",
+                file_types=[".csv"],
+                elem_id="file-upload"
+            )
+            analyze_btn = gr.Button("🔍 Analyze", variant="primary", size="lg")
+        with gr.Column(scale=2):
+            status_output = gr.Textbox(
+                label="Status",
+                lines=3,
+                elem_classes="status-box"
+            )
+    with gr.Row():
+        preview_output = gr.Textbox(
+            label="Preview of Results",
+            lines=15,
+            show_copy_button=True
+        )
+    with gr.Row():
+        download_output = gr.File(
+            label="📥 Download Results CSV",
+            visible=True,
+            elem_classes="download-button"
+        )
+    with gr.Row():
+        with gr.Column():
+            dist_plot = gr.Image(label="Feature Distributions")
+        with gr.Column():
+            corr_plot = gr.Image(label="Feature Correlation Matrix")
+    # JavaScript for automatic download
+    download_js = """
+    () => {
+        setTimeout(() => {
+            const downloadButton = document.querySelector('.download-button a');
+            if (downloadButton) {
+                downloadButton.click();
+            }
+        }, 500);
+    }
+    """
+    # Set up the event handler
+    analyze_btn.click(
+        fn=analyze_file,
+        inputs=[file_input],
+        outputs=[preview_output, download_output, dist_plot, corr_plot, status_output]
+    ).then(
+        fn=None,
+        inputs=None,
+        outputs=None,
+        _js=download_js  # This triggers automatic download
+    )
+    # Add example usage
+    gr.Examples(
+        examples=[["example_dependencies.csv"]],  # Add your example file here if you have one
+        inputs=file_input,
+        outputs=[preview_output, download_output, dist_plot, corr_plot, status_output],
+        fn=analyze_file,
+        cache_examples=False,
+    )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", 7860))
+    iface.launch(
+        server_name="0.0.0.0",
+        server_port=port,
+        share=False,
+        show_error=True
+    )