Spaces:
Sleeping
Sleeping
Commit
Β·
bf9b592
1
Parent(s):
d3e9b26
Oracle
Browse files- Oracle/deepfundingoracle.py +69 -4
- app.py +194 -51
Oracle/deepfundingoracle.py
CHANGED
@@ -47,8 +47,6 @@ from sklearn.preprocessing import RobustScaler
|
|
47 |
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold
|
48 |
from sklearn.ensemble import RandomForestRegressor
|
49 |
from sklearn.preprocessing import StandardScaler
|
50 |
-
import matplotlib.pyplot as plt
|
51 |
-
import seaborn as sns
|
52 |
from scipy.special import log1p, expm1
|
53 |
from sklearn.preprocessing import RobustScaler
|
54 |
from sklearn.metrics import mean_squared_error
|
@@ -65,9 +63,76 @@ logging.basicConfig(
|
|
65 |
logging.StreamHandler(sys.stdout)
|
66 |
],
|
67 |
level=logging.INFO,
|
68 |
-
format="%(asctime)s - %(levelname)s - %(message)s"
|
69 |
-
)
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
##############################
|
73 |
# GitHub API helper: Fetch repository metrics
|
|
|
47 |
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold
|
48 |
from sklearn.ensemble import RandomForestRegressor
|
49 |
from sklearn.preprocessing import StandardScaler
|
|
|
|
|
50 |
from scipy.special import log1p, expm1
|
51 |
from sklearn.preprocessing import RobustScaler
|
52 |
from sklearn.metrics import mean_squared_error
|
|
|
63 |
logging.StreamHandler(sys.stdout)
|
64 |
],
|
65 |
level=logging.INFO,
|
66 |
+
format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
67 |
|
68 |
+
# Add these functions to make the pipeline importable by app.py
|
69 |
+
|
70 |
+
|
71 |
+
def prepare_dataset(file_path):
|
72 |
+
"""
|
73 |
+
Wrapper function that prepares the dataset by:
|
74 |
+
1. Loading the CSV
|
75 |
+
2. Fetching GitHub features
|
76 |
+
3. Adding derived features
|
77 |
+
4. Cleaning data
|
78 |
+
5. Generating base weights using LLM
|
79 |
+
|
80 |
+
Args:
|
81 |
+
file_path: Path to the input CSV file
|
82 |
+
|
83 |
+
Returns:
|
84 |
+
DataFrame with all features and base_weight prepared
|
85 |
+
"""
|
86 |
+
logging.info(f"Preparing dataset from {file_path}")
|
87 |
+
|
88 |
+
# Load data
|
89 |
+
if isinstance(file_path, str):
|
90 |
+
df = pd.read_csv(file_path)
|
91 |
+
else:
|
92 |
+
# Handle file object (from Gradio)
|
93 |
+
df = pd.read_csv(file_path)
|
94 |
+
|
95 |
+
# Check required columns
|
96 |
+
if not {"repo", "parent"}.issubset(df.columns):
|
97 |
+
raise ValueError("Input CSV must contain 'repo' and 'parent' columns.")
|
98 |
+
|
99 |
+
# Run the pipeline steps
|
100 |
+
df = fetch_github_features(df)
|
101 |
+
df = add_derived_features(df)
|
102 |
+
df = clean_data(df)
|
103 |
+
df = generate_all_base_weights(df)
|
104 |
+
|
105 |
+
return df
|
106 |
+
|
107 |
+
|
108 |
+
def run_full_pipeline(input_file, output_file="submission_enhanced.csv"):
|
109 |
+
"""
|
110 |
+
Runs the complete DeepFunding Oracle pipeline.
|
111 |
+
|
112 |
+
Args:
|
113 |
+
input_file: Path to input CSV file
|
114 |
+
output_file: Path for output CSV file
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
The processed DataFrame with final_weight column
|
118 |
+
"""
|
119 |
+
logging.info("--- Starting DeepFunding Oracle Pipeline ---")
|
120 |
+
|
121 |
+
# Prepare dataset
|
122 |
+
df = prepare_dataset(input_file)
|
123 |
+
|
124 |
+
# Train model and predict weights
|
125 |
+
df = train_predict_weight(df)
|
126 |
+
|
127 |
+
# Normalize weights
|
128 |
+
df = normalize_and_clip_weights(df)
|
129 |
+
|
130 |
+
# Save results
|
131 |
+
create_submission_csv(df, output_file)
|
132 |
+
|
133 |
+
logging.info("--- Pipeline Completed Successfully ---")
|
134 |
+
|
135 |
+
return df
|
136 |
|
137 |
##############################
|
138 |
# GitHub API helper: Fetch repository metrics
|
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import os
|
2 |
import gradio as gr
|
3 |
-
from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
|
4 |
import pandas as pd
|
5 |
import matplotlib.pyplot as plt
|
6 |
import seaborn as sns
|
@@ -8,59 +7,203 @@ import numpy as np
|
|
8 |
import time
|
9 |
import io
|
10 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
|
|
|
|
|
|
|
13 |
start_time = time.time()
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
if __name__ == "__main__":
|
65 |
port = int(os.environ.get("PORT", 7860))
|
66 |
-
iface.launch(
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
import gradio as gr
|
|
|
3 |
import pandas as pd
|
4 |
import matplotlib.pyplot as plt
|
5 |
import seaborn as sns
|
|
|
7 |
import time
|
8 |
import io
|
9 |
from PIL import Image
|
10 |
+
import logging
|
11 |
+
|
12 |
+
# Import the functions from deepfundingoracle
|
13 |
+
from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv, \
|
14 |
+
normalize_and_clip_weights
|
15 |
+
|
16 |
+
# Configure logging
|
17 |
+
logging.basicConfig(level=logging.INFO)
|
18 |
+
|
19 |
|
20 |
def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
|
21 |
+
"""
|
22 |
+
Analyzes the uploaded file and generates results.
|
23 |
+
"""
|
24 |
start_time = time.time()
|
25 |
+
|
26 |
+
try:
|
27 |
+
# Step 1: Prepare dataset
|
28 |
+
progress(0, desc="Preparing dataset...")
|
29 |
+
df = prepare_dataset(file.name)
|
30 |
+
|
31 |
+
# Step 2: Train model and predict weights
|
32 |
+
progress(0.3, desc="Training model and predicting weights...")
|
33 |
+
df = train_predict_weight(df)
|
34 |
+
|
35 |
+
# Step 3: Normalize weights
|
36 |
+
progress(0.5, desc="Normalizing weights...")
|
37 |
+
df = normalize_and_clip_weights(df)
|
38 |
+
|
39 |
+
# Step 4: Save results
|
40 |
+
progress(0.6, desc="Saving results to CSV...")
|
41 |
+
output_filename = "submission.csv"
|
42 |
+
create_submission_csv(df, output_filename)
|
43 |
+
|
44 |
+
# Step 5: Generate visualizations
|
45 |
+
progress(0.8, desc="Generating graphs...")
|
46 |
+
|
47 |
+
# Feature distribution plot
|
48 |
+
dist_fig = plt.figure(figsize=(15, 10))
|
49 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
50 |
+
plot_cols = [col for col in numeric_cols if
|
51 |
+
col in ['stars', 'forks', 'watchers', 'contributors', 'pulls', 'final_weight']]
|
52 |
+
|
53 |
+
if plot_cols:
|
54 |
+
df[plot_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
|
55 |
+
plt.suptitle("Feature Distributions", fontsize=16)
|
56 |
+
plt.tight_layout()
|
57 |
+
|
58 |
+
dist_buf = io.BytesIO()
|
59 |
+
plt.savefig(dist_buf, format='png', dpi=100, bbox_inches='tight')
|
60 |
+
dist_buf.seek(0)
|
61 |
+
plt.close(dist_fig)
|
62 |
+
dist_img = Image.open(dist_buf)
|
63 |
+
|
64 |
+
# Correlation matrix plot
|
65 |
+
corr_fig = plt.figure(figsize=(12, 8))
|
66 |
+
if len(plot_cols) > 1:
|
67 |
+
correlation_matrix = df[plot_cols].corr()
|
68 |
+
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
|
69 |
+
plt.title("Feature Correlation Matrix", fontsize=16)
|
70 |
+
|
71 |
+
corr_buf = io.BytesIO()
|
72 |
+
plt.savefig(corr_buf, format='png', dpi=100, bbox_inches='tight')
|
73 |
+
corr_buf.seek(0)
|
74 |
+
plt.close(corr_fig)
|
75 |
+
corr_img = Image.open(corr_buf)
|
76 |
+
|
77 |
+
# Prepare preview
|
78 |
+
progress(1, desc="Done!")
|
79 |
+
elapsed = time.time() - start_time
|
80 |
+
|
81 |
+
# Create a summary preview
|
82 |
+
summary_df = df[['repo', 'parent', 'final_weight']].head(10)
|
83 |
+
preview = f"Top 10 Results:\n{summary_df.to_string(index=False)}\n\nTotal repositories analyzed: {len(df)}"
|
84 |
+
|
85 |
+
# Return the path to the generated file for automatic download
|
86 |
+
return (
|
87 |
+
preview,
|
88 |
+
output_filename, # This will trigger automatic download
|
89 |
+
dist_img,
|
90 |
+
corr_img,
|
91 |
+
f"β
Analysis completed successfully in {elapsed:.2f} seconds.\nπ₯ Results file ready for download!"
|
92 |
+
)
|
93 |
+
|
94 |
+
except Exception as e:
|
95 |
+
logging.error(f"Error during analysis: {str(e)}")
|
96 |
+
elapsed = time.time() - start_time
|
97 |
+
error_msg = f"β Error: {str(e)}\nTime elapsed: {elapsed:.2f} seconds"
|
98 |
+
|
99 |
+
# Return empty images and error message
|
100 |
+
empty_img = Image.new('RGB', (800, 600), color='white')
|
101 |
+
return error_msg, None, empty_img, empty_img, error_msg
|
102 |
+
|
103 |
+
|
104 |
+
# Custom CSS for better styling
|
105 |
+
custom_css = """
|
106 |
+
.download-button {
|
107 |
+
background-color: #4CAF50 !important;
|
108 |
+
color: white !important;
|
109 |
+
font-weight: bold !important;
|
110 |
+
}
|
111 |
+
.status-box {
|
112 |
+
font-family: monospace;
|
113 |
+
padding: 10px;
|
114 |
+
border-radius: 5px;
|
115 |
+
}
|
116 |
+
"""
|
117 |
+
|
118 |
+
# Create Gradio interface with automatic download
|
119 |
+
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as iface:
|
120 |
+
gr.Markdown("""
|
121 |
+
# π DeepFunding Oracle
|
122 |
+
|
123 |
+
Upload a CSV file containing repository dependencies with 'repo' and 'parent' columns.
|
124 |
+
The system will:
|
125 |
+
1. **Fetch** GitHub metrics for each repository
|
126 |
+
2. **Generate** importance weights using AI
|
127 |
+
3. **Train** a model to predict final contribution weights
|
128 |
+
4. **Normalize** weights so they sum to 1 per parent
|
129 |
+
|
130 |
+
β οΈ **Note**: Set `GITHUB_API_TOKEN` environment variable for better API rate limits.
|
131 |
+
""")
|
132 |
+
|
133 |
+
with gr.Row():
|
134 |
+
with gr.Column(scale=1):
|
135 |
+
file_input = gr.File(
|
136 |
+
label="Upload CSV File",
|
137 |
+
file_types=[".csv"],
|
138 |
+
elem_id="file-upload"
|
139 |
+
)
|
140 |
+
analyze_btn = gr.Button("π Analyze", variant="primary", size="lg")
|
141 |
+
|
142 |
+
with gr.Column(scale=2):
|
143 |
+
status_output = gr.Textbox(
|
144 |
+
label="Status",
|
145 |
+
lines=3,
|
146 |
+
elem_classes="status-box"
|
147 |
+
)
|
148 |
+
|
149 |
+
with gr.Row():
|
150 |
+
preview_output = gr.Textbox(
|
151 |
+
label="Preview of Results",
|
152 |
+
lines=15,
|
153 |
+
show_copy_button=True
|
154 |
+
)
|
155 |
+
|
156 |
+
with gr.Row():
|
157 |
+
download_output = gr.File(
|
158 |
+
label="π₯ Download Results CSV",
|
159 |
+
visible=True,
|
160 |
+
elem_classes="download-button"
|
161 |
+
)
|
162 |
+
|
163 |
+
with gr.Row():
|
164 |
+
with gr.Column():
|
165 |
+
dist_plot = gr.Image(label="Feature Distributions")
|
166 |
+
with gr.Column():
|
167 |
+
corr_plot = gr.Image(label="Feature Correlation Matrix")
|
168 |
+
|
169 |
+
# JavaScript for automatic download
|
170 |
+
download_js = """
|
171 |
+
() => {
|
172 |
+
setTimeout(() => {
|
173 |
+
const downloadButton = document.querySelector('.download-button a');
|
174 |
+
if (downloadButton) {
|
175 |
+
downloadButton.click();
|
176 |
+
}
|
177 |
+
}, 500);
|
178 |
+
}
|
179 |
+
"""
|
180 |
+
|
181 |
+
# Set up the event handler
|
182 |
+
analyze_btn.click(
|
183 |
+
fn=analyze_file,
|
184 |
+
inputs=[file_input],
|
185 |
+
outputs=[preview_output, download_output, dist_plot, corr_plot, status_output]
|
186 |
+
).then(
|
187 |
+
fn=None,
|
188 |
+
inputs=None,
|
189 |
+
outputs=None,
|
190 |
+
_js=download_js # This triggers automatic download
|
191 |
+
)
|
192 |
+
|
193 |
+
# Add example usage
|
194 |
+
gr.Examples(
|
195 |
+
examples=[["example_dependencies.csv"]], # Add your example file here if you have one
|
196 |
+
inputs=file_input,
|
197 |
+
outputs=[preview_output, download_output, dist_plot, corr_plot, status_output],
|
198 |
+
fn=analyze_file,
|
199 |
+
cache_examples=False,
|
200 |
+
)
|
201 |
|
202 |
if __name__ == "__main__":
|
203 |
port = int(os.environ.get("PORT", 7860))
|
204 |
+
iface.launch(
|
205 |
+
server_name="0.0.0.0",
|
206 |
+
server_port=port,
|
207 |
+
share=False,
|
208 |
+
show_error=True
|
209 |
+
)
|