Spaces:
Running
Running
Commit
·
3388ab8
1
Parent(s):
db24239
Oracle
Browse files- Oracle/DataSmolAgent.py +73 -25
- Oracle/deepfundingoracle.py +101 -60
- app.py +31 -19
Oracle/DataSmolAgent.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
|
|
3 |
from smolagents import tool, CodeAgent
|
4 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
5 |
|
6 |
@tool
|
7 |
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
|
@@ -29,35 +31,52 @@ def extract_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
29 |
Returns:
|
30 |
The DataFrame updated with new dynamically engineered features.
|
31 |
"""
|
32 |
-
# Numeric columns: log transformation
|
33 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
|
34 |
for col in numeric_cols:
|
35 |
-
if (df[col] >= 0).all():
|
36 |
df[f"log_{col}"] = np.log(df[col] + 1)
|
37 |
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
@tool
|
63 |
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
|
@@ -95,6 +114,34 @@ def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
|
|
95 |
)
|
96 |
return df
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
class DataSmolAgent(CodeAgent):
|
99 |
"""
|
100 |
A data processing agent that cleans and extracts features from the provided DataFrame.
|
@@ -109,6 +156,7 @@ class DataSmolAgent(CodeAgent):
|
|
109 |
extract_features,
|
110 |
save_to_csv, # Added save_to_csv tool
|
111 |
predict_funding, # Added predict_funding tool
|
|
|
112 |
],
|
113 |
model=self.model,
|
114 |
additional_authorized_imports=["pandas", "numpy"]
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
from smolagents import tool, CodeAgent
|
5 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
6 |
+
from sklearn.preprocessing import StandardScaler
|
7 |
|
8 |
@tool
|
9 |
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
31 |
Returns:
|
32 |
The DataFrame updated with new dynamically engineered features.
|
33 |
"""
|
34 |
+
# Numeric columns: log transformation for skewed features
|
35 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
|
36 |
for col in numeric_cols:
|
37 |
+
if (df[col] >= 0).all() and df[col].skew() > 1:
|
38 |
df[f"log_{col}"] = np.log(df[col] + 1)
|
39 |
|
40 |
+
# Repository age (days since creation)
|
41 |
+
if "created_at" in df.columns:
|
42 |
+
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
|
43 |
+
df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
|
44 |
+
|
45 |
+
# Recent activity count (commits/issues in last 30/90 days)
|
46 |
+
if "activity" in df.columns:
|
47 |
+
df["activity"] = pd.to_datetime(df["activity"], errors="coerce")
|
48 |
+
now = pd.Timestamp.now()
|
49 |
+
df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
|
50 |
+
df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
|
51 |
+
|
52 |
+
# Open/closed PR ratio
|
53 |
+
if {"open_prs", "closed_prs"}.issubset(df.columns):
|
54 |
+
df["pr_ratio"] = df["open_prs"] / (df["closed_prs"] + 1)
|
55 |
+
|
56 |
+
# Issue resolution speed
|
57 |
+
if {"issues_closed", "issues_opened"}.issubset(df.columns):
|
58 |
+
df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
|
59 |
+
|
60 |
+
# Is the repo archived?
|
61 |
+
if "archived" in df.columns:
|
62 |
+
df["is_archived"] = df["archived"].astype(int)
|
63 |
+
|
64 |
+
# Description length
|
65 |
+
if "description" in df.columns:
|
66 |
+
df["description_length"] = df["description"].fillna("").apply(len)
|
67 |
+
|
68 |
+
# Topics count
|
69 |
+
if "topics" in df.columns:
|
70 |
+
df["topics_count"] = df["topics"].fillna("").apply(lambda x: len(x.split(",")))
|
71 |
+
|
72 |
+
# Normalize or standardize features
|
73 |
+
scaler = StandardScaler()
|
74 |
+
scaled_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "contributors"]
|
75 |
+
for col in scaled_cols:
|
76 |
+
if col in df.columns:
|
77 |
+
df[f"scaled_{col}"] = scaler.fit_transform(df[[col]])
|
78 |
+
|
79 |
+
return df
|
80 |
|
81 |
@tool
|
82 |
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
|
|
|
114 |
)
|
115 |
return df
|
116 |
|
117 |
+
@tool
|
118 |
+
def analyze_feature_importance(feature_importances: dict, feature_cols: list):
|
119 |
+
"""
|
120 |
+
Visualizes feature importance and identifies irrelevant features.
|
121 |
+
|
122 |
+
Args:
|
123 |
+
feature_importances: A dictionary of feature names and their importance scores.
|
124 |
+
feature_cols: List of feature column names.
|
125 |
+
"""
|
126 |
+
importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
|
127 |
+
print("[INFO] Feature importances:")
|
128 |
+
print(importance_df)
|
129 |
+
|
130 |
+
# Plot feature importance
|
131 |
+
plt.figure(figsize=(10, 6))
|
132 |
+
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
|
133 |
+
plt.xlabel("Importance")
|
134 |
+
plt.ylabel("Feature")
|
135 |
+
plt.title("Feature Importance")
|
136 |
+
plt.gca().invert_yaxis()
|
137 |
+
plt.show()
|
138 |
+
|
139 |
+
# Drop irrelevant features (importance < threshold)
|
140 |
+
threshold = 0.01
|
141 |
+
irrelevant_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist()
|
142 |
+
print(f"[INFO] Irrelevant features (importance < {threshold}): {irrelevant_features}")
|
143 |
+
return irrelevant_features
|
144 |
+
|
145 |
class DataSmolAgent(CodeAgent):
|
146 |
"""
|
147 |
A data processing agent that cleans and extracts features from the provided DataFrame.
|
|
|
156 |
extract_features,
|
157 |
save_to_csv, # Added save_to_csv tool
|
158 |
predict_funding, # Added predict_funding tool
|
159 |
+
analyze_feature_importance, # Added analyze_feature_importance tool
|
160 |
],
|
161 |
model=self.model,
|
162 |
additional_authorized_imports=["pandas", "numpy"]
|
Oracle/deepfundingoracle.py
CHANGED
@@ -28,9 +28,11 @@ import re
|
|
28 |
import json
|
29 |
import time
|
30 |
|
31 |
-
from sklearn.model_selection import train_test_split,
|
32 |
from sklearn.ensemble import RandomForestRegressor
|
33 |
from sklearn.metrics import mean_squared_error
|
|
|
|
|
34 |
|
35 |
from Oracle.SmolLM import SmolLM
|
36 |
|
@@ -261,6 +263,34 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
|
|
261 |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
262 |
return df
|
263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
def normalize_funding(df):
|
266 |
"""
|
@@ -284,91 +314,101 @@ def prepare_dataset(file):
|
|
284 |
print("[INFO] Fetching GitHub features...")
|
285 |
df = fetch_github_features(df)
|
286 |
print("[INFO] GitHub features fetched successfully.")
|
|
|
|
|
|
|
287 |
print("[INFO] Assigning base weights using LLama model...")
|
288 |
df = assign_base_weight(df)
|
|
|
289 |
df = train_predict_weight(df)
|
|
|
290 |
df = normalize_funding(df)
|
291 |
end_time = time.time()
|
292 |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
|
293 |
return df
|
294 |
|
295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
296 |
##############################
|
297 |
# RandomForest Regression
|
298 |
##############################
|
299 |
-
def train_predict_weight(df
|
300 |
-
criterion='gini',
|
301 |
-
max_features='sqrt',
|
302 |
-
max_depth=12,
|
303 |
-
min_samples_split=2,
|
304 |
-
min_samples_leaf=1):
|
305 |
"""
|
306 |
-
|
307 |
-
The regressor is tuned with provided hyperparameters.
|
308 |
-
A flag column 'is_source' is used to indicate if a repository is the primary source.
|
309 |
-
If none is flagged, the repo with the highest prediction is set as the parent.
|
310 |
"""
|
311 |
-
print("[INFO] Starting weight prediction...", flush=True)
|
312 |
start_time = time.time()
|
313 |
target = "base_weight"
|
314 |
-
feature_cols = [
|
315 |
-
|
316 |
-
if "activity" in df.columns:
|
317 |
-
df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
|
318 |
-
now = pd.Timestamp.now(tz="UTC")
|
319 |
-
df["activity"] = (now - df["activity"]).dt.days.fillna(-1)
|
320 |
-
|
321 |
-
if target not in df.columns:
|
322 |
-
raise ValueError("Base weight column missing.")
|
323 |
|
324 |
X = df[feature_cols]
|
325 |
y = df[target]
|
326 |
|
327 |
-
#
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
|
|
364 |
|
365 |
end_time = time.time()
|
366 |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|
367 |
return df
|
368 |
|
369 |
|
370 |
-
|
371 |
-
|
372 |
##############################
|
373 |
# CSV Output
|
374 |
##############################
|
@@ -393,3 +433,4 @@ if __name__ == "__main__":
|
|
393 |
print("[INFO] Creating submission CSV...")
|
394 |
create_submission_csv(df, output_file)
|
395 |
print("[INFO] Process completed successfully.")
|
|
|
|
28 |
import json
|
29 |
import time
|
30 |
|
31 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
32 |
from sklearn.ensemble import RandomForestRegressor
|
33 |
from sklearn.metrics import mean_squared_error
|
34 |
+
import matplotlib.pyplot as plt
|
35 |
+
import seaborn as sns
|
36 |
|
37 |
from Oracle.SmolLM import SmolLM
|
38 |
|
|
|
263 |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
264 |
return df
|
265 |
|
266 |
+
def sanity_check_weights(df):
|
267 |
+
"""
|
268 |
+
Sanity-checks LLM weights by comparing them with other metrics.
|
269 |
+
"""
|
270 |
+
print("[INFO] Performing sanity check on LLM weights...")
|
271 |
+
df["sanity_check_weight"] = (df["stars"] + df["forks"] + df["watchers"]) / 3
|
272 |
+
df["ensemble_weight"] = (df["base_weight"] + df["sanity_check_weight"]) / 2
|
273 |
+
print("[INFO] Sanity check and ensemble weights added.")
|
274 |
+
return df
|
275 |
+
|
276 |
+
def visualize_feature_distributions(df):
|
277 |
+
"""
|
278 |
+
Visualizes feature distributions and correlations.
|
279 |
+
"""
|
280 |
+
print("[INFO] Visualizing feature distributions and correlations...")
|
281 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
282 |
+
|
283 |
+
# Plot feature distributions
|
284 |
+
df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
|
285 |
+
plt.suptitle("Feature Distributions", fontsize=16)
|
286 |
+
plt.show()
|
287 |
+
|
288 |
+
# Plot feature correlations
|
289 |
+
correlation_matrix = df[numeric_cols].corr()
|
290 |
+
plt.figure(figsize=(12, 8))
|
291 |
+
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
|
292 |
+
plt.title("Feature Correlation Matrix", fontsize=16)
|
293 |
+
plt.show()
|
294 |
|
295 |
def normalize_funding(df):
|
296 |
"""
|
|
|
314 |
print("[INFO] Fetching GitHub features...")
|
315 |
df = fetch_github_features(df)
|
316 |
print("[INFO] GitHub features fetched successfully.")
|
317 |
+
print("[INFO] Cleaning data...")
|
318 |
+
df = clean_data(df)
|
319 |
+
print("[INFO] Data cleaned successfully.")
|
320 |
print("[INFO] Assigning base weights using LLama model...")
|
321 |
df = assign_base_weight(df)
|
322 |
+
df = sanity_check_weights(df) # Add sanity-check and ensemble weights
|
323 |
df = train_predict_weight(df)
|
324 |
+
visualize_feature_distributions(df) # Add feature visualization
|
325 |
df = normalize_funding(df)
|
326 |
end_time = time.time()
|
327 |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
|
328 |
return df
|
329 |
|
330 |
|
331 |
+
##############################
|
332 |
+
# Data Cleaning
|
333 |
+
##############################
|
334 |
+
def clean_data(df):
|
335 |
+
"""
|
336 |
+
Cleans the input DataFrame by handling missing values and removing outliers.
|
337 |
+
"""
|
338 |
+
# Impute missing values
|
339 |
+
df.fillna(df.median(numeric_only=True), inplace=True)
|
340 |
+
|
341 |
+
# Remove extreme outliers using quantiles
|
342 |
+
for col in df.select_dtypes(include=[np.number]).columns:
|
343 |
+
q1 = df[col].quantile(0.25)
|
344 |
+
q3 = df[col].quantile(0.75)
|
345 |
+
iqr = q3 - q1
|
346 |
+
lower_bound = q1 - 1.5 * iqr
|
347 |
+
upper_bound = q3 + 1.5 * iqr
|
348 |
+
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
|
349 |
+
|
350 |
+
return df
|
351 |
+
|
352 |
+
|
353 |
##############################
|
354 |
# RandomForest Regression
|
355 |
##############################
|
356 |
+
def train_predict_weight(df):
|
|
|
|
|
|
|
|
|
|
|
357 |
"""
|
358 |
+
Trains a RandomForestRegressor with hyperparameter tuning and evaluates the model.
|
|
|
|
|
|
|
359 |
"""
|
360 |
+
print("[INFO] Starting weight prediction with hyperparameter tuning...", flush=True)
|
361 |
start_time = time.time()
|
362 |
target = "base_weight"
|
363 |
+
feature_cols = [col for col in df.columns if col not in ["repo", "parent", "base_weight", "final_weight"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
|
365 |
X = df[feature_cols]
|
366 |
y = df[target]
|
367 |
|
368 |
+
# Split data into train/test sets
|
369 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
370 |
+
|
371 |
+
# Hyperparameter tuning using GridSearchCV
|
372 |
+
param_grid = {
|
373 |
+
"n_estimators": [100, 200, 300],
|
374 |
+
"max_depth": [10, 15, 20],
|
375 |
+
"min_samples_split": [2, 5, 10],
|
376 |
+
"min_samples_leaf": [1, 2, 4]
|
377 |
+
}
|
378 |
+
rf = RandomForestRegressor(random_state=42)
|
379 |
+
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2)
|
380 |
+
grid_search.fit(X_train, y_train)
|
381 |
+
|
382 |
+
# Best model
|
383 |
+
best_rf = grid_search.best_estimator_
|
384 |
+
print(f"[INFO] Best parameters: {grid_search.best_params_}")
|
385 |
+
|
386 |
+
# Evaluate on test set
|
387 |
+
y_pred = best_rf.predict(X_test)
|
388 |
+
mse = mean_squared_error(y_test, y_pred)
|
389 |
+
print(f"[INFO] Test MSE: {mse}")
|
390 |
+
|
391 |
+
# Feature importance analysis
|
392 |
+
feature_importances = best_rf.feature_importances_
|
393 |
+
importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
|
394 |
+
print("[INFO] Feature importances:")
|
395 |
+
print(importance_df)
|
396 |
+
|
397 |
+
# Plot predictions vs. actual values
|
398 |
+
plt.scatter(y_test, y_pred, alpha=0.5)
|
399 |
+
plt.xlabel("Actual Base Weight")
|
400 |
+
plt.ylabel("Predicted Base Weight")
|
401 |
+
plt.title("Predictions vs. Actual")
|
402 |
+
plt.show()
|
403 |
+
|
404 |
+
# Assign predictions to DataFrame
|
405 |
+
df["final_weight"] = best_rf.predict(X)
|
406 |
|
407 |
end_time = time.time()
|
408 |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|
409 |
return df
|
410 |
|
411 |
|
|
|
|
|
412 |
##############################
|
413 |
# CSV Output
|
414 |
##############################
|
|
|
433 |
print("[INFO] Creating submission CSV...")
|
434 |
create_submission_csv(df, output_file)
|
435 |
print("[INFO] Process completed successfully.")
|
436 |
+
|
app.py
CHANGED
@@ -3,6 +3,8 @@ import gradio as gr
|
|
3 |
from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
|
4 |
import pandas as pd
|
5 |
import matplotlib.pyplot as plt
|
|
|
|
|
6 |
import time
|
7 |
import io
|
8 |
from PIL import Image
|
@@ -15,25 +17,34 @@ def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
|
|
15 |
df = train_predict_weight(df)
|
16 |
progress(0.6, desc="Saving results to CSV...")
|
17 |
csv_path = create_submission_csv(df, "submission.csv")
|
18 |
-
progress(0.8, desc="Generating
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
progress(1, desc="Done!")
|
34 |
elapsed = time.time() - start_time
|
35 |
preview = df.head().to_csv(index=False)
|
36 |
-
return preview, csv_path,
|
37 |
|
38 |
iface = gr.Interface(
|
39 |
fn=analyze_file,
|
@@ -41,14 +52,15 @@ iface = gr.Interface(
|
|
41 |
outputs=[
|
42 |
gr.Textbox(label="Preview of Results"),
|
43 |
gr.File(label="Download CSV"),
|
44 |
-
gr.Image(label="
|
|
|
45 |
gr.Textbox(label="Status/Timing Info")
|
46 |
],
|
47 |
title="DeepFunding Oracle",
|
48 |
-
description="Upload a CSV of repo-parent relationships; see analysis progress, get
|
49 |
allow_flagging="never"
|
50 |
)
|
51 |
|
52 |
if __name__ == "__main__":
|
53 |
port = int(os.environ.get("PORT", 7860))
|
54 |
-
iface.launch(server_name="0.0.0.0", server_port=port)
|
|
|
3 |
from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
|
4 |
import pandas as pd
|
5 |
import matplotlib.pyplot as plt
|
6 |
+
import seaborn as sns
|
7 |
+
import numpy as np
|
8 |
import time
|
9 |
import io
|
10 |
from PIL import Image
|
|
|
17 |
df = train_predict_weight(df)
|
18 |
progress(0.6, desc="Saving results to CSV...")
|
19 |
csv_path = create_submission_csv(df, "submission.csv")
|
20 |
+
progress(0.8, desc="Generating graphs...")
|
21 |
+
|
22 |
+
# Feature distribution plot
|
23 |
+
dist_fig = plt.figure(figsize=(15, 10))
|
24 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
25 |
+
df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
|
26 |
+
plt.suptitle("Feature Distributions", fontsize=16)
|
27 |
+
dist_buf = io.BytesIO()
|
28 |
+
plt.savefig(dist_buf, format='png')
|
29 |
+
dist_buf.seek(0)
|
30 |
+
plt.close(dist_fig)
|
31 |
+
dist_img = Image.open(dist_buf)
|
32 |
+
|
33 |
+
# Correlation matrix plot
|
34 |
+
corr_fig = plt.figure(figsize=(12, 8))
|
35 |
+
correlation_matrix = df[numeric_cols].corr()
|
36 |
+
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
|
37 |
+
plt.title("Feature Correlation Matrix", fontsize=16)
|
38 |
+
corr_buf = io.BytesIO()
|
39 |
+
plt.savefig(corr_buf, format='png')
|
40 |
+
corr_buf.seek(0)
|
41 |
+
plt.close(corr_fig)
|
42 |
+
corr_img = Image.open(corr_buf)
|
43 |
+
|
44 |
progress(1, desc="Done!")
|
45 |
elapsed = time.time() - start_time
|
46 |
preview = df.head().to_csv(index=False)
|
47 |
+
return preview, csv_path, dist_img, corr_img, f"Analysis completed in {elapsed:.2f} seconds."
|
48 |
|
49 |
iface = gr.Interface(
|
50 |
fn=analyze_file,
|
|
|
52 |
outputs=[
|
53 |
gr.Textbox(label="Preview of Results"),
|
54 |
gr.File(label="Download CSV"),
|
55 |
+
gr.Image(label="Feature Distributions"),
|
56 |
+
gr.Image(label="Feature Correlation Matrix"),
|
57 |
gr.Textbox(label="Status/Timing Info")
|
58 |
],
|
59 |
title="DeepFunding Oracle",
|
60 |
+
description="Upload a CSV of repo-parent relationships; see analysis progress, get graphs, and download results as CSV.",
|
61 |
allow_flagging="never"
|
62 |
)
|
63 |
|
64 |
if __name__ == "__main__":
|
65 |
port = int(os.environ.get("PORT", 7860))
|
66 |
+
iface.launch(server_name="0.0.0.0", server_port=port)
|