Spaces:
Sleeping
Sleeping
Commit
·
3388ab8
1
Parent(s):
db24239
Oracle
Browse files- Oracle/DataSmolAgent.py +73 -25
- Oracle/deepfundingoracle.py +101 -60
- app.py +31 -19
Oracle/DataSmolAgent.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
|
|
|
| 3 |
from smolagents import tool, CodeAgent
|
| 4 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
|
|
| 5 |
|
| 6 |
@tool
|
| 7 |
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
|
|
@@ -29,35 +31,52 @@ def extract_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 29 |
Returns:
|
| 30 |
The DataFrame updated with new dynamically engineered features.
|
| 31 |
"""
|
| 32 |
-
# Numeric columns: log transformation
|
| 33 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
|
| 34 |
for col in numeric_cols:
|
| 35 |
-
if (df[col] >= 0).all():
|
| 36 |
df[f"log_{col}"] = np.log(df[col] + 1)
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
@tool
|
| 63 |
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
|
|
@@ -95,6 +114,34 @@ def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 95 |
)
|
| 96 |
return df
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
class DataSmolAgent(CodeAgent):
|
| 99 |
"""
|
| 100 |
A data processing agent that cleans and extracts features from the provided DataFrame.
|
|
@@ -109,6 +156,7 @@ class DataSmolAgent(CodeAgent):
|
|
| 109 |
extract_features,
|
| 110 |
save_to_csv, # Added save_to_csv tool
|
| 111 |
predict_funding, # Added predict_funding tool
|
|
|
|
| 112 |
],
|
| 113 |
model=self.model,
|
| 114 |
additional_authorized_imports=["pandas", "numpy"]
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
from smolagents import tool, CodeAgent
|
| 5 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 6 |
+
from sklearn.preprocessing import StandardScaler
|
| 7 |
|
| 8 |
@tool
|
| 9 |
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
|
| 31 |
Returns:
|
| 32 |
The DataFrame updated with new dynamically engineered features.
|
| 33 |
"""
|
| 34 |
+
# Numeric columns: log transformation for skewed features
|
| 35 |
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
|
| 36 |
for col in numeric_cols:
|
| 37 |
+
if (df[col] >= 0).all() and df[col].skew() > 1:
|
| 38 |
df[f"log_{col}"] = np.log(df[col] + 1)
|
| 39 |
|
| 40 |
+
# Repository age (days since creation)
|
| 41 |
+
if "created_at" in df.columns:
|
| 42 |
+
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
|
| 43 |
+
df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
|
| 44 |
+
|
| 45 |
+
# Recent activity count (commits/issues in last 30/90 days)
|
| 46 |
+
if "activity" in df.columns:
|
| 47 |
+
df["activity"] = pd.to_datetime(df["activity"], errors="coerce")
|
| 48 |
+
now = pd.Timestamp.now()
|
| 49 |
+
df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
|
| 50 |
+
df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
|
| 51 |
+
|
| 52 |
+
# Open/closed PR ratio
|
| 53 |
+
if {"open_prs", "closed_prs"}.issubset(df.columns):
|
| 54 |
+
df["pr_ratio"] = df["open_prs"] / (df["closed_prs"] + 1)
|
| 55 |
+
|
| 56 |
+
# Issue resolution speed
|
| 57 |
+
if {"issues_closed", "issues_opened"}.issubset(df.columns):
|
| 58 |
+
df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
|
| 59 |
+
|
| 60 |
+
# Is the repo archived?
|
| 61 |
+
if "archived" in df.columns:
|
| 62 |
+
df["is_archived"] = df["archived"].astype(int)
|
| 63 |
+
|
| 64 |
+
# Description length
|
| 65 |
+
if "description" in df.columns:
|
| 66 |
+
df["description_length"] = df["description"].fillna("").apply(len)
|
| 67 |
+
|
| 68 |
+
# Topics count
|
| 69 |
+
if "topics" in df.columns:
|
| 70 |
+
df["topics_count"] = df["topics"].fillna("").apply(lambda x: len(x.split(",")))
|
| 71 |
+
|
| 72 |
+
# Normalize or standardize features
|
| 73 |
+
scaler = StandardScaler()
|
| 74 |
+
scaled_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "contributors"]
|
| 75 |
+
for col in scaled_cols:
|
| 76 |
+
if col in df.columns:
|
| 77 |
+
df[f"scaled_{col}"] = scaler.fit_transform(df[[col]])
|
| 78 |
+
|
| 79 |
+
return df
|
| 80 |
|
| 81 |
@tool
|
| 82 |
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
|
|
|
|
| 114 |
)
|
| 115 |
return df
|
| 116 |
|
| 117 |
+
@tool
|
| 118 |
+
def analyze_feature_importance(feature_importances: dict, feature_cols: list):
|
| 119 |
+
"""
|
| 120 |
+
Visualizes feature importance and identifies irrelevant features.
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
feature_importances: A dictionary of feature names and their importance scores.
|
| 124 |
+
feature_cols: List of feature column names.
|
| 125 |
+
"""
|
| 126 |
+
importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
|
| 127 |
+
print("[INFO] Feature importances:")
|
| 128 |
+
print(importance_df)
|
| 129 |
+
|
| 130 |
+
# Plot feature importance
|
| 131 |
+
plt.figure(figsize=(10, 6))
|
| 132 |
+
plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
|
| 133 |
+
plt.xlabel("Importance")
|
| 134 |
+
plt.ylabel("Feature")
|
| 135 |
+
plt.title("Feature Importance")
|
| 136 |
+
plt.gca().invert_yaxis()
|
| 137 |
+
plt.show()
|
| 138 |
+
|
| 139 |
+
# Drop irrelevant features (importance < threshold)
|
| 140 |
+
threshold = 0.01
|
| 141 |
+
irrelevant_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist()
|
| 142 |
+
print(f"[INFO] Irrelevant features (importance < {threshold}): {irrelevant_features}")
|
| 143 |
+
return irrelevant_features
|
| 144 |
+
|
| 145 |
class DataSmolAgent(CodeAgent):
|
| 146 |
"""
|
| 147 |
A data processing agent that cleans and extracts features from the provided DataFrame.
|
|
|
|
| 156 |
extract_features,
|
| 157 |
save_to_csv, # Added save_to_csv tool
|
| 158 |
predict_funding, # Added predict_funding tool
|
| 159 |
+
analyze_feature_importance, # Added analyze_feature_importance tool
|
| 160 |
],
|
| 161 |
model=self.model,
|
| 162 |
additional_authorized_imports=["pandas", "numpy"]
|
Oracle/deepfundingoracle.py
CHANGED
|
@@ -28,9 +28,11 @@ import re
|
|
| 28 |
import json
|
| 29 |
import time
|
| 30 |
|
| 31 |
-
from sklearn.model_selection import train_test_split,
|
| 32 |
from sklearn.ensemble import RandomForestRegressor
|
| 33 |
from sklearn.metrics import mean_squared_error
|
|
|
|
|
|
|
| 34 |
|
| 35 |
from Oracle.SmolLM import SmolLM
|
| 36 |
|
|
@@ -261,6 +263,34 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
|
|
| 261 |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
| 262 |
return df
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
def normalize_funding(df):
|
| 266 |
"""
|
|
@@ -284,91 +314,101 @@ def prepare_dataset(file):
|
|
| 284 |
print("[INFO] Fetching GitHub features...")
|
| 285 |
df = fetch_github_features(df)
|
| 286 |
print("[INFO] GitHub features fetched successfully.")
|
|
|
|
|
|
|
|
|
|
| 287 |
print("[INFO] Assigning base weights using LLama model...")
|
| 288 |
df = assign_base_weight(df)
|
|
|
|
| 289 |
df = train_predict_weight(df)
|
|
|
|
| 290 |
df = normalize_funding(df)
|
| 291 |
end_time = time.time()
|
| 292 |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
|
| 293 |
return df
|
| 294 |
|
| 295 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
##############################
|
| 297 |
# RandomForest Regression
|
| 298 |
##############################
|
| 299 |
-
def train_predict_weight(df
|
| 300 |
-
criterion='gini',
|
| 301 |
-
max_features='sqrt',
|
| 302 |
-
max_depth=12,
|
| 303 |
-
min_samples_split=2,
|
| 304 |
-
min_samples_leaf=1):
|
| 305 |
"""
|
| 306 |
-
|
| 307 |
-
The regressor is tuned with provided hyperparameters.
|
| 308 |
-
A flag column 'is_source' is used to indicate if a repository is the primary source.
|
| 309 |
-
If none is flagged, the repo with the highest prediction is set as the parent.
|
| 310 |
"""
|
| 311 |
-
print("[INFO] Starting weight prediction...", flush=True)
|
| 312 |
start_time = time.time()
|
| 313 |
target = "base_weight"
|
| 314 |
-
feature_cols = [
|
| 315 |
-
|
| 316 |
-
if "activity" in df.columns:
|
| 317 |
-
df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
|
| 318 |
-
now = pd.Timestamp.now(tz="UTC")
|
| 319 |
-
df["activity"] = (now - df["activity"]).dt.days.fillna(-1)
|
| 320 |
-
|
| 321 |
-
if target not in df.columns:
|
| 322 |
-
raise ValueError("Base weight column missing.")
|
| 323 |
|
| 324 |
X = df[feature_cols]
|
| 325 |
y = df[target]
|
| 326 |
|
| 327 |
-
#
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
|
|
|
| 364 |
|
| 365 |
end_time = time.time()
|
| 366 |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|
| 367 |
return df
|
| 368 |
|
| 369 |
|
| 370 |
-
|
| 371 |
-
|
| 372 |
##############################
|
| 373 |
# CSV Output
|
| 374 |
##############################
|
|
@@ -393,3 +433,4 @@ if __name__ == "__main__":
|
|
| 393 |
print("[INFO] Creating submission CSV...")
|
| 394 |
create_submission_csv(df, output_file)
|
| 395 |
print("[INFO] Process completed successfully.")
|
|
|
|
|
|
| 28 |
import json
|
| 29 |
import time
|
| 30 |
|
| 31 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
| 32 |
from sklearn.ensemble import RandomForestRegressor
|
| 33 |
from sklearn.metrics import mean_squared_error
|
| 34 |
+
import matplotlib.pyplot as plt
|
| 35 |
+
import seaborn as sns
|
| 36 |
|
| 37 |
from Oracle.SmolLM import SmolLM
|
| 38 |
|
|
|
|
| 263 |
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
| 264 |
return df
|
| 265 |
|
| 266 |
+
def sanity_check_weights(df):
|
| 267 |
+
"""
|
| 268 |
+
Sanity-checks LLM weights by comparing them with other metrics.
|
| 269 |
+
"""
|
| 270 |
+
print("[INFO] Performing sanity check on LLM weights...")
|
| 271 |
+
df["sanity_check_weight"] = (df["stars"] + df["forks"] + df["watchers"]) / 3
|
| 272 |
+
df["ensemble_weight"] = (df["base_weight"] + df["sanity_check_weight"]) / 2
|
| 273 |
+
print("[INFO] Sanity check and ensemble weights added.")
|
| 274 |
+
return df
|
| 275 |
+
|
| 276 |
+
def visualize_feature_distributions(df):
|
| 277 |
+
"""
|
| 278 |
+
Visualizes feature distributions and correlations.
|
| 279 |
+
"""
|
| 280 |
+
print("[INFO] Visualizing feature distributions and correlations...")
|
| 281 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 282 |
+
|
| 283 |
+
# Plot feature distributions
|
| 284 |
+
df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
|
| 285 |
+
plt.suptitle("Feature Distributions", fontsize=16)
|
| 286 |
+
plt.show()
|
| 287 |
+
|
| 288 |
+
# Plot feature correlations
|
| 289 |
+
correlation_matrix = df[numeric_cols].corr()
|
| 290 |
+
plt.figure(figsize=(12, 8))
|
| 291 |
+
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
|
| 292 |
+
plt.title("Feature Correlation Matrix", fontsize=16)
|
| 293 |
+
plt.show()
|
| 294 |
|
| 295 |
def normalize_funding(df):
|
| 296 |
"""
|
|
|
|
| 314 |
print("[INFO] Fetching GitHub features...")
|
| 315 |
df = fetch_github_features(df)
|
| 316 |
print("[INFO] GitHub features fetched successfully.")
|
| 317 |
+
print("[INFO] Cleaning data...")
|
| 318 |
+
df = clean_data(df)
|
| 319 |
+
print("[INFO] Data cleaned successfully.")
|
| 320 |
print("[INFO] Assigning base weights using LLama model...")
|
| 321 |
df = assign_base_weight(df)
|
| 322 |
+
df = sanity_check_weights(df) # Add sanity-check and ensemble weights
|
| 323 |
df = train_predict_weight(df)
|
| 324 |
+
visualize_feature_distributions(df) # Add feature visualization
|
| 325 |
df = normalize_funding(df)
|
| 326 |
end_time = time.time()
|
| 327 |
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
|
| 328 |
return df
|
| 329 |
|
| 330 |
|
| 331 |
+
##############################
|
| 332 |
+
# Data Cleaning
|
| 333 |
+
##############################
|
| 334 |
+
def clean_data(df):
|
| 335 |
+
"""
|
| 336 |
+
Cleans the input DataFrame by handling missing values and removing outliers.
|
| 337 |
+
"""
|
| 338 |
+
# Impute missing values
|
| 339 |
+
df.fillna(df.median(numeric_only=True), inplace=True)
|
| 340 |
+
|
| 341 |
+
# Remove extreme outliers using quantiles
|
| 342 |
+
for col in df.select_dtypes(include=[np.number]).columns:
|
| 343 |
+
q1 = df[col].quantile(0.25)
|
| 344 |
+
q3 = df[col].quantile(0.75)
|
| 345 |
+
iqr = q3 - q1
|
| 346 |
+
lower_bound = q1 - 1.5 * iqr
|
| 347 |
+
upper_bound = q3 + 1.5 * iqr
|
| 348 |
+
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
|
| 349 |
+
|
| 350 |
+
return df
|
| 351 |
+
|
| 352 |
+
|
| 353 |
##############################
|
| 354 |
# RandomForest Regression
|
| 355 |
##############################
|
| 356 |
+
def train_predict_weight(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
"""
|
| 358 |
+
Trains a RandomForestRegressor with hyperparameter tuning and evaluates the model.
|
|
|
|
|
|
|
|
|
|
| 359 |
"""
|
| 360 |
+
print("[INFO] Starting weight prediction with hyperparameter tuning...", flush=True)
|
| 361 |
start_time = time.time()
|
| 362 |
target = "base_weight"
|
| 363 |
+
feature_cols = [col for col in df.columns if col not in ["repo", "parent", "base_weight", "final_weight"]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
X = df[feature_cols]
|
| 366 |
y = df[target]
|
| 367 |
|
| 368 |
+
# Split data into train/test sets
|
| 369 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 370 |
+
|
| 371 |
+
# Hyperparameter tuning using GridSearchCV
|
| 372 |
+
param_grid = {
|
| 373 |
+
"n_estimators": [100, 200, 300],
|
| 374 |
+
"max_depth": [10, 15, 20],
|
| 375 |
+
"min_samples_split": [2, 5, 10],
|
| 376 |
+
"min_samples_leaf": [1, 2, 4]
|
| 377 |
+
}
|
| 378 |
+
rf = RandomForestRegressor(random_state=42)
|
| 379 |
+
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2)
|
| 380 |
+
grid_search.fit(X_train, y_train)
|
| 381 |
+
|
| 382 |
+
# Best model
|
| 383 |
+
best_rf = grid_search.best_estimator_
|
| 384 |
+
print(f"[INFO] Best parameters: {grid_search.best_params_}")
|
| 385 |
+
|
| 386 |
+
# Evaluate on test set
|
| 387 |
+
y_pred = best_rf.predict(X_test)
|
| 388 |
+
mse = mean_squared_error(y_test, y_pred)
|
| 389 |
+
print(f"[INFO] Test MSE: {mse}")
|
| 390 |
+
|
| 391 |
+
# Feature importance analysis
|
| 392 |
+
feature_importances = best_rf.feature_importances_
|
| 393 |
+
importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
|
| 394 |
+
print("[INFO] Feature importances:")
|
| 395 |
+
print(importance_df)
|
| 396 |
+
|
| 397 |
+
# Plot predictions vs. actual values
|
| 398 |
+
plt.scatter(y_test, y_pred, alpha=0.5)
|
| 399 |
+
plt.xlabel("Actual Base Weight")
|
| 400 |
+
plt.ylabel("Predicted Base Weight")
|
| 401 |
+
plt.title("Predictions vs. Actual")
|
| 402 |
+
plt.show()
|
| 403 |
+
|
| 404 |
+
# Assign predictions to DataFrame
|
| 405 |
+
df["final_weight"] = best_rf.predict(X)
|
| 406 |
|
| 407 |
end_time = time.time()
|
| 408 |
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|
| 409 |
return df
|
| 410 |
|
| 411 |
|
|
|
|
|
|
|
| 412 |
##############################
|
| 413 |
# CSV Output
|
| 414 |
##############################
|
|
|
|
| 433 |
print("[INFO] Creating submission CSV...")
|
| 434 |
create_submission_csv(df, output_file)
|
| 435 |
print("[INFO] Process completed successfully.")
|
| 436 |
+
|
app.py
CHANGED
|
@@ -3,6 +3,8 @@ import gradio as gr
|
|
| 3 |
from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
|
| 4 |
import pandas as pd
|
| 5 |
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
| 6 |
import time
|
| 7 |
import io
|
| 8 |
from PIL import Image
|
|
@@ -15,25 +17,34 @@ def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
|
|
| 15 |
df = train_predict_weight(df)
|
| 16 |
progress(0.6, desc="Saving results to CSV...")
|
| 17 |
csv_path = create_submission_csv(df, "submission.csv")
|
| 18 |
-
progress(0.8, desc="Generating
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
progress(1, desc="Done!")
|
| 34 |
elapsed = time.time() - start_time
|
| 35 |
preview = df.head().to_csv(index=False)
|
| 36 |
-
return preview, csv_path,
|
| 37 |
|
| 38 |
iface = gr.Interface(
|
| 39 |
fn=analyze_file,
|
|
@@ -41,14 +52,15 @@ iface = gr.Interface(
|
|
| 41 |
outputs=[
|
| 42 |
gr.Textbox(label="Preview of Results"),
|
| 43 |
gr.File(label="Download CSV"),
|
| 44 |
-
gr.Image(label="
|
|
|
|
| 45 |
gr.Textbox(label="Status/Timing Info")
|
| 46 |
],
|
| 47 |
title="DeepFunding Oracle",
|
| 48 |
-
description="Upload a CSV of repo-parent relationships; see analysis progress, get
|
| 49 |
allow_flagging="never"
|
| 50 |
)
|
| 51 |
|
| 52 |
if __name__ == "__main__":
|
| 53 |
port = int(os.environ.get("PORT", 7860))
|
| 54 |
-
iface.launch(server_name="0.0.0.0", server_port=port)
|
|
|
|
| 3 |
from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
|
| 4 |
import pandas as pd
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
+
import seaborn as sns
|
| 7 |
+
import numpy as np
|
| 8 |
import time
|
| 9 |
import io
|
| 10 |
from PIL import Image
|
|
|
|
| 17 |
df = train_predict_weight(df)
|
| 18 |
progress(0.6, desc="Saving results to CSV...")
|
| 19 |
csv_path = create_submission_csv(df, "submission.csv")
|
| 20 |
+
progress(0.8, desc="Generating graphs...")
|
| 21 |
+
|
| 22 |
+
# Feature distribution plot
|
| 23 |
+
dist_fig = plt.figure(figsize=(15, 10))
|
| 24 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 25 |
+
df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
|
| 26 |
+
plt.suptitle("Feature Distributions", fontsize=16)
|
| 27 |
+
dist_buf = io.BytesIO()
|
| 28 |
+
plt.savefig(dist_buf, format='png')
|
| 29 |
+
dist_buf.seek(0)
|
| 30 |
+
plt.close(dist_fig)
|
| 31 |
+
dist_img = Image.open(dist_buf)
|
| 32 |
+
|
| 33 |
+
# Correlation matrix plot
|
| 34 |
+
corr_fig = plt.figure(figsize=(12, 8))
|
| 35 |
+
correlation_matrix = df[numeric_cols].corr()
|
| 36 |
+
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
|
| 37 |
+
plt.title("Feature Correlation Matrix", fontsize=16)
|
| 38 |
+
corr_buf = io.BytesIO()
|
| 39 |
+
plt.savefig(corr_buf, format='png')
|
| 40 |
+
corr_buf.seek(0)
|
| 41 |
+
plt.close(corr_fig)
|
| 42 |
+
corr_img = Image.open(corr_buf)
|
| 43 |
+
|
| 44 |
progress(1, desc="Done!")
|
| 45 |
elapsed = time.time() - start_time
|
| 46 |
preview = df.head().to_csv(index=False)
|
| 47 |
+
return preview, csv_path, dist_img, corr_img, f"Analysis completed in {elapsed:.2f} seconds."
|
| 48 |
|
| 49 |
iface = gr.Interface(
|
| 50 |
fn=analyze_file,
|
|
|
|
| 52 |
outputs=[
|
| 53 |
gr.Textbox(label="Preview of Results"),
|
| 54 |
gr.File(label="Download CSV"),
|
| 55 |
+
gr.Image(label="Feature Distributions"),
|
| 56 |
+
gr.Image(label="Feature Correlation Matrix"),
|
| 57 |
gr.Textbox(label="Status/Timing Info")
|
| 58 |
],
|
| 59 |
title="DeepFunding Oracle",
|
| 60 |
+
description="Upload a CSV of repo-parent relationships; see analysis progress, get graphs, and download results as CSV.",
|
| 61 |
allow_flagging="never"
|
| 62 |
)
|
| 63 |
|
| 64 |
if __name__ == "__main__":
|
| 65 |
port = int(os.environ.get("PORT", 7860))
|
| 66 |
+
iface.launch(server_name="0.0.0.0", server_port=port)
|