FelixPhilip commited on
Commit
3388ab8
·
1 Parent(s): db24239
Files changed (3) hide show
  1. Oracle/DataSmolAgent.py +73 -25
  2. Oracle/deepfundingoracle.py +101 -60
  3. app.py +31 -19
Oracle/DataSmolAgent.py CHANGED
@@ -1,7 +1,9 @@
1
  import pandas as pd
2
  import numpy as np
 
3
  from smolagents import tool, CodeAgent
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
5
 
6
  @tool
7
  def clean_data(df: pd.DataFrame) -> pd.DataFrame:
@@ -29,35 +31,52 @@ def extract_features(df: pd.DataFrame) -> pd.DataFrame:
29
  Returns:
30
  The DataFrame updated with new dynamically engineered features.
31
  """
32
- # Numeric columns: log transformation
33
  numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
34
  for col in numeric_cols:
35
- if (df[col] >= 0).all():
36
  df[f"log_{col}"] = np.log(df[col] + 1)
37
 
38
- # Date-like columns extraction
39
- for col in df.columns:
40
- if "date" in col.lower() or "time" in col.lower():
41
- try:
42
- df[col] = pd.to_datetime(df[col], errors='coerce')
43
- df[f"{col}_year"] = df[col].dt.year
44
- df[f"{col}_month"] = df[col].dt.month
45
- df[f"{col}_day"] = df[col].dt.day
46
- except (ValueError, TypeError):
47
- pass
48
-
49
- # Non-numeric processing: encode as categorical numeric codes.
50
- non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
51
- valid_cat = []
52
- for col in non_numeric:
53
- try:
54
- pd.to_datetime(df[col], errors='raise')
55
- except ValueError:
56
- valid_cat.append(col)
57
- for col in valid_cat:
58
- df[f"{col}_cat"] = df[col].astype("category").cat.codes
59
-
60
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  @tool
63
  def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
@@ -95,6 +114,34 @@ def predict_funding(df: pd.DataFrame) -> pd.DataFrame:
95
  )
96
  return df
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  class DataSmolAgent(CodeAgent):
99
  """
100
  A data processing agent that cleans and extracts features from the provided DataFrame.
@@ -109,6 +156,7 @@ class DataSmolAgent(CodeAgent):
109
  extract_features,
110
  save_to_csv, # Added save_to_csv tool
111
  predict_funding, # Added predict_funding tool
 
112
  ],
113
  model=self.model,
114
  additional_authorized_imports=["pandas", "numpy"]
 
1
  import pandas as pd
2
  import numpy as np
3
+ import matplotlib.pyplot as plt
4
  from smolagents import tool, CodeAgent
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from sklearn.preprocessing import StandardScaler
7
 
8
  @tool
9
  def clean_data(df: pd.DataFrame) -> pd.DataFrame:
 
31
  Returns:
32
  The DataFrame updated with new dynamically engineered features.
33
  """
34
+ # Numeric columns: log transformation for skewed features
35
  numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
36
  for col in numeric_cols:
37
+ if (df[col] >= 0).all() and df[col].skew() > 1:
38
  df[f"log_{col}"] = np.log(df[col] + 1)
39
 
40
+ # Repository age (days since creation)
41
+ if "created_at" in df.columns:
42
+ df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
43
+ df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
44
+
45
+ # Recent activity count (commits/issues in last 30/90 days)
46
+ if "activity" in df.columns:
47
+ df["activity"] = pd.to_datetime(df["activity"], errors="coerce")
48
+ now = pd.Timestamp.now()
49
+ df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
50
+ df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
51
+
52
+ # Open/closed PR ratio
53
+ if {"open_prs", "closed_prs"}.issubset(df.columns):
54
+ df["pr_ratio"] = df["open_prs"] / (df["closed_prs"] + 1)
55
+
56
+ # Issue resolution speed
57
+ if {"issues_closed", "issues_opened"}.issubset(df.columns):
58
+ df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
59
+
60
+ # Is the repo archived?
61
+ if "archived" in df.columns:
62
+ df["is_archived"] = df["archived"].astype(int)
63
+
64
+ # Description length
65
+ if "description" in df.columns:
66
+ df["description_length"] = df["description"].fillna("").apply(len)
67
+
68
+ # Topics count
69
+ if "topics" in df.columns:
70
+ df["topics_count"] = df["topics"].fillna("").apply(lambda x: len(x.split(",")))
71
+
72
+ # Normalize or standardize features
73
+ scaler = StandardScaler()
74
+ scaled_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "contributors"]
75
+ for col in scaled_cols:
76
+ if col in df.columns:
77
+ df[f"scaled_{col}"] = scaler.fit_transform(df[[col]])
78
+
79
+ return df
80
 
81
  @tool
82
  def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
 
114
  )
115
  return df
116
 
117
+ @tool
118
+ def analyze_feature_importance(feature_importances: dict, feature_cols: list):
119
+ """
120
+ Visualizes feature importance and identifies irrelevant features.
121
+
122
+ Args:
123
+ feature_importances: A dictionary of feature names and their importance scores.
124
+ feature_cols: List of feature column names.
125
+ """
126
+ importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
127
+ print("[INFO] Feature importances:")
128
+ print(importance_df)
129
+
130
+ # Plot feature importance
131
+ plt.figure(figsize=(10, 6))
132
+ plt.barh(importance_df["Feature"], importance_df["Importance"], color="skyblue")
133
+ plt.xlabel("Importance")
134
+ plt.ylabel("Feature")
135
+ plt.title("Feature Importance")
136
+ plt.gca().invert_yaxis()
137
+ plt.show()
138
+
139
+ # Drop irrelevant features (importance < threshold)
140
+ threshold = 0.01
141
+ irrelevant_features = importance_df[importance_df["Importance"] < threshold]["Feature"].tolist()
142
+ print(f"[INFO] Irrelevant features (importance < {threshold}): {irrelevant_features}")
143
+ return irrelevant_features
144
+
145
  class DataSmolAgent(CodeAgent):
146
  """
147
  A data processing agent that cleans and extracts features from the provided DataFrame.
 
156
  extract_features,
157
  save_to_csv, # Added save_to_csv tool
158
  predict_funding, # Added predict_funding tool
159
+ analyze_feature_importance, # Added analyze_feature_importance tool
160
  ],
161
  model=self.model,
162
  additional_authorized_imports=["pandas", "numpy"]
Oracle/deepfundingoracle.py CHANGED
@@ -28,9 +28,11 @@ import re
28
  import json
29
  import time
30
 
31
- from sklearn.model_selection import train_test_split, RandomizedSearchCV
32
  from sklearn.ensemble import RandomForestRegressor
33
  from sklearn.metrics import mean_squared_error
 
 
34
 
35
  from Oracle.SmolLM import SmolLM
36
 
@@ -261,6 +263,34 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
261
  logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
262
  return df
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  def normalize_funding(df):
266
  """
@@ -284,91 +314,101 @@ def prepare_dataset(file):
284
  print("[INFO] Fetching GitHub features...")
285
  df = fetch_github_features(df)
286
  print("[INFO] GitHub features fetched successfully.")
 
 
 
287
  print("[INFO] Assigning base weights using LLama model...")
288
  df = assign_base_weight(df)
 
289
  df = train_predict_weight(df)
 
290
  df = normalize_funding(df)
291
  end_time = time.time()
292
  print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
293
  return df
294
 
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  ##############################
297
  # RandomForest Regression
298
  ##############################
299
- def train_predict_weight(df,
300
- criterion='gini',
301
- max_features='sqrt',
302
- max_depth=12,
303
- min_samples_split=2,
304
- min_samples_leaf=1):
305
  """
306
- Uses a RandomForestRegressor to predict a repository weight based on GitHub features.
307
- The regressor is tuned with provided hyperparameters.
308
- A flag column 'is_source' is used to indicate if a repository is the primary source.
309
- If none is flagged, the repo with the highest prediction is set as the parent.
310
  """
311
- print("[INFO] Starting weight prediction...", flush=True)
312
  start_time = time.time()
313
  target = "base_weight"
314
- feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
315
-
316
- if "activity" in df.columns:
317
- df["activity"] = pd.to_datetime(df["activity"], errors="coerce", utc=True)
318
- now = pd.Timestamp.now(tz="UTC")
319
- df["activity"] = (now - df["activity"]).dt.days.fillna(-1)
320
-
321
- if target not in df.columns:
322
- raise ValueError("Base weight column missing.")
323
 
324
  X = df[feature_cols]
325
  y = df[target]
326
 
327
- # For regression, if a classification criterion is given, switch to 'mse'
328
- reg_criterion = "squared_error" if criterion in ["gini", "entropy"] else criterion
329
-
330
- rf_model = RandomForestRegressor(random_state=42,
331
- criterion=reg_criterion,
332
- max_features=max_features,
333
- max_depth=max_depth,
334
- min_samples_split=min_samples_split,
335
- min_samples_leaf=min_samples_leaf,
336
- n_estimators=200)
337
- rf_model.fit(X, y)
338
- df["rf_pred"] = rf_model.predict(X)
339
-
340
- # Provide feedback about one of the trees in the RF
341
- try:
342
- depth = rf_model.estimators_[0].get_depth()
343
- leaves = rf_model.estimators_[0].get_n_leaves()
344
- print(f"[INFO] RF tree depth: {depth}, number of leaves: {leaves}", flush=True)
345
- except Exception:
346
- pass
347
-
348
- parent_map = df.groupby("parent")["repo"].apply(list).to_dict()
349
- final_weights = {}
350
-
351
- for parent, children in parent_map.items():
352
- group_idxs = df[df["parent"] == parent].index
353
- preds = df.loc[group_idxs, "rf_pred"]
354
- total = preds.sum()
355
- if total > 0:
356
- normed = preds / total
357
- else:
358
- # If sum is zero, assign equal weights.
359
- normed = pd.Series([1/len(preds)] * len(preds), index=preds.index)
360
- for idx, weight in normed.items():
361
- final_weights[idx] = weight
362
-
363
- df["final_weight"] = df.index.map(final_weights).fillna(0.0)
 
364
 
365
  end_time = time.time()
366
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
367
  return df
368
 
369
 
370
-
371
-
372
  ##############################
373
  # CSV Output
374
  ##############################
@@ -393,3 +433,4 @@ if __name__ == "__main__":
393
  print("[INFO] Creating submission CSV...")
394
  create_submission_csv(df, output_file)
395
  print("[INFO] Process completed successfully.")
 
 
28
  import json
29
  import time
30
 
31
+ from sklearn.model_selection import train_test_split, GridSearchCV
32
  from sklearn.ensemble import RandomForestRegressor
33
  from sklearn.metrics import mean_squared_error
34
+ import matplotlib.pyplot as plt
35
+ import seaborn as sns
36
 
37
  from Oracle.SmolLM import SmolLM
38
 
 
263
  logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
264
  return df
265
 
266
+ def sanity_check_weights(df):
267
+ """
268
+ Sanity-checks LLM weights by comparing them with other metrics.
269
+ """
270
+ print("[INFO] Performing sanity check on LLM weights...")
271
+ df["sanity_check_weight"] = (df["stars"] + df["forks"] + df["watchers"]) / 3
272
+ df["ensemble_weight"] = (df["base_weight"] + df["sanity_check_weight"]) / 2
273
+ print("[INFO] Sanity check and ensemble weights added.")
274
+ return df
275
+
276
+ def visualize_feature_distributions(df):
277
+ """
278
+ Visualizes feature distributions and correlations.
279
+ """
280
+ print("[INFO] Visualizing feature distributions and correlations...")
281
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
282
+
283
+ # Plot feature distributions
284
+ df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
285
+ plt.suptitle("Feature Distributions", fontsize=16)
286
+ plt.show()
287
+
288
+ # Plot feature correlations
289
+ correlation_matrix = df[numeric_cols].corr()
290
+ plt.figure(figsize=(12, 8))
291
+ sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
292
+ plt.title("Feature Correlation Matrix", fontsize=16)
293
+ plt.show()
294
 
295
  def normalize_funding(df):
296
  """
 
314
  print("[INFO] Fetching GitHub features...")
315
  df = fetch_github_features(df)
316
  print("[INFO] GitHub features fetched successfully.")
317
+ print("[INFO] Cleaning data...")
318
+ df = clean_data(df)
319
+ print("[INFO] Data cleaned successfully.")
320
  print("[INFO] Assigning base weights using LLama model...")
321
  df = assign_base_weight(df)
322
+ df = sanity_check_weights(df) # Add sanity-check and ensemble weights
323
  df = train_predict_weight(df)
324
+ visualize_feature_distributions(df) # Add feature visualization
325
  df = normalize_funding(df)
326
  end_time = time.time()
327
  print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
328
  return df
329
 
330
 
331
+ ##############################
332
+ # Data Cleaning
333
+ ##############################
334
+ def clean_data(df):
335
+ """
336
+ Cleans the input DataFrame by handling missing values and removing outliers.
337
+ """
338
+ # Impute missing values
339
+ df.fillna(df.median(numeric_only=True), inplace=True)
340
+
341
+ # Remove extreme outliers using quantiles
342
+ for col in df.select_dtypes(include=[np.number]).columns:
343
+ q1 = df[col].quantile(0.25)
344
+ q3 = df[col].quantile(0.75)
345
+ iqr = q3 - q1
346
+ lower_bound = q1 - 1.5 * iqr
347
+ upper_bound = q3 + 1.5 * iqr
348
+ df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
349
+
350
+ return df
351
+
352
+
353
  ##############################
354
  # RandomForest Regression
355
  ##############################
356
+ def train_predict_weight(df):
 
 
 
 
 
357
  """
358
+ Trains a RandomForestRegressor with hyperparameter tuning and evaluates the model.
 
 
 
359
  """
360
+ print("[INFO] Starting weight prediction with hyperparameter tuning...", flush=True)
361
  start_time = time.time()
362
  target = "base_weight"
363
+ feature_cols = [col for col in df.columns if col not in ["repo", "parent", "base_weight", "final_weight"]]
 
 
 
 
 
 
 
 
364
 
365
  X = df[feature_cols]
366
  y = df[target]
367
 
368
+ # Split data into train/test sets
369
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
370
+
371
+ # Hyperparameter tuning using GridSearchCV
372
+ param_grid = {
373
+ "n_estimators": [100, 200, 300],
374
+ "max_depth": [10, 15, 20],
375
+ "min_samples_split": [2, 5, 10],
376
+ "min_samples_leaf": [1, 2, 4]
377
+ }
378
+ rf = RandomForestRegressor(random_state=42)
379
+ grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring="neg_mean_squared_error", verbose=2)
380
+ grid_search.fit(X_train, y_train)
381
+
382
+ # Best model
383
+ best_rf = grid_search.best_estimator_
384
+ print(f"[INFO] Best parameters: {grid_search.best_params_}")
385
+
386
+ # Evaluate on test set
387
+ y_pred = best_rf.predict(X_test)
388
+ mse = mean_squared_error(y_test, y_pred)
389
+ print(f"[INFO] Test MSE: {mse}")
390
+
391
+ # Feature importance analysis
392
+ feature_importances = best_rf.feature_importances_
393
+ importance_df = pd.DataFrame({"Feature": feature_cols, "Importance": feature_importances}).sort_values(by="Importance", ascending=False)
394
+ print("[INFO] Feature importances:")
395
+ print(importance_df)
396
+
397
+ # Plot predictions vs. actual values
398
+ plt.scatter(y_test, y_pred, alpha=0.5)
399
+ plt.xlabel("Actual Base Weight")
400
+ plt.ylabel("Predicted Base Weight")
401
+ plt.title("Predictions vs. Actual")
402
+ plt.show()
403
+
404
+ # Assign predictions to DataFrame
405
+ df["final_weight"] = best_rf.predict(X)
406
 
407
  end_time = time.time()
408
  print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
409
  return df
410
 
411
 
 
 
412
  ##############################
413
  # CSV Output
414
  ##############################
 
433
  print("[INFO] Creating submission CSV...")
434
  create_submission_csv(df, output_file)
435
  print("[INFO] Process completed successfully.")
436
+
app.py CHANGED
@@ -3,6 +3,8 @@ import gradio as gr
3
  from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
4
  import pandas as pd
5
  import matplotlib.pyplot as plt
 
 
6
  import time
7
  import io
8
  from PIL import Image
@@ -15,25 +17,34 @@ def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
15
  df = train_predict_weight(df)
16
  progress(0.6, desc="Saving results to CSV...")
17
  csv_path = create_submission_csv(df, "submission.csv")
18
- progress(0.8, desc="Generating graph...")
19
- # Example: plot histogram of a column if exists
20
- fig, ax = plt.subplots()
21
- if 'final_weight' in df.columns:
22
- df['final_weight'].hist(ax=ax)
23
- ax.set_title('Distribution of Final Weights')
24
- ax.set_xlabel('Final Weight')
25
- ax.set_ylabel('Count')
26
- else:
27
- ax.text(0.5, 0.5, 'No final_weight column to plot', ha='center')
28
- buf = io.BytesIO()
29
- plt.savefig(buf, format='png')
30
- buf.seek(0)
31
- plt.close(fig)
32
- img = Image.open(buf)
 
 
 
 
 
 
 
 
 
33
  progress(1, desc="Done!")
34
  elapsed = time.time() - start_time
35
  preview = df.head().to_csv(index=False)
36
- return preview, csv_path, img, f"Analysis completed in {elapsed:.2f} seconds."
37
 
38
  iface = gr.Interface(
39
  fn=analyze_file,
@@ -41,14 +52,15 @@ iface = gr.Interface(
41
  outputs=[
42
  gr.Textbox(label="Preview of Results"),
43
  gr.File(label="Download CSV"),
44
- gr.Image(label="Analysis Graph"),
 
45
  gr.Textbox(label="Status/Timing Info")
46
  ],
47
  title="DeepFunding Oracle",
48
- description="Upload a CSV of repo-parent relationships; see analysis progress, get a graph, and download results as CSV.",
49
  allow_flagging="never"
50
  )
51
 
52
  if __name__ == "__main__":
53
  port = int(os.environ.get("PORT", 7860))
54
- iface.launch(server_name="0.0.0.0", server_port=port)
 
3
  from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
4
  import pandas as pd
5
  import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import numpy as np
8
  import time
9
  import io
10
  from PIL import Image
 
17
  df = train_predict_weight(df)
18
  progress(0.6, desc="Saving results to CSV...")
19
  csv_path = create_submission_csv(df, "submission.csv")
20
+ progress(0.8, desc="Generating graphs...")
21
+
22
+ # Feature distribution plot
23
+ dist_fig = plt.figure(figsize=(15, 10))
24
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
25
+ df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
26
+ plt.suptitle("Feature Distributions", fontsize=16)
27
+ dist_buf = io.BytesIO()
28
+ plt.savefig(dist_buf, format='png')
29
+ dist_buf.seek(0)
30
+ plt.close(dist_fig)
31
+ dist_img = Image.open(dist_buf)
32
+
33
+ # Correlation matrix plot
34
+ corr_fig = plt.figure(figsize=(12, 8))
35
+ correlation_matrix = df[numeric_cols].corr()
36
+ sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
37
+ plt.title("Feature Correlation Matrix", fontsize=16)
38
+ corr_buf = io.BytesIO()
39
+ plt.savefig(corr_buf, format='png')
40
+ corr_buf.seek(0)
41
+ plt.close(corr_fig)
42
+ corr_img = Image.open(corr_buf)
43
+
44
  progress(1, desc="Done!")
45
  elapsed = time.time() - start_time
46
  preview = df.head().to_csv(index=False)
47
+ return preview, csv_path, dist_img, corr_img, f"Analysis completed in {elapsed:.2f} seconds."
48
 
49
  iface = gr.Interface(
50
  fn=analyze_file,
 
52
  outputs=[
53
  gr.Textbox(label="Preview of Results"),
54
  gr.File(label="Download CSV"),
55
+ gr.Image(label="Feature Distributions"),
56
+ gr.Image(label="Feature Correlation Matrix"),
57
  gr.Textbox(label="Status/Timing Info")
58
  ],
59
  title="DeepFunding Oracle",
60
+ description="Upload a CSV of repo-parent relationships; see analysis progress, get graphs, and download results as CSV.",
61
  allow_flagging="never"
62
  )
63
 
64
  if __name__ == "__main__":
65
  port = int(os.environ.get("PORT", 7860))
66
+ iface.launch(server_name="0.0.0.0", server_port=port)