FelixPhilip commited on
Commit
ea68d4a
·
1 Parent(s): 386c440
Files changed (2) hide show
  1. Oracle/DataSmolAgent.py +34 -12
  2. Oracle/deepfundingoracle.py +160 -109
Oracle/DataSmolAgent.py CHANGED
@@ -34,17 +34,30 @@ def extract_features(df: pd.DataFrame) -> pd.DataFrame:
34
  # Numeric columns: log transformation for skewed features
35
  numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
36
  for col in numeric_cols:
37
- if (df[col] >= 0).all() and df[col].skew() > 1:
38
  df[f"log_{col}"] = np.log(df[col] + 1)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # Repository age (days since creation)
41
  if "created_at" in df.columns:
42
  df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
43
  df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
44
 
45
  # Recent activity count (commits/issues in last 30/90 days)
46
- if "activity" in df.columns:
47
- df["activity"] = pd.to_datetime(df["activity"], errors="coerce")
48
  now = pd.Timestamp.now()
49
  df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
50
  df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
@@ -57,17 +70,25 @@ def extract_features(df: pd.DataFrame) -> pd.DataFrame:
57
  if {"issues_closed", "issues_opened"}.issubset(df.columns):
58
  df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
59
 
60
- # Is the repo archived?
61
- if "archived" in df.columns:
62
- df["is_archived"] = df["archived"].astype(int)
 
 
 
63
 
64
- # Description length
65
- if "description" in df.columns:
66
- df["description_length"] = df["description"].fillna("").apply(len)
 
 
 
 
 
67
 
68
- # Topics count
69
- if "topics" in df.columns:
70
- df["topics_count"] = df["topics"].fillna("").apply(lambda x: len(x.split(",")))
71
 
72
  # Normalize or standardize features
73
  scaler = StandardScaler()
@@ -178,3 +199,4 @@ class DataSmolAgent(CodeAgent):
178
  print(f"CSV saved at: {csv_output}")
179
 
180
  return self.df
 
 
34
  # Numeric columns: log transformation for skewed features
35
  numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
36
  for col in numeric_cols:
37
+ if (df[col] >= 0).all():
38
  df[f"log_{col}"] = np.log(df[col] + 1)
39
 
40
+ # Date-like columns extraction
41
+ for col in df.columns:
42
+ if "date" in col.lower() or "time" in col.lower() or col == "activity":
43
+ try:
44
+ df[col] = pd.to_datetime(df[col], errors='coerce')
45
+ if not df[col].isna().all(): # Only create features if we have valid dates
46
+ df[f"{col}_year"] = df[col].dt.year
47
+ df[f"{col}_month"] = df[col].dt.month
48
+ df[f"{col}_day"] = df[col].dt.day
49
+ # Calculate age (days since date)
50
+ df[f"{col}_age_days"] = (pd.Timestamp.now() - df[col]).dt.days
51
+ except Exception:
52
+ pass
53
+
54
  # Repository age (days since creation)
55
  if "created_at" in df.columns:
56
  df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
57
  df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
58
 
59
  # Recent activity count (commits/issues in last 30/90 days)
60
+ if "activity" in df.columns and pd.api.types.is_datetime64_any_dtype(df["activity"]):
 
61
  now = pd.Timestamp.now()
62
  df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
63
  df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
 
70
  if {"issues_closed", "issues_opened"}.issubset(df.columns):
71
  df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
72
 
73
+ # Feature ratios
74
+ if {"stars", "forks"}.issubset(df.columns):
75
+ df["stars_to_forks_ratio"] = df["stars"] / (df["forks"] + 1)
76
+
77
+ if {"open_issues", "closed_issues"}.issubset(df.columns):
78
+ df["issues_ratio"] = df["closed_issues"] / (df["open_issues"] + df["closed_issues"] + 1)
79
 
80
+ # Non-numeric processing: encode categorical features
81
+ non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
82
+ valid_cat = []
83
+ for col in non_numeric:
84
+ try:
85
+ pd.to_datetime(df[col], errors='raise')
86
+ except Exception:
87
+ valid_cat.append(col)
88
 
89
+ for col in valid_cat:
90
+ if col not in ["repo", "parent"]: # Skip identifier columns
91
+ df[f"{col}_cat"] = df[col].astype("category").cat.codes
92
 
93
  # Normalize or standardize features
94
  scaler = StandardScaler()
 
199
  print(f"CSV saved at: {csv_output}")
200
 
201
  return self.df
202
+
Oracle/deepfundingoracle.py CHANGED
@@ -55,64 +55,97 @@ logging.basicConfig(
55
  def fetch_repo_metrics(repo_url):
56
  """
57
  Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
 
 
58
  """
 
 
 
 
 
 
 
 
 
 
 
 
59
  try:
60
  # Extract owner and repo name
61
  m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
62
  if not m:
63
- return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
 
 
64
  owner, repo_name = m.group(1), m.group(2)
65
  api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
66
  headers = {}
67
 
68
  token = os.environ.get("GITHUB_API_TOKEN", "")
69
- if token: headers["Authorization"] = f"token {token}"
70
- r = requests.get(api_url, headers=headers)
 
 
 
71
  if r.status_code == 200:
72
  data = r.json()
73
- # Log fetched data for debugging
74
- print(f"[DEBUG] Fetched data for {repo_url}: {data}")
75
- pulls_url = data.get("pulls_url", "").replace("{/state}", "")
76
- pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
77
- activity = data.get("updated_at", "")
78
- return {
79
  "stargazers_count": data.get("stargazers_count", 0),
80
  "forks_count": data.get("forks_count", 0),
81
  "watchers_count": data.get("watchers_count", 0),
82
  "open_issues_count": data.get("open_issues_count", 0),
83
- "pulls_count": pulls_count,
84
- "activity": activity,
85
  "owner": owner,
86
  "repo_name": repo_name,
87
- "token": token
88
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  else:
90
  print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
91
- return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
92
  except Exception as e:
93
  print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
94
- return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
95
-
96
-
97
- ##############################
98
- # Feature Extraction
99
- ##############################
100
- def load_data(file):
101
- """
102
- Dynamically load the dependency data CSV from the uploaded file.
103
- Expects at least "repo" and "parent" columns.
104
- """
105
- try:
106
- print("[INFO] Loading data from uploaded file...")
107
- start_time = time.time()
108
- # Read the uploaded file directly into a DataFrame
109
- df = pd.read_csv(file)
110
- end_time = time.time()
111
- print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.")
112
- return df
113
- except Exception as e:
114
- print("[ERROR] Error loading data:", e)
115
- return None
116
 
117
  def fetch_github_features(df):
118
  """
@@ -122,45 +155,27 @@ def fetch_github_features(df):
122
  """
123
  print("[INFO] Fetching GitHub features for repositories...")
124
  start_time = time.time()
125
- stars_list = []
126
- forks_list = []
127
- watchers_list = []
128
- issues_list = []
129
- pulls_list = []
130
- activity_list = []
131
- contributors_list = []
132
- dependencies_list =[]
 
 
 
 
133
 
134
  cache = {}
135
 
136
  def get_metrics(repo_url):
137
  if repo_url in cache:
138
- print(f"[DEBUG] Cached data for {repo_url}: {cache[repo_url]}")
139
  return cache[repo_url]
140
  val = fetch_repo_metrics(repo_url)
141
- print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}") # <-- Add this line
142
- try:
143
- m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
144
- if m:
145
- owner, repo_name = m.group(1), m.group(2)
146
- pkg_url = f"https://api.github.com/repos/{owner}/{repo_name}/packages.json"
147
- headers = {}
148
- token = os.environ.get("GITHUB_API_TOKEN", "")
149
- if token:
150
- headers["Authorization"] = f"token {token}"
151
- pkg_resp = requests.get(pkg_url, headers=headers)
152
- if pkg_resp.status_code ==200:
153
- pkg_data = pkg_resp.json()
154
- content = base64.b64decode(pkg_data["content",""]).decode("utf-8")
155
- pkg_json = json.loads(content)
156
- dependencies = pkg_json.get("dependencies", {})
157
- val["dependencies_count"] = len(dependencies)
158
- else:
159
- val["dependencies_count"] = 0
160
- else:
161
- val["dependencies_count"] = 0
162
- except Exception:
163
- val["dependencies_count"] = 0
164
  cache[repo_url] = val
165
  return val
166
 
@@ -168,38 +183,82 @@ def fetch_github_features(df):
168
  futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
169
  for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
170
  res = fut.result()
171
- stars_list.append(res.get("stargazers_count", 0))
172
- forks_list.append(res.get("forks_count", 0))
173
- watchers_list.append(res.get("watchers_count", 0))
174
- issues_list.append(res.get("open_issues_count", 0))
175
- pulls_list.append(res.get("pulls_count", 0))
176
- activity_list.append(res.get("activity", 0))
177
- dependencies_list.append(res.get("dependencies_count", 0))
178
- # Fetch contributors count
179
- try:
180
- contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
181
- headers = {"Authorization": f"token {res['token']}"}
182
- contributors_response = requests.get(contributors_url, headers=headers)
183
- if contributors_response.status_code == 200:
184
- contributors_list.append(len(contributors_response.json()))
185
- else:
186
- contributors_list.append(0)
187
- except Exception:
188
- contributors_list.append(0)
189
-
190
- df["stars"] = stars_list
191
- df["forks"] = forks_list
192
- df["watchers"] = watchers_list
193
- df["open_issues"] = issues_list
194
- df["pulls"] = pulls_list
195
- df["activity"] = activity_list
196
- df["contributors"] = contributors_list
197
- df["dependencies_count"] = dependencies_list
198
 
199
  end_time = time.time()
200
  print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
201
  return df
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  def timeout_handler(signum, frame):
204
  raise TimeoutError("LLama model prediction timed out.")
205
 
@@ -244,7 +303,6 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
244
  feature_weights = calculate_fallback_weights(df)
245
  print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
246
 
247
- # Ensure numeric columns are properly formatted
248
  for feature in feature_weights.keys():
249
  if feature in df.columns:
250
  df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
@@ -266,19 +324,6 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
266
  logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
267
  return df
268
 
269
- def calculate_fallback_weights(df):
270
- """
271
- Dynamically calculate fallback feature weights based on feature variance and correlation with the target.
272
- """
273
- print("[INFO] Calculating fallback feature weights...")
274
- numeric_cols = df.select_dtypes(include=[np.number]).columns
275
- feature_variances = df[numeric_cols].var()
276
- total_variance = feature_variances.sum()
277
-
278
- # Assign weights proportional to feature variance
279
- fallback_weights = {col: var / total_variance for col, var in feature_variances.items() if total_variance > 0}
280
- return fallback_weights
281
-
282
  def sanity_check_weights(df):
283
  """
284
  Sanity-checks LLM weights by comparing them with other metrics.
@@ -389,6 +434,7 @@ def validate_features(df):
389
  def validate_target(df):
390
  """
391
  Validates the target variable to ensure it has sufficient variance.
 
392
  """
393
  print("[INFO] Validating target variable 'base_weight'...")
394
  target = "base_weight"
@@ -398,7 +444,12 @@ def validate_target(df):
398
  variance = df[target].var()
399
  print(f"[DEBUG] Target variable variance: {variance}")
400
  if variance < 1e-6:
401
- raise ValueError(f"Target variable '{target}' has insufficient variance. Please check feature values.")
 
 
 
 
 
402
  return df
403
 
404
 
 
55
  def fetch_repo_metrics(repo_url):
56
  """
57
  Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
58
+ Assumes repo_url is in the form "https://github.com/owner/repo".
59
+ Handles API failures and malformed URLs gracefully.
60
  """
61
+ # Default values in case of failure
62
+ default_metrics = {
63
+ "stargazers_count": 0,
64
+ "forks_count": 0,
65
+ "watchers_count": 0,
66
+ "open_issues_count": 0,
67
+ "pulls_count": 0,
68
+ "activity": "",
69
+ "contributors": 0,
70
+ "dependencies_count": 0
71
+ }
72
+
73
  try:
74
  # Extract owner and repo name
75
  m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
76
  if not m:
77
+ print(f"[WARN] Malformed GitHub URL: {repo_url}")
78
+ return default_metrics
79
+
80
  owner, repo_name = m.group(1), m.group(2)
81
  api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
82
  headers = {}
83
 
84
  token = os.environ.get("GITHUB_API_TOKEN", "")
85
+ if token:
86
+ headers["Authorization"] = f"token {token}"
87
+
88
+ # Fetch main repository data
89
+ r = requests.get(api_url, headers=headers, timeout=10)
90
  if r.status_code == 200:
91
  data = r.json()
92
+ metrics = {
 
 
 
 
 
93
  "stargazers_count": data.get("stargazers_count", 0),
94
  "forks_count": data.get("forks_count", 0),
95
  "watchers_count": data.get("watchers_count", 0),
96
  "open_issues_count": data.get("open_issues_count", 0),
97
+ "activity": data.get("updated_at", ""),
 
98
  "owner": owner,
99
  "repo_name": repo_name,
100
+ "dependencies_count": 0
101
  }
102
+
103
+ # Try to fetch pull requests count
104
+ try:
105
+ pulls_url = f"{api_url}/pulls"
106
+ pulls_resp = requests.get(pulls_url, headers=headers, timeout=5)
107
+ metrics["pulls_count"] = len(pulls_resp.json()) if pulls_resp.status_code == 200 else 0
108
+ except Exception as e:
109
+ print(f"[WARN] Failed to fetch pulls for {repo_url}: {e}")
110
+ metrics["pulls_count"] = 0
111
+
112
+ # Try to fetch contributors count
113
+ try:
114
+ contributors_url = f"{api_url}/contributors"
115
+ contributors_resp = requests.get(contributors_url, headers=headers, timeout=5)
116
+ metrics["contributors"] = len(contributors_resp.json()) if contributors_resp.status_code == 200 else 0
117
+ except Exception as e:
118
+ print(f"[WARN] Failed to fetch contributors for {repo_url}: {e}")
119
+ metrics["contributors"] = 0
120
+
121
+ # Try to estimate dependencies from package files
122
+ try:
123
+ # Look for package.json for Node.js projects
124
+ package_json_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/package.json"
125
+ package_resp = requests.get(package_json_url, timeout=5)
126
+ if package_resp.status_code == 200:
127
+ package_data = package_resp.json()
128
+ deps = package_data.get("dependencies", {})
129
+ dev_deps = package_data.get("devDependencies", {})
130
+ metrics["dependencies_count"] = len(deps) + len(dev_deps)
131
+ else:
132
+ # Try requirements.txt for Python projects
133
+ req_txt_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/requirements.txt"
134
+ req_resp = requests.get(req_txt_url, timeout=5)
135
+ if req_resp.status_code == 200:
136
+ deps = [line for line in req_resp.text.split('\n') if line.strip() and not line.startswith('#')]
137
+ metrics["dependencies_count"] = len(deps)
138
+ except Exception as e:
139
+ print(f"[WARN] Failed to fetch dependencies for {repo_url}: {e}")
140
+ metrics["dependencies_count"] = 0
141
+
142
+ return metrics
143
  else:
144
  print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
145
+ return default_metrics
146
  except Exception as e:
147
  print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
148
+ return default_metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  def fetch_github_features(df):
151
  """
 
155
  """
156
  print("[INFO] Fetching GitHub features for repositories...")
157
  start_time = time.time()
158
+
159
+ # Initialize lists for storing fetched data
160
+ metrics_lists = {
161
+ "stars": [],
162
+ "forks": [],
163
+ "watchers": [],
164
+ "open_issues": [],
165
+ "pulls": [],
166
+ "activity": [],
167
+ "contributors": [],
168
+ "dependencies_count": []
169
+ }
170
 
171
  cache = {}
172
 
173
  def get_metrics(repo_url):
174
  if repo_url in cache:
175
+ print(f"[DEBUG] Cached GitHub data for {repo_url}: {cache[repo_url]}")
176
  return cache[repo_url]
177
  val = fetch_repo_metrics(repo_url)
178
+ print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  cache[repo_url] = val
180
  return val
181
 
 
183
  futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
184
  for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
185
  res = fut.result()
186
+ metrics_lists["stars"].append(res.get("stargazers_count", 0))
187
+ metrics_lists["forks"].append(res.get("forks_count", 0))
188
+ metrics_lists["watchers"].append(res.get("watchers_count", 0))
189
+ metrics_lists["open_issues"].append(res.get("open_issues_count", 0))
190
+ metrics_lists["pulls"].append(res.get("pulls_count", 0))
191
+ metrics_lists["activity"].append(res.get("activity", ""))
192
+ metrics_lists["contributors"].append(res.get("contributors", 0))
193
+ metrics_lists["dependencies_count"].append(res.get("dependencies_count", 0))
194
+
195
+ # Add the fetched data to the DataFrame
196
+ for key, values in metrics_lists.items():
197
+ df[key] = values
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  end_time = time.time()
200
  print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
201
  return df
202
 
203
+ def calculate_fallback_weights(df):
204
+ """
205
+ Dynamically calculate fallback feature weights based on feature variance.
206
+ """
207
+ print("[INFO] Calculating fallback feature weights...")
208
+ numeric_cols = ['stars', 'forks', 'watchers', 'open_issues', 'pulls', 'contributors', 'dependencies_count']
209
+ # Filter to only include columns that exist in the DataFrame
210
+ valid_cols = [col for col in numeric_cols if col in df.columns]
211
+
212
+ # Create default weights
213
+ default_weights = {
214
+ 'stars': 0.3,
215
+ 'forks': 0.2,
216
+ 'watchers': 0.2,
217
+ 'open_issues': 0.1,
218
+ 'pulls': 0.1,
219
+ 'contributors': 0.05,
220
+ 'dependencies_count': 0.05
221
+ }
222
+
223
+ # If any data exists, calculate variance-based weights
224
+ if len(valid_cols) > 0 and df[valid_cols].sum().sum() > 0:
225
+ # Calculate variance for each feature
226
+ feature_variances = df[valid_cols].var()
227
+ total_variance = feature_variances.sum()
228
+
229
+ # If meaningful variance exists, use it for weights
230
+ if total_variance > 0:
231
+ weights = {col: var / total_variance for col, var in feature_variances.items()}
232
+ # Normalize to ensure sum is 1.0
233
+ sum_weights = sum(weights.values())
234
+ if sum_weights > 0:
235
+ weights = {k: v / sum_weights for k, v in weights.items()}
236
+ return weights
237
+
238
+ # Return default weights if we couldn't calculate meaningful ones
239
+ print("[INFO] Using default fallback weights")
240
+ return default_weights
241
+
242
+ ##############################
243
+ # Feature Extraction
244
+ ##############################
245
+ def load_data(file):
246
+ """
247
+ Dynamically load the dependency data CSV from the uploaded file.
248
+ Expects at least "repo" and "parent" columns.
249
+ """
250
+ try:
251
+ print("[INFO] Loading data from uploaded file...")
252
+ start_time = time.time()
253
+ # Read the uploaded file directly into a DataFrame
254
+ df = pd.read_csv(file)
255
+ end_time = time.time()
256
+ print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.")
257
+ return df
258
+ except Exception as e:
259
+ print("[ERROR] Error loading data:", e)
260
+ return None
261
+
262
  def timeout_handler(signum, frame):
263
  raise TimeoutError("LLama model prediction timed out.")
264
 
 
303
  feature_weights = calculate_fallback_weights(df)
304
  print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
305
 
 
306
  for feature in feature_weights.keys():
307
  if feature in df.columns:
308
  df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
 
324
  logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
325
  return df
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  def sanity_check_weights(df):
328
  """
329
  Sanity-checks LLM weights by comparing them with other metrics.
 
434
  def validate_target(df):
435
  """
436
  Validates the target variable to ensure it has sufficient variance.
437
+ If variance is insufficient, adds small random noise to create variance.
438
  """
439
  print("[INFO] Validating target variable 'base_weight'...")
440
  target = "base_weight"
 
444
  variance = df[target].var()
445
  print(f"[DEBUG] Target variable variance: {variance}")
446
  if variance < 1e-6:
447
+ print("[WARN] Target variable has insufficient variance. Adding small random noise...")
448
+ # Add small random noise to introduce variance
449
+ np.random.seed(42) # For reproducibility
450
+ noise = np.random.normal(0.5, 0.1, size=len(df))
451
+ df[target] = noise
452
+ print(f"[INFO] New target variable variance: {df[target].var()}")
453
  return df
454
 
455