FelixPhilip commited on
Commit
57ca96a
·
1 Parent(s): 52a14c1

Oracle weight assigning update

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
.idea/DeepFundingOracle.iml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/.venv" />
6
+ </content>
7
+ <orderEntry type="jdk" jdkName="Python 3.11 (DeepFundingOracle)" jdkType="Python SDK" />
8
+ <orderEntry type="sourceFolder" forTests="false" />
9
+ </component>
10
+ <component name="PyDocumentationSettings">
11
+ <option name="format" value="GOOGLE" />
12
+ <option name="myDocStringFormat" value="Google" />
13
+ </component>
14
+ </module>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="Black">
4
+ <option name="sdkName" value="Python 3.11 (DeepFundingOracle)" />
5
+ </component>
6
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (DeepFundingOracle)" project-jdk-type="Python SDK" />
7
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/DeepFundingOracle.iml" filepath="$PROJECT_DIR$/.idea/DeepFundingOracle.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
Oracle/deepfundingoracle.py CHANGED
@@ -169,62 +169,115 @@ def fetch_github_features(df):
169
  def timeout_handler(signum, frame):
170
  raise TimeoutError("LLama model prediction timed out.")
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  def assign_base_weight(df, max_workers=32):
173
  """
174
- Assign base weights using LLama model in parallel.
 
175
  """
176
- print("[INFO] Starting base weight assignment using LLama model...", flush=True)
177
- logging.info("[INFO] Assigning base weights using LLama model...")
178
  start_time = time.time()
179
  llama = SmolLM()
180
- base_weights = []
181
- llm_cache = {}
182
 
183
- # Prepare prompts for all repositories
184
- prompts = {}
185
- for idx, row in df.iterrows():
186
- repo = row.get("repo", "")
187
- parent = row.get("parent", "")
188
- stars = row.get("stars", 0)
189
- forks = row.get("forks", 0)
190
- watchers = row.get("watchers", 0)
191
- issues = row.get("open_issues", 0)
192
- pulls = row.get("pulls", 0)
193
- activity = row.get("activity", "")
194
- prompts[idx] = (
195
- f"Repository: {repo}\n"
196
- f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
197
- f"Parent or dependency: {parent}\n\n"
198
- "Based on these features, assign a dependency weight between 0 and 1 for the repository "
199
- "that reflects how influential the repository is as a source relative to its parent. "
200
- "Only output the numeric value."
201
- )
 
 
 
202
 
203
- # Define the prediction function
204
- def _predict(idx, prompt):
205
- if idx in llm_cache:
206
- return idx, llm_cache[idx]
207
- try:
208
- resp = llama.predict(prompt)
209
- match = re.search(r"[-+]?\d*\.\d+|\d+", resp)
210
- weight = min(max(float(match.group()), 0), 1) if match else 0.0
211
- llm_cache[idx] = weight
212
- return idx, weight
213
- except Exception as e:
214
- print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
215
- logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
216
- return idx, 0.0 # Default weight in case of failure
217
 
218
- # Run predictions in parallel
219
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
220
- futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
221
- for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
222
- idx, weight = fut.result()
223
- base_weights.append((idx, weight))
224
 
225
- # Sort weights by index and assign to DataFrame
226
- base_weights.sort(key=lambda x: x[0])
227
- df["base_weight"] = [weight for _, weight in base_weights]
 
228
 
229
  end_time = time.time()
230
  print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
 
169
  def timeout_handler(signum, frame):
170
  raise TimeoutError("LLama model prediction timed out.")
171
 
172
+ # def assign_base_weight(df, max_workers=32):
173
+ # """
174
+ # Assign base weights using LLama model in parallel.
175
+ # """
176
+ # print("[INFO] Starting base weight assignment using LLama model...", flush=True)
177
+ # logging.info("[INFO] Assigning base weights using LLama model...")
178
+ # start_time = time.time()
179
+ # llama = SmolLM()
180
+ # base_weights = []
181
+ # llm_cache = {}
182
+ #
183
+ # # Prepare prompts for all repositories
184
+ # prompts = {}
185
+ # for idx, row in df.iterrows():
186
+ # repo = row.get("repo", "")
187
+ # parent = row.get("parent", "")
188
+ # stars = row.get("stars", 0)
189
+ # forks = row.get("forks", 0)
190
+ # watchers = row.get("watchers", 0)
191
+ # issues = row.get("open_issues", 0)
192
+ # pulls = row.get("pulls", 0)
193
+ # activity = row.get("activity", "")
194
+ # prompts[idx] = (
195
+ # f"Repository: {repo}\n"
196
+ # f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
197
+ # f"Parent or dependency: {parent}\n\n"
198
+ # "Based on these features, assign a dependency weight between 0 and 1 for the repository "
199
+ # "that reflects how influential the repository is as a source relative to its parent. "
200
+ # "Only output the numeric value."
201
+ # )
202
+ #
203
+ # # Define the prediction function
204
+ # def _predict(idx, prompt):
205
+ # if idx in llm_cache:
206
+ # return idx, llm_cache[idx]
207
+ # try:
208
+ # resp = llama.predict(prompt)
209
+ # match = re.search(r"[-+]?\d*\.\d+|\d+", resp)
210
+ # weight = min(max(float(match.group()), 0), 1) if match else 0.0
211
+ # llm_cache[idx] = weight
212
+ # return idx, weight
213
+ # except Exception as e:
214
+ # print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
215
+ # logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
216
+ # return idx, 0.0 # Default weight in case of failure
217
+ #
218
+ # # Run predictions in parallel
219
+ # with ThreadPoolExecutor(max_workers=max_workers) as executor:
220
+ # futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
221
+ # for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
222
+ # idx, weight = fut.result()
223
+ # base_weights.append((idx, weight))
224
+ #
225
+ # # Sort weights by index and assign to DataFrame
226
+ # base_weights.sort(key=lambda x: x[0])
227
+ # df["base_weight"] = [weight for _, weight in base_weights]
228
+ #
229
+ # end_time = time.time()
230
+ # print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
231
+ # logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
232
+ # return df
233
+
234
  def assign_base_weight(df, max_workers=32):
235
  """
236
+ Assign base weights using a single LLM call to determine feature weights,
237
+ and programmatically calculate repository weights.
238
  """
239
+ print("[INFO] Starting optimized base weight assignment...", flush=True)
240
+ logging.info("[INFO] Assigning base weights using optimized approach...")
241
  start_time = time.time()
242
  llama = SmolLM()
 
 
243
 
244
+ # Step 1: Call LLM once to determine weights for each feature
245
+ prompt = (
246
+ "The following are GitHub repository features:\n"
247
+ "- Stars\n"
248
+ "- Forks\n"
249
+ "- Watchers\n"
250
+ "- Open Issues\n"
251
+ "- Pull Requests\n"
252
+ "- Activity (days since last update)\n"
253
+ "- Contributors\n\n"
254
+ "Assign a weight (0-1) to each feature based on its importance in determining "
255
+ "the influence of a repository. Provide the weights as a JSON object with "
256
+ "keys as feature names and values as their weights."
257
+ )
258
+ try:
259
+ response = llama.predict(prompt)
260
+ feature_weights = eval(response) # Convert JSON string to dictionary
261
+ print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
262
+ except Exception as e:
263
+ print(f"[ERROR] Failed to fetch feature weights from LLM: {e}", flush=True)
264
+ logging.error(f"[ERROR] Failed to fetch feature weights from LLM: {e}")
265
+ return df
266
 
267
+ # Step 2: Programmatically calculate weights for each repository
268
+ def calculate_weight(row):
269
+ weight = 0
270
+ for feature, feature_weight in feature_weights.items():
271
+ if feature in row and pd.notna(row[feature]):
272
+ weight += row[feature] * feature_weight
273
+ return weight
 
 
 
 
 
 
 
274
 
275
+ df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
 
 
 
 
 
276
 
277
+ # Step 3: Normalize weights per parent
278
+ df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
279
+ lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
280
+ )
281
 
282
  end_time = time.time()
283
  print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)