Spaces:
Sleeping
Sleeping
Commit
·
57ca96a
1
Parent(s):
52a14c1
Oracle weight assigning update
Browse files- .idea/.gitignore +8 -0
- .idea/DeepFundingOracle.iml +14 -0
- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +7 -0
- .idea/modules.xml +8 -0
- .idea/vcs.xml +6 -0
- Oracle/deepfundingoracle.py +100 -47
.idea/.gitignore
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Default ignored files
|
2 |
+
/shelf/
|
3 |
+
/workspace.xml
|
4 |
+
# Editor-based HTTP Client requests
|
5 |
+
/httpRequests/
|
6 |
+
# Datasource local storage ignored files
|
7 |
+
/dataSources/
|
8 |
+
/dataSources.local.xml
|
.idea/DeepFundingOracle.iml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$">
|
5 |
+
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
6 |
+
</content>
|
7 |
+
<orderEntry type="jdk" jdkName="Python 3.11 (DeepFundingOracle)" jdkType="Python SDK" />
|
8 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
9 |
+
</component>
|
10 |
+
<component name="PyDocumentationSettings">
|
11 |
+
<option name="format" value="GOOGLE" />
|
12 |
+
<option name="myDocStringFormat" value="Google" />
|
13 |
+
</component>
|
14 |
+
</module>
|
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="Black">
|
4 |
+
<option name="sdkName" value="Python 3.11 (DeepFundingOracle)" />
|
5 |
+
</component>
|
6 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (DeepFundingOracle)" project-jdk-type="Python SDK" />
|
7 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/DeepFundingOracle.iml" filepath="$PROJECT_DIR$/.idea/DeepFundingOracle.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
Oracle/deepfundingoracle.py
CHANGED
@@ -169,62 +169,115 @@ def fetch_github_features(df):
|
|
169 |
def timeout_handler(signum, frame):
|
170 |
raise TimeoutError("LLama model prediction timed out.")
|
171 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
def assign_base_weight(df, max_workers=32):
|
173 |
"""
|
174 |
-
Assign base weights using
|
|
|
175 |
"""
|
176 |
-
print("[INFO] Starting base weight assignment
|
177 |
-
logging.info("[INFO] Assigning base weights using
|
178 |
start_time = time.time()
|
179 |
llama = SmolLM()
|
180 |
-
base_weights = []
|
181 |
-
llm_cache = {}
|
182 |
|
183 |
-
#
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
202 |
|
203 |
-
#
|
204 |
-
def
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
weight = min(max(float(match.group()), 0), 1) if match else 0.0
|
211 |
-
llm_cache[idx] = weight
|
212 |
-
return idx, weight
|
213 |
-
except Exception as e:
|
214 |
-
print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
|
215 |
-
logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
|
216 |
-
return idx, 0.0 # Default weight in case of failure
|
217 |
|
218 |
-
|
219 |
-
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
220 |
-
futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
|
221 |
-
for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
|
222 |
-
idx, weight = fut.result()
|
223 |
-
base_weights.append((idx, weight))
|
224 |
|
225 |
-
#
|
226 |
-
|
227 |
-
|
|
|
228 |
|
229 |
end_time = time.time()
|
230 |
print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
|
|
|
169 |
def timeout_handler(signum, frame):
|
170 |
raise TimeoutError("LLama model prediction timed out.")
|
171 |
|
172 |
+
# def assign_base_weight(df, max_workers=32):
|
173 |
+
# """
|
174 |
+
# Assign base weights using LLama model in parallel.
|
175 |
+
# """
|
176 |
+
# print("[INFO] Starting base weight assignment using LLama model...", flush=True)
|
177 |
+
# logging.info("[INFO] Assigning base weights using LLama model...")
|
178 |
+
# start_time = time.time()
|
179 |
+
# llama = SmolLM()
|
180 |
+
# base_weights = []
|
181 |
+
# llm_cache = {}
|
182 |
+
#
|
183 |
+
# # Prepare prompts for all repositories
|
184 |
+
# prompts = {}
|
185 |
+
# for idx, row in df.iterrows():
|
186 |
+
# repo = row.get("repo", "")
|
187 |
+
# parent = row.get("parent", "")
|
188 |
+
# stars = row.get("stars", 0)
|
189 |
+
# forks = row.get("forks", 0)
|
190 |
+
# watchers = row.get("watchers", 0)
|
191 |
+
# issues = row.get("open_issues", 0)
|
192 |
+
# pulls = row.get("pulls", 0)
|
193 |
+
# activity = row.get("activity", "")
|
194 |
+
# prompts[idx] = (
|
195 |
+
# f"Repository: {repo}\n"
|
196 |
+
# f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
|
197 |
+
# f"Parent or dependency: {parent}\n\n"
|
198 |
+
# "Based on these features, assign a dependency weight between 0 and 1 for the repository "
|
199 |
+
# "that reflects how influential the repository is as a source relative to its parent. "
|
200 |
+
# "Only output the numeric value."
|
201 |
+
# )
|
202 |
+
#
|
203 |
+
# # Define the prediction function
|
204 |
+
# def _predict(idx, prompt):
|
205 |
+
# if idx in llm_cache:
|
206 |
+
# return idx, llm_cache[idx]
|
207 |
+
# try:
|
208 |
+
# resp = llama.predict(prompt)
|
209 |
+
# match = re.search(r"[-+]?\d*\.\d+|\d+", resp)
|
210 |
+
# weight = min(max(float(match.group()), 0), 1) if match else 0.0
|
211 |
+
# llm_cache[idx] = weight
|
212 |
+
# return idx, weight
|
213 |
+
# except Exception as e:
|
214 |
+
# print(f"[ERROR] Failed to process repository {idx}: {e}", flush=True)
|
215 |
+
# logging.error(f"[ERROR] Failed to process repository {idx}: {e}")
|
216 |
+
# return idx, 0.0 # Default weight in case of failure
|
217 |
+
#
|
218 |
+
# # Run predictions in parallel
|
219 |
+
# with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
220 |
+
# futures = [executor.submit(_predict, idx, prompt) for idx, prompt in prompts.items()]
|
221 |
+
# for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="LLM Prompts"):
|
222 |
+
# idx, weight = fut.result()
|
223 |
+
# base_weights.append((idx, weight))
|
224 |
+
#
|
225 |
+
# # Sort weights by index and assign to DataFrame
|
226 |
+
# base_weights.sort(key=lambda x: x[0])
|
227 |
+
# df["base_weight"] = [weight for _, weight in base_weights]
|
228 |
+
#
|
229 |
+
# end_time = time.time()
|
230 |
+
# print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
|
231 |
+
# logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
232 |
+
# return df
|
233 |
+
|
234 |
def assign_base_weight(df, max_workers=32):
|
235 |
"""
|
236 |
+
Assign base weights using a single LLM call to determine feature weights,
|
237 |
+
and programmatically calculate repository weights.
|
238 |
"""
|
239 |
+
print("[INFO] Starting optimized base weight assignment...", flush=True)
|
240 |
+
logging.info("[INFO] Assigning base weights using optimized approach...")
|
241 |
start_time = time.time()
|
242 |
llama = SmolLM()
|
|
|
|
|
243 |
|
244 |
+
# Step 1: Call LLM once to determine weights for each feature
|
245 |
+
prompt = (
|
246 |
+
"The following are GitHub repository features:\n"
|
247 |
+
"- Stars\n"
|
248 |
+
"- Forks\n"
|
249 |
+
"- Watchers\n"
|
250 |
+
"- Open Issues\n"
|
251 |
+
"- Pull Requests\n"
|
252 |
+
"- Activity (days since last update)\n"
|
253 |
+
"- Contributors\n\n"
|
254 |
+
"Assign a weight (0-1) to each feature based on its importance in determining "
|
255 |
+
"the influence of a repository. Provide the weights as a JSON object with "
|
256 |
+
"keys as feature names and values as their weights."
|
257 |
+
)
|
258 |
+
try:
|
259 |
+
response = llama.predict(prompt)
|
260 |
+
feature_weights = eval(response) # Convert JSON string to dictionary
|
261 |
+
print(f"[INFO] Feature weights from LLM: {feature_weights}", flush=True)
|
262 |
+
except Exception as e:
|
263 |
+
print(f"[ERROR] Failed to fetch feature weights from LLM: {e}", flush=True)
|
264 |
+
logging.error(f"[ERROR] Failed to fetch feature weights from LLM: {e}")
|
265 |
+
return df
|
266 |
|
267 |
+
# Step 2: Programmatically calculate weights for each repository
|
268 |
+
def calculate_weight(row):
|
269 |
+
weight = 0
|
270 |
+
for feature, feature_weight in feature_weights.items():
|
271 |
+
if feature in row and pd.notna(row[feature]):
|
272 |
+
weight += row[feature] * feature_weight
|
273 |
+
return weight
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
+
df["base_weight_raw"] = df.apply(calculate_weight, axis=1)
|
|
|
|
|
|
|
|
|
|
|
276 |
|
277 |
+
# Step 3: Normalize weights per parent
|
278 |
+
df["base_weight"] = df.groupby("parent")["base_weight_raw"].transform(
|
279 |
+
lambda s: (s - s.min()) / (s.max() - s.min() if s.max() != s.min() else 1)
|
280 |
+
)
|
281 |
|
282 |
end_time = time.time()
|
283 |
print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
|