Spaces:
Running
Running
Commit
·
513a1f2
1
Parent(s):
15b03e0
Oracle and smolagent
Browse files- Oracle/DataSmolAgent.py +116 -0
- Oracle/SmolLM.py +29 -0
- Oracle/__init__.py +0 -0
- Oracle/__pycache__/DataSmolAgent.cpython-311.pyc +0 -0
- Oracle/__pycache__/LLamaModel.cpython-311.pyc +0 -0
- Oracle/__pycache__/__init__.cpython-311.pyc +0 -0
- Oracle/__pycache__/backend.cpython-311.pyc +0 -0
- Oracle/__pycache__/deepfundingoracle.cpython-311.pyc +0 -0
- Oracle/deepfundingoracle.py +285 -0
- README.md +1 -11
- app.py +22 -59
- data/GG_data.csv +0 -0
- data/test.csv +0 -0
- data/test2.csv +100 -0
- deepfundingoracle.log +3 -0
- requirements.txt +11 -1
Oracle/DataSmolAgent.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
from smolagents import HfApiModel,tool,CodeAgent
|
4 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
5 |
+
|
6 |
+
@tool
|
7 |
+
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
|
8 |
+
"""
|
9 |
+
Clean the DataFrame by stripping whitespace from column names and dropping rows that are completely empty.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
df (pd.DataFrame): The input DataFrame containing the raw data.
|
13 |
+
|
14 |
+
Returns:
|
15 |
+
pd.DataFrame: A cleaned DataFrame with stripped column names and without completely empty rows.
|
16 |
+
"""
|
17 |
+
df.columns = df.columns.str.strip()
|
18 |
+
df = df.dropna(how="all")
|
19 |
+
return df
|
20 |
+
|
21 |
+
@tool
|
22 |
+
def extract_features(df: pd.DataFrame) -> pd.DataFrame:
|
23 |
+
"""
|
24 |
+
Dynamically extract features from the DataFrame.
|
25 |
+
|
26 |
+
For numeric columns:
|
27 |
+
- If all values are non-negative, a log-transformed version is created.
|
28 |
+
|
29 |
+
For columns that appear to be dates:
|
30 |
+
- Year, month, and day are extracted.
|
31 |
+
|
32 |
+
For non-numeric, non-date columns:
|
33 |
+
- They are encoded as categorical numeric codes.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
df (pd.DataFrame): The input DataFrame containing the raw data.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
pd.DataFrame: The DataFrame updated with new dynamically engineered features.
|
40 |
+
"""
|
41 |
+
# Numeric columns: log transformation
|
42 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
|
43 |
+
for col in numeric_cols:
|
44 |
+
if (df[col] >= 0).all():
|
45 |
+
df[f"log_{col}"] = np.log(df[col] + 1)
|
46 |
+
|
47 |
+
# Date-like columns extraction
|
48 |
+
for col in df.columns:
|
49 |
+
if "date" in col.lower() or "time" in col.lower():
|
50 |
+
try:
|
51 |
+
df[col] = pd.to_datetime(df[col], errors='coerce')
|
52 |
+
df[f"{col}_year"] = df[col].dt.year
|
53 |
+
df[f"{col}_month"] = df[col].dt.month
|
54 |
+
df[f"{col}_day"] = df[col].dt.day
|
55 |
+
except Exception:
|
56 |
+
pass
|
57 |
+
|
58 |
+
# Non-numeric processing: encode as categorical numeric codes.
|
59 |
+
non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
|
60 |
+
valid_cat = []
|
61 |
+
for col in non_numeric:
|
62 |
+
try:
|
63 |
+
pd.to_datetime(df[col], errors='raise')
|
64 |
+
except Exception:
|
65 |
+
valid_cat.append(col)
|
66 |
+
for col in valid_cat:
|
67 |
+
df[f"{col}_cat"] = df[col].astype("category").cat.codes
|
68 |
+
|
69 |
+
return df
|
70 |
+
|
71 |
+
@tool
|
72 |
+
def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
|
73 |
+
"""
|
74 |
+
Save the DataFrame to a CSV file and return the file path.
|
75 |
+
|
76 |
+
Args:
|
77 |
+
df (pd.DataFrame): The DataFrame to save.
|
78 |
+
filename (str): The name of the output CSV file.
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
str: The file path of the saved CSV.
|
82 |
+
"""
|
83 |
+
df.to_csv(filename, index=False)
|
84 |
+
return filename
|
85 |
+
|
86 |
+
class DataSmolAgent(CodeAgent):
|
87 |
+
"""
|
88 |
+
A data processing agent that cleans and extracts features from the provided DataFrame.
|
89 |
+
"""
|
90 |
+
def __init__(self, df: pd.DataFrame):
|
91 |
+
self.df = df
|
92 |
+
self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
93 |
+
self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
|
94 |
+
super().__init__(
|
95 |
+
tools=[
|
96 |
+
clean_data,
|
97 |
+
extract_features,
|
98 |
+
save_to_csv, # Added save_to_csv tool
|
99 |
+
],
|
100 |
+
model=self.model,
|
101 |
+
additional_authorized_imports=["pandas", "numpy"]
|
102 |
+
)
|
103 |
+
|
104 |
+
def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
|
105 |
+
# Run the agent with the provided DataFrame
|
106 |
+
clean_output = self.tools["clean_data"](df=self.df)
|
107 |
+
self.df = clean_output.result if hasattr(clean_output, "result") else clean_output
|
108 |
+
|
109 |
+
features_output = self.tools["extract_features"](df=self.df)
|
110 |
+
self.df = features_output.result if hasattr(features_output, "result") else features_output
|
111 |
+
|
112 |
+
if output_csv:
|
113 |
+
csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
|
114 |
+
print(f"CSV saved at: {csv_output}")
|
115 |
+
|
116 |
+
return self.df
|
Oracle/SmolLM.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
2 |
+
|
3 |
+
class SmolLM:
|
4 |
+
def __init__(self, model_path="HuggingFaceTB/SmolLM2-1.7B-Instruct"):
|
5 |
+
self.available = True
|
6 |
+
try:
|
7 |
+
print(f"[INFO] Loading model tokenizer from {model_path}")
|
8 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
9 |
+
print(f"[INFO] Loading model from {model_path}")
|
10 |
+
self.model = AutoModelForCausalLM.from_pretrained(model_path)
|
11 |
+
print("[INFO] Model loaded successfully")
|
12 |
+
except Exception as e:
|
13 |
+
print(f"[ERROR] Failed to load model '{model_path}': {e}")
|
14 |
+
self.available = False
|
15 |
+
|
16 |
+
def predict(self, prompt):
|
17 |
+
if not self.available:
|
18 |
+
print("[WARN] LLama model unavailable, returning default weight 0.5")
|
19 |
+
return "0.5"
|
20 |
+
try:
|
21 |
+
print(f"[INFO] Generating response for prompt: {prompt[:100]}...", flush=True)
|
22 |
+
inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
23 |
+
outputs = self.model.generate(**inputs, max_length=150, num_return_sequences=1)
|
24 |
+
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
25 |
+
print(f"[INFO] Generated response: {response[:100]}...", flush=True)
|
26 |
+
return response
|
27 |
+
except Exception as e:
|
28 |
+
print(f"[ERROR] LLama model inference failed: {e}")
|
29 |
+
return "0.5"
|
Oracle/__init__.py
ADDED
File without changes
|
Oracle/__pycache__/DataSmolAgent.cpython-311.pyc
ADDED
Binary file (5.28 kB). View file
|
|
Oracle/__pycache__/LLamaModel.cpython-311.pyc
ADDED
Binary file (2.69 kB). View file
|
|
Oracle/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (150 Bytes). View file
|
|
Oracle/__pycache__/backend.cpython-311.pyc
ADDED
Binary file (3.36 kB). View file
|
|
Oracle/__pycache__/deepfundingoracle.cpython-311.pyc
ADDED
Binary file (16.7 kB). View file
|
|
Oracle/deepfundingoracle.py
ADDED
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
DeepFunding Oracle:
|
3 |
+
This script dynamically loads dependency data and for each repository URL:
|
4 |
+
• Fetches GitHub features (stars, forks, watchers, open issues, pull requests, activity) using the GitHub API.
|
5 |
+
• Uses the LLama model to analyze parent-child behavior (based on the fetched features and parent info)
|
6 |
+
and returns a base weight (0-1) for the repository.
|
7 |
+
• Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight.
|
8 |
+
The output submission CSV has three columns: repo, parent, and final_weight.
|
9 |
+
"""
|
10 |
+
|
11 |
+
from io import StringIO
|
12 |
+
import os
|
13 |
+
import warnings
|
14 |
+
import csv
|
15 |
+
import re
|
16 |
+
import requests
|
17 |
+
import numpy as np
|
18 |
+
import pandas as pd
|
19 |
+
import matplotlib.pyplot as plt
|
20 |
+
import seaborn as sns
|
21 |
+
import time
|
22 |
+
import threading
|
23 |
+
import logging
|
24 |
+
import concurrent.futures
|
25 |
+
import signal
|
26 |
+
from tqdm import tqdm
|
27 |
+
import sys
|
28 |
+
|
29 |
+
from sklearn.model_selection import train_test_split, GridSearchCV
|
30 |
+
from sklearn.ensemble import RandomForestRegressor
|
31 |
+
from sklearn.metrics import mean_squared_error
|
32 |
+
|
33 |
+
from Oracle.SmolLM import SmolLM
|
34 |
+
|
35 |
+
warnings.filterwarnings("ignore")
|
36 |
+
|
37 |
+
# Configure logging to file and console
|
38 |
+
logging.basicConfig(
|
39 |
+
handlers=[
|
40 |
+
logging.FileHandler("deepfundingoracle.log"),
|
41 |
+
logging.StreamHandler(sys.stdout)
|
42 |
+
],
|
43 |
+
level=logging.INFO,
|
44 |
+
format="%(asctime)s - %(levelname)s - %(message)s"
|
45 |
+
)
|
46 |
+
|
47 |
+
##############################
|
48 |
+
# Enhanced GitHub API helper: Fetch repository metrics
|
49 |
+
##############################
|
50 |
+
def fetch_repo_metrics(repo_url):
|
51 |
+
"""
|
52 |
+
Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
|
53 |
+
Assumes repo_url is in the form "https://github.com/owner/repo".
|
54 |
+
"""
|
55 |
+
try:
|
56 |
+
# Extract owner and repo name
|
57 |
+
m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
|
58 |
+
if not m:
|
59 |
+
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
|
60 |
+
owner, repo_name = m.group(1), m.group(2)
|
61 |
+
api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
|
62 |
+
headers = {}
|
63 |
+
|
64 |
+
token = os.environ.get("GITHUB_API_TOKEN", "")
|
65 |
+
if token: headers["Authorization"] = f"token {token}"
|
66 |
+
r = requests.get(api_url, headers=headers)
|
67 |
+
if r.status_code == 200:
|
68 |
+
data = r.json()
|
69 |
+
pulls_url = data.get("pulls_url", "").replace("{\/*state}", "")
|
70 |
+
pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
|
71 |
+
activity = data.get("updated_at", "")
|
72 |
+
return {
|
73 |
+
"stargazers_count": data.get("stargazers_count", 0),
|
74 |
+
"forks_count": data.get("forks_count", 0),
|
75 |
+
"watchers_count": data.get("watchers_count", 0),
|
76 |
+
"open_issues_count": data.get("open_issues_count", 0),
|
77 |
+
"pulls_count": pulls_count,
|
78 |
+
"activity": activity,
|
79 |
+
"owner": owner,
|
80 |
+
"repo_name": repo_name,
|
81 |
+
"token": token
|
82 |
+
}
|
83 |
+
else:
|
84 |
+
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
|
85 |
+
except Exception:
|
86 |
+
return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
|
87 |
+
|
88 |
+
|
89 |
+
##############################
|
90 |
+
# Enhanced Feature Extraction
|
91 |
+
##############################
|
92 |
+
def load_data(file):
|
93 |
+
"""
|
94 |
+
Dynamically load the dependency data CSV from the uploaded file.
|
95 |
+
Expects at least "repo" and "parent" columns.
|
96 |
+
"""
|
97 |
+
try:
|
98 |
+
print("[INFO] Loading data from uploaded file...")
|
99 |
+
start_time = time.time()
|
100 |
+
# Read the uploaded file directly into a DataFrame
|
101 |
+
df = pd.read_csv(file)
|
102 |
+
end_time = time.time()
|
103 |
+
print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.")
|
104 |
+
return df
|
105 |
+
except Exception as e:
|
106 |
+
print("[ERROR] Error loading data:", e)
|
107 |
+
return None
|
108 |
+
|
109 |
+
def fetch_github_features(df):
|
110 |
+
"""
|
111 |
+
For each row, using the repo URL, call the GitHub API to fetch:
|
112 |
+
stars, forks, watchers, open issues, pull requests, activity, and contributors count.
|
113 |
+
Adds these as new columns to the DataFrame.
|
114 |
+
"""
|
115 |
+
print("[INFO] Fetching GitHub features for repositories...")
|
116 |
+
start_time = time.time()
|
117 |
+
stars_list = []
|
118 |
+
forks_list = []
|
119 |
+
watchers_list = []
|
120 |
+
issues_list = []
|
121 |
+
pulls_list = []
|
122 |
+
activity_list = []
|
123 |
+
contributors_list = []
|
124 |
+
|
125 |
+
for idx, row in df.iterrows():
|
126 |
+
repo_url = row.get("repo", "")
|
127 |
+
print(f"[INFO] Processing repository {idx + 1}/{len(df)}: {repo_url}")
|
128 |
+
features = fetch_repo_metrics(repo_url)
|
129 |
+
stars_list.append(features["stargazers_count"])
|
130 |
+
forks_list.append(features["forks_count"])
|
131 |
+
watchers_list.append(features["watchers_count"])
|
132 |
+
issues_list.append(features["open_issues_count"])
|
133 |
+
pulls_list.append(features["pulls_count"])
|
134 |
+
activity_list.append(features["activity"])
|
135 |
+
|
136 |
+
# Fetch contributors count
|
137 |
+
try:
|
138 |
+
contributors_url = f"https://api.github.com/repos/{features['owner']}/{features['repo_name']}/contributors"
|
139 |
+
headers = {"Authorization": f"token {features['token']}"}
|
140 |
+
contributors_response = requests.get(contributors_url, headers=headers)
|
141 |
+
if contributors_response.status_code == 200:
|
142 |
+
contributors_list.append(len(contributors_response.json()))
|
143 |
+
else:
|
144 |
+
contributors_list.append(0)
|
145 |
+
except Exception:
|
146 |
+
contributors_list.append(0)
|
147 |
+
|
148 |
+
df["stars"] = stars_list
|
149 |
+
df["forks"] = forks_list
|
150 |
+
df["watchers"] = watchers_list
|
151 |
+
df["open_issues"] = issues_list
|
152 |
+
df["pulls"] = pulls_list
|
153 |
+
df["activity"] = activity_list
|
154 |
+
df["contributors"] = contributors_list
|
155 |
+
|
156 |
+
end_time = time.time()
|
157 |
+
print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
|
158 |
+
return df
|
159 |
+
|
160 |
+
def timeout_handler(signum, frame):
|
161 |
+
raise TimeoutError("LLama model prediction timed out.")
|
162 |
+
|
163 |
+
def assign_base_weight(df):
|
164 |
+
print("[INFO] Starting base weight assignment using LLama model...", flush=True)
|
165 |
+
logging.info("[INFO] Assigning base weights using LLama model...")
|
166 |
+
start_time = time.time()
|
167 |
+
llama = SmolLM()
|
168 |
+
base_weights = []
|
169 |
+
|
170 |
+
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Assigning weights"):
|
171 |
+
repo = row.get("repo", "")
|
172 |
+
print(f"[INFO] Assigning weight for repository {idx + 1}/{len(df)}: {repo}", flush=True)
|
173 |
+
logging.info(f"[INFO] Processing repository {idx + 1}/{len(df)}: {repo}")
|
174 |
+
parent = row.get("parent", "")
|
175 |
+
stars = row.get("stars", 0)
|
176 |
+
forks = row.get("forks", 0)
|
177 |
+
watchers = row.get("watchers", 0)
|
178 |
+
issues = row.get("open_issues", 0)
|
179 |
+
pulls = row.get("pulls", 0)
|
180 |
+
activity = row.get("activity", "")
|
181 |
+
prompt = (
|
182 |
+
f"Repository: {repo}\n"
|
183 |
+
f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
|
184 |
+
f"Parent or dependency: {parent}\n\n"
|
185 |
+
"Based on these features, assign a dependency weight between 0 and 1 for the repository "
|
186 |
+
"that reflects how influential the repository is as a source relative to its parent. "
|
187 |
+
"Only output the numeric value."
|
188 |
+
)
|
189 |
+
try:
|
190 |
+
print(f"[INFO] Sending prompt to LLama model for repo: {repo}", flush=True)
|
191 |
+
start_llama_time = time.time()
|
192 |
+
response = llama.predict(prompt)
|
193 |
+
weight = float(''.join([c for c in response if c.isdigit() or c == '.']))
|
194 |
+
weight = min(max(weight, 0), 1)
|
195 |
+
end_llama_time = time.time()
|
196 |
+
print(f"[INFO] Received weight {weight} for {repo} in {end_llama_time - start_llama_time:.2f} seconds.", flush=True)
|
197 |
+
logging.info(f"[INFO] Processed repository {repo} in {end_llama_time - start_llama_time:.2f} seconds. Weight: {weight}")
|
198 |
+
except Exception as e:
|
199 |
+
print(f"[ERROR] Failed to process repository {repo}: {e}", flush=True)
|
200 |
+
logging.error(f"[ERROR] Failed to process repository {repo}: {e}")
|
201 |
+
weight = 0.5 # Default weight in case of failure
|
202 |
+
base_weights.append(weight)
|
203 |
+
print(f"[PROGRESS] Finished {idx + 1}/{len(df)} repositories.", flush=True)
|
204 |
+
|
205 |
+
df["base_weight"] = base_weights
|
206 |
+
end_time = time.time()
|
207 |
+
print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
|
208 |
+
logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
|
209 |
+
return df
|
210 |
+
|
211 |
+
def prepare_dataset(file):
|
212 |
+
print("[INFO] Starting dataset preparation...")
|
213 |
+
start_time = time.time()
|
214 |
+
df = load_data(file)
|
215 |
+
if df is None:
|
216 |
+
raise ValueError("Failed to load data.")
|
217 |
+
if not {"repo", "parent"}.issubset(df.columns):
|
218 |
+
raise ValueError("Input CSV must contain 'repo' and 'parent' columns.")
|
219 |
+
print("[INFO] Fetching GitHub features...")
|
220 |
+
df = fetch_github_features(df)
|
221 |
+
print("[INFO] GitHub features fetched successfully.")
|
222 |
+
print("[INFO] Assigning base weights using LLama model...")
|
223 |
+
df = assign_base_weight(df)
|
224 |
+
end_time = time.time()
|
225 |
+
print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
|
226 |
+
return df
|
227 |
+
|
228 |
+
|
229 |
+
##############################
|
230 |
+
# Enhanced RandomForest Regression
|
231 |
+
##############################
|
232 |
+
def train_predict_weight(df):
|
233 |
+
print("[INFO] Starting weight prediction...", flush=True)
|
234 |
+
start_time = time.time()
|
235 |
+
target = "base_weight"
|
236 |
+
feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
|
237 |
+
if target not in df.columns:
|
238 |
+
raise ValueError("Base weight column missing.")
|
239 |
+
X = df[feature_cols]
|
240 |
+
y = df[target]
|
241 |
+
print("[INFO] Splitting data into training and testing sets...", flush=True)
|
242 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
243 |
+
rf_model = RandomForestRegressor(random_state=42)
|
244 |
+
param_grid = {
|
245 |
+
"n_estimators": [100, 200, 300],
|
246 |
+
"max_depth": [None, 10, 20, 30],
|
247 |
+
"min_samples_split": [2, 5, 10],
|
248 |
+
"min_samples_leaf": [1, 2, 4]
|
249 |
+
}
|
250 |
+
print("[INFO] Performing grid search for hyperparameter tuning...", flush=True)
|
251 |
+
gridSearch = GridSearchCV(
|
252 |
+
estimator=rf_model,
|
253 |
+
param_grid=param_grid,
|
254 |
+
cv=5,
|
255 |
+
scoring="neg_mean_squared_error"
|
256 |
+
)
|
257 |
+
gridSearch.fit(X_train, y_train)
|
258 |
+
print("[INFO] Grid search completed.", flush=True)
|
259 |
+
print("Best Parameters:", gridSearch.best_params_, flush=True)
|
260 |
+
print("Best MSE:", -gridSearch.best_score_, flush=True)
|
261 |
+
y_pred = gridSearch.best_estimator_.predict(X_test)
|
262 |
+
mse = mean_squared_error(y_test, y_pred)
|
263 |
+
print("Final RF Test MSE:", mse, flush=True)
|
264 |
+
print("[INFO] Predicting final weights for all rows...")
|
265 |
+
df["final_weight"] = gridSearch.best_estimator_.predict(X)
|
266 |
+
end_time = time.time()
|
267 |
+
print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
|
268 |
+
return df
|
269 |
+
|
270 |
+
##############################
|
271 |
+
# CSV Output
|
272 |
+
##############################
|
273 |
+
def create_submission_csv(df, output_filename="submission.csv"):
|
274 |
+
print(f"[INFO] Writing results to {output_filename}...", flush=True)
|
275 |
+
required_cols = ["repo", "parent", "final_weight"]
|
276 |
+
submission_df = df[required_cols]
|
277 |
+
submission_df.to_csv(output_filename, index=False)
|
278 |
+
print(f"[INFO] Results written to {output_filename}.", flush=True)
|
279 |
+
return output_filename
|
280 |
+
|
281 |
+
# Removed Gradio UI code from this file to ensure modular workflow.
|
282 |
+
# This file now focuses solely on data processing and prediction.
|
283 |
+
|
284 |
+
if __name__ == "__main__":
|
285 |
+
print("DeepFunding Oracle is now ready for backend processing.", flush=True)
|
README.md
CHANGED
@@ -1,13 +1,3 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
emoji: 💬
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: purple
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 5.0.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
short_description: 'Oracle for predicting funding for projects '
|
11 |
---
|
12 |
-
|
13 |
-
An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
|
|
|
1 |
---
|
2 |
+
license: llama3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
app.py
CHANGED
@@ -1,64 +1,27 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
)
|
18 |
-
|
19 |
-
|
20 |
-
for val in history:
|
21 |
-
if val[0]:
|
22 |
-
messages.append({"role": "user", "content": val[0]})
|
23 |
-
if val[1]:
|
24 |
-
messages.append({"role": "assistant", "content": val[1]})
|
25 |
-
|
26 |
-
messages.append({"role": "user", "content": message})
|
27 |
-
|
28 |
-
response = ""
|
29 |
-
|
30 |
-
for message in client.chat_completion(
|
31 |
-
messages,
|
32 |
-
max_tokens=max_tokens,
|
33 |
-
stream=True,
|
34 |
-
temperature=temperature,
|
35 |
-
top_p=top_p,
|
36 |
-
):
|
37 |
-
token = message.choices[0].delta.content
|
38 |
-
|
39 |
-
response += token
|
40 |
-
yield response
|
41 |
-
|
42 |
-
|
43 |
-
"""
|
44 |
-
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
|
45 |
-
"""
|
46 |
-
demo = gr.ChatInterface(
|
47 |
-
respond,
|
48 |
-
additional_inputs=[
|
49 |
-
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
|
50 |
-
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
|
51 |
-
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
|
52 |
-
gr.Slider(
|
53 |
-
minimum=0.1,
|
54 |
-
maximum=1.0,
|
55 |
-
value=0.95,
|
56 |
-
step=0.05,
|
57 |
-
label="Top-p (nucleus sampling)",
|
58 |
-
),
|
59 |
],
|
|
|
|
|
60 |
)
|
61 |
|
62 |
-
|
63 |
if __name__ == "__main__":
|
64 |
-
|
|
|
|
1 |
+
import os
|
2 |
import gradio as gr
|
3 |
+
from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
|
4 |
+
|
5 |
+
# Gradio-only deployment entrypoint for Hugging Face Spaces
|
6 |
+
def analyze_file(upload):
|
7 |
+
# upload is a file-like object with .name
|
8 |
+
df = prepare_dataset(upload.name)
|
9 |
+
df = train_predict_weight(df)
|
10 |
+
csv_path = create_submission_csv(df, "submission.csv")
|
11 |
+
preview = df.head().to_csv(index=False)
|
12 |
+
return preview, csv_path
|
13 |
+
|
14 |
+
iface = gr.Interface(
|
15 |
+
fn=analyze_file,
|
16 |
+
inputs=gr.File(label="Upload CSV", type="file"),
|
17 |
+
outputs=[
|
18 |
+
gr.Textbox(label="Preview of Results"),
|
19 |
+
gr.Textbox(label="Download CSV Path")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
],
|
21 |
+
title="DeepFunding Oracle",
|
22 |
+
description="Upload a CSV of repo-parent relationships; returns base and final weight predictions as CSV."
|
23 |
)
|
24 |
|
|
|
25 |
if __name__ == "__main__":
|
26 |
+
port = int(os.environ.get("PORT", 7860))
|
27 |
+
iface.launch(server_name="0.0.0.0", server_port=port)
|
data/GG_data.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/test.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/test2.csv
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
repo,parent
|
2 |
+
https://github.com/web3/web3.js,ethereum
|
3 |
+
https://github.com/prysmaticlabs/prysm,ethereum
|
4 |
+
https://github.com/ethereum/fe,ethereum
|
5 |
+
https://github.com/ethereum/remix-project,ethereum
|
6 |
+
https://github.com/eth-infinitism/account-abstraction,ethereum
|
7 |
+
https://github.com/wevm/viem,ethereum
|
8 |
+
https://github.com/nethereum/nethereum,ethereum
|
9 |
+
https://github.com/ethers-io/ethers.js,ethereum
|
10 |
+
https://github.com/chainsafe/lodestar,ethereum
|
11 |
+
https://github.com/ethereum-lists/chains,ethereum
|
12 |
+
https://github.com/sigp/lighthouse,ethereum
|
13 |
+
https://github.com/ethereum/py-evm,ethereum
|
14 |
+
https://github.com/hyperledger/besu,ethereum
|
15 |
+
https://github.com/erigontech/erigon,ethereum
|
16 |
+
https://github.com/vyperlang/titanoboa,ethereum
|
17 |
+
https://github.com/alloy-rs/alloy,ethereum
|
18 |
+
https://github.com/ethereumjs/ethereumjs-monorepo,ethereum
|
19 |
+
https://github.com/foundry-rs/foundry,ethereum
|
20 |
+
https://github.com/safe-global/safe-smart-account,ethereum
|
21 |
+
https://github.com/consensys/teku,ethereum
|
22 |
+
https://github.com/grandinetech/grandine,ethereum
|
23 |
+
https://github.com/ethereum/sourcify,ethereum
|
24 |
+
https://github.com/ethereum/solidity,ethereum
|
25 |
+
https://github.com/status-im/nimbus-eth2,ethereum
|
26 |
+
https://github.com/openzeppelin/openzeppelin-contracts,ethereum
|
27 |
+
https://github.com/ethereum/web3.py,ethereum
|
28 |
+
https://github.com/nethermindeth/nethermind,ethereum
|
29 |
+
https://github.com/apeworx/ape,ethereum
|
30 |
+
https://github.com/a16z/helios,ethereum
|
31 |
+
https://github.com/paradigmxyz/reth,ethereum
|
32 |
+
https://github.com/scaffold-eth/scaffold-eth-2,ethereum
|
33 |
+
https://github.com/vyperlang/vyper,ethereum
|
34 |
+
https://github.com/hyperledger-web3j/web3j,ethereum
|
35 |
+
https://github.com/ethereum/go-ethereum,ethereum
|
36 |
+
https://github.com/nomicfoundation/hardhat,ethereum
|
37 |
+
https://github.com/census-instrumentation/opencensus-go,https://github.com/prysmaticlabs/prysm
|
38 |
+
https://github.com/pion/turn,https://github.com/prysmaticlabs/prysm
|
39 |
+
https://github.com/google/pprof,https://github.com/prysmaticlabs/prysm
|
40 |
+
https://github.com/uber-go/fx,https://github.com/prysmaticlabs/prysm
|
41 |
+
https://github.com/elastic/gosigar,https://github.com/prysmaticlabs/prysm
|
42 |
+
https://github.com/cockroachdb/logtags,https://github.com/prysmaticlabs/prysm
|
43 |
+
https://github.com/tklauser/go-sysconf,https://github.com/prysmaticlabs/prysm
|
44 |
+
https://github.com/manifoldco/promptui,https://github.com/prysmaticlabs/prysm
|
45 |
+
https://github.com/cockroachdb/errors,https://github.com/prysmaticlabs/prysm
|
46 |
+
https://github.com/coreos/go-systemd,https://github.com/prysmaticlabs/prysm
|
47 |
+
https://github.com/herumi/bls-eth-go-binary,https://github.com/prysmaticlabs/prysm
|
48 |
+
https://github.com/spf13/afero,https://github.com/prysmaticlabs/prysm
|
49 |
+
https://github.com/gorilla/websocket,https://github.com/prysmaticlabs/prysm
|
50 |
+
https://github.com/libp2p/go-reuseport,https://github.com/prysmaticlabs/prysm
|
51 |
+
https://github.com/hashicorp/golang-lru,https://github.com/prysmaticlabs/prysm
|
52 |
+
https://github.com/ianlancetaylor/cgosymbolizer,https://github.com/prysmaticlabs/prysm
|
53 |
+
https://github.com/huin/goupnp,https://github.com/prysmaticlabs/prysm
|
54 |
+
https://github.com/olekukonko/tablewriter,https://github.com/prysmaticlabs/prysm
|
55 |
+
https://github.com/pion/webrtc,https://github.com/prysmaticlabs/prysm
|
56 |
+
https://github.com/d4l3k/messagediff,https://github.com/prysmaticlabs/prysm
|
57 |
+
https://github.com/decred/dcrd,https://github.com/prysmaticlabs/prysm
|
58 |
+
https://github.com/multiformats/go-multihash,https://github.com/prysmaticlabs/prysm
|
59 |
+
https://github.com/kubernetes/klog,https://github.com/prysmaticlabs/prysm
|
60 |
+
https://github.com/flynn/noise,https://github.com/prysmaticlabs/prysm
|
61 |
+
https://github.com/mikioh/tcpinfo,https://github.com/prysmaticlabs/prysm
|
62 |
+
https://github.com/json-iterator/go,https://github.com/prysmaticlabs/prysm
|
63 |
+
https://github.com/pion/logging,https://github.com/prysmaticlabs/prysm
|
64 |
+
https://github.com/btcsuite/btcd,https://github.com/prysmaticlabs/prysm
|
65 |
+
https://github.com/mgutz/ansi,https://github.com/prysmaticlabs/prysm
|
66 |
+
https://github.com/mattn/go-runewidth,https://github.com/prysmaticlabs/prysm
|
67 |
+
https://github.com/munnerz/goautoneg,https://github.com/prysmaticlabs/prysm
|
68 |
+
https://github.com/lukechampine/blake3,https://github.com/prysmaticlabs/prysm
|
69 |
+
https://github.com/wealdtech/go-eth2-wallet-encryptor-keystorev4,https://github.com/prysmaticlabs/prysm
|
70 |
+
https://github.com/libp2p/go-msgio,https://github.com/prysmaticlabs/prysm
|
71 |
+
https://github.com/pion/transport,https://github.com/prysmaticlabs/prysm
|
72 |
+
https://github.com/minio/highwayhash,https://github.com/prysmaticlabs/prysm
|
73 |
+
https://github.com/kubernetes-sigs/structured-merge-diff,https://github.com/prysmaticlabs/prysm
|
74 |
+
https://github.com/bits-and-blooms/bitset,https://github.com/prysmaticlabs/prysm
|
75 |
+
https://github.com/libp2p/go-buffer-pool,https://github.com/prysmaticlabs/prysm
|
76 |
+
https://github.com/holiman/uint256,https://github.com/prysmaticlabs/prysm
|
77 |
+
https://github.com/holiman/billy,https://github.com/prysmaticlabs/prysm
|
78 |
+
https://github.com/chzyer/readline,https://github.com/prysmaticlabs/prysm
|
79 |
+
https://github.com/grpc-ecosystem/go-grpc-middleware,https://github.com/prysmaticlabs/prysm
|
80 |
+
https://github.com/consensys/gnark-crypto,https://github.com/prysmaticlabs/prysm
|
81 |
+
https://github.com/kubernetes-sigs/json,https://github.com/prysmaticlabs/prysm
|
82 |
+
https://github.com/influxdata/influxdb-client-go,https://github.com/prysmaticlabs/prysm
|
83 |
+
https://github.com/ethereum/go-verkle,https://github.com/prysmaticlabs/prysm
|
84 |
+
https://github.com/quic-go/quic-go,https://github.com/prysmaticlabs/prysm
|
85 |
+
https://github.com/k0kubun/go-ansi,https://github.com/prysmaticlabs/prysm
|
86 |
+
https://github.com/go-yaml/yaml,https://github.com/prysmaticlabs/prysm
|
87 |
+
https://github.com/raulk/go-watchdog,https://github.com/prysmaticlabs/prysm
|
88 |
+
https://github.com/godbus/dbus,https://github.com/prysmaticlabs/prysm
|
89 |
+
https://github.com/grpc/grpc-go,https://github.com/prysmaticlabs/prysm
|
90 |
+
https://github.com/mattn/go-isatty,https://github.com/prysmaticlabs/prysm
|
91 |
+
https://github.com/nxadm/tail,https://github.com/prysmaticlabs/prysm
|
92 |
+
https://github.com/tyler-smith/go-bip39,https://github.com/prysmaticlabs/prysm
|
93 |
+
https://github.com/docker/go-units,https://github.com/prysmaticlabs/prysm
|
94 |
+
https://github.com/wlynxg/anet,https://github.com/prysmaticlabs/prysm
|
95 |
+
https://github.com/uber-go/dig,https://github.com/prysmaticlabs/prysm
|
96 |
+
https://github.com/kubernetes/client-go,https://github.com/prysmaticlabs/prysm
|
97 |
+
https://github.com/libp2p/go-flow-metrics,https://github.com/prysmaticlabs/prysm
|
98 |
+
https://github.com/mmcloughlin/addchain,https://github.com/prysmaticlabs/prysm
|
99 |
+
https://github.com/mohae/deepcopy,https://github.com/prysmaticlabs/prysm
|
100 |
+
https://github.com/multiformats/go-base36,https://github.com/prysmaticlabs/prysm
|
deepfundingoracle.log
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
2025-04-19 14:13:36,946 - INFO - [INFO] Assigning base weights using LLama model...
|
2 |
+
2025-04-19 14:22:31,541 - INFO - [INFO] Assigning base weights using LLama model...
|
3 |
+
2025-04-19 14:31:26,535 - INFO - [INFO] Assigning base weights using LLama model...
|
requirements.txt
CHANGED
@@ -1 +1,11 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
polars
|
2 |
+
pandas
|
3 |
+
scikit-learn
|
4 |
+
lightgbm
|
5 |
+
altair
|
6 |
+
transformers
|
7 |
+
smolagents
|
8 |
+
huggingface_hub
|
9 |
+
gradio
|
10 |
+
fastapi
|
11 |
+
uvicorn[standard]
|