FelixPhilip commited on
Commit
513a1f2
·
1 Parent(s): 15b03e0

Oracle and smolagent

Browse files
Oracle/DataSmolAgent.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from smolagents import HfApiModel,tool,CodeAgent
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+
6
+ @tool
7
+ def clean_data(df: pd.DataFrame) -> pd.DataFrame:
8
+ """
9
+ Clean the DataFrame by stripping whitespace from column names and dropping rows that are completely empty.
10
+
11
+ Args:
12
+ df (pd.DataFrame): The input DataFrame containing the raw data.
13
+
14
+ Returns:
15
+ pd.DataFrame: A cleaned DataFrame with stripped column names and without completely empty rows.
16
+ """
17
+ df.columns = df.columns.str.strip()
18
+ df = df.dropna(how="all")
19
+ return df
20
+
21
+ @tool
22
+ def extract_features(df: pd.DataFrame) -> pd.DataFrame:
23
+ """
24
+ Dynamically extract features from the DataFrame.
25
+
26
+ For numeric columns:
27
+ - If all values are non-negative, a log-transformed version is created.
28
+
29
+ For columns that appear to be dates:
30
+ - Year, month, and day are extracted.
31
+
32
+ For non-numeric, non-date columns:
33
+ - They are encoded as categorical numeric codes.
34
+
35
+ Args:
36
+ df (pd.DataFrame): The input DataFrame containing the raw data.
37
+
38
+ Returns:
39
+ pd.DataFrame: The DataFrame updated with new dynamically engineered features.
40
+ """
41
+ # Numeric columns: log transformation
42
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
43
+ for col in numeric_cols:
44
+ if (df[col] >= 0).all():
45
+ df[f"log_{col}"] = np.log(df[col] + 1)
46
+
47
+ # Date-like columns extraction
48
+ for col in df.columns:
49
+ if "date" in col.lower() or "time" in col.lower():
50
+ try:
51
+ df[col] = pd.to_datetime(df[col], errors='coerce')
52
+ df[f"{col}_year"] = df[col].dt.year
53
+ df[f"{col}_month"] = df[col].dt.month
54
+ df[f"{col}_day"] = df[col].dt.day
55
+ except Exception:
56
+ pass
57
+
58
+ # Non-numeric processing: encode as categorical numeric codes.
59
+ non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
60
+ valid_cat = []
61
+ for col in non_numeric:
62
+ try:
63
+ pd.to_datetime(df[col], errors='raise')
64
+ except Exception:
65
+ valid_cat.append(col)
66
+ for col in valid_cat:
67
+ df[f"{col}_cat"] = df[col].astype("category").cat.codes
68
+
69
+ return df
70
+
71
+ @tool
72
+ def save_to_csv(df: pd.DataFrame, filename: str = "output.csv") -> str:
73
+ """
74
+ Save the DataFrame to a CSV file and return the file path.
75
+
76
+ Args:
77
+ df (pd.DataFrame): The DataFrame to save.
78
+ filename (str): The name of the output CSV file.
79
+
80
+ Returns:
81
+ str: The file path of the saved CSV.
82
+ """
83
+ df.to_csv(filename, index=False)
84
+ return filename
85
+
86
+ class DataSmolAgent(CodeAgent):
87
+ """
88
+ A data processing agent that cleans and extracts features from the provided DataFrame.
89
+ """
90
+ def __init__(self, df: pd.DataFrame):
91
+ self.df = df
92
+ self.tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
93
+ self.model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
94
+ super().__init__(
95
+ tools=[
96
+ clean_data,
97
+ extract_features,
98
+ save_to_csv, # Added save_to_csv tool
99
+ ],
100
+ model=self.model,
101
+ additional_authorized_imports=["pandas", "numpy"]
102
+ )
103
+
104
+ def run(self, prompt: str, output_csv: bool = False) -> pd.DataFrame:
105
+ # Run the agent with the provided DataFrame
106
+ clean_output = self.tools["clean_data"](df=self.df)
107
+ self.df = clean_output.result if hasattr(clean_output, "result") else clean_output
108
+
109
+ features_output = self.tools["extract_features"](df=self.df)
110
+ self.df = features_output.result if hasattr(features_output, "result") else features_output
111
+
112
+ if output_csv:
113
+ csv_output = self.tools["save_to_csv"](df=self.df, filename="processed_output.csv")
114
+ print(f"CSV saved at: {csv_output}")
115
+
116
+ return self.df
Oracle/SmolLM.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+
3
+ class SmolLM:
4
+ def __init__(self, model_path="HuggingFaceTB/SmolLM2-1.7B-Instruct"):
5
+ self.available = True
6
+ try:
7
+ print(f"[INFO] Loading model tokenizer from {model_path}")
8
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
9
+ print(f"[INFO] Loading model from {model_path}")
10
+ self.model = AutoModelForCausalLM.from_pretrained(model_path)
11
+ print("[INFO] Model loaded successfully")
12
+ except Exception as e:
13
+ print(f"[ERROR] Failed to load model '{model_path}': {e}")
14
+ self.available = False
15
+
16
+ def predict(self, prompt):
17
+ if not self.available:
18
+ print("[WARN] LLama model unavailable, returning default weight 0.5")
19
+ return "0.5"
20
+ try:
21
+ print(f"[INFO] Generating response for prompt: {prompt[:100]}...", flush=True)
22
+ inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
23
+ outputs = self.model.generate(**inputs, max_length=150, num_return_sequences=1)
24
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
25
+ print(f"[INFO] Generated response: {response[:100]}...", flush=True)
26
+ return response
27
+ except Exception as e:
28
+ print(f"[ERROR] LLama model inference failed: {e}")
29
+ return "0.5"
Oracle/__init__.py ADDED
File without changes
Oracle/__pycache__/DataSmolAgent.cpython-311.pyc ADDED
Binary file (5.28 kB). View file
 
Oracle/__pycache__/LLamaModel.cpython-311.pyc ADDED
Binary file (2.69 kB). View file
 
Oracle/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (150 Bytes). View file
 
Oracle/__pycache__/backend.cpython-311.pyc ADDED
Binary file (3.36 kB). View file
 
Oracle/__pycache__/deepfundingoracle.cpython-311.pyc ADDED
Binary file (16.7 kB). View file
 
Oracle/deepfundingoracle.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DeepFunding Oracle:
3
+ This script dynamically loads dependency data and for each repository URL:
4
+ • Fetches GitHub features (stars, forks, watchers, open issues, pull requests, activity) using the GitHub API.
5
+ • Uses the LLama model to analyze parent-child behavior (based on the fetched features and parent info)
6
+ and returns a base weight (0-1) for the repository.
7
+ • Trains a RandomForest regressor on these features (with the base weight as the target) to predict a final weight.
8
+ The output submission CSV has three columns: repo, parent, and final_weight.
9
+ """
10
+
11
+ from io import StringIO
12
+ import os
13
+ import warnings
14
+ import csv
15
+ import re
16
+ import requests
17
+ import numpy as np
18
+ import pandas as pd
19
+ import matplotlib.pyplot as plt
20
+ import seaborn as sns
21
+ import time
22
+ import threading
23
+ import logging
24
+ import concurrent.futures
25
+ import signal
26
+ from tqdm import tqdm
27
+ import sys
28
+
29
+ from sklearn.model_selection import train_test_split, GridSearchCV
30
+ from sklearn.ensemble import RandomForestRegressor
31
+ from sklearn.metrics import mean_squared_error
32
+
33
+ from Oracle.SmolLM import SmolLM
34
+
35
+ warnings.filterwarnings("ignore")
36
+
37
+ # Configure logging to file and console
38
+ logging.basicConfig(
39
+ handlers=[
40
+ logging.FileHandler("deepfundingoracle.log"),
41
+ logging.StreamHandler(sys.stdout)
42
+ ],
43
+ level=logging.INFO,
44
+ format="%(asctime)s - %(levelname)s - %(message)s"
45
+ )
46
+
47
+ ##############################
48
+ # Enhanced GitHub API helper: Fetch repository metrics
49
+ ##############################
50
+ def fetch_repo_metrics(repo_url):
51
+ """
52
+ Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
53
+ Assumes repo_url is in the form "https://github.com/owner/repo".
54
+ """
55
+ try:
56
+ # Extract owner and repo name
57
+ m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
58
+ if not m:
59
+ return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
60
+ owner, repo_name = m.group(1), m.group(2)
61
+ api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
62
+ headers = {}
63
+
64
+ token = os.environ.get("GITHUB_API_TOKEN", "")
65
+ if token: headers["Authorization"] = f"token {token}"
66
+ r = requests.get(api_url, headers=headers)
67
+ if r.status_code == 200:
68
+ data = r.json()
69
+ pulls_url = data.get("pulls_url", "").replace("{\/*state}", "")
70
+ pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
71
+ activity = data.get("updated_at", "")
72
+ return {
73
+ "stargazers_count": data.get("stargazers_count", 0),
74
+ "forks_count": data.get("forks_count", 0),
75
+ "watchers_count": data.get("watchers_count", 0),
76
+ "open_issues_count": data.get("open_issues_count", 0),
77
+ "pulls_count": pulls_count,
78
+ "activity": activity,
79
+ "owner": owner,
80
+ "repo_name": repo_name,
81
+ "token": token
82
+ }
83
+ else:
84
+ return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
85
+ except Exception:
86
+ return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
87
+
88
+
89
+ ##############################
90
+ # Enhanced Feature Extraction
91
+ ##############################
92
+ def load_data(file):
93
+ """
94
+ Dynamically load the dependency data CSV from the uploaded file.
95
+ Expects at least "repo" and "parent" columns.
96
+ """
97
+ try:
98
+ print("[INFO] Loading data from uploaded file...")
99
+ start_time = time.time()
100
+ # Read the uploaded file directly into a DataFrame
101
+ df = pd.read_csv(file)
102
+ end_time = time.time()
103
+ print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.")
104
+ return df
105
+ except Exception as e:
106
+ print("[ERROR] Error loading data:", e)
107
+ return None
108
+
109
+ def fetch_github_features(df):
110
+ """
111
+ For each row, using the repo URL, call the GitHub API to fetch:
112
+ stars, forks, watchers, open issues, pull requests, activity, and contributors count.
113
+ Adds these as new columns to the DataFrame.
114
+ """
115
+ print("[INFO] Fetching GitHub features for repositories...")
116
+ start_time = time.time()
117
+ stars_list = []
118
+ forks_list = []
119
+ watchers_list = []
120
+ issues_list = []
121
+ pulls_list = []
122
+ activity_list = []
123
+ contributors_list = []
124
+
125
+ for idx, row in df.iterrows():
126
+ repo_url = row.get("repo", "")
127
+ print(f"[INFO] Processing repository {idx + 1}/{len(df)}: {repo_url}")
128
+ features = fetch_repo_metrics(repo_url)
129
+ stars_list.append(features["stargazers_count"])
130
+ forks_list.append(features["forks_count"])
131
+ watchers_list.append(features["watchers_count"])
132
+ issues_list.append(features["open_issues_count"])
133
+ pulls_list.append(features["pulls_count"])
134
+ activity_list.append(features["activity"])
135
+
136
+ # Fetch contributors count
137
+ try:
138
+ contributors_url = f"https://api.github.com/repos/{features['owner']}/{features['repo_name']}/contributors"
139
+ headers = {"Authorization": f"token {features['token']}"}
140
+ contributors_response = requests.get(contributors_url, headers=headers)
141
+ if contributors_response.status_code == 200:
142
+ contributors_list.append(len(contributors_response.json()))
143
+ else:
144
+ contributors_list.append(0)
145
+ except Exception:
146
+ contributors_list.append(0)
147
+
148
+ df["stars"] = stars_list
149
+ df["forks"] = forks_list
150
+ df["watchers"] = watchers_list
151
+ df["open_issues"] = issues_list
152
+ df["pulls"] = pulls_list
153
+ df["activity"] = activity_list
154
+ df["contributors"] = contributors_list
155
+
156
+ end_time = time.time()
157
+ print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
158
+ return df
159
+
160
+ def timeout_handler(signum, frame):
161
+ raise TimeoutError("LLama model prediction timed out.")
162
+
163
+ def assign_base_weight(df):
164
+ print("[INFO] Starting base weight assignment using LLama model...", flush=True)
165
+ logging.info("[INFO] Assigning base weights using LLama model...")
166
+ start_time = time.time()
167
+ llama = SmolLM()
168
+ base_weights = []
169
+
170
+ for idx, row in tqdm(df.iterrows(), total=len(df), desc="Assigning weights"):
171
+ repo = row.get("repo", "")
172
+ print(f"[INFO] Assigning weight for repository {idx + 1}/{len(df)}: {repo}", flush=True)
173
+ logging.info(f"[INFO] Processing repository {idx + 1}/{len(df)}: {repo}")
174
+ parent = row.get("parent", "")
175
+ stars = row.get("stars", 0)
176
+ forks = row.get("forks", 0)
177
+ watchers = row.get("watchers", 0)
178
+ issues = row.get("open_issues", 0)
179
+ pulls = row.get("pulls", 0)
180
+ activity = row.get("activity", "")
181
+ prompt = (
182
+ f"Repository: {repo}\n"
183
+ f"GitHub Metrics: {stars} stars, {forks} forks, {watchers} watchers, {issues} open issues, {pulls} pull requests, activity: {activity}.\n"
184
+ f"Parent or dependency: {parent}\n\n"
185
+ "Based on these features, assign a dependency weight between 0 and 1 for the repository "
186
+ "that reflects how influential the repository is as a source relative to its parent. "
187
+ "Only output the numeric value."
188
+ )
189
+ try:
190
+ print(f"[INFO] Sending prompt to LLama model for repo: {repo}", flush=True)
191
+ start_llama_time = time.time()
192
+ response = llama.predict(prompt)
193
+ weight = float(''.join([c for c in response if c.isdigit() or c == '.']))
194
+ weight = min(max(weight, 0), 1)
195
+ end_llama_time = time.time()
196
+ print(f"[INFO] Received weight {weight} for {repo} in {end_llama_time - start_llama_time:.2f} seconds.", flush=True)
197
+ logging.info(f"[INFO] Processed repository {repo} in {end_llama_time - start_llama_time:.2f} seconds. Weight: {weight}")
198
+ except Exception as e:
199
+ print(f"[ERROR] Failed to process repository {repo}: {e}", flush=True)
200
+ logging.error(f"[ERROR] Failed to process repository {repo}: {e}")
201
+ weight = 0.5 # Default weight in case of failure
202
+ base_weights.append(weight)
203
+ print(f"[PROGRESS] Finished {idx + 1}/{len(df)} repositories.", flush=True)
204
+
205
+ df["base_weight"] = base_weights
206
+ end_time = time.time()
207
+ print(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.", flush=True)
208
+ logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
209
+ return df
210
+
211
+ def prepare_dataset(file):
212
+ print("[INFO] Starting dataset preparation...")
213
+ start_time = time.time()
214
+ df = load_data(file)
215
+ if df is None:
216
+ raise ValueError("Failed to load data.")
217
+ if not {"repo", "parent"}.issubset(df.columns):
218
+ raise ValueError("Input CSV must contain 'repo' and 'parent' columns.")
219
+ print("[INFO] Fetching GitHub features...")
220
+ df = fetch_github_features(df)
221
+ print("[INFO] GitHub features fetched successfully.")
222
+ print("[INFO] Assigning base weights using LLama model...")
223
+ df = assign_base_weight(df)
224
+ end_time = time.time()
225
+ print(f"[INFO] Dataset preparation completed in {end_time - start_time:.2f} seconds.")
226
+ return df
227
+
228
+
229
+ ##############################
230
+ # Enhanced RandomForest Regression
231
+ ##############################
232
+ def train_predict_weight(df):
233
+ print("[INFO] Starting weight prediction...", flush=True)
234
+ start_time = time.time()
235
+ target = "base_weight"
236
+ feature_cols = ["stars", "forks", "watchers", "open_issues", "pulls", "activity", "contributors"]
237
+ if target not in df.columns:
238
+ raise ValueError("Base weight column missing.")
239
+ X = df[feature_cols]
240
+ y = df[target]
241
+ print("[INFO] Splitting data into training and testing sets...", flush=True)
242
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
243
+ rf_model = RandomForestRegressor(random_state=42)
244
+ param_grid = {
245
+ "n_estimators": [100, 200, 300],
246
+ "max_depth": [None, 10, 20, 30],
247
+ "min_samples_split": [2, 5, 10],
248
+ "min_samples_leaf": [1, 2, 4]
249
+ }
250
+ print("[INFO] Performing grid search for hyperparameter tuning...", flush=True)
251
+ gridSearch = GridSearchCV(
252
+ estimator=rf_model,
253
+ param_grid=param_grid,
254
+ cv=5,
255
+ scoring="neg_mean_squared_error"
256
+ )
257
+ gridSearch.fit(X_train, y_train)
258
+ print("[INFO] Grid search completed.", flush=True)
259
+ print("Best Parameters:", gridSearch.best_params_, flush=True)
260
+ print("Best MSE:", -gridSearch.best_score_, flush=True)
261
+ y_pred = gridSearch.best_estimator_.predict(X_test)
262
+ mse = mean_squared_error(y_test, y_pred)
263
+ print("Final RF Test MSE:", mse, flush=True)
264
+ print("[INFO] Predicting final weights for all rows...")
265
+ df["final_weight"] = gridSearch.best_estimator_.predict(X)
266
+ end_time = time.time()
267
+ print(f"[INFO] Weight prediction completed in {end_time - start_time:.2f} seconds.", flush=True)
268
+ return df
269
+
270
+ ##############################
271
+ # CSV Output
272
+ ##############################
273
+ def create_submission_csv(df, output_filename="submission.csv"):
274
+ print(f"[INFO] Writing results to {output_filename}...", flush=True)
275
+ required_cols = ["repo", "parent", "final_weight"]
276
+ submission_df = df[required_cols]
277
+ submission_df.to_csv(output_filename, index=False)
278
+ print(f"[INFO] Results written to {output_filename}.", flush=True)
279
+ return output_filename
280
+
281
+ # Removed Gradio UI code from this file to ensure modular workflow.
282
+ # This file now focuses solely on data processing and prediction.
283
+
284
+ if __name__ == "__main__":
285
+ print("DeepFunding Oracle is now ready for backend processing.", flush=True)
README.md CHANGED
@@ -1,13 +1,3 @@
1
  ---
2
- title: DeepFundingOracle
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.0.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: 'Oracle for predicting funding for projects '
11
  ---
12
-
13
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
1
  ---
2
+ license: llama3
 
 
 
 
 
 
 
 
3
  ---
 
 
app.py CHANGED
@@ -1,64 +1,27 @@
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
  ],
 
 
60
  )
61
 
62
-
63
  if __name__ == "__main__":
64
- demo.launch()
 
 
1
+ import os
2
  import gradio as gr
3
+ from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
4
+
5
+ # Gradio-only deployment entrypoint for Hugging Face Spaces
6
+ def analyze_file(upload):
7
+ # upload is a file-like object with .name
8
+ df = prepare_dataset(upload.name)
9
+ df = train_predict_weight(df)
10
+ csv_path = create_submission_csv(df, "submission.csv")
11
+ preview = df.head().to_csv(index=False)
12
+ return preview, csv_path
13
+
14
+ iface = gr.Interface(
15
+ fn=analyze_file,
16
+ inputs=gr.File(label="Upload CSV", type="file"),
17
+ outputs=[
18
+ gr.Textbox(label="Preview of Results"),
19
+ gr.Textbox(label="Download CSV Path")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  ],
21
+ title="DeepFunding Oracle",
22
+ description="Upload a CSV of repo-parent relationships; returns base and final weight predictions as CSV."
23
  )
24
 
 
25
  if __name__ == "__main__":
26
+ port = int(os.environ.get("PORT", 7860))
27
+ iface.launch(server_name="0.0.0.0", server_port=port)
data/GG_data.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/test.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/test2.csv ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repo,parent
2
+ https://github.com/web3/web3.js,ethereum
3
+ https://github.com/prysmaticlabs/prysm,ethereum
4
+ https://github.com/ethereum/fe,ethereum
5
+ https://github.com/ethereum/remix-project,ethereum
6
+ https://github.com/eth-infinitism/account-abstraction,ethereum
7
+ https://github.com/wevm/viem,ethereum
8
+ https://github.com/nethereum/nethereum,ethereum
9
+ https://github.com/ethers-io/ethers.js,ethereum
10
+ https://github.com/chainsafe/lodestar,ethereum
11
+ https://github.com/ethereum-lists/chains,ethereum
12
+ https://github.com/sigp/lighthouse,ethereum
13
+ https://github.com/ethereum/py-evm,ethereum
14
+ https://github.com/hyperledger/besu,ethereum
15
+ https://github.com/erigontech/erigon,ethereum
16
+ https://github.com/vyperlang/titanoboa,ethereum
17
+ https://github.com/alloy-rs/alloy,ethereum
18
+ https://github.com/ethereumjs/ethereumjs-monorepo,ethereum
19
+ https://github.com/foundry-rs/foundry,ethereum
20
+ https://github.com/safe-global/safe-smart-account,ethereum
21
+ https://github.com/consensys/teku,ethereum
22
+ https://github.com/grandinetech/grandine,ethereum
23
+ https://github.com/ethereum/sourcify,ethereum
24
+ https://github.com/ethereum/solidity,ethereum
25
+ https://github.com/status-im/nimbus-eth2,ethereum
26
+ https://github.com/openzeppelin/openzeppelin-contracts,ethereum
27
+ https://github.com/ethereum/web3.py,ethereum
28
+ https://github.com/nethermindeth/nethermind,ethereum
29
+ https://github.com/apeworx/ape,ethereum
30
+ https://github.com/a16z/helios,ethereum
31
+ https://github.com/paradigmxyz/reth,ethereum
32
+ https://github.com/scaffold-eth/scaffold-eth-2,ethereum
33
+ https://github.com/vyperlang/vyper,ethereum
34
+ https://github.com/hyperledger-web3j/web3j,ethereum
35
+ https://github.com/ethereum/go-ethereum,ethereum
36
+ https://github.com/nomicfoundation/hardhat,ethereum
37
+ https://github.com/census-instrumentation/opencensus-go,https://github.com/prysmaticlabs/prysm
38
+ https://github.com/pion/turn,https://github.com/prysmaticlabs/prysm
39
+ https://github.com/google/pprof,https://github.com/prysmaticlabs/prysm
40
+ https://github.com/uber-go/fx,https://github.com/prysmaticlabs/prysm
41
+ https://github.com/elastic/gosigar,https://github.com/prysmaticlabs/prysm
42
+ https://github.com/cockroachdb/logtags,https://github.com/prysmaticlabs/prysm
43
+ https://github.com/tklauser/go-sysconf,https://github.com/prysmaticlabs/prysm
44
+ https://github.com/manifoldco/promptui,https://github.com/prysmaticlabs/prysm
45
+ https://github.com/cockroachdb/errors,https://github.com/prysmaticlabs/prysm
46
+ https://github.com/coreos/go-systemd,https://github.com/prysmaticlabs/prysm
47
+ https://github.com/herumi/bls-eth-go-binary,https://github.com/prysmaticlabs/prysm
48
+ https://github.com/spf13/afero,https://github.com/prysmaticlabs/prysm
49
+ https://github.com/gorilla/websocket,https://github.com/prysmaticlabs/prysm
50
+ https://github.com/libp2p/go-reuseport,https://github.com/prysmaticlabs/prysm
51
+ https://github.com/hashicorp/golang-lru,https://github.com/prysmaticlabs/prysm
52
+ https://github.com/ianlancetaylor/cgosymbolizer,https://github.com/prysmaticlabs/prysm
53
+ https://github.com/huin/goupnp,https://github.com/prysmaticlabs/prysm
54
+ https://github.com/olekukonko/tablewriter,https://github.com/prysmaticlabs/prysm
55
+ https://github.com/pion/webrtc,https://github.com/prysmaticlabs/prysm
56
+ https://github.com/d4l3k/messagediff,https://github.com/prysmaticlabs/prysm
57
+ https://github.com/decred/dcrd,https://github.com/prysmaticlabs/prysm
58
+ https://github.com/multiformats/go-multihash,https://github.com/prysmaticlabs/prysm
59
+ https://github.com/kubernetes/klog,https://github.com/prysmaticlabs/prysm
60
+ https://github.com/flynn/noise,https://github.com/prysmaticlabs/prysm
61
+ https://github.com/mikioh/tcpinfo,https://github.com/prysmaticlabs/prysm
62
+ https://github.com/json-iterator/go,https://github.com/prysmaticlabs/prysm
63
+ https://github.com/pion/logging,https://github.com/prysmaticlabs/prysm
64
+ https://github.com/btcsuite/btcd,https://github.com/prysmaticlabs/prysm
65
+ https://github.com/mgutz/ansi,https://github.com/prysmaticlabs/prysm
66
+ https://github.com/mattn/go-runewidth,https://github.com/prysmaticlabs/prysm
67
+ https://github.com/munnerz/goautoneg,https://github.com/prysmaticlabs/prysm
68
+ https://github.com/lukechampine/blake3,https://github.com/prysmaticlabs/prysm
69
+ https://github.com/wealdtech/go-eth2-wallet-encryptor-keystorev4,https://github.com/prysmaticlabs/prysm
70
+ https://github.com/libp2p/go-msgio,https://github.com/prysmaticlabs/prysm
71
+ https://github.com/pion/transport,https://github.com/prysmaticlabs/prysm
72
+ https://github.com/minio/highwayhash,https://github.com/prysmaticlabs/prysm
73
+ https://github.com/kubernetes-sigs/structured-merge-diff,https://github.com/prysmaticlabs/prysm
74
+ https://github.com/bits-and-blooms/bitset,https://github.com/prysmaticlabs/prysm
75
+ https://github.com/libp2p/go-buffer-pool,https://github.com/prysmaticlabs/prysm
76
+ https://github.com/holiman/uint256,https://github.com/prysmaticlabs/prysm
77
+ https://github.com/holiman/billy,https://github.com/prysmaticlabs/prysm
78
+ https://github.com/chzyer/readline,https://github.com/prysmaticlabs/prysm
79
+ https://github.com/grpc-ecosystem/go-grpc-middleware,https://github.com/prysmaticlabs/prysm
80
+ https://github.com/consensys/gnark-crypto,https://github.com/prysmaticlabs/prysm
81
+ https://github.com/kubernetes-sigs/json,https://github.com/prysmaticlabs/prysm
82
+ https://github.com/influxdata/influxdb-client-go,https://github.com/prysmaticlabs/prysm
83
+ https://github.com/ethereum/go-verkle,https://github.com/prysmaticlabs/prysm
84
+ https://github.com/quic-go/quic-go,https://github.com/prysmaticlabs/prysm
85
+ https://github.com/k0kubun/go-ansi,https://github.com/prysmaticlabs/prysm
86
+ https://github.com/go-yaml/yaml,https://github.com/prysmaticlabs/prysm
87
+ https://github.com/raulk/go-watchdog,https://github.com/prysmaticlabs/prysm
88
+ https://github.com/godbus/dbus,https://github.com/prysmaticlabs/prysm
89
+ https://github.com/grpc/grpc-go,https://github.com/prysmaticlabs/prysm
90
+ https://github.com/mattn/go-isatty,https://github.com/prysmaticlabs/prysm
91
+ https://github.com/nxadm/tail,https://github.com/prysmaticlabs/prysm
92
+ https://github.com/tyler-smith/go-bip39,https://github.com/prysmaticlabs/prysm
93
+ https://github.com/docker/go-units,https://github.com/prysmaticlabs/prysm
94
+ https://github.com/wlynxg/anet,https://github.com/prysmaticlabs/prysm
95
+ https://github.com/uber-go/dig,https://github.com/prysmaticlabs/prysm
96
+ https://github.com/kubernetes/client-go,https://github.com/prysmaticlabs/prysm
97
+ https://github.com/libp2p/go-flow-metrics,https://github.com/prysmaticlabs/prysm
98
+ https://github.com/mmcloughlin/addchain,https://github.com/prysmaticlabs/prysm
99
+ https://github.com/mohae/deepcopy,https://github.com/prysmaticlabs/prysm
100
+ https://github.com/multiformats/go-base36,https://github.com/prysmaticlabs/prysm
deepfundingoracle.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ 2025-04-19 14:13:36,946 - INFO - [INFO] Assigning base weights using LLama model...
2
+ 2025-04-19 14:22:31,541 - INFO - [INFO] Assigning base weights using LLama model...
3
+ 2025-04-19 14:31:26,535 - INFO - [INFO] Assigning base weights using LLama model...
requirements.txt CHANGED
@@ -1 +1,11 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
 
 
 
 
 
 
1
+ polars
2
+ pandas
3
+ scikit-learn
4
+ lightgbm
5
+ altair
6
+ transformers
7
+ smolagents
8
+ huggingface_hub
9
+ gradio
10
+ fastapi
11
+ uvicorn[standard]