Spaces:

FelixPhilip
/

DeepFundingOracle

Running

App Files Files Community

FelixPhilip commited on May 4

Commit

ea68d4a

1 Parent(s): 386c440

Oracle

Browse files

Files changed (2) hide show

Oracle/DataSmolAgent.py +34 -12
Oracle/deepfundingoracle.py +160 -109

Oracle/DataSmolAgent.py CHANGED Viewed

@@ -34,17 +34,30 @@ def extract_features(df: pd.DataFrame) -> pd.DataFrame:
     # Numeric columns: log transformation for skewed features
     numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
     for col in numeric_cols:
-        if (df[col] >= 0).all() and df[col].skew() > 1:
             df[f"log_{col}"] = np.log(df[col] + 1)
     # Repository age (days since creation)
     if "created_at" in df.columns:
         df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
         df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
     # Recent activity count (commits/issues in last 30/90 days)
-    if "activity" in df.columns:
-        df["activity"] = pd.to_datetime(df["activity"], errors="coerce")
         now = pd.Timestamp.now()
         df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
         df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
@@ -57,17 +70,25 @@ def extract_features(df: pd.DataFrame) -> pd.DataFrame:
     if {"issues_closed", "issues_opened"}.issubset(df.columns):
         df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
-    # Is the repo archived?
-    if "archived" in df.columns:
-        df["is_archived"] = df["archived"].astype(int)
-    # Description length
-    if "description" in df.columns:
-        df["description_length"] = df["description"].fillna("").apply(len)
-    # Topics count
-    if "topics" in df.columns:
-        df["topics_count"] = df["topics"].fillna("").apply(lambda x: len(x.split(",")))
     # Normalize or standardize features
     scaler = StandardScaler()
@@ -178,3 +199,4 @@ class DataSmolAgent(CodeAgent):
             print(f"CSV saved at: {csv_output}")
         return self.df

     # Numeric columns: log transformation for skewed features
     numeric_cols = df.select_dtypes(include=[np.number]).columns.to_list()
     for col in numeric_cols:
+        if (df[col] >= 0).all():
             df[f"log_{col}"] = np.log(df[col] + 1)
+    # Date-like columns extraction
+    for col in df.columns:
+        if "date" in col.lower() or "time" in col.lower() or col == "activity":
+            try:
+                df[col] = pd.to_datetime(df[col], errors='coerce')
+                if not df[col].isna().all():  # Only create features if we have valid dates
+                    df[f"{col}_year"] = df[col].dt.year
+                    df[f"{col}_month"] = df[col].dt.month
+                    df[f"{col}_day"] = df[col].dt.day
+                    # Calculate age (days since date)
+                    df[f"{col}_age_days"] = (pd.Timestamp.now() - df[col]).dt.days
+            except Exception:
+                pass
     # Repository age (days since creation)
     if "created_at" in df.columns:
         df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce")
         df["repo_age_days"] = (pd.Timestamp.now() - df["created_at"]).dt.days
     # Recent activity count (commits/issues in last 30/90 days)
+    if "activity" in df.columns and pd.api.types.is_datetime64_any_dtype(df["activity"]):
         now = pd.Timestamp.now()
         df["recent_activity_30d"] = ((now - df["activity"]).dt.days <= 30).astype(int)
         df["recent_activity_90d"] = ((now - df["activity"]).dt.days <= 90).astype(int)
     if {"issues_closed", "issues_opened"}.issubset(df.columns):
         df["issue_resolution_speed"] = df["issues_closed"] / (df["issues_opened"] + 1)
+    # Feature ratios
+    if {"stars", "forks"}.issubset(df.columns):
+        df["stars_to_forks_ratio"] = df["stars"] / (df["forks"] + 1)
+    if {"open_issues", "closed_issues"}.issubset(df.columns):
+        df["issues_ratio"] = df["closed_issues"] / (df["open_issues"] + df["closed_issues"] + 1)
+    # Non-numeric processing: encode categorical features
+    non_numeric = df.select_dtypes(include=["object"]).columns.to_list()
+    valid_cat = []
+    for col in non_numeric:
+        try:
+            pd.to_datetime(df[col], errors='raise')
+        except Exception:
+            valid_cat.append(col)
+    for col in valid_cat:
+        if col not in ["repo", "parent"]:  # Skip identifier columns
+            df[f"{col}_cat"] = df[col].astype("category").cat.codes
     # Normalize or standardize features
     scaler = StandardScaler()
             print(f"CSV saved at: {csv_output}")
         return self.df

Oracle/deepfundingoracle.py CHANGED Viewed

@@ -55,64 +55,97 @@ logging.basicConfig(
 def fetch_repo_metrics(repo_url):
     """
     Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
     """
     try:
         # Extract owner and repo name
         m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
         if not m:
-            return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
         owner, repo_name = m.group(1), m.group(2)
         api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
         headers = {}
         token = os.environ.get("GITHUB_API_TOKEN", "")
-        if token: headers["Authorization"] = f"token {token}"
-        r = requests.get(api_url, headers=headers)
         if r.status_code == 200:
             data = r.json()
-            # Log fetched data for debugging
-            print(f"[DEBUG] Fetched data for {repo_url}: {data}")
-            pulls_url = data.get("pulls_url", "").replace("{/state}", "")
-            pulls_count = len(requests.get(pulls_url, headers=headers).json()) if pulls_url else 0
-            activity = data.get("updated_at", "")
-            return {
                 "stargazers_count": data.get("stargazers_count", 0),
                 "forks_count": data.get("forks_count", 0),
                 "watchers_count": data.get("watchers_count", 0),
                 "open_issues_count": data.get("open_issues_count", 0),
-                "pulls_count": pulls_count,
-                "activity": activity,
                 "owner": owner,
                 "repo_name": repo_name,
-                "token": token
             }
         else:
             print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
-            return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
     except Exception as e:
         print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
-        return {"stargazers_count": 0, "forks_count": 0, "watchers_count": 0, "open_issues_count": 0, "pulls_count": 0, "activity": 0}
-##############################
-#  Feature Extraction
-##############################
-def load_data(file):
-    """
-    Dynamically load the dependency data CSV from the uploaded file.
-    Expects at least "repo" and "parent" columns.
-    """
-    try:
-        print("[INFO] Loading data from uploaded file...")
-        start_time = time.time()
-        # Read the uploaded file directly into a DataFrame
-        df = pd.read_csv(file)
-        end_time = time.time()
-        print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.")
-        return df
-    except Exception as e:
-        print("[ERROR] Error loading data:", e)
-        return None
 def fetch_github_features(df):
     """
@@ -122,45 +155,27 @@ def fetch_github_features(df):
     """
     print("[INFO] Fetching GitHub features for repositories...")
     start_time = time.time()
-    stars_list = []
-    forks_list = []
-    watchers_list = []
-    issues_list = []
-    pulls_list = []
-    activity_list = []
-    contributors_list = []
-    dependencies_list =[]
     cache = {}
     def get_metrics(repo_url):
         if repo_url in cache:
-            print(f"[DEBUG] Cached data for {repo_url}: {cache[repo_url]}")
             return cache[repo_url]
         val = fetch_repo_metrics(repo_url)
-        print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}")  # <-- Add this line
-        try:
-            m = re.search(r"github\.com/([^/]+)/([^/]+)",repo_url)
-            if m:
-                owner, repo_name = m.group(1), m.group(2)
-                pkg_url = f"https://api.github.com/repos/{owner}/{repo_name}/packages.json"
-                headers = {}
-                token = os.environ.get("GITHUB_API_TOKEN", "")
-                if token:
-                    headers["Authorization"] = f"token {token}"
-                pkg_resp = requests.get(pkg_url, headers=headers)
-                if pkg_resp.status_code ==200:
-                    pkg_data = pkg_resp.json()
-                    content = base64.b64decode(pkg_data["content",""]).decode("utf-8")
-                    pkg_json = json.loads(content)
-                    dependencies = pkg_json.get("dependencies", {})
-                    val["dependencies_count"] = len(dependencies)
-                else:
-                    val["dependencies_count"] = 0
-            else:
-                val["dependencies_count"] = 0
-        except Exception:
-            val["dependencies_count"] = 0
         cache[repo_url] = val
         return val
@@ -168,38 +183,82 @@ def fetch_github_features(df):
         futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
         for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
             res = fut.result()
-            stars_list.append(res.get("stargazers_count", 0))
-            forks_list.append(res.get("forks_count", 0))
-            watchers_list.append(res.get("watchers_count", 0))
-            issues_list.append(res.get("open_issues_count", 0))
-            pulls_list.append(res.get("pulls_count", 0))
-            activity_list.append(res.get("activity", 0))
-            dependencies_list.append(res.get("dependencies_count", 0))
-            # Fetch contributors count
-            try:
-                contributors_url = f"https://api.github.com/repos/{res['owner']}/{res['repo_name']}/contributors"
-                headers = {"Authorization": f"token {res['token']}"}
-                contributors_response = requests.get(contributors_url, headers=headers)
-                if contributors_response.status_code == 200:
-                    contributors_list.append(len(contributors_response.json()))
-                else:
-                    contributors_list.append(0)
-            except Exception:
-                contributors_list.append(0)
-    df["stars"] = stars_list
-    df["forks"] = forks_list
-    df["watchers"] = watchers_list
-    df["open_issues"] = issues_list
-    df["pulls"] = pulls_list
-    df["activity"] = activity_list
-    df["contributors"] = contributors_list
-    df["dependencies_count"] = dependencies_list
     end_time = time.time()
     print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
     return df
 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
@@ -244,7 +303,6 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
         feature_weights = calculate_fallback_weights(df)
         print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
-    # Ensure numeric columns are properly formatted
     for feature in feature_weights.keys():
         if feature in df.columns:
             df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
@@ -266,19 +324,6 @@ def assign_base_weight(df, max_workers=32, llm_retries=2, llm_delay=0):
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
     return df
-def calculate_fallback_weights(df):
-    """
-    Dynamically calculate fallback feature weights based on feature variance and correlation with the target.
-    """
-    print("[INFO] Calculating fallback feature weights...")
-    numeric_cols = df.select_dtypes(include=[np.number]).columns
-    feature_variances = df[numeric_cols].var()
-    total_variance = feature_variances.sum()
-    # Assign weights proportional to feature variance
-    fallback_weights = {col: var / total_variance for col, var in feature_variances.items() if total_variance > 0}
-    return fallback_weights
 def sanity_check_weights(df):
     """
     Sanity-checks LLM weights by comparing them with other metrics.
@@ -389,6 +434,7 @@ def validate_features(df):
 def validate_target(df):
     """
     Validates the target variable to ensure it has sufficient variance.
     """
     print("[INFO] Validating target variable 'base_weight'...")
     target = "base_weight"
@@ -398,7 +444,12 @@ def validate_target(df):
     variance = df[target].var()
     print(f"[DEBUG] Target variable variance: {variance}")
     if variance < 1e-6:
-        raise ValueError(f"Target variable '{target}' has insufficient variance. Please check feature values.")
     return df

 def fetch_repo_metrics(repo_url):
     """
     Fetch GitHub metrics (stars, forks, watchers, open issues, pull requests, and activity) given a repository URL.
+    Assumes repo_url is in the form "https://github.com/owner/repo".
+    Handles API failures and malformed URLs gracefully.
     """
+    # Default values in case of failure
+    default_metrics = {
+        "stargazers_count": 0,
+        "forks_count": 0,
+        "watchers_count": 0,
+        "open_issues_count": 0,
+        "pulls_count": 0,
+        "activity": "",
+        "contributors": 0,
+        "dependencies_count": 0
+    }
     try:
         # Extract owner and repo name
         m = re.search(r"github\.com/([^/]+)/([^/]+)", repo_url)
         if not m:
+            print(f"[WARN] Malformed GitHub URL: {repo_url}")
+            return default_metrics
         owner, repo_name = m.group(1), m.group(2)
         api_url = f"https://api.github.com/repos/{owner}/{repo_name}"
         headers = {}
         token = os.environ.get("GITHUB_API_TOKEN", "")
+        if token:
+            headers["Authorization"] = f"token {token}"
+        # Fetch main repository data
+        r = requests.get(api_url, headers=headers, timeout=10)
         if r.status_code == 200:
             data = r.json()
+            metrics = {
                 "stargazers_count": data.get("stargazers_count", 0),
                 "forks_count": data.get("forks_count", 0),
                 "watchers_count": data.get("watchers_count", 0),
                 "open_issues_count": data.get("open_issues_count", 0),
+                "activity": data.get("updated_at", ""),
                 "owner": owner,
                 "repo_name": repo_name,
+                "dependencies_count": 0
             }
+            # Try to fetch pull requests count
+            try:
+                pulls_url = f"{api_url}/pulls"
+                pulls_resp = requests.get(pulls_url, headers=headers, timeout=5)
+                metrics["pulls_count"] = len(pulls_resp.json()) if pulls_resp.status_code == 200 else 0
+            except Exception as e:
+                print(f"[WARN] Failed to fetch pulls for {repo_url}: {e}")
+                metrics["pulls_count"] = 0
+            # Try to fetch contributors count
+            try:
+                contributors_url = f"{api_url}/contributors"
+                contributors_resp = requests.get(contributors_url, headers=headers, timeout=5)
+                metrics["contributors"] = len(contributors_resp.json()) if contributors_resp.status_code == 200 else 0
+            except Exception as e:
+                print(f"[WARN] Failed to fetch contributors for {repo_url}: {e}")
+                metrics["contributors"] = 0
+            # Try to estimate dependencies from package files
+            try:
+                # Look for package.json for Node.js projects
+                package_json_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/package.json"
+                package_resp = requests.get(package_json_url, timeout=5)
+                if package_resp.status_code == 200:
+                    package_data = package_resp.json()
+                    deps = package_data.get("dependencies", {})
+                    dev_deps = package_data.get("devDependencies", {})
+                    metrics["dependencies_count"] = len(deps) + len(dev_deps)
+                else:
+                    # Try requirements.txt for Python projects
+                    req_txt_url = f"https://raw.githubusercontent.com/{owner}/{repo_name}/master/requirements.txt"
+                    req_resp = requests.get(req_txt_url, timeout=5)
+                    if req_resp.status_code == 200:
+                        deps = [line for line in req_resp.text.split('\n') if line.strip() and not line.startswith('#')]
+                        metrics["dependencies_count"] = len(deps)
+            except Exception as e:
+                print(f"[WARN] Failed to fetch dependencies for {repo_url}: {e}")
+                metrics["dependencies_count"] = 0
+            return metrics
         else:
             print(f"[ERROR] Failed to fetch data for {repo_url}: {r.status_code}")
+            return default_metrics
     except Exception as e:
         print(f"[ERROR] Exception while fetching data for {repo_url}: {e}")
+        return default_metrics
 def fetch_github_features(df):
     """
     """
     print("[INFO] Fetching GitHub features for repositories...")
     start_time = time.time()
+    # Initialize lists for storing fetched data
+    metrics_lists = {
+        "stars": [],
+        "forks": [],
+        "watchers": [],
+        "open_issues": [],
+        "pulls": [],
+        "activity": [],
+        "contributors": [],
+        "dependencies_count": []
+    }
     cache = {}
     def get_metrics(repo_url):
         if repo_url in cache:
+            print(f"[DEBUG] Cached GitHub data for {repo_url}: {cache[repo_url]}")
             return cache[repo_url]
         val = fetch_repo_metrics(repo_url)
+        print(f"[DEBUG] Extracted GitHub data for {repo_url}: {val}")
         cache[repo_url] = val
         return val
         futures = {executor.submit(get_metrics, row['repo']): i for i, row in df.iterrows()}
         for fut in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching metrics"):
             res = fut.result()
+            metrics_lists["stars"].append(res.get("stargazers_count", 0))
+            metrics_lists["forks"].append(res.get("forks_count", 0))
+            metrics_lists["watchers"].append(res.get("watchers_count", 0))
+            metrics_lists["open_issues"].append(res.get("open_issues_count", 0))
+            metrics_lists["pulls"].append(res.get("pulls_count", 0))
+            metrics_lists["activity"].append(res.get("activity", ""))
+            metrics_lists["contributors"].append(res.get("contributors", 0))
+            metrics_lists["dependencies_count"].append(res.get("dependencies_count", 0))
+    # Add the fetched data to the DataFrame
+    for key, values in metrics_lists.items():
+        df[key] = values
     end_time = time.time()
     print(f"[INFO] GitHub features fetched successfully in {end_time - start_time:.2f} seconds.")
     return df
+def calculate_fallback_weights(df):
+    """
+    Dynamically calculate fallback feature weights based on feature variance.
+    """
+    print("[INFO] Calculating fallback feature weights...")
+    numeric_cols = ['stars', 'forks', 'watchers', 'open_issues', 'pulls', 'contributors', 'dependencies_count']
+    # Filter to only include columns that exist in the DataFrame
+    valid_cols = [col for col in numeric_cols if col in df.columns]
+    # Create default weights
+    default_weights = {
+        'stars': 0.3,
+        'forks': 0.2,
+        'watchers': 0.2,
+        'open_issues': 0.1,
+        'pulls': 0.1,
+        'contributors': 0.05,
+        'dependencies_count': 0.05
+    }
+    # If any data exists, calculate variance-based weights
+    if len(valid_cols) > 0 and df[valid_cols].sum().sum() > 0:
+        # Calculate variance for each feature
+        feature_variances = df[valid_cols].var()
+        total_variance = feature_variances.sum()
+        # If meaningful variance exists, use it for weights
+        if total_variance > 0:
+            weights = {col: var / total_variance for col, var in feature_variances.items()}
+            # Normalize to ensure sum is 1.0
+            sum_weights = sum(weights.values())
+            if sum_weights > 0:
+                weights = {k: v / sum_weights for k, v in weights.items()}
+            return weights
+    # Return default weights if we couldn't calculate meaningful ones
+    print("[INFO] Using default fallback weights")
+    return default_weights
+##############################
+#  Feature Extraction
+##############################
+def load_data(file):
+    """
+    Dynamically load the dependency data CSV from the uploaded file.
+    Expects at least "repo" and "parent" columns.
+    """
+    try:
+        print("[INFO] Loading data from uploaded file...")
+        start_time = time.time()
+        # Read the uploaded file directly into a DataFrame
+        df = pd.read_csv(file)
+        end_time = time.time()
+        print(f"[INFO] Data loaded successfully in {end_time - start_time:.2f} seconds.")
+        return df
+    except Exception as e:
+        print("[ERROR] Error loading data:", e)
+        return None
 def timeout_handler(signum, frame):
     raise TimeoutError("LLama model prediction timed out.")
         feature_weights = calculate_fallback_weights(df)
         print(f"[INFO] Fallback feature weights: {feature_weights}", flush=True)
     for feature in feature_weights.keys():
         if feature in df.columns:
             df[feature] = pd.to_numeric(df[feature], errors='coerce').fillna(0)
     logging.info(f"[INFO] Base weights assigned successfully in {end_time - start_time:.2f} seconds.")
     return df
 def sanity_check_weights(df):
     """
     Sanity-checks LLM weights by comparing them with other metrics.
 def validate_target(df):
     """
     Validates the target variable to ensure it has sufficient variance.
+    If variance is insufficient, adds small random noise to create variance.
     """
     print("[INFO] Validating target variable 'base_weight'...")
     target = "base_weight"
     variance = df[target].var()
     print(f"[DEBUG] Target variable variance: {variance}")
     if variance < 1e-6:
+        print("[WARN] Target variable has insufficient variance. Adding small random noise...")
+        # Add small random noise to introduce variance
+        np.random.seed(42)  # For reproducibility
+        noise = np.random.normal(0.5, 0.1, size=len(df))
+        df[target] = noise
+        print(f"[INFO] New target variable variance: {df[target].var()}")
     return df