FelixPhilip commited on
Commit
bf9b592
Β·
1 Parent(s): d3e9b26
Files changed (2) hide show
  1. Oracle/deepfundingoracle.py +69 -4
  2. app.py +194 -51
Oracle/deepfundingoracle.py CHANGED
@@ -47,8 +47,6 @@ from sklearn.preprocessing import RobustScaler
47
  from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold
48
  from sklearn.ensemble import RandomForestRegressor
49
  from sklearn.preprocessing import StandardScaler
50
- import matplotlib.pyplot as plt
51
- import seaborn as sns
52
  from scipy.special import log1p, expm1
53
  from sklearn.preprocessing import RobustScaler
54
  from sklearn.metrics import mean_squared_error
@@ -65,9 +63,76 @@ logging.basicConfig(
65
  logging.StreamHandler(sys.stdout)
66
  ],
67
  level=logging.INFO,
68
- format="%(asctime)s - %(levelname)s - %(message)s"
69
- )
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  ##############################
73
  # GitHub API helper: Fetch repository metrics
 
47
  from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV,KFold
48
  from sklearn.ensemble import RandomForestRegressor
49
  from sklearn.preprocessing import StandardScaler
 
 
50
  from scipy.special import log1p, expm1
51
  from sklearn.preprocessing import RobustScaler
52
  from sklearn.metrics import mean_squared_error
 
63
  logging.StreamHandler(sys.stdout)
64
  ],
65
  level=logging.INFO,
66
+ format="%(asctime)s - %(levelname)s - %(message)s")
 
67
 
68
+ # Add these functions to make the pipeline importable by app.py
69
+
70
+
71
+ def prepare_dataset(file_path):
72
+ """
73
+ Wrapper function that prepares the dataset by:
74
+ 1. Loading the CSV
75
+ 2. Fetching GitHub features
76
+ 3. Adding derived features
77
+ 4. Cleaning data
78
+ 5. Generating base weights using LLM
79
+
80
+ Args:
81
+ file_path: Path to the input CSV file
82
+
83
+ Returns:
84
+ DataFrame with all features and base_weight prepared
85
+ """
86
+ logging.info(f"Preparing dataset from {file_path}")
87
+
88
+ # Load data
89
+ if isinstance(file_path, str):
90
+ df = pd.read_csv(file_path)
91
+ else:
92
+ # Handle file object (from Gradio)
93
+ df = pd.read_csv(file_path)
94
+
95
+ # Check required columns
96
+ if not {"repo", "parent"}.issubset(df.columns):
97
+ raise ValueError("Input CSV must contain 'repo' and 'parent' columns.")
98
+
99
+ # Run the pipeline steps
100
+ df = fetch_github_features(df)
101
+ df = add_derived_features(df)
102
+ df = clean_data(df)
103
+ df = generate_all_base_weights(df)
104
+
105
+ return df
106
+
107
+
108
+ def run_full_pipeline(input_file, output_file="submission_enhanced.csv"):
109
+ """
110
+ Runs the complete DeepFunding Oracle pipeline.
111
+
112
+ Args:
113
+ input_file: Path to input CSV file
114
+ output_file: Path for output CSV file
115
+
116
+ Returns:
117
+ The processed DataFrame with final_weight column
118
+ """
119
+ logging.info("--- Starting DeepFunding Oracle Pipeline ---")
120
+
121
+ # Prepare dataset
122
+ df = prepare_dataset(input_file)
123
+
124
+ # Train model and predict weights
125
+ df = train_predict_weight(df)
126
+
127
+ # Normalize weights
128
+ df = normalize_and_clip_weights(df)
129
+
130
+ # Save results
131
+ create_submission_csv(df, output_file)
132
+
133
+ logging.info("--- Pipeline Completed Successfully ---")
134
+
135
+ return df
136
 
137
  ##############################
138
  # GitHub API helper: Fetch repository metrics
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import os
2
  import gradio as gr
3
- from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv
4
  import pandas as pd
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
@@ -8,59 +7,203 @@ import numpy as np
8
  import time
9
  import io
10
  from PIL import Image
 
 
 
 
 
 
 
 
 
11
 
12
  def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
 
 
 
13
  start_time = time.time()
14
- progress(0, desc="Preparing dataset...")
15
- df = prepare_dataset(file.name)
16
- progress(0.3, desc="Predicting weights...")
17
- df = train_predict_weight(df)
18
- progress(0.6, desc="Saving results to CSV...")
19
- csv_path = create_submission_csv(df, "submission.csv")
20
- progress(0.8, desc="Generating graphs...")
21
-
22
- # Feature distribution plot
23
- dist_fig = plt.figure(figsize=(15, 10))
24
- numeric_cols = df.select_dtypes(include=[np.number]).columns
25
- df[numeric_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
26
- plt.suptitle("Feature Distributions", fontsize=16)
27
- dist_buf = io.BytesIO()
28
- plt.savefig(dist_buf, format='png')
29
- dist_buf.seek(0)
30
- plt.close(dist_fig)
31
- dist_img = Image.open(dist_buf)
32
-
33
- # Correlation matrix plot
34
- corr_fig = plt.figure(figsize=(12, 8))
35
- correlation_matrix = df[numeric_cols].corr()
36
- sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
37
- plt.title("Feature Correlation Matrix", fontsize=16)
38
- corr_buf = io.BytesIO()
39
- plt.savefig(corr_buf, format='png')
40
- corr_buf.seek(0)
41
- plt.close(corr_fig)
42
- corr_img = Image.open(corr_buf)
43
-
44
- progress(1, desc="Done!")
45
- elapsed = time.time() - start_time
46
- preview = df.head().to_csv(index=False)
47
- return preview, csv_path, dist_img, corr_img, f"Analysis completed in {elapsed:.2f} seconds."
48
-
49
- iface = gr.Interface(
50
- fn=analyze_file,
51
- inputs=gr.File(label="Upload CSV"),
52
- outputs=[
53
- gr.Textbox(label="Preview of Results"),
54
- gr.File(label="Download CSV"),
55
- gr.Image(label="Feature Distributions"),
56
- gr.Image(label="Feature Correlation Matrix"),
57
- gr.Textbox(label="Status/Timing Info")
58
- ],
59
- title="DeepFunding Oracle",
60
- description="Upload a CSV of repo-parent relationships; see analysis progress, get graphs, and download results as CSV.",
61
- allow_flagging="never"
62
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  if __name__ == "__main__":
65
  port = int(os.environ.get("PORT", 7860))
66
- iface.launch(server_name="0.0.0.0", server_port=port)
 
 
 
 
 
 
1
  import os
2
  import gradio as gr
 
3
  import pandas as pd
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
 
7
  import time
8
  import io
9
  from PIL import Image
10
+ import logging
11
+
12
+ # Import the functions from deepfundingoracle
13
+ from Oracle.deepfundingoracle import prepare_dataset, train_predict_weight, create_submission_csv, \
14
+ normalize_and_clip_weights
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO)
18
+
19
 
20
  def analyze_file(file, progress=gr.Progress(track_tqdm=True)):
21
+ """
22
+ Analyzes the uploaded file and generates results.
23
+ """
24
  start_time = time.time()
25
+
26
+ try:
27
+ # Step 1: Prepare dataset
28
+ progress(0, desc="Preparing dataset...")
29
+ df = prepare_dataset(file.name)
30
+
31
+ # Step 2: Train model and predict weights
32
+ progress(0.3, desc="Training model and predicting weights...")
33
+ df = train_predict_weight(df)
34
+
35
+ # Step 3: Normalize weights
36
+ progress(0.5, desc="Normalizing weights...")
37
+ df = normalize_and_clip_weights(df)
38
+
39
+ # Step 4: Save results
40
+ progress(0.6, desc="Saving results to CSV...")
41
+ output_filename = "submission.csv"
42
+ create_submission_csv(df, output_filename)
43
+
44
+ # Step 5: Generate visualizations
45
+ progress(0.8, desc="Generating graphs...")
46
+
47
+ # Feature distribution plot
48
+ dist_fig = plt.figure(figsize=(15, 10))
49
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
50
+ plot_cols = [col for col in numeric_cols if
51
+ col in ['stars', 'forks', 'watchers', 'contributors', 'pulls', 'final_weight']]
52
+
53
+ if plot_cols:
54
+ df[plot_cols].hist(bins=20, figsize=(15, 10), color="skyblue", edgecolor="black")
55
+ plt.suptitle("Feature Distributions", fontsize=16)
56
+ plt.tight_layout()
57
+
58
+ dist_buf = io.BytesIO()
59
+ plt.savefig(dist_buf, format='png', dpi=100, bbox_inches='tight')
60
+ dist_buf.seek(0)
61
+ plt.close(dist_fig)
62
+ dist_img = Image.open(dist_buf)
63
+
64
+ # Correlation matrix plot
65
+ corr_fig = plt.figure(figsize=(12, 8))
66
+ if len(plot_cols) > 1:
67
+ correlation_matrix = df[plot_cols].corr()
68
+ sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
69
+ plt.title("Feature Correlation Matrix", fontsize=16)
70
+
71
+ corr_buf = io.BytesIO()
72
+ plt.savefig(corr_buf, format='png', dpi=100, bbox_inches='tight')
73
+ corr_buf.seek(0)
74
+ plt.close(corr_fig)
75
+ corr_img = Image.open(corr_buf)
76
+
77
+ # Prepare preview
78
+ progress(1, desc="Done!")
79
+ elapsed = time.time() - start_time
80
+
81
+ # Create a summary preview
82
+ summary_df = df[['repo', 'parent', 'final_weight']].head(10)
83
+ preview = f"Top 10 Results:\n{summary_df.to_string(index=False)}\n\nTotal repositories analyzed: {len(df)}"
84
+
85
+ # Return the path to the generated file for automatic download
86
+ return (
87
+ preview,
88
+ output_filename, # This will trigger automatic download
89
+ dist_img,
90
+ corr_img,
91
+ f"βœ… Analysis completed successfully in {elapsed:.2f} seconds.\nπŸ“₯ Results file ready for download!"
92
+ )
93
+
94
+ except Exception as e:
95
+ logging.error(f"Error during analysis: {str(e)}")
96
+ elapsed = time.time() - start_time
97
+ error_msg = f"❌ Error: {str(e)}\nTime elapsed: {elapsed:.2f} seconds"
98
+
99
+ # Return empty images and error message
100
+ empty_img = Image.new('RGB', (800, 600), color='white')
101
+ return error_msg, None, empty_img, empty_img, error_msg
102
+
103
+
104
+ # Custom CSS for better styling
105
+ custom_css = """
106
+ .download-button {
107
+ background-color: #4CAF50 !important;
108
+ color: white !important;
109
+ font-weight: bold !important;
110
+ }
111
+ .status-box {
112
+ font-family: monospace;
113
+ padding: 10px;
114
+ border-radius: 5px;
115
+ }
116
+ """
117
+
118
+ # Create Gradio interface with automatic download
119
+ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as iface:
120
+ gr.Markdown("""
121
+ # πŸš€ DeepFunding Oracle
122
+
123
+ Upload a CSV file containing repository dependencies with 'repo' and 'parent' columns.
124
+ The system will:
125
+ 1. **Fetch** GitHub metrics for each repository
126
+ 2. **Generate** importance weights using AI
127
+ 3. **Train** a model to predict final contribution weights
128
+ 4. **Normalize** weights so they sum to 1 per parent
129
+
130
+ ⚠️ **Note**: Set `GITHUB_API_TOKEN` environment variable for better API rate limits.
131
+ """)
132
+
133
+ with gr.Row():
134
+ with gr.Column(scale=1):
135
+ file_input = gr.File(
136
+ label="Upload CSV File",
137
+ file_types=[".csv"],
138
+ elem_id="file-upload"
139
+ )
140
+ analyze_btn = gr.Button("πŸ” Analyze", variant="primary", size="lg")
141
+
142
+ with gr.Column(scale=2):
143
+ status_output = gr.Textbox(
144
+ label="Status",
145
+ lines=3,
146
+ elem_classes="status-box"
147
+ )
148
+
149
+ with gr.Row():
150
+ preview_output = gr.Textbox(
151
+ label="Preview of Results",
152
+ lines=15,
153
+ show_copy_button=True
154
+ )
155
+
156
+ with gr.Row():
157
+ download_output = gr.File(
158
+ label="πŸ“₯ Download Results CSV",
159
+ visible=True,
160
+ elem_classes="download-button"
161
+ )
162
+
163
+ with gr.Row():
164
+ with gr.Column():
165
+ dist_plot = gr.Image(label="Feature Distributions")
166
+ with gr.Column():
167
+ corr_plot = gr.Image(label="Feature Correlation Matrix")
168
+
169
+ # JavaScript for automatic download
170
+ download_js = """
171
+ () => {
172
+ setTimeout(() => {
173
+ const downloadButton = document.querySelector('.download-button a');
174
+ if (downloadButton) {
175
+ downloadButton.click();
176
+ }
177
+ }, 500);
178
+ }
179
+ """
180
+
181
+ # Set up the event handler
182
+ analyze_btn.click(
183
+ fn=analyze_file,
184
+ inputs=[file_input],
185
+ outputs=[preview_output, download_output, dist_plot, corr_plot, status_output]
186
+ ).then(
187
+ fn=None,
188
+ inputs=None,
189
+ outputs=None,
190
+ _js=download_js # This triggers automatic download
191
+ )
192
+
193
+ # Add example usage
194
+ gr.Examples(
195
+ examples=[["example_dependencies.csv"]], # Add your example file here if you have one
196
+ inputs=file_input,
197
+ outputs=[preview_output, download_output, dist_plot, corr_plot, status_output],
198
+ fn=analyze_file,
199
+ cache_examples=False,
200
+ )
201
 
202
  if __name__ == "__main__":
203
  port = int(os.environ.get("PORT", 7860))
204
+ iface.launch(
205
+ server_name="0.0.0.0",
206
+ server_port=port,
207
+ share=False,
208
+ show_error=True
209
+ )