pavlyhalim commited on
Commit
dd5d2d2
·
1 Parent(s): 316f472

Adding model and demo

Browse files
Files changed (4) hide show
  1. .DS_Store +0 -0
  2. demo_app.py +526 -0
  3. model.joblib +3 -0
  4. requirements.txt +8 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
demo_app.py ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ import plotly.graph_objects as go
6
+ from sklearn.ensemble import RandomForestRegressor
7
+
8
+ class GEMMPredictor:
9
+ def __init__(self, model_path='model.joblib'):
10
+ self.stacked_model = joblib.load(model_path)
11
+ self.initialize_features()
12
+
13
+ def initialize_features(self):
14
+ """Initialize features used by the model"""
15
+ # Core matrix features
16
+ self.core_features = [
17
+ 'm', 'n', 'k',
18
+ 'blocksize1', 'blocksize2', 'blocksize3'
19
+ ]
20
+ # Derived features
21
+ self.derived_features = [
22
+ 'arithmetic_intensity',
23
+ 'bytes_accessed',
24
+ 'total_flops'
25
+ ]
26
+ # Categorical features
27
+ self.categorical_features = ['Layout']
28
+ # Target features
29
+ self.target_features = [
30
+ 'runtime',
31
+ 'power',
32
+ 'Energy',
33
+ 'TFlops'
34
+ ]
35
+ self.numerical_features = self.core_features + self.derived_features
36
+
37
+ def calculate_gemm_characteristics(self, m, n, k, blocksize1, blocksize2, blocksize3):
38
+ """Calculate GEMM-specific characteristics"""
39
+ total_flops = 2 * m * n * k # 2 operations per FMA
40
+ bytes_accessed = (m * k + k * n + m * n) * 4 # Single precision
41
+ arithmetic_intensity = total_flops / bytes_accessed
42
+ bound_type = 'compute' if arithmetic_intensity > 59 else 'memory'
43
+
44
+ return {
45
+ 'total_flops': total_flops,
46
+ 'bytes_accessed': bytes_accessed,
47
+ 'arithmetic_intensity': arithmetic_intensity,
48
+ 'bound_type': bound_type
49
+ }
50
+
51
+ def get_default_numeric_values(self):
52
+ """Return default values for missing numeric features"""
53
+ return {
54
+ # Memory-related defaults
55
+ 'total_memory': 12288, # 12GB for RTX 4070
56
+ 'free_memory': 10240, # Assuming 80% free
57
+ 'used_memory': 2048, # Assuming 20% used
58
+ 'mem_util': 20.0, # 20% utilization
59
+ 'mem_util2': 20.0, # Secondary memory utilization
60
+
61
+ # GPU state defaults
62
+ 'temp': 65.0, # Default temperature
63
+ 'gpu_util': 80.0, # Default GPU utilization
64
+ 'gpu_util1': 80.0, # Secondary GPU utilization
65
+ 'clock_sm': 2475, # Default SM clock for RTX 4070
66
+ 'power_limit': 200.0, # Default power limit
67
+ 'clocks.meme': 2000, # Memory clock speed
68
+
69
+ 'alpha': 1.0, # Default scaling factor
70
+ 'beta': 0.0, # Default scaling factor
71
+ 'problem_size_m': 1024,
72
+ 'problem_size_n': 1024,
73
+ 'problem_size_k': 1024
74
+ }
75
+
76
+ def get_default_categorical_values(self):
77
+ """Return default values for missing categorical features"""
78
+ return {
79
+ 'stage': 'main',
80
+ 'kernel_name': 'cutlass_simt_sgemm_128x128_8x2_nn_align1',
81
+ 'computation_pattern': 'GEMM',
82
+ 'combination_type': 'standard',
83
+ 'state': 'active',
84
+ 'uses_shared_memory': 'true',
85
+ 'gpu_name': 'RTX4070'
86
+ }
87
+
88
+ def prepare_input_data(self, input_dict):
89
+ """Prepare input data for prediction with default values for missing features"""
90
+ numeric_defaults = self.get_default_numeric_values()
91
+ categorical_defaults = self.get_default_categorical_values()
92
+
93
+ complete_input = {**numeric_defaults, **categorical_defaults}
94
+
95
+ complete_input.update(input_dict)
96
+
97
+ df = pd.DataFrame([complete_input])
98
+
99
+ characteristics = self.calculate_gemm_characteristics(
100
+ df['m'].iloc[0], df['n'].iloc[0], df['k'].iloc[0],
101
+ df['blocksize1'].iloc[0], df['blocksize2'].iloc[0], df['blocksize3'].iloc[0]
102
+ )
103
+
104
+ df['total_flops'] = characteristics['total_flops']
105
+ df['bytes_accessed'] = characteristics['bytes_accessed']
106
+ df['arithmetic_intensity'] = characteristics['arithmetic_intensity']
107
+
108
+ for col in self.categorical_features:
109
+ if col in df.columns:
110
+ df[col] = df[col].astype(str)
111
+
112
+ for col in self.numerical_features:
113
+ if col in df.columns:
114
+ df[col] = pd.to_numeric(df[col], errors='coerce')
115
+
116
+ return df
117
+
118
+ def estimate_power(df):
119
+ BASE_POWER = 30
120
+ MAX_POWER = 200
121
+ MAX_TFLOPS = 40
122
+
123
+ df['estimated_power'] = BASE_POWER + (
124
+ (MAX_POWER - BASE_POWER) *
125
+ (df['total_flops'] / (MAX_TFLOPS * 1e12))
126
+ )
127
+
128
+ df['power'] = df['power'].fillna(df['estimated_power'])
129
+
130
+ return df
131
+
132
+ def filter_power_bounds(df):
133
+ MIN_POWER = 25 # Minimum idle power
134
+ MAX_POWER = 200 # Maximum TDP
135
+
136
+ df = df[
137
+ (df['power'].between(MIN_POWER, MAX_POWER)) |
138
+ (df['power'].isna())
139
+ ]
140
+
141
+ return df
142
+
143
+ def impute_power(df):
144
+ df['total_elements'] = df['m'] * df['n'] * df['k']
145
+ valid_power = df[df['power'].notna()]
146
+
147
+ features = ['total_elements', 'total_flops', 'arithmetic_intensity']
148
+ X = valid_power[features]
149
+ y = valid_power['power']
150
+
151
+ model = RandomForestRegressor(n_estimators=100)
152
+ model.fit(X, y)
153
+
154
+ missing_power = df[df['power'].isna()]
155
+ imputed_values = model.predict(missing_power[features])
156
+ df.loc[df['power'].isna(), 'power'] = imputed_values
157
+
158
+ return df
159
+
160
+ def preprocess_data(self, df):
161
+ """Preprocess data focusing on GEMM characteristics with improved power handling"""
162
+ print("\nPreprocessing data...")
163
+
164
+ try:
165
+ df_processed = df.copy()
166
+ df_processed = df_processed.replace('[N/A]', np.nan)
167
+ df_processed = df_processed.replace('', np.nan)
168
+ df_processed = self.calculate_gemm_characteristics(df_processed)
169
+
170
+ df_processed['Layout'] = df_processed['Layout'].astype(str)
171
+
172
+ df_processed = self.estimate_power(df_processed)
173
+ df_processed = self.impute_power(df_processed)
174
+ df_processed = self.filter_power_bounds(df_processed)
175
+
176
+ for col in self.numerical_features:
177
+ if col in df_processed.columns:
178
+ df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')
179
+ Q1 = df_processed[col].quantile(0.01)
180
+ Q3 = df_processed[col].quantile(0.99)
181
+ df_processed[col] = df_processed[col].clip(Q1, Q3)
182
+ df_processed[col] = df_processed[col].fillna(df_processed[col].median())
183
+
184
+ print("Data preprocessing completed successfully")
185
+ print(f"Features summary:")
186
+ print(df_processed[self.numerical_features].describe())
187
+
188
+ return df_processed
189
+
190
+ except Exception as e:
191
+ print(f"Error in preprocess_data: {str(e)}")
192
+ raise
193
+
194
+ def predict(self, input_data):
195
+ """Make predictions using the stacked model"""
196
+ df = self.prepare_input_data(input_data)
197
+ predictions = self.stacked_model.predict(df)
198
+
199
+ # Map predictions to target features
200
+ prediction_dict = {target: predictions[0][i] for i, target in enumerate(self.target_features)}
201
+
202
+ prediction_dict['characteristics'] = self.calculate_gemm_characteristics(
203
+ input_data['m'], input_data['n'], input_data['k'],
204
+ input_data['blocksize1'], input_data['blocksize2'], input_data['blocksize3']
205
+ )
206
+
207
+ return prediction_dict
208
+
209
+ def create_comparison_chart(current_metrics, optimal_metrics):
210
+ """Create a comparison chart using plotly"""
211
+ metrics = ['Runtime (ms)', 'Power (W)', 'Energy (J)', 'TFLOPS']
212
+ current_values = [
213
+ current_metrics['runtime'],
214
+ current_metrics['power'],
215
+ current_metrics['Energy'],
216
+ current_metrics['TFlops']
217
+ ]
218
+ optimal_values = [
219
+ optimal_metrics['runtime'],
220
+ optimal_metrics['power'],
221
+ optimal_metrics['Energy'],
222
+ optimal_metrics['TFlops']
223
+ ]
224
+
225
+ fig = go.Figure(data=[
226
+ go.Bar(name='Current', x=metrics, y=current_values, marker_color='#ff7c43'),
227
+ go.Bar(name='Optimal', x=metrics, y=optimal_values, marker_color='#00ba38')
228
+ ])
229
+
230
+ fig.update_layout(
231
+ barmode='group',
232
+ title='Performance Comparison',
233
+ xaxis_title='Metrics',
234
+ yaxis_title='Values',
235
+ height=400
236
+ )
237
+
238
+ return fig
239
+
240
+ def create_heatmap(m, n, k, block_m, block_n):
241
+ """Create a heatmap visualization of the matrix blocking"""
242
+ grid_m = int(np.ceil(m / block_m))
243
+ grid_n = int(np.ceil(n / block_n))
244
+
245
+ grid = np.random.uniform(0.5, 1.0, (grid_m, grid_n))
246
+
247
+ fig = go.Figure(data=go.Heatmap(
248
+ z=grid,
249
+ colorscale='Viridis',
250
+ showscale=False
251
+ ))
252
+
253
+ fig.update_layout(
254
+ title='Matrix Blocking Visualization',
255
+ xaxis_title='N dimension (columns)',
256
+ yaxis_title='M dimension (rows)',
257
+ height=300,
258
+ margin=dict(l=50, r=50, t=50, b=50)
259
+ )
260
+
261
+ return fig
262
+
263
+ def create_performance_metrics_chart(predictions):
264
+ """Create a gauge chart for TFLOPS and other metrics"""
265
+ max_tflops = 40 # RTX 4070 theoretical max
266
+ tflops_percentage = (predictions['TFlops'] / max_tflops) * 100
267
+
268
+ fig = go.Figure(go.Indicator(
269
+ mode = "gauge+number",
270
+ value = predictions['TFlops'],
271
+ domain = {'x': [0, 1], 'y': [0, 1]},
272
+ title = {'text': "TFLOPS Performance"},
273
+ gauge = {
274
+ 'axis': {'range': [None, max_tflops]},
275
+ 'bar': {'color': "darkblue"},
276
+ 'steps': [
277
+ {'range': [0, max_tflops/3], 'color': "red"},
278
+ {'range': [max_tflops/3, 2*max_tflops/3], 'color': "yellow"},
279
+ {'range': [2*max_tflops/3, max_tflops], 'color': "green"}
280
+ ],
281
+ 'threshold': {
282
+ 'line': {'color': "red", 'width': 4},
283
+ 'thickness': 0.75,
284
+ 'value': predictions['TFlops']
285
+ }
286
+ }
287
+ ))
288
+
289
+ fig.update_layout(height=300)
290
+ return fig
291
+
292
+ def create_efficiency_chart(arithmetic_intensity, mem_bandwidth_utilization, compute_utilization):
293
+ """Create a spider chart showing various efficiency metrics"""
294
+ fig = go.Figure()
295
+
296
+ categories = ['Arithmetic Intensity', 'Memory BW Utilization', 'Compute Utilization']
297
+
298
+ fig.add_trace(go.Scatterpolar(
299
+ r=[arithmetic_intensity/200*100, mem_bandwidth_utilization, compute_utilization],
300
+ theta=categories,
301
+ fill='toself',
302
+ name='Current Configuration'
303
+ ))
304
+
305
+ fig.update_layout(
306
+ polar=dict(
307
+ radialaxis=dict(
308
+ visible=True,
309
+ range=[0, 100]
310
+ )),
311
+ showlegend=False,
312
+ height=300
313
+ )
314
+
315
+ return fig
316
+
317
+ def main():
318
+ st.set_page_config(page_title="GEMM Performance Predictor", layout="wide")
319
+ st.markdown("""
320
+ <style>
321
+ .main {
322
+ padding: 2rem 1rem;
323
+ max-width: 100%;
324
+ }
325
+ .metric-card {
326
+ background-color: #f0f2f6;
327
+ padding: 1rem;
328
+ border-radius: 0.5rem;
329
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
330
+ }
331
+ </style>
332
+ """, unsafe_allow_html=True)
333
+
334
+ st.title("GEMM Performance Predictor for RTX 4070")
335
+
336
+ try:
337
+ predictor = GEMMPredictor()
338
+ col1, col2, col3 = st.columns([1,1,1])
339
+
340
+ with col1:
341
+ st.subheader("Matrix Dimensions")
342
+ with st.expander("Set Matrix Dimensions", expanded=True):
343
+ m = st.number_input("M", min_value=1, value=512)
344
+ n = st.number_input("N", min_value=1, value=512)
345
+ k = st.number_input("K", min_value=1, value=1024)
346
+
347
+ with col2:
348
+ st.subheader("Block Sizes")
349
+ with st.expander("Set Block Dimensions", expanded=True):
350
+ blocksize1 = st.number_input("Block Size 1", min_value=1, value=512)
351
+ blocksize2 = st.number_input("Block Size 2", min_value=1, value=128)
352
+ blocksize3 = st.number_input("Block Size 3", min_value=1, value=512)
353
+
354
+ with col3:
355
+ st.subheader("Configuration")
356
+ with st.expander("Additional Settings", expanded=True):
357
+ layout = st.selectbox("Matrix Layout", ['nn', 'nt', 'tn', 'tt'])
358
+ kernel_name = st.selectbox(
359
+ "CUTLASS Kernel",
360
+ [
361
+ 'cutlass_simt_sgemm_128x128_8x2_nn_align1',
362
+ 'cutlass_simt_sgemm_128x128_8x2_nt_align1',
363
+ 'cutlass_simt_sgemm_128x128_8x2_tn_align1',
364
+ 'cutlass_simt_sgemm_128x128_8x2_tt_align1'
365
+ ]
366
+ )
367
+ alpha = st.number_input("Alpha Scalar", value=1.00, step=0.25)
368
+ beta = st.number_input("Beta Scalar", value=0.50, step=0.25)
369
+
370
+ if st.button("Analyze Performance", use_container_width=True):
371
+ with st.spinner("Analyzing performance..."):
372
+ input_data = {
373
+ 'm': m, 'n': n, 'k': k,
374
+ 'blocksize1': blocksize1,
375
+ 'blocksize2': blocksize2,
376
+ 'blocksize3': blocksize3,
377
+ 'Layout': layout,
378
+ 'kernel_name': kernel_name,
379
+ 'alpha': alpha,
380
+ 'beta': beta
381
+ }
382
+ predictions = predictor.predict(input_data)
383
+
384
+ tab1, tab2, tab3 = st.tabs(["Performance Metrics", "Detailed Analysis", "Visualizations"])
385
+
386
+ with tab1:
387
+ st.subheader("GEMM Characteristics")
388
+ metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
389
+
390
+ with metric_col1:
391
+ st.metric(
392
+ "Arithmetic Intensity",
393
+ f"{predictions['characteristics']['arithmetic_intensity']:.2f}",
394
+ f"{predictions['characteristics']['bound_type'].upper()} bound"
395
+ )
396
+
397
+ with metric_col2:
398
+ st.metric(
399
+ "Total FLOPS",
400
+ f"{predictions['characteristics']['total_flops']/1e9:.2f}G",
401
+ "Operations"
402
+ )
403
+
404
+ with metric_col3:
405
+ st.metric(
406
+ "Memory Accessed",
407
+ f"{predictions['characteristics']['bytes_accessed']/1e6:.2f}MB",
408
+ "Total Data Movement"
409
+ )
410
+
411
+ with metric_col4:
412
+ memory_efficiency = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100)
413
+ st.metric(
414
+ "Memory Efficiency",
415
+ f"{memory_efficiency:.1f}%",
416
+ "vs Peak Bandwidth"
417
+ )
418
+
419
+ st.markdown("---")
420
+
421
+ perf_col1, perf_col2, perf_col3, perf_col4 = st.columns(4)
422
+
423
+ with perf_col1:
424
+ st.metric(
425
+ "Runtime",
426
+ f"{max(0.01, predictions['runtime']):.2f} ms",
427
+ "Execution Time"
428
+ )
429
+
430
+ with perf_col2:
431
+ st.metric(
432
+ "Power",
433
+ f"{max(1.0, predictions['power']):.2f} W",
434
+ "Power Consumption"
435
+ )
436
+
437
+ with perf_col3:
438
+ st.metric(
439
+ "Energy",
440
+ f"{max(0.01, predictions['Energy']):.2f} J",
441
+ "Total Energy"
442
+ )
443
+
444
+ with perf_col4:
445
+ efficiency = (predictions['TFlops'] / 40) * 100
446
+ st.metric(
447
+ "TFLOPS",
448
+ f"{predictions['TFlops']:.2f}",
449
+ f"{efficiency:.1f}% of Peak"
450
+ )
451
+
452
+ with tab2:
453
+ st.subheader("Detailed Performance Analysis")
454
+
455
+ col1, col2 = st.columns(2)
456
+
457
+ with col1:
458
+ st.markdown("#### Matrix Configuration")
459
+ st.markdown(f"""
460
+ - Total Matrix Elements: {m*n:,}
461
+ - Memory Footprint: {predictions['characteristics']['bytes_accessed']/1e6:.2f} MB
462
+ - Block Dimensions: {blocksize1}x{blocksize2}x{blocksize3}
463
+ - Grid Size: {m//blocksize1}x{n//blocksize2} blocks
464
+ """)
465
+
466
+ with col2:
467
+ st.markdown("#### Performance Bottlenecks")
468
+ ai = predictions['characteristics']['arithmetic_intensity']
469
+ if ai > 59:
470
+ st.success("✅ Compute Bound - Optimal for GPU")
471
+ else:
472
+ st.warning("⚠️ Memory Bound - Consider Optimization")
473
+
474
+ efficiency = (predictions['TFlops'] / 40) * 100
475
+ if efficiency < 30:
476
+ st.error("🔴 Low Compute Efficiency - Check Configuration")
477
+ elif efficiency < 60:
478
+ st.warning("🟡 Moderate Efficiency - Room for Improvement")
479
+ else:
480
+ st.success("🟢 Good Efficiency")
481
+
482
+ with tab3:
483
+ st.subheader("Performance Visualizations")
484
+
485
+ viz_col1, viz_col2 = st.columns(2)
486
+
487
+ with viz_col1:
488
+ st.plotly_chart(create_performance_metrics_chart(predictions), use_container_width=True)
489
+
490
+ with viz_col2:
491
+ mem_bw_util = min(100, predictions['characteristics']['bytes_accessed'] / (504 * 1e9) * 100)
492
+ compute_util = min(100, (predictions['TFlops'] / 40) * 100)
493
+ st.plotly_chart(
494
+ create_efficiency_chart(
495
+ predictions['characteristics']['arithmetic_intensity'],
496
+ mem_bw_util,
497
+ compute_util
498
+ ),
499
+ use_container_width=True
500
+ )
501
+
502
+ st.plotly_chart(create_heatmap(m, n, k, blocksize1, blocksize2), use_container_width=True)
503
+
504
+ st.markdown("### Recommendations")
505
+
506
+ recommendations = []
507
+ if blocksize1 * blocksize2 > 1024:
508
+ recommendations.append("⚠️ Block size might be too large for optimal occupancy")
509
+ if predictions['characteristics']['arithmetic_intensity'] < 30:
510
+ recommendations.append("Consider increasing arithmetic intensity through blocking")
511
+ if efficiency < 50:
512
+ recommendations.append("Performance is below 50% of peak - try different block sizes")
513
+
514
+ if recommendations:
515
+ for rec in recommendations:
516
+ st.markdown(f"- {rec}")
517
+ else:
518
+ st.success("Current configuration appears optimal!")
519
+
520
+ except Exception as e:
521
+ st.error(f"An error occurred: {str(e)}")
522
+ st.write("Please make sure the model file 'rtx4070_performance_models.joblib' is in the correct directory.")
523
+ st.write("If the error persists, check the input parameters and model compatibility.")
524
+
525
+ if __name__ == "__main__":
526
+ main()
model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0515756fcee4d66c50911757d1956682e7ea023f1f8bd92a15dbdbc49835f08a
3
+ size 2759586
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ scikit-learn
4
+ matplotlib
5
+ seaborn
6
+ joblib
7
+ streamlit
8
+ plotly