ma7583 commited on
Commit
1b94cd6
·
verified ·
1 Parent(s): f16abf0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def estimate_transformer_stats(batch_size, seq_len, num_layers, hidden_dim, vocab_size, show_breakdown):
4
+ B = batch_size
5
+ S = seq_len
6
+ L = num_layers
7
+ D = hidden_dim
8
+ V = vocab_size
9
+
10
+ # --- Parameters ---
11
+ num_params = L * 12 * (D ** 2) + D * V
12
+
13
+ # --- FLOPs --- (using 2 * m * n * p per matmul)
14
+ attn_proj_flops = 2 * 3 * S * D * D
15
+ attn_score_flops = 2 * S * D * S
16
+ attn_out_proj_flops = 2 * S * D * D
17
+ ffn_flops = 2 * 2 * S * D * 4 * D
18
+ logit_flops = 2 * S * D * V / L
19
+
20
+ total_layer_flops = attn_proj_flops + attn_score_flops + attn_out_proj_flops + ffn_flops + logit_flops
21
+ total_flops = 6 * B * L * total_layer_flops
22
+
23
+ output_lines = [
24
+ f"Parameters: P = 12 * L * D^2 + D * V",
25
+ f" = 12 * {L} * {D}^2 + {D} * {V} = {num_params:.2e}",
26
+ f"",
27
+ f"FLOPs per layer (per sequence):",
28
+ f" Attention Projections (QKV): 2 * 3 * S * D^2 = 2 * 3 * {S} * {D}^2 = {attn_proj_flops:.2e}",
29
+ f" Attention Scores (QKᵀ): 2 * S * D * S = 2 * {S} * {D} * {S} = {attn_score_flops:.2e}",
30
+ f" Attention Output Proj: 2 * S * D^2 = 2 * {S} * {D}^2 = {attn_out_proj_flops:.2e}",
31
+ f" Feedforward Network: 2 * 2 * S * D * 4D = 2*2*{S}*{D}*{4*D} = {ffn_flops:.2e}",
32
+ f" Logits: 2 * S * D * V / L = 2*{S}*{D}*{V} / {L} = {logit_flops:.2e}",
33
+ f"",
34
+ f"Layer Total FLOPs = {total_layer_flops:.2e}",
35
+ f"",
36
+ f"Total Training FLOPs = 6 * B * L * Layer_FLOPs",
37
+ f" = 6 * {B} * {L} * {total_layer_flops:.2e} = {total_flops:.2e}"
38
+ ]
39
+
40
+ if show_breakdown:
41
+ output_lines.append("\nComponent-wise totals across training batch:")
42
+ output_lines.append(f" - QKV Projections: {attn_proj_flops * B * L:.2e}")
43
+ output_lines.append(f" - Attention Scores: {attn_score_flops * B * L:.2e}")
44
+ output_lines.append(f" - Attention Output: {attn_out_proj_flops * B * L:.2e}")
45
+ output_lines.append(f" - FFN: {ffn_flops * B * L:.2e}")
46
+ output_lines.append(f" - Logits: {logit_flops * B * L:.2e}")
47
+
48
+ return "\n".join(output_lines)
49
+
50
+ iface = gr.Interface(
51
+ fn=estimate_transformer_stats,
52
+ inputs=[
53
+ gr.Number(label="Batch Size", value=32),
54
+ gr.Number(label="Sequence Length", value=2048),
55
+ gr.Number(label="Number of Layers", value=24),
56
+ gr.Number(label="Hidden Size (d_model)", value=2048),
57
+ gr.Number(label="Vocabulary Size", value=50272),
58
+ gr.Checkbox(label="Show FLOPs Breakdown", value=True),
59
+ ],
60
+ outputs=gr.Textbox(label="Estimates"),
61
+ title="Transformer Parameter and FLOPs Estimator",
62
+ description="Estimates parameter count and training FLOPs for decoder-only Transformers (like OPT/GPT). Shows formulas and per-component breakdown."
63
+ )
64
+
65
+ if __name__ == "__main__":
66
+ iface.launch()