jwkirchenbauer commited on
Commit
9004b7b
·
verified ·
1 Parent(s): a4a17d6

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. config.hq_cd.yaml +116 -0
  3. config.yaml +131 -0
  4. metrics.hq_cd.jsonl +0 -0
  5. metrics.jsonl +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ metrics.jsonl filter=lfs diff=lfs merge=lfs -text
config.hq_cd.yaml ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: prod_lingua_7B_2T_lin_hq_cd_128N
2
+ dump_dir: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_lin_hq_cd_128N
3
+ seed: 777
4
+ grad_acc_steps: 1
5
+ gc_collect_freq: 1000
6
+ probe_freq: null
7
+ steps: 239000
8
+ data:
9
+ root_dir: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked
10
+ sources:
11
+ stackv2_edu: 0.170363004622478
12
+ doab: 0.160392905980941
13
+ wikimedia: 0.15321084473113
14
+ stackexchange: 0.149577338855193
15
+ peS2o: 0.121808388328754
16
+ cccc: 0.116267077616518
17
+ arxiv_papers: 0.0649775722614287
18
+ data_provenance_initiative: 0.0455351865398593
19
+ pressbooks: 0.00768988231013495
20
+ libretexts: 0.00481335361292812
21
+ news: 0.00331307394000609
22
+ foodista: 0.00111435553323384
23
+ oercommons: 0.000692183554680895
24
+ python_enhancement_proposals: 0.000151098755264962
25
+ public_domain_review: 9.37333574480524e-05
26
+ batch_size: 4
27
+ seq_len: 4096
28
+ n_views: 2
29
+ seed: 42
30
+ add_bos: true
31
+ add_eos: true
32
+ load_async: true
33
+ prefetch_size: 4096
34
+ tokenizer:
35
+ name: tiktoken
36
+ path: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked/tokenizer/common-pile-tokenizer.tiktoken
37
+ optim:
38
+ lr: 0.002
39
+ weight_decay: 0.2
40
+ epsilon: 1.0e-08
41
+ beta1: 0.9
42
+ beta2: 0.95
43
+ clip: 1.0
44
+ scheduler: linear
45
+ warmup: -324821
46
+ lr_min_ratio: 0.0
47
+ cycle_length: 1.0
48
+ cosine_theta: 1.0
49
+ annealing_step: 1000
50
+ decay_fraction: 0.1
51
+ exp_factor: 0.5
52
+ model:
53
+ dim: 4096
54
+ n_layers: 32
55
+ head_dim: null
56
+ n_heads: 32
57
+ n_kv_heads: null
58
+ ffn_dim_multiplier: 1.0
59
+ multiple_of: 256
60
+ norm_eps: 1.0e-05
61
+ rope_theta: 100000.0
62
+ init_base_std: null
63
+ init_std_factor: disabled
64
+ max_seqlen: 4096
65
+ seed: 42
66
+ vocab_size: 64256
67
+ weight_tying: false
68
+ sliding_window: null
69
+ distributed:
70
+ dp_shard: 1
71
+ dp_replicate: 512
72
+ tp_size: 1
73
+ selective_activation_checkpointing: false
74
+ compile: true
75
+ fsdp_type: full_shard
76
+ model_dtype: bf16
77
+ float8_recipe: null
78
+ float8_filter: layers\.[0-9]+\.
79
+ matmul_allow_tf32: false
80
+ detect_anomaly: false
81
+ compile_cache_size_limit: 8
82
+ spawn_method: forkserver
83
+ env:
84
+ MKL_SERVICE_FORCE_INTEL: GNU
85
+ OMP_NUM_THREADS: '1'
86
+ MKL_NUM_THREADS: '1'
87
+ ENABLE_INTRA_NODE_COMM: '1'
88
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
89
+ NCCL_IB_TIMEOUT: '22'
90
+ NCCL_DEBUG: INFO
91
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
92
+ checkpoint:
93
+ dump:
94
+ every: 1000
95
+ keep: -1
96
+ eval:
97
+ every: 900
98
+ keep: 11
99
+ path: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_lin_hq_cd_128N/checkpoints
100
+ init_ckpt_path: null
101
+ continue_training_from_init: false
102
+ ignore_data_loader_state: true
103
+ ignore_lr_scheduler_state: true
104
+ profiling:
105
+ run: false
106
+ trace_folder: profiling
107
+ mem_warmup: 0
108
+ mem_steps: 4
109
+ profile_warmup: 100
110
+ profile_steps: 4
111
+ logging:
112
+ freq: 1
113
+ acc_freq: null
114
+ wandb: null
115
+ async_eval_gpus: null
116
+ eval: null
config.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: prod_lingua_7B_2T_128N
2
+ dump_dir: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_128N
3
+ seed: 777
4
+ grad_acc_steps: 1
5
+ gc_collect_freq: 1000
6
+ probe_freq: null
7
+ steps: 250000
8
+ data:
9
+ root_dir: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked
10
+ sources:
11
+ peS2o: 0.274065475510351
12
+ stackexchange: 0.134617935796937
13
+ stackv2_edu: 0.127770669195666
14
+ cccc: 0.0871992270000557
15
+ wikimedia: 0.0861800315862719
16
+ github_archive: 0.0606452345122248
17
+ uspto: 0.0413469377516883
18
+ pubmed: 0.0367902799837971
19
+ arxiv_papers: 0.0292395449667613
20
+ caselaw_access_project: 0.0193875362722656
21
+ wikiteam: 0.0137485410839637
22
+ doab: 0.0180439781895451
23
+ uk_hansard: 0.0144498535570883
24
+ pre_1929_books: 0.0115755547988338
25
+ ubuntu_irc: 0.00794254267719456
26
+ regulations: 0.00762583706405442
27
+ data_provenance_initiative: 0.00512264496834867
28
+ project_gutenberg: 0.00502100654070129
29
+ youtube: 0.00465917165839394
30
+ arxiv_abstracts: 0.00359635066160403
31
+ stackv2_html: 0.00225924255952781
32
+ usgpo: 0.00226024581728848
33
+ library_of_congress: 0.00222469340783564
34
+ biodiversity_heritage_library: 0.00221737524370278
35
+ pressbooks: 0.000865101033213598
36
+ libretexts: 0.00054149556727006
37
+ news: 0.000372716196818104
38
+ foodista: 0.000125363443065615
39
+ oercommons: 7.78696843693821e-05
40
+ python_enhancement_proposals: 1.69983991984805e-05
41
+ public_domain_review: 1.05448719635173e-05
42
+ batch_size: 4
43
+ seq_len: 4096
44
+ n_views: 2
45
+ seed: 42
46
+ add_bos: true
47
+ add_eos: true
48
+ load_async: true
49
+ prefetch_size: 4096
50
+ tokenizer:
51
+ name: tiktoken
52
+ path: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked/tokenizer/common-pile-tokenizer.tiktoken
53
+ optim:
54
+ lr: 0.002
55
+ weight_decay: 0.2
56
+ epsilon: 1.0e-08
57
+ beta1: 0.9
58
+ beta2: 0.95
59
+ clip: 1.0
60
+ scheduler: cosine
61
+ warmup: 2000
62
+ lr_min_ratio: 1.0e-06
63
+ cycle_length: 1.0
64
+ cosine_theta: 1.0
65
+ annealing_step: 1000
66
+ decay_fraction: 0.1
67
+ exp_factor: 0.5
68
+ model:
69
+ dim: 4096
70
+ n_layers: 32
71
+ head_dim: null
72
+ n_heads: 32
73
+ n_kv_heads: null
74
+ ffn_dim_multiplier: 1.0
75
+ multiple_of: 256
76
+ norm_eps: 1.0e-05
77
+ rope_theta: 100000.0
78
+ init_base_std: null
79
+ init_std_factor: disabled
80
+ max_seqlen: 4096
81
+ seed: 42
82
+ vocab_size: 64256
83
+ weight_tying: false
84
+ sliding_window: null
85
+ distributed:
86
+ dp_shard: 1
87
+ dp_replicate: 512
88
+ tp_size: 1
89
+ selective_activation_checkpointing: false
90
+ compile: true
91
+ fsdp_type: full_shard
92
+ model_dtype: bf16
93
+ float8_recipe: null
94
+ float8_filter: layers\.[0-9]+\.
95
+ matmul_allow_tf32: false
96
+ detect_anomaly: false
97
+ compile_cache_size_limit: 8
98
+ spawn_method: forkserver
99
+ env:
100
+ MKL_SERVICE_FORCE_INTEL: GNU
101
+ OMP_NUM_THREADS: '1'
102
+ MKL_NUM_THREADS: '1'
103
+ ENABLE_INTRA_NODE_COMM: '1'
104
+ TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
105
+ NCCL_IB_TIMEOUT: '22'
106
+ NCCL_DEBUG: INFO
107
+ TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
108
+ checkpoint:
109
+ dump:
110
+ every: 10000
111
+ keep: -1
112
+ eval:
113
+ every: 2000
114
+ keep: 3
115
+ path: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_128N/checkpoints
116
+ init_ckpt_path: null
117
+ continue_training_from_init: false
118
+ ignore_data_loader_state: false
119
+ profiling:
120
+ run: false
121
+ trace_folder: profiling
122
+ mem_warmup: 0
123
+ mem_steps: 4
124
+ profile_warmup: 100
125
+ profile_steps: 4
126
+ logging:
127
+ freq: 1
128
+ acc_freq: null
129
+ wandb: null
130
+ async_eval_gpus: null
131
+ eval: null
metrics.hq_cd.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
metrics.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6494d76109eb6ee1c0564dc902a8bb8f711e599afb545e9f49f28628fc00328a
3
+ size 225744751