comma-v0.1-2t-checkpoints / config.hq_cd.yaml
jwkirchenbauer's picture
Upload folder using huggingface_hub
9004b7b verified
name: prod_lingua_7B_2T_lin_hq_cd_128N
dump_dir: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_lin_hq_cd_128N
seed: 777
grad_acc_steps: 1
gc_collect_freq: 1000
probe_freq: null
steps: 239000
data:
root_dir: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked
sources:
stackv2_edu: 0.170363004622478
doab: 0.160392905980941
wikimedia: 0.15321084473113
stackexchange: 0.149577338855193
peS2o: 0.121808388328754
cccc: 0.116267077616518
arxiv_papers: 0.0649775722614287
data_provenance_initiative: 0.0455351865398593
pressbooks: 0.00768988231013495
libretexts: 0.00481335361292812
news: 0.00331307394000609
foodista: 0.00111435553323384
oercommons: 0.000692183554680895
python_enhancement_proposals: 0.000151098755264962
public_domain_review: 9.37333574480524e-05
batch_size: 4
seq_len: 4096
n_views: 2
seed: 42
add_bos: true
add_eos: true
load_async: true
prefetch_size: 4096
tokenizer:
name: tiktoken
path: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked/tokenizer/common-pile-tokenizer.tiktoken
optim:
lr: 0.002
weight_decay: 0.2
epsilon: 1.0e-08
beta1: 0.9
beta2: 0.95
clip: 1.0
scheduler: linear
warmup: -324821
lr_min_ratio: 0.0
cycle_length: 1.0
cosine_theta: 1.0
annealing_step: 1000
decay_fraction: 0.1
exp_factor: 0.5
model:
dim: 4096
n_layers: 32
head_dim: null
n_heads: 32
n_kv_heads: null
ffn_dim_multiplier: 1.0
multiple_of: 256
norm_eps: 1.0e-05
rope_theta: 100000.0
init_base_std: null
init_std_factor: disabled
max_seqlen: 4096
seed: 42
vocab_size: 64256
weight_tying: false
sliding_window: null
distributed:
dp_shard: 1
dp_replicate: 512
tp_size: 1
selective_activation_checkpointing: false
compile: true
fsdp_type: full_shard
model_dtype: bf16
float8_recipe: null
float8_filter: layers\.[0-9]+\.
matmul_allow_tf32: false
detect_anomaly: false
compile_cache_size_limit: 8
spawn_method: forkserver
env:
MKL_SERVICE_FORCE_INTEL: GNU
OMP_NUM_THREADS: '1'
MKL_NUM_THREADS: '1'
ENABLE_INTRA_NODE_COMM: '1'
TORCH_NCCL_AVOID_RECORD_STREAMS: '1'
NCCL_IB_TIMEOUT: '22'
NCCL_DEBUG: INFO
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1'
checkpoint:
dump:
every: 1000
keep: -1
eval:
every: 900
keep: 11
path: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_lin_hq_cd_128N/checkpoints
init_ckpt_path: null
continue_training_from_init: false
ignore_data_loader_state: true
ignore_lr_scheduler_state: true
profiling:
run: false
trace_folder: profiling
mem_warmup: 0
mem_steps: 4
profile_warmup: 100
profile_steps: 4
logging:
freq: 1
acc_freq: null
wandb: null
async_eval_gpus: null
eval: null