|
name: prod_lingua_7B_2T_lin_hq_cd_128N |
|
dump_dir: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_lin_hq_cd_128N |
|
seed: 777 |
|
grad_acc_steps: 1 |
|
gc_collect_freq: 1000 |
|
probe_freq: null |
|
steps: 239000 |
|
data: |
|
root_dir: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked |
|
sources: |
|
stackv2_edu: 0.170363004622478 |
|
doab: 0.160392905980941 |
|
wikimedia: 0.15321084473113 |
|
stackexchange: 0.149577338855193 |
|
peS2o: 0.121808388328754 |
|
cccc: 0.116267077616518 |
|
arxiv_papers: 0.0649775722614287 |
|
data_provenance_initiative: 0.0455351865398593 |
|
pressbooks: 0.00768988231013495 |
|
libretexts: 0.00481335361292812 |
|
news: 0.00331307394000609 |
|
foodista: 0.00111435553323384 |
|
oercommons: 0.000692183554680895 |
|
python_enhancement_proposals: 0.000151098755264962 |
|
public_domain_review: 9.37333574480524e-05 |
|
batch_size: 4 |
|
seq_len: 4096 |
|
n_views: 2 |
|
seed: 42 |
|
add_bos: true |
|
add_eos: true |
|
load_async: true |
|
prefetch_size: 4096 |
|
tokenizer: |
|
name: tiktoken |
|
path: /p/vast1/pretrain/datasets/common_pile/common-pile-chunked/tokenizer/common-pile-tokenizer.tiktoken |
|
optim: |
|
lr: 0.002 |
|
weight_decay: 0.2 |
|
epsilon: 1.0e-08 |
|
beta1: 0.9 |
|
beta2: 0.95 |
|
clip: 1.0 |
|
scheduler: linear |
|
warmup: -324821 |
|
lr_min_ratio: 0.0 |
|
cycle_length: 1.0 |
|
cosine_theta: 1.0 |
|
annealing_step: 1000 |
|
decay_fraction: 0.1 |
|
exp_factor: 0.5 |
|
model: |
|
dim: 4096 |
|
n_layers: 32 |
|
head_dim: null |
|
n_heads: 32 |
|
n_kv_heads: null |
|
ffn_dim_multiplier: 1.0 |
|
multiple_of: 256 |
|
norm_eps: 1.0e-05 |
|
rope_theta: 100000.0 |
|
init_base_std: null |
|
init_std_factor: disabled |
|
max_seqlen: 4096 |
|
seed: 42 |
|
vocab_size: 64256 |
|
weight_tying: false |
|
sliding_window: null |
|
distributed: |
|
dp_shard: 1 |
|
dp_replicate: 512 |
|
tp_size: 1 |
|
selective_activation_checkpointing: false |
|
compile: true |
|
fsdp_type: full_shard |
|
model_dtype: bf16 |
|
float8_recipe: null |
|
float8_filter: layers\.[0-9]+\. |
|
matmul_allow_tf32: false |
|
detect_anomaly: false |
|
compile_cache_size_limit: 8 |
|
spawn_method: forkserver |
|
env: |
|
MKL_SERVICE_FORCE_INTEL: GNU |
|
OMP_NUM_THREADS: '1' |
|
MKL_NUM_THREADS: '1' |
|
ENABLE_INTRA_NODE_COMM: '1' |
|
TORCH_NCCL_AVOID_RECORD_STREAMS: '1' |
|
NCCL_IB_TIMEOUT: '22' |
|
NCCL_DEBUG: INFO |
|
TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' |
|
checkpoint: |
|
dump: |
|
every: 1000 |
|
keep: -1 |
|
eval: |
|
every: 900 |
|
keep: 11 |
|
path: /p/lustre5/kirchenb/common-pile-root/lingua/output/prod_lingua_7B_2T_lin_hq_cd_128N/checkpoints |
|
init_ckpt_path: null |
|
continue_training_from_init: false |
|
ignore_data_loader_state: true |
|
ignore_lr_scheduler_state: true |
|
profiling: |
|
run: false |
|
trace_folder: profiling |
|
mem_warmup: 0 |
|
mem_steps: 4 |
|
profile_warmup: 100 |
|
profile_steps: 4 |
|
logging: |
|
freq: 1 |
|
acc_freq: null |
|
wandb: null |
|
async_eval_gpus: null |
|
eval: null |
|
|