
----------------------------------------------Summary-----------------------------------------------
batch_size_per_gpu: 8
max_batch_size_per_gpu: 9
gradient_accumulation_steps: 15625
global_batch_size: 4000000
dp_size: 32
tp_size: 8
pp_size: 8
sp_size: 8
ds_zero: NONE
total_num_gpus: 2048
seq_len: 2048
total_num_tokens: 1.4 T
num_params_total: 64.69 G
activation_recomputation: SELECTIVE
achived_flops: 156.0
flops_efficiency: 0.5
hbm_memory_efficiency: 0.9
num_flops_total_per_micro_batch: 6710.8 T
weight_memory_per_gpu: 2.08 GB
gradient_memory_per_gpu: 2.01 GB
optimizer_state_memory_per_gpu: 16.11 GB
activation_memory_bs1: 6.21 GB
activation_memory_per_gpu: 49.66 GB
latency_per_micro_batch: 672.16 ms
latency_fwd: 261.04 ms
latency_fwd_attn: 79.29 ms
latency_fwd_mlp: 140.96 ms
latency_fwd_layernorm: 731.39 us
latency_fwd_tp_comm: 31.32 ms
latency_fwd_input_embedding: 1.85 ms
latency_fwd_output_embedding_loss: 6.88 ms
latency_per_iter: 2.92 hours
total_training_latency: 20.66 days
gpu_hours: 1015701
----------------------------------------------------------------------------------------------------
