
----------------------------------------------Summary-----------------------------------------------
batch_size_per_gpu: 1
max_batch_size_per_gpu: 1
gradient_accumulation_steps: 560
global_batch_size: 2240
dp_size: 4
tp_size: 8
pp_size: 35
sp_size: 1
ds_zero: NONE
total_num_gpus: 1120
seq_len: 2048
total_num_tokens: 300.0 G
num_params_total: 529.51 G
activation_recomputation: FULL
achived_flops: 167.0
flops_efficiency: 0.5352564102564102
hbm_memory_efficiency: 1
num_flops_total_per_micro_batch: 8819.83 T
weight_memory_per_gpu: 4.03 GB
gradient_memory_per_gpu: 3.77 GB
optimizer_state_memory_per_gpu: 30.2 GB
activation_memory_bs1: 1.1 GB
activation_memory_per_gpu: 1.1 GB
latency_per_micro_batch: 188.62 ms
latency_fwd: 55.62 ms
latency_fwd_attn: 16.2 ms
latency_fwd_mlp: 30.86 ms
latency_fwd_layernorm: 647.35 us
latency_fwd_tp_comm: 2.94 ms
latency_fwd_input_embedding: 1.81 ms
latency_fwd_output_embedding_loss: 3.16 ms
latency_per_iter: 1.76 minutes
total_training_latency: 79.95 days
gpu_hours: 2148955
----------------------------------------------------------------------------------------------------
