
----------------------------------------------Summary-----------------------------------------------
batch_size_per_gpu: 4
max_batch_size_per_gpu: 5
gradient_accumulation_steps: 96
global_batch_size: 1536
dp_size: 4
tp_size: 8
pp_size: 12
sp_size: 1
ds_zero: NONE
total_num_gpus: 384
seq_len: 2048
total_num_tokens: 300.0 G
num_params_total: 174.56 G
activation_recomputation: FULL
achived_flops: 153.0
flops_efficiency: 0.49038461538461536
hbm_memory_efficiency: 1
num_flops_total_per_micro_batch: 11756.88 T
weight_memory_per_gpu: 3.78 GB
gradient_memory_per_gpu: 3.62 GB
optimizer_state_memory_per_gpu: 28.99 GB
activation_memory_bs1: 603.98 MB
activation_memory_per_gpu: 2.42 GB
latency_per_micro_batch: 800.44 ms
latency_fwd: 232.59 ms
latency_fwd_attn: 70.07 ms
latency_fwd_mlp: 129.35 ms
latency_fwd_layernorm: 4.14 ms
latency_fwd_tp_comm: 18.79 ms
latency_fwd_input_embedding: 1.97 ms
latency_fwd_output_embedding_loss: 8.27 ms
latency_per_iter: 1.28 minutes
total_training_latency: 84.82 days
gpu_hours: 781677
----------------------------------------------------------------------------------------------------
