
----------------------------------------------Summary-----------------------------------------------
batch_size_per_gpu: 1
seq_len: 512
tp_size: 2
pp_size: 1
num_tokens_to_generate: 32
use_kv_cache: True
kv_cache_memory_per_gpu: 713.03 MB
kv_cache_latency: 777.1 us
layernorm_dtype_bytes: 2
embedding_memory_per_gpu: 262.14 MB
weight_memory_per_gpu: 64.69 GB
prefill_max_batch_size_per_gpu: 1
prefill_activation_memory_per_gpu: 9.4 GB
prefill_num_flops_fwd_total: 66.93 T
decode_max_batch_size_per_gpu: 21
decode_activation_memory_per_gpu: 13.12 MB
decode_num_flops_fwd_total: 129.38 G
prefill_latency: 159.47 ms
prefill_latency_fwd_attn: 51.92 ms
prefill_latency_fwd_mlp: 100.69 ms
prefill_latency_fwd_layernorm: 1.46 ms
prefill_latency_fwd_tp_comm: 4.47 ms
prefill_latency_fwd_input_embedding: 313.66 us
prefill_latency_fwd_output_embedding_loss: 614.55 us
decode_latency: 37.47 ms
decode_latency_fwd_attn: 11.7 ms
decode_latency_fwd_mlp: 23.41 ms
decode_latency_fwd_layernorm: 2.86 us
decode_latency_fwd_tp_comm: 1.28 ms
decode_latency_fwd_input_embedding: 293.7 us
decode_latency_fwd_output_embedding_loss: 1.2 us
total_decode_latency: 1.2 s
total_latency: 1.36 s
total_per_token_latency: 42.45 ms
----------------------------------------------------------------------------------------------------
