
----------------------------------------------Summary-----------------------------------------------
batch_size_per_gpu: 1
seq_len: 512
tp_size: 1
pp_size: 1
num_tokens_to_generate: 32
use_kv_cache: True
kv_cache_memory_per_gpu: 111.41 MB
kv_cache_latency: 109.28 us
layernorm_dtype_bytes: 2
embedding_memory_per_gpu: 327.68 MB
weight_memory_per_gpu: 6.62 GB
prefill_max_batch_size_per_gpu: 46
prefill_activation_memory_per_gpu: 1.57 GB
prefill_num_flops_fwd_total: 13.27 T
decode_max_batch_size_per_gpu: 645
decode_activation_memory_per_gpu: 2.25 MB
decode_num_flops_fwd_total: 25.49 G
prefill_latency: 11.2 ms
prefill_latency_fwd_attn: 3.61 ms
prefill_latency_fwd_mlp: 6.88 ms
prefill_latency_fwd_layernorm: 411.41 us
prefill_latency_fwd_tp_comm: 0.0 us
prefill_latency_fwd_input_embedding: 160.71 us
prefill_latency_fwd_output_embedding_loss: 134.43 us
decode_latency: 3.36 ms
decode_latency_fwd_attn: 1.03 ms
decode_latency_fwd_mlp: 2.06 ms
decode_latency_fwd_layernorm: 0.8 us
decode_latency_fwd_tp_comm: 0.0 us
decode_latency_fwd_input_embedding: 160.71 us
decode_latency_fwd_output_embedding_loss: 0.26 us
total_decode_latency: 107.43 ms
total_latency: 118.64 ms
total_per_token_latency: 3.71 ms
----------------------------------------------------------------------------------------------------
