
----------------------------------------------Summary-----------------------------------------------
batch_size_per_gpu: 1
seq_len: 512
tp_size: 1
pp_size: 1
num_tokens_to_generate: 32
use_kv_cache: True
kv_cache_memory_per_gpu: 1.43 GB
kv_cache_latency: 1.4 ms
layernorm_dtype_bytes: 2
embedding_memory_per_gpu: 524.29 MB
weight_memory_per_gpu: 32.74 GB
prefill_max_batch_size_per_gpu: 2
prefill_activation_memory_per_gpu: 16.11 GB
prefill_num_flops_fwd_total: 66.93 T
decode_max_batch_size_per_gpu: 32
decode_activation_memory_per_gpu: 20.99 MB
decode_num_flops_fwd_total: 129.38 G
prefill_latency: 216.08 ms
prefill_latency_fwd_attn: 72.68 ms
prefill_latency_fwd_mlp: 140.96 ms
prefill_latency_fwd_layernorm: 1.32 ms
prefill_latency_fwd_tp_comm: 0.0 us
prefill_latency_fwd_input_embedding: 257.13 us
prefill_latency_fwd_output_embedding_loss: 860.37 us
decode_latency: 17.47 ms
decode_latency_fwd_attn: 5.27 ms
decode_latency_fwd_mlp: 10.54 ms
decode_latency_fwd_layernorm: 2.57 us
decode_latency_fwd_tp_comm: 0.0 us
decode_latency_fwd_input_embedding: 257.13 us
decode_latency_fwd_output_embedding_loss: 1.68 us
total_decode_latency: 558.95 ms
total_latency: 775.03 ms
total_per_token_latency: 24.22 ms
----------------------------------------------------------------------------------------------------
