# HELP python_gc_objects_collected_total Objects collected during gc
# TYPE python_gc_objects_collected_total counter
python_gc_objects_collected_total{generation="0"} 66525.0
python_gc_objects_collected_total{generation="1"} 18311.0
python_gc_objects_collected_total{generation="2"} 1431.0
# HELP python_gc_objects_uncollectable_total Uncollectable objects found during GC
# TYPE python_gc_objects_uncollectable_total counter
python_gc_objects_uncollectable_total{generation="0"} 0.0
python_gc_objects_uncollectable_total{generation="1"} 0.0
python_gc_objects_uncollectable_total{generation="2"} 0.0
# HELP python_gc_collections_total Number of times this generation was collected
# TYPE python_gc_collections_total counter
python_gc_collections_total{generation="0"} 1082.0
python_gc_collections_total{generation="1"} 96.0
python_gc_collections_total{generation="2"} 78.0
# HELP python_info Python platform information
# TYPE python_info gauge
python_info{implementation="CPython",major="3",minor="10",patchlevel="14",version="3.10.14"} 1.0
# HELP process_virtual_memory_bytes Virtual memory size in bytes.
# TYPE process_virtual_memory_bytes gauge
process_virtual_memory_bytes 3.9024848896e+010
# HELP process_resident_memory_bytes Resident memory size in bytes.
# TYPE process_resident_memory_bytes gauge
process_resident_memory_bytes 5.8330112e+09
# HELP process_start_time_seconds Start time of the process since unix epoch in seconds.
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1.72365683894e+09
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total 78.41
# HELP process_open_fds Number of open file descriptors.
# TYPE process_open_fds gauge
process_open_fds 76.0
# HELP process_max_fds Maximum number of open file descriptors.
# TYPE process_max_fds gauge
process_max_fds 1.048576e+06
# HELP vllm:cache_config_info information of cache_config
# TYPE vllm:cache_config_info gauge
vllm:cache_config_info{block_size="16",cache_dtype="auto",cpu_offload_gb="0",enable_prefix_caching="False",gpu_memory_utilization="0.9",num_cpu_blocks="2048",num_gpu_blocks="4574",num_gpu_blocks_override="None",sliding_window="None",swap_space_bytes="4294967296"} 1.0
# HELP vllm:num_requests_running Number of requests currently running on GPU.
# TYPE vllm:num_requests_running gauge
vllm:num_requests_running{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 3.0
# HELP vllm:num_requests_waiting Number of requests waiting to be processed.
# TYPE vllm:num_requests_waiting gauge
vllm:num_requests_waiting{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2.0
vllm:num_requests_waiting{model_name="other/OTHER"} 3.0
# HELP vllm:num_requests_swapped Number of requests swapped to CPU.
# TYPE vllm:num_requests_swapped gauge
vllm:num_requests_swapped{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
# HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
# TYPE vllm:gpu_cache_usage_perc gauge
vllm:gpu_cache_usage_perc{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0026235242675994863
# HELP vllm:cpu_cache_usage_perc CPU KV-cache usage. 1 means 100 percent usage.
# TYPE vllm:cpu_cache_usage_perc gauge
vllm:cpu_cache_usage_perc{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
# HELP vllm:num_preemptions_total Cumulative number of preemption from the engine.
# TYPE vllm:num_preemptions_total counter
vllm:num_preemptions_total{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
# HELP vllm:prompt_tokens_total Number of prefill tokens processed.
# TYPE vllm:prompt_tokens_total counter
vllm:prompt_tokens_total{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 90.0
# HELP vllm:generation_tokens_total Number of generation tokens processed.
# TYPE vllm:generation_tokens_total counter
vllm:generation_tokens_total{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2546.0
# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
# TYPE vllm:time_to_first_token_seconds histogram
vllm:time_to_first_token_seconds_bucket{le="0.001",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:time_to_first_token_seconds_bucket{le="0.01",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:time_to_first_token_seconds_bucket{le="0.02",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:time_to_first_token_seconds_bucket{le="0.04",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 1.0
vllm:time_to_first_token_seconds_bucket{le="0.06",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:time_to_first_token_seconds_bucket{le="0.08",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="0.1",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="0.25",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="0.5",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="0.75",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="1.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="2.5",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="5.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="7.5",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="10.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_bucket{le="+Inf",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_count{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 9.0
vllm:time_to_first_token_seconds_sum{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.5106480121612549
# HELP vllm:time_per_output_token_seconds Histogram of time per output token in seconds.
# TYPE vllm:time_per_output_token_seconds histogram
vllm:time_per_output_token_seconds_bucket{le="0.01",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:time_per_output_token_seconds_bucket{le="0.025",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:time_per_output_token_seconds_bucket{le="0.05",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2519.0
vllm:time_per_output_token_seconds_bucket{le="0.075",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2519.0
vllm:time_per_output_token_seconds_bucket{le="0.1",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_bucket{le="0.15",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_bucket{le="0.2",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_bucket{le="0.3",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_bucket{le="0.4",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_bucket{le="0.5",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_bucket{le="0.75",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_bucket{le="1.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_bucket{le="2.5",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_bucket{le="+Inf",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_count{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2537.0
vllm:time_per_output_token_seconds_sum{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 98.35070300102234
# HELP vllm:e2e_request_latency_seconds Histogram of end to end request latency in seconds.
# TYPE vllm:e2e_request_latency_seconds histogram
vllm:e2e_request_latency_seconds_bucket{le="1.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:e2e_request_latency_seconds_bucket{le="2.5",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:e2e_request_latency_seconds_bucket{le="5.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:e2e_request_latency_seconds_bucket{le="10.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:e2e_request_latency_seconds_bucket{le="15.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:e2e_request_latency_seconds_bucket{le="20.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:e2e_request_latency_seconds_bucket{le="30.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:e2e_request_latency_seconds_bucket{le="40.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:e2e_request_latency_seconds_bucket{le="50.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:e2e_request_latency_seconds_bucket{le="60.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:e2e_request_latency_seconds_bucket{le="+Inf",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:e2e_request_latency_seconds_count{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:e2e_request_latency_seconds_sum{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 93.24844908714294
# HELP vllm:request_prompt_tokens Number of prefill tokens processed.
# TYPE vllm:request_prompt_tokens histogram
vllm:request_prompt_tokens_bucket{le="1.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_prompt_tokens_bucket{le="2.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_prompt_tokens_bucket{le="5.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_prompt_tokens_bucket{le="10.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="20.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="50.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="100.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="200.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="500.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="1000.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="2000.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="5000.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="10000.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_bucket{le="+Inf",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_count{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_prompt_tokens_sum{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 60.0
# HELP vllm:request_generation_tokens Number of generation tokens processed.
# TYPE vllm:request_generation_tokens histogram
vllm:request_generation_tokens_bucket{le="1.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_generation_tokens_bucket{le="2.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_generation_tokens_bucket{le="5.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_generation_tokens_bucket{le="10.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_generation_tokens_bucket{le="20.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_generation_tokens_bucket{le="50.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_generation_tokens_bucket{le="100.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_generation_tokens_bucket{le="200.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 0.0
vllm:request_generation_tokens_bucket{le="500.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_generation_tokens_bucket{le="1000.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_generation_tokens_bucket{le="2000.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_generation_tokens_bucket{le="5000.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_generation_tokens_bucket{le="10000.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_generation_tokens_bucket{le="+Inf",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_generation_tokens_count{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_generation_tokens_sum{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 2400.0
# HELP vllm:request_params_best_of Histogram of the best_of request parameter.
# TYPE vllm:request_params_best_of histogram
vllm:request_params_best_of_bucket{le="1.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_best_of_bucket{le="2.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_best_of_bucket{le="5.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_best_of_bucket{le="10.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_best_of_bucket{le="20.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_best_of_bucket{le="+Inf",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_best_of_count{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_best_of_sum{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
# HELP vllm:request_params_n Histogram of the n request parameter.
# TYPE vllm:request_params_n histogram
vllm:request_params_n_bucket{le="1.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_n_bucket{le="2.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_n_bucket{le="5.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_n_bucket{le="10.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_n_bucket{le="20.0",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_n_bucket{le="+Inf",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_n_count{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
vllm:request_params_n_sum{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
# HELP vllm:request_success_total Count of successfully processed requests.
# TYPE vllm:request_success_total counter
vllm:request_success_total{finished_reason="length",model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 6.0
# HELP vllm:spec_decode_draft_acceptance_rate Speulative token acceptance rate.
# TYPE vllm:spec_decode_draft_acceptance_rate gauge
# HELP vllm:spec_decode_efficiency Speculative decoding system efficiency.
# TYPE vllm:spec_decode_efficiency gauge
# HELP vllm:spec_decode_num_accepted_tokens_total Number of accepted tokens.
# TYPE vllm:spec_decode_num_accepted_tokens_total counter
# HELP vllm:spec_decode_num_draft_tokens_total Number of draft tokens.
# TYPE vllm:spec_decode_num_draft_tokens_total counter
# HELP vllm:spec_decode_num_emitted_tokens_total Number of emitted tokens.
# TYPE vllm:spec_decode_num_emitted_tokens_total counter
# HELP vllm:avg_prompt_throughput_toks_per_s Average prefill throughput in tokens/s.
# TYPE vllm:avg_prompt_throughput_toks_per_s gauge
vllm:avg_prompt_throughput_toks_per_s{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 1.294472788861082
# HELP vllm:avg_generation_throughput_toks_per_s Average generation throughput in tokens/s.
# TYPE vllm:avg_generation_throughput_toks_per_s gauge
vllm:avg_generation_throughput_toks_per_s{model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"} 34.30352890481867
