engine_kwargs = {
# quantization: Enables FP8 quantization for model weights. While 'fp8' can reduce memory and increase speed,
# simple per-tensor FP8 might not yield optimal performance or could degrade quality due to dynamic scale calculations or outlier issues.[9, 10]
# For optimal quality, consider calibrated scales or PTPC-FP8 (on AMD ROCm).[11, 12]
"quantization": "fp8",
#kv_cache_dtype: Storing Key-Value (KV) cache data in FP8 significantly enhances memory efficiency,
# effectively doubling the maximum token capacity within the same memory footprint.[6, 7, 8]
# This directly boosts the number of concurrent requests. Supported on CUDA 11.8+ and ROCm.[7]
"kv_cache_dtype": "fp8",
"gpu_memory_utilization": 0.80,
# max_num_batched_tokens: Maximum total tokens across all sequences in a single batch.
# Crucial for prefill efficiency. VLLM recommends setting this > 2048 for optimal throughput.[15]
"max_num_batched_tokens": 4096, # Set to a value > 2048
# tokenizer_mode: Controls the tokenizer implementation. "auto" attempts to use a fast, Rust-based tokenizer,
# which significantly reduces CPU overhead and prevents GPU idle time.[16, 17]
"tokenizer_mode": "auto",
# disable_log_stats: Disables detailed performance statistics logging.
# This reduces CPU overhead in high-throughput production environments, freeing resources for inference.[18, 19]
"disable_log_stats": True,
# max_seq_len_to_capture: Maximum sequence length for which CUDA graphs are employed.
# Tune to encompass the majority of expected sequence lengths to maximize CUDA graph utilization.[22]
"max_seq_len_to_capture": 2048, # Should ideally match max_model_len for consistent CUDA graph usage
# "max_model_len": 16384, # Should match max_seq_len_to_capture
"max_num_seqs": 32,
"tensor_parallel_size": 1,
"enable_chunked_prefill": False, # Explicitly set to False to prioritize prefill operations, optimizing TTFT
# "dtype": torch.bfloat16, # Explicitly set dtype for clarity and consistency
# "use_cuda_graphs": True, # Enable CUDA graphs for performance optimization
# "use_flash_attention": True, # Enable flash attention for faster inference
"trust_remote_code": True,
}
This is also configuration but same issue