@RunLLM how can I speed up prompt and generation throughput with this model, Qwen3.5-397B-A17B-GPTQ-Int4. I have 8 qty A6000 GPUs (384G VRAM) on Ubuntu 22, nvidia Driver Version: 535.104.05 and CUDA Version: 12.2. This is my docker-compose.yaml:
services:
vllm:
image: orthozany/vllm-qwen35-mtp
container_name: vllm-qwen35-gpt4
ipc: host
ulimits:
memlock: { soft: -1, hard: -1 }
ports:
- "8000:8000"
environment:
HF_TOKEN: "${HF_TOKEN}"
HF_HOME: "/mnt/llm_storage"
HF_CACHE_DIR: "/mnt/llm_storage"
TRANSFORMERS_CACHE: "/mnt/llm_storage/cache" # ensure this subdir is in hf_cache
TRITON_CACHE_DIR: "/triton_cache"
NCCL_DEBUG: "WARN"
NCCL_SHM_DISABLE: "1"
NCCL_P2P_DISABLE: "1"
NCCL_IB_DISABLE: "1"
NCCL_COMM_BLOCKING: "1"
volumes:
- hf_cache:/mnt/llm_storage # mount the *root* of the storage, not a subfolder — HF needs to create the model dir if it doesn't exist
- triton_cache:/triton_cache:rw
# if you want to use a temp dir separate from hf cache:
# - tmpfs:/tmp:rw,noexec,nosuid,size=16G
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
# Make vLLM aware of local model path
command: >
--model /mnt/llm_storage/Qwen3.5-397B-A17B-GPTQ-Int4
--host 0.0.0.0
--quantization modelopt
--tensor-parallel-size 8
--max-model-len 262144
--served-model-name Qwen3.5-397B-A17B-GPTQ-Int4
--enable-prefix-caching
--enable-auto-tool-choice
--tool-call-parser qwen3_coder
--reasoning-parser qwen3
--quantization moe_wna16
restart: unless-stopped
volumes:
hf_cache: # this will be the root for HF cache too
open_webui_data:
triton_cache: