Trying to run Qwen3.5-397B-A17B-GPTQ-Int4

@RunLLM how can I speed up prompt and generation throughput with this model, Qwen3.5-397B-A17B-GPTQ-Int4. I have 8 qty A6000 GPUs (384G VRAM) on Ubuntu 22, nvidia Driver Version: 535.104.05 and CUDA Version: 12.2. This is my docker-compose.yaml:

services:
  vllm:
    image: orthozany/vllm-qwen35-mtp
    container_name: vllm-qwen35-gpt4
    ipc: host
    ulimits:
      memlock: { soft: -1, hard: -1 }
    ports:
      - "8000:8000"
    environment:
      HF_TOKEN: "${HF_TOKEN}"
      HF_HOME: "/mnt/llm_storage"
      HF_CACHE_DIR: "/mnt/llm_storage"
      TRANSFORMERS_CACHE: "/mnt/llm_storage/cache"  # ensure this subdir is in hf_cache
      TRITON_CACHE_DIR: "/triton_cache"
      NCCL_DEBUG: "WARN"
      NCCL_SHM_DISABLE: "1"
      NCCL_P2P_DISABLE: "1"
      NCCL_IB_DISABLE: "1"
      NCCL_COMM_BLOCKING: "1"
    volumes:
      - hf_cache:/mnt/llm_storage   # mount the *root* of the storage, not a subfolder — HF needs to create the model dir if it doesn't exist
      - triton_cache:/triton_cache:rw
      # if you want to use a temp dir separate from hf cache:
      # - tmpfs:/tmp:rw,noexec,nosuid,size=16G

    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]

    # Make vLLM aware of local model path
    command: >
      --model /mnt/llm_storage/Qwen3.5-397B-A17B-GPTQ-Int4
      --host 0.0.0.0
      --quantization modelopt
      --tensor-parallel-size 8
      --max-model-len 262144
      --served-model-name Qwen3.5-397B-A17B-GPTQ-Int4
      --enable-prefix-caching
      --enable-auto-tool-choice
      --tool-call-parser qwen3_coder
      --reasoning-parser qwen3
      --quantization moe_wna16

    restart: unless-stopped

volumes:
  hf_cache:   # this will be the root for HF cache too
  open_webui_data:
  triton_cache: