I got it working using NVIDIA’s latest container image for vLLM.
docker run `
--gpus all `
-v "C:\git\NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4:/model" `
-p 8000:8000 `
--env "VLLM_USE_FLASHINFER_MOE_FP4=1" `
--env "VLLM_FLASHINFER_MOE_BACKEND=throughput" `
--env CUDA_DEVICE_ORDER=PCI_BUS_ID `
--env "CUDA_VISIBLE_DEVICES=0,1" `
--ipc=host `
nvcr.io/nvidia/vllm:26.01-py3 `
vllm serve `
--model /model `
--served-model-name nemotron `
--max-model-len 30000 `
--max-num-seqs 8 `
--kv-cache-dtype fp8 `
--trust-remote-code `
--reasoning-parser-plugin "/model/nano_v3_reasoning_parser.py" `
--reasoning-parser nano_v3 `
--tensor-parallel-size 2