Hi, we’ve noticed a problem running Qwen3-32B-FP8, which seems to only happen in the FP8 quantized version. The model is running well and sometimes a request causes it to crash without any reasonable explanation.
We are running 2 models on 2xH200 setup, 1 model per GPU. Here’s our dockerized setup:
services:
vllm-devstral-small-2505-api:
image: vllm/vllm-openai:latest
container_name: vllm-devstral-small-2505
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["0"]
capabilities: [gpu]
ipc: host
volumes:
- /opt/hf:/opt/hf
environment:
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
- HF_HOME=/opt/hf
- VLLM_API_KEY=${VLLM_API_KEY}
ports:
- 8001:8000
command:
- --model
- "mistralai/Devstral-Small-2505"
- --served-model-name
- "Devstral"
- --host
- "0.0.0.0"
- --port
- "8000"
- --enforce-eager
- --disable-custom-all-reduce
- --max-model-len
- "65536"
- --max-num-batched-tokens
- "65536"
- --swap-space
- "128"
- --max-num-seqs
- "12"
- --kv-cache-dtype
- "fp8"
- --generation-config
- "auto"
- --enable-chunked-prefill
- --tokenizer_mode
- "mistral"
- --config_format
- "mistral"
- --load_format
- "mistral"
- --trust-remote-code
- --tensor-parallel-size
- "1"
- --gpu-memory-utilization
- "0.9"
- --enable-auto-tool-choice
- --tool-call-parser
- "mistral"
- --chat-template
- "examples/tool_chat_template_mistral_parallel.jinja"
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 10s
timeout: 5s
retries: 5
start_period: 5m
vllm-qwen3:
image: vllm/vllm-openai:latest
container_name: vllm-qwen3-32b
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["1"]
capabilities: [gpu]
ipc: host
volumes:
- /opt/hf:/opt/hf
environment:
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}
- HF_HOME=/opt/hf
- VLLM_API_KEY=${VLLM_API_KEY}
ports:
- 8002:8000
command:
- --model
- "Qwen/Qwen3-32B-FP8"
- --served-model-name
- "Qwen3"
- --host
- "0.0.0.0"
- --port
- "8000"
- --enforce-eager
- --disable-custom-all-reduce
- --max-model-len
- "32768"
- --rope-scaling
- '{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}'
- --max-num-batched-tokens
- "65536"
- --swap-space
- "128"
- --max-num-seqs
- "16"
- --quantization
- "fp8"
- --kv-cache-dtype
- "fp8"
- --generation-config
- "auto"
- --enable-chunked-prefill
- --tensor-parallel-size
- "1"
- --gpu-memory-utilization
- "0.8"
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10m
ERROR 07-22 02:45:17 [dump_input.py:79] Dumping scheduler stats: SchedulerStats(num_running_reqs=1, num_waiting_reqs=0, kv_cache_usage=0.001931971196065807, prefix_cache_stats=PrefixCacheStats(reset=False, requests=1, queries=1027, hits=1024), spec_decoding_stats=None, num_corrupted_reqs=0)
ERROR 07-22 02:45:17 [core.py:588] EngineCore encountered a fatal error.
ERROR 07-22 02:45:17 [core.py:588] Traceback (most recent call last):
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 579, in run_engine_core
ERROR 07-22 02:45:17 [core.py:588] engine_core.run_busy_loop()
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 606, in run_busy_loop
ERROR 07-22 02:45:17 [core.py:588] self._process_engine_step()
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 631, in _process_engine_step
ERROR 07-22 02:45:17 [core.py:588] outputs, model_executed = self.step_fn()
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 235, in step
ERROR 07-22 02:45:17 [core.py:588] model_output = self.execute_model(scheduler_output)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 221, in execute_model
ERROR 07-22 02:45:17 [core.py:588] raise err
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 212, in execute_model
ERROR 07-22 02:45:17 [core.py:588] return self.model_executor.execute_model(scheduler_output)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 87, in execute_model
ERROR 07-22 02:45:17 [core.py:588] output = self.collective_rpc("execute_model",
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 57, in collective_rpc
ERROR 07-22 02:45:17 [core.py:588] answer = run_method(self.driver_worker, method, args, kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 2736, in run_method
ERROR 07-22 02:45:17 [core.py:588] return func(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
ERROR 07-22 02:45:17 [core.py:588] return func(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 308, in execute_model
ERROR 07-22 02:45:17 [core.py:588] output = self.model_runner.execute_model(scheduler_output,
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
ERROR 07-22 02:45:17 [core.py:588] return func(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1374, in execute_model
ERROR 07-22 02:45:17 [core.py:588] model_output = self.model(
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
ERROR 07-22 02:45:17 [core.py:588] return self._call_impl(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
ERROR 07-22 02:45:17 [core.py:588] return forward_call(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 302, in forward
ERROR 07-22 02:45:17 [core.py:588] hidden_states = self.model(input_ids, positions, intermediate_tensors,
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 173, in __call__
ERROR 07-22 02:45:17 [core.py:588] return self.forward(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 355, in forward
ERROR 07-22 02:45:17 [core.py:588] hidden_states, residual = layer(
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
ERROR 07-22 02:45:17 [core.py:588] return self._call_impl(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
ERROR 07-22 02:45:17 [core.py:588] return forward_call(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 215, in forward
ERROR 07-22 02:45:17 [core.py:588] hidden_states = self.self_attn(
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
ERROR 07-22 02:45:17 [core.py:588] return self._call_impl(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
ERROR 07-22 02:45:17 [core.py:588] return forward_call(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 134, in forward
ERROR 07-22 02:45:17 [core.py:588] qkv, _ = self.qkv_proj(hidden_states)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
ERROR 07-22 02:45:17 [core.py:588] return self._call_impl(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
ERROR 07-22 02:45:17 [core.py:588] return forward_call(*args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 510, in forward
ERROR 07-22 02:45:17 [core.py:588] output_parallel = self.quant_method.apply(self, input_, bias)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/fp8.py", line 414, in apply
ERROR 07-22 02:45:17 [core.py:588] return torch.ops.vllm.apply_w8a8_block_fp8_linear(
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1158, in __call__
ERROR 07-22 02:45:17 [core.py:588] return self._op(*args, **(kwargs or {}))
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/fp8_utils.py", line 173, in apply_w8a8_block_fp8_linear
ERROR 07-22 02:45:17 [core.py:588] q_input, x_scale = per_token_group_quant_fp8(
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/fp8_utils.py", line 409, in per_token_group_quant_fp8
ERROR 07-22 02:45:17 [core.py:588] _per_token_group_quant_fp8_colmajor[(M, )](
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 347, in <lambda>
ERROR 07-22 02:45:17 [core.py:588] return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
ERROR 07-22 02:45:17 [core.py:588] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 591, in run
ERROR 07-22 02:45:17 [core.py:588] kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
ERROR 07-22 02:45:17 [core.py:588] File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/driver.py", line 529, in __call__
ERROR 07-22 02:45:17 [core.py:588] self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
ERROR 07-22 02:45:17 [core.py:588] RuntimeError: Triton Error [CUDA]: an illegal memory access was encountered
Process EngineCore_0:
ERROR 07-22 02:45:17 [async_llm.py:419] AsyncLLM output_handler failed.
ERROR 07-22 02:45:17 [async_llm.py:419] Traceback (most recent call last):
Traceback (most recent call last):
ERROR 07-22 02:45:17 [async_llm.py:419] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 378, in output_handler
ERROR 07-22 02:45:17 [async_llm.py:419] outputs = await engine_core.get_output_async()
ERROR 07-22 02:45:17 [async_llm.py:419] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [async_llm.py:419] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 740, in get_output_async
ERROR 07-22 02:45:17 [async_llm.py:419] raise self._format_exception(outputs) from None
ERROR 07-22 02:45:17 [async_llm.py:419] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
INFO 07-22 02:45:17 [async_llm.py:345] Request chatcmpl-3486b6e51c5343adbadef61174010586 failed (engine dead).
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 590, in run_engine_core
raise e
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 579, in run_engine_core
engine_core.run_busy_loop()
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 606, in run_busy_loop
self._process_engine_step()
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 631, in _process_engine_step
outputs, model_executed = self.step_fn()
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 235, in step
model_output = self.execute_model(scheduler_output)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 221, in execute_model
raise err
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 212, in execute_model
return self.model_executor.execute_model(scheduler_output)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 87, in execute_model
output = self.collective_rpc("execute_model",
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 57, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/utils/__init__.py", line 2736, in run_method
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 308, in execute_model
output = self.model_runner.execute_model(scheduler_output,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 1374, in execute_model
model_output = self.model(
^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 302, in forward
hidden_states = self.model(input_ids, positions, intermediate_tensors,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 173, in __call__
return self.forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 355, in forward
hidden_states, residual = layer(
^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [serving_chat.py:948] Error in chat completion stream generator.
ERROR 07-22 02:45:17 [serving_chat.py:948] Traceback (most recent call last):
ERROR 07-22 02:45:17 [serving_chat.py:948] File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/serving_chat.py", line 518, in chat_completion_stream_generator
ERROR 07-22 02:45:17 [serving_chat.py:948] async for res in result_generator:
ERROR 07-22 02:45:17 [serving_chat.py:948] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 326, in generate
ERROR 07-22 02:45:17 [serving_chat.py:948] out = q.get_nowait() or await q.get()
ERROR 07-22 02:45:17 [serving_chat.py:948] ^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [serving_chat.py:948] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/output_processor.py", line 57, in get
ERROR 07-22 02:45:17 [serving_chat.py:948] raise output
ERROR 07-22 02:45:17 [serving_chat.py:948] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 378, in output_handler
ERROR 07-22 02:45:17 [serving_chat.py:948] outputs = await engine_core.get_output_async()
ERROR 07-22 02:45:17 [serving_chat.py:948] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 07-22 02:45:17 [serving_chat.py:948] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 740, in get_output_async
ERROR 07-22 02:45:17 [serving_chat.py:948] raise self._format_exception(outputs) from None
ERROR 07-22 02:45:17 [serving_chat.py:948] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 215, in forward
hidden_states = self.self_attn(
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 134, in forward
qkv, _ = self.qkv_proj(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1762, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 510, in forward
output_parallel = self.quant_method.apply(self, input_, bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/fp8.py", line 414, in apply
return torch.ops.vllm.apply_w8a8_block_fp8_linear(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1158, in __call__
return self._op(*args, **(kwargs or {}))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/fp8_utils.py", line 173, in apply_w8a8_block_fp8_linear
q_input, x_scale = per_token_group_quant_fp8(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/fp8_utils.py", line 409, in per_token_group_quant_fp8
_per_token_group_quant_fp8_colmajor[(M, )](
File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 347, in <lambda>
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/triton/runtime/jit.py", line 591, in run
kernel.run(grid_0, grid_1, grid_2, stream, kernel.function, kernel.packed_metadata,
File "/usr/local/lib/python3.12/dist-packages/triton/backends/nvidia/driver.py", line 529, in __call__
self.launch(gridX, gridY, gridZ, stream, function, self.launch_cooperative_grid, global_scratch, *args)
RuntimeError: Triton Error [CUDA]: an illegal memory access was encountered
[rank0]:[W722 02:45:18.187919950 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())