I’m trying to start vllm server with Qwen 3.5 35B-A3B model. However, the startup fails. Below is my command and error:
CUDA_VISIBLE_DEVICES=6 VLLM_USE_TRITON_MOE=0 vllm serve Qwen/Qwen3.5-35B-A3B --tensor-parallel-size 1 --port 8000 --max-model-len 2048 --language-model-only --reasoning-parser qwen3 --dtype float16 --enforce-eager
=============================================
(EngineCore_DP0 pid=3511673) INFO 02-25 19:38:15 [default_loader.py:293] Loading weights took 27.81 seconds
(EngineCore_DP0 pid=3511673) INFO 02-25 19:38:16 [gpu_model_runner.py:4275] Model loading took 64.69 GiB memory and 28.572507 seconds
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] EngineCore failed to start.
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] Traceback (most recent call last):
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/engine/core.py”, line 1069, in run_engine_core
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/tracing/otel.py”, line 178, in sync_wrapper
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/engine/core.py”, line 813, in init
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] super().init(
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/engine/core.py”, line 115, in init
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/tracing/otel.py”, line 178, in sync_wrapper
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/engine/core.py”, line 249, in _initialize_kv_caches
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] available_gpu_memory = self.model_executor.determine_available_memory()
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/executor/abstract.py”, line 128, in determine_available_memory
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return self.collective_rpc(“determine_available_memory”)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/executor/uniproc_executor.py”, line 75, in collective_rpc
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] result = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/serial_utils.py”, line 459, in run_method
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/utils/_contextlib.py”, line 124, in decorate_context
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py”, line 371, in determine_available_memory
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] self.model_runner.profile_run()
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py”, line 5229, in profile_run
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] hidden_states, last_hidden_states = self._dummy_run(
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/utils/_contextlib.py”, line 124, in decorate_context
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py”, line 4927, in _dummy_run
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] outputs = self.model(
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1787, in _call_impl
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_5.py”, line 733, in forward
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] hidden_states = self.language_model.model(
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/compilation/decorators.py”, line 389, in call
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return self.forward(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_next.py”, line 1152, in forward
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] hidden_states, residual = layer(
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1787, in _call_impl
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_next.py”, line 1046, in forward
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] self.linear_attn(
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1787, in _call_impl
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_5.py”, line 164, in forward
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] mixed_qkvz, _ = self.in_proj_qkvz(hidden_states)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1787, in call_impl
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/layers/linear.py”, line 620, in forward
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] output_parallel = self.quant_method.apply(self, input, bias)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/layers/linear.py”, line 266, in apply
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/layers/utils.py”, line 119, in default_unquantized_gemm
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return torch.nn.functional.linear(x, weight, bias)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/parameter.py”, line 126, in torch_function
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] return super().torch_function(func, types, args, kwargs)
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) ERROR 02-25 19:38:19 [core.py:1079] RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling cublasGemmEx( handle, opa, opb, m, n, k, alpha_ptr, a, CUDA_R_16F, lda, b, CUDA_R_16F, ldb, beta_ptr, c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)
(EngineCore_DP0 pid=3511673) Process EngineCore_DP0:
(EngineCore_DP0 pid=3511673) Traceback (most recent call last):
(EngineCore_DP0 pid=3511673) File “/usr/local/lib/python3.12/multiprocessing/process.py”, line 314, in _bootstrap
(EngineCore_DP0 pid=3511673) self.run()
(EngineCore_DP0 pid=3511673) File “/usr/local/lib/python3.12/multiprocessing/process.py”, line 108, in run
(EngineCore_DP0 pid=3511673) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/engine/core.py”, line 1083, in run_engine_core
(EngineCore_DP0 pid=3511673) raise e
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/engine/core.py”, line 1069, in run_engine_core
(EngineCore_DP0 pid=3511673) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/tracing/otel.py”, line 178, in sync_wrapper
(EngineCore_DP0 pid=3511673) return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/engine/core.py”, line 813, in init
(EngineCore_DP0 pid=3511673) super().init(
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/engine/core.py”, line 115, in init
(EngineCore_DP0 pid=3511673) num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/tracing/otel.py”, line 178, in sync_wrapper
(EngineCore_DP0 pid=3511673) return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/engine/core.py”, line 249, in _initialize_kv_caches
(EngineCore_DP0 pid=3511673) available_gpu_memory = self.model_executor.determine_available_memory()
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/executor/abstract.py”, line 128, in determine_available_memory
(EngineCore_DP0 pid=3511673) return self.collective_rpc(“determine_available_memory”)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/executor/uniproc_executor.py”, line 75, in collective_rpc
(EngineCore_DP0 pid=3511673) result = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/serial_utils.py”, line 459, in run_method
(EngineCore_DP0 pid=3511673) return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/utils/_contextlib.py”, line 124, in decorate_context
(EngineCore_DP0 pid=3511673) return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/worker/gpu_worker.py”, line 371, in determine_available_memory
(EngineCore_DP0 pid=3511673) self.model_runner.profile_run()
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py”, line 5229, in profile_run
(EngineCore_DP0 pid=3511673) hidden_states, last_hidden_states = self._dummy_run(
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/utils/_contextlib.py”, line 124, in decorate_context
(EngineCore_DP0 pid=3511673) return func(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py”, line 4927, in _dummy_run
(EngineCore_DP0 pid=3511673) outputs = self.model(
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=3511673) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1787, in _call_impl
(EngineCore_DP0 pid=3511673) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_5.py”, line 733, in forward
(EngineCore_DP0 pid=3511673) hidden_states = self.language_model.model(
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/compilation/decorators.py”, line 389, in call
(EngineCore_DP0 pid=3511673) return self.forward(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_next.py”, line 1152, in forward
(EngineCore_DP0 pid=3511673) hidden_states, residual = layer(
(EngineCore_DP0 pid=3511673) ^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=3511673) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1787, in _call_impl
(EngineCore_DP0 pid=3511673) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_next.py”, line 1046, in forward
(EngineCore_DP0 pid=3511673) self.linear_attn(
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=3511673) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1787, in _call_impl
(EngineCore_DP0 pid=3511673) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/models/qwen3_5.py”, line 164, in forward
(EngineCore_DP0 pid=3511673) mixed_qkvz, _ = self.in_proj_qkvz(hidden_states)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=3511673) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/torch/nn/modules/module.py”, line 1787, in call_impl
(EngineCore_DP0 pid=3511673) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/layers/linear.py”, line 620, in forward
(EngineCore_DP0 pid=3511673) output_parallel = self.quant_method.apply(self, input, bias)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/layers/linear.py”, line 266, in apply
(EngineCore_DP0 pid=3511673) return dispatch_unquantized_gemm()(layer, x, layer.weight, bias)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/layers/utils.py”, line 119, in default_unquantized_gemm
(EngineCore_DP0 pid=3511673) return torch.nn.functional.linear(x, weight, bias)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) File “/home/sanjeevkumar4/vllm_nightly/lib/python3.12/site-packages/vllm/model_executor/parameter.py”, line 126, in torch_function
(EngineCore_DP0 pid=3511673) return super().torch_function(func, types, args, kwargs)
(EngineCore_DP0 pid=3511673) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=3511673) RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling cublasGemmEx( handle, opa, opb, m, n, k, alpha_ptr, a, CUDA_R_16F, lda, b, CUDA_R_16F, ldb, beta_ptr, c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)
==============
can someone please help? Am using a A100 80GB GPU.
Note that I will not be able to use the docker based setup.