Hi,
I have the problem that my vLLM crashes from time to time.
this is my setup (with 8xH200):
vllm:
image: vllm/vllm-openai:v0.12.0
ulimits:
memlock:
soft: -1
hard: -1
stack:
soft: 67108864
hard: 67108864
nofile:
soft: 1048576
hard: 1048576
shm_size: 512gb
container_name: vllm
restart: unless-stopped
environment:
- VLLM_USE_FLASHINFER_MOE_FP16=1
- OMP_NUM_THREADS=2
volumes:
- /opt/docker/openwebui/vllm/models:/models:ro
- /opt/docker/openwebui/vllm/cache:/root/.cache/huggingface
command:
- "--model=/models/Qwen3-VL-235B-A22B-Instruct"
- "--served-model-name=Delphi"
- "--host=0.0.0.0"
- "--port=8000"
- "--api-server-count=4"
- "--tensor-parallel-size=8"
- "--async-scheduling"
- "--enable-expert-parallel"
- "--dtype=bfloat16"
- "--kv-cache-dtype=fp8"
- "--gpu-memory-utilization=0.7"
- "--max-model-len=131072"
- "--max-num-batched-tokens=65536"
- "--max-num-seqs=32"
- "--enable-chunked-prefill"
- "--enable-prefix-caching"
- "--swap-space=128"
- "--mm-encoder-tp-mode=data"
- "--mm_processor_cache_type=shm"
- "--mm_processor_cache_gb=128"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 8
capabilities: ["gpu"]
and this is a example of the error:
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [dump_input.py:72] Dumping input data for V1 LLM engine (v0.11.2) with config: model=‘/models/Qwen3-VL-235B-A22B-Instruct’, speculative_config=None, tokenizer=‘/models/Qwen3-VL-235B-A22B-Instruct’, skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=fp8, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend=‘auto’, disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=‘’, reasoning_parser_plugin=‘’, enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=Delphi, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={‘level’: None, ‘mode’: <CompilationMode.VLLM_COMPILE: 3>, ‘debug_dump_path’: None, ‘cache_dir’: ‘’, ‘compile_cache_save_format’: ‘binary’, ‘backend’: ‘inductor’, ‘custom_ops’: [‘none’], ‘splitting_ops’: [‘vllm::unified_attention’, ‘vllm::unified_attention_with_output’, ‘vllm::unified_mla_attention’, ‘vllm::unified_mla_attention_with_output’, ‘vllm::mamba_mixer2’, ‘vllm::mamba_mixer’, ‘vllm::short_conv’, ‘vllm::linear_attention’, ‘vllm::plamo2_mamba_mixer’, ‘vllm::gdn_attention_core’, ‘vllm::kda_attention’, ‘vllm::sparse_attn_indexer’], ‘compile_mm_encoder’: False, ‘use_inductor’: None, ‘compile_sizes’: , ‘inductor_compile_config’: {‘enable_auto_functionalized_v2’: False, ‘combo_kernels’: True, ‘benchmark_combo_kernel’: True}, ‘inductor_passes’: {}, ‘cudagraph_mode’: <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, ‘cudagraph_num_of_warmups’: 1, ‘cudagraph_capture_sizes’: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64], ‘cudagraph_copy_inputs’: False, ‘cudagraph_specialize_lora’: True, ‘use_inductor_graph_partition’: False, ‘pass_config’: {}, ‘max_cudagraph_capture_size’: 64, ‘local_cache_dir’: None},
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [dump_input.py:79] Dumping scheduler output for model execution: SchedulerOutput(scheduled_new_reqs=[NewRequestData(req_id=chatcmpl-50cfb5507c4c401590f28cb0cfcc65da,prompt_token_ids_len=83338,mm_features=,sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.7, top_p=0.8, top_k=20, min_p=0.0, seed=None, stop=, stop_token_ids=[151643], bad_words=, include_stop_str_in_output=False, ignore_eos=False, max_tokens=47734, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None),block_ids=([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41991, 41992, 41993, 41994, 41995, 41996, 41997, 41998, 41999, 42000, 42001, 42002, 42003, 42004, 42005, 42006, 42007, 42008, 42009, 42010, 42011, 42012, 42013, 42014, 42015, 42016, 42017, 42018, 42019, 42020, 42021, 42022, 42023, 42024, 42025, 42026, 42027, 42028, 42029, 42030, 42031, 42032, 42033, 46086],),num_computed_tokens=320,lora_request=None,prompt_embeds_shape=None)], scheduled_cached_reqs=CachedRequestData(req_ids=, resumed_req_ids=, new_token_ids=, all_token_ids={}, new_block_ids=, num_computed_tokens=, num_output_tokens=), num_scheduled_tokens={chatcmpl-50cfb5507c4c401590f28cb0cfcc65da: 65536}, total_num_scheduled_tokens=65536, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, num_common_prefix_blocks=[4116], finished_req_ids=, free_encoder_mm_hashes=, pending_structured_output_tokens=false, kv_connector_metadata=null, ec_connector_metadata=null)
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [dump_input.py:81] Dumping scheduler stats: SchedulerStats(num_running_reqs=1, num_waiting_reqs=0, step_counter=0, current_wave=0, kv_cache_usage=0.0745250085841822, prefix_cache_stats=PrefixCacheStats(reset=False, requests=1, queries=83338, hits=320, preempted_requests=0, preempted_queries=0, preempted_hits=0), connector_prefix_cache_stats=None, spec_decoding_stats=None, kv_connector_stats=None, waiting_lora_adapters={}, running_lora_adapters={})
(EngineCore_DP0 pid=18) [2025-12-12 01:44:15] ERROR _base.py:342: exception calling callback for <FutureWrapper at 0x79603c00c560 state=finished raised TimeoutError>
(EngineCore_DP0 pid=18) Traceback (most recent call last):
(EngineCore_DP0 pid=18) File “/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py”, line 335, in get_response
(EngineCore_DP0 pid=18) status, result = mq.dequeue(
(EngineCore_DP0 pid=18) ^^^^^^^^^^^
(EngineCore_DP0 pid=18) File “/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/shm_broadcast.py”, line 571, in dequeue
(EngineCore_DP0 pid=18) with self.acquire_read(timeout, cancel, indefinite) as buf:
(EngineCore_DP0 pid=18) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) File “/usr/lib/python3.12/contextlib.py”, line 137, in enter
(EngineCore_DP0 pid=18) return next(self.gen)
(EngineCore_DP0 pid=18) ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) File “/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/shm_broadcast.py”, line 495, in acquire_read
(EngineCore_DP0 pid=18) raise TimeoutError
(EngineCore_DP0 pid=18) TimeoutError
(EngineCore_DP0 pid=18)
(EngineCore_DP0 pid=18) The above exception was the direct cause of the following exception:
(EngineCore_DP0 pid=18)
(EngineCore_DP0 pid=18) Traceback (most recent call last):
(EngineCore_DP0 pid=18) File “/usr/lib/python3.12/concurrent/futures/_base.py”, line 340, in _invoke_callbacks
(EngineCore_DP0 pid=18) callback(self)
(EngineCore_DP0 pid=18) File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py”, line 322, in callback
(EngineCore_DP0 pid=18) result = f.result()
(EngineCore_DP0 pid=18) ^^^^^^^^^^
(EngineCore_DP0 pid=18) File “/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py”, line 79, in result
(EngineCore_DP0 pid=18) return super().result()
(EngineCore_DP0 pid=18) ^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) File “/usr/lib/python3.12/concurrent/futures/_base.py”, line 449, in result
(EngineCore_DP0 pid=18) return self.__get_result()
(EngineCore_DP0 pid=18) ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) File “/usr/lib/python3.12/concurrent/futures/_base.py”, line 401, in __get_result
(EngineCore_DP0 pid=18) raise self._exception
(EngineCore_DP0 pid=18) File “/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py”, line 83, in wait_for_response
(EngineCore_DP0 pid=18) response = self.aggregate(get_response())
(EngineCore_DP0 pid=18) ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) File “/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py”, line 339, in get_response
(EngineCore_DP0 pid=18) raise TimeoutError(f"RPC call to {method} timed out.“) from e
(EngineCore_DP0 pid=18) TimeoutError: RPC call to execute_model timed out.
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [dump_input.py:72] Dumping input data for V1 LLM engine (v0.11.2) with config: model=‘/models/Qwen3-VL-235B-A22B-Instruct’, speculative_config=None, tokenizer=‘/models/Qwen3-VL-235B-A22B-Instruct’, skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=fp8, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend=‘auto’, disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=‘’, reasoning_parser_plugin=‘’, enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=Delphi, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={‘level’: None, ‘mode’: <CompilationMode.VLLM_COMPILE: 3>, ‘debug_dump_path’: None, ‘cache_dir’: ‘’, ‘compile_cache_save_format’: ‘binary’, ‘backend’: ‘inductor’, ‘custom_ops’: [‘none’], ‘splitting_ops’: [‘vllm::unified_attention’, ‘vllm::unified_attention_with_output’, ‘vllm::unified_mla_attention’, ‘vllm::unified_mla_attention_with_output’, ‘vllm::mamba_mixer2’, ‘vllm::mamba_mixer’, ‘vllm::short_conv’, ‘vllm::linear_attention’, ‘vllm::plamo2_mamba_mixer’, ‘vllm::gdn_attention_core’, ‘vllm::kda_attention’, ‘vllm::sparse_attn_indexer’], ‘compile_mm_encoder’: False, ‘use_inductor’: None, ‘compile_sizes’: , ‘inductor_compile_config’: {‘enable_auto_functionalized_v2’: False, ‘combo_kernels’: True, ‘benchmark_combo_kernel’: True}, ‘inductor_passes’: {}, ‘cudagraph_mode’: <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, ‘cudagraph_num_of_warmups’: 1, ‘cudagraph_capture_sizes’: [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64], ‘cudagraph_copy_inputs’: False, ‘cudagraph_specialize_lora’: True, ‘use_inductor_graph_partition’: False, ‘pass_config’: {}, ‘max_cudagraph_capture_size’: 64, ‘local_cache_dir’: None},
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [dump_input.py:79] Dumping scheduler output for model execution: SchedulerOutput(scheduled_new_reqs=[NewRequestData(req_id=chatcmpl-50cfb5507c4c401590f28cb0cfcc65da,prompt_token_ids_len=83338,mm_features=,sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.7, top_p=0.8, top_k=20, min_p=0.0, seed=None, stop=, stop_token_ids=[151643], bad_words=, include_stop_str_in_output=False, ignore_eos=False, max_tokens=47734, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None),block_ids=([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41991, 41992, 41993, 41994, 41995, 41996, 46068, 46069, 46070, 46071, 46072, 46073, 46074, 46075, 46076, 46077, 46078, 46079, 46080, 46081, 46082, 46083, 46084, 46085, 46086],),num_computed_tokens=320,lora_request=None,prompt_embeds_shape=None)], scheduled_cached_reqs=CachedRequestData(req_ids=, resumed_req_ids=, new_token_ids=, all_token_ids={}, new_block_ids=, num_computed_tokens=, num_output_tokens=), num_scheduled_tokens={chatcmpl-50cfb5507c4c401590f28cb0cfcc65da: 65536}, total_num_scheduled_tokens=65536, scheduled_spec_decode_tokens={}, scheduled_encoder_inputs={}, num_common_prefix_blocks=[4116], finished_req_ids=, free_encoder_mm_hashes=, pending_structured_output_tokens=false, kv_connector_metadata=null, ec_connector_metadata=null)
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [dump_input.py:81] Dumping scheduler stats: SchedulerStats(num_running_reqs=1, num_waiting_reqs=0, step_counter=0, current_wave=0, kv_cache_usage=0.0745250085841822, prefix_cache_stats=PrefixCacheStats(reset=False, requests=0, queries=0, hits=0, preempted_requests=0, preempted_queries=0, preempted_hits=0), connector_prefix_cache_stats=None, spec_decoding_stats=None, kv_connector_stats=None, waiting_lora_adapters={}, running_lora_adapters={})
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] EngineCore encountered a fatal error.
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] Traceback (most recent call last):
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py”, line 335, in get_response
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] status, result = mq.dequeue(
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] ^^^^^^^^^^^
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/shm_broadcast.py”, line 571, in dequeue
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] with self.acquire_read(timeout, cancel, indefinite) as buf:
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/lib/python3.12/contextlib.py”, line 137, in enter
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] return next(self.gen)
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/shm_broadcast.py”, line 495, in acquire_read
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] raise TimeoutError
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] TimeoutError
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844]
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] The above exception was the direct cause of the following exception:
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844]
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] Traceback (most recent call last):
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py”, line 835, in run_engine_core
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] engine_core.run_busy_loop()
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py”, line 862, in run_busy_loop
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] self._process_engine_step()
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py”, line 891, in _process_engine_step
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] outputs, model_executed = self.step_fn()
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py”, line 437, in step_with_batch_queue
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] model_output = future.result()
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] ^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py”, line 79, in result
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] return super().result()
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] ^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/lib/python3.12/concurrent/futures/_base.py”, line 449, in result
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] return self.__get_result()
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/lib/python3.12/concurrent/futures/_base.py”, line 401, in __get_result
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] raise self._exception
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py”, line 83, in wait_for_response
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] response = self.aggregate(get_response())
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] ^^^^^^^^^^^^^^
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py”, line 339, in get_response
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] raise TimeoutError(f"RPC call to {method} timed out.”) from e
(EngineCore_DP0 pid=18) ERROR 12-12 01:44:15 [core.py:844] TimeoutError: RPC call to sample_tokens timed out.
(Worker_TP1_EP1 pid=53) INFO 12-12 01:44:15 [multiproc_executor.py:702] Parent process exited, terminating worker
(Worker_TP2_EP2 pid=54) INFO 12-12 01:44:15 [multiproc_executor.py:702] Parent process exited, terminating worker
(Worker_TP5_EP5 pid=57) INFO 12-12 01:44:15 [multiproc_executor.py:702] Parent process exited, terminating worker
(Worker_TP6_EP6 pid=58) INFO 12-12 01:44:15 [multiproc_executor.py:702] Parent process exited, terminating worker
(ApiServer_1 pid=20) ERROR 12-12 01:44:15 [async_llm.py:525] AsyncLLM output_handler failed.
(ApiServer_1 pid=20) ERROR 12-12 01:44:15 [async_llm.py:525] Traceback (most recent call last):
(ApiServer_1 pid=20) ERROR 12-12 01:44:15 [async_llm.py:525] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 477, in output_handler
(ApiServer_1 pid=20) ERROR 12-12 01:44:15 [async_llm.py:525] outputs = await engine_core.get_output_async()
(ApiServer_1 pid=20) ERROR 12-12 01:44:15 [async_llm.py:525] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(ApiServer_1 pid=20) ERROR 12-12 01:44:15 [async_llm.py:525] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py”, line 883, in get_output_async
(ApiServer_1 pid=20) ERROR 12-12 01:44:15 [async_llm.py:525] raise self._format_exception(outputs) from None
(ApiServer_1 pid=20) ERROR 12-12 01:44:15 [async_llm.py:525] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [async_llm.py:525] AsyncLLM output_handler failed.
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [async_llm.py:525] Traceback (most recent call last):
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [async_llm.py:525] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 477, in output_handler
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [async_llm.py:525] outputs = await engine_core.get_output_async()
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [async_llm.py:525] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [async_llm.py:525] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py”, line 883, in get_output_async
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [async_llm.py:525] raise self._format_exception(outputs) from None
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [async_llm.py:525] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [async_llm.py:525] AsyncLLM output_handler failed.
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [async_llm.py:525] Traceback (most recent call last):
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [async_llm.py:525] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 477, in output_handler
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [async_llm.py:525] outputs = await engine_core.get_output_async()
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [async_llm.py:525] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [async_llm.py:525] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py”, line 883, in get_output_async
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [async_llm.py:525] raise self._format_exception(outputs) from None
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [async_llm.py:525] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [async_llm.py:525] AsyncLLM output_handler failed.
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [async_llm.py:525] Traceback (most recent call last):
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [async_llm.py:525] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 477, in output_handler
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [async_llm.py:525] outputs = await engine_core.get_output_async()
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [async_llm.py:525] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [async_llm.py:525] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py”, line 883, in get_output_async
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [async_llm.py:525] raise self._format_exception(outputs) from None
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [async_llm.py:525] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] Error in chat completion stream generator.
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] Traceback (most recent call last):
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/serving_chat.py”, line 611, in chat_completion_stream_generator
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] async for res in result_generator:
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 423, in generate
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] out = q.get_nowait() or await q.get()
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] ^^^^^^^^^^^^^
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/output_processor.py”, line 70, in get
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] raise output
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 477, in output_handler
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] outputs = await engine_core.get_output_async()
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py”, line 883, in get_output_async
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] raise self._format_exception(outputs) from None
(ApiServer_2 pid=21) ERROR 12-12 01:44:15 [serving_chat.py:1278] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] Error in chat completion stream generator.
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] Traceback (most recent call last):
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/serving_chat.py”, line 611, in chat_completion_stream_generator
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] async for res in result_generator:
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 423, in generate
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] out = q.get_nowait() or await q.get()
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] ^^^^^^^^^^^^^
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/output_processor.py”, line 70, in get
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] raise output
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 477, in output_handler
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] outputs = await engine_core.get_output_async()
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py”, line 883, in get_output_async
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] raise self._format_exception(outputs) from None
(ApiServer_0 pid=19) ERROR 12-12 01:44:15 [serving_chat.py:1278] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] Error in chat completion stream generator.
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] Traceback (most recent call last):
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/serving_chat.py”, line 611, in chat_completion_stream_generator
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] async for res in result_generator:
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 423, in generate
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] out = q.get_nowait() or await q.get()
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] ^^^^^^^^^^^^^
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/output_processor.py”, line 70, in get
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] raise output
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py”, line 477, in output_handler
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] outputs = await engine_core.get_output_async()
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] File “/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py”, line 883, in get_output_async
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] raise self._format_exception(outputs) from None
(ApiServer_3 pid=22) ERROR 12-12 01:44:15 [serving_chat.py:1278] vllm.v1.engine.exceptions.EngineDeadError: EngineCore encountered an issue. See stack trace (above) for the root cause.