root@a49e15233991:/data-new/app/Qwen2.5-72B/0123/v091/L20# export VLLM_LOGGING_LEVEL=DEBUG
root@a49e15233991:/data-new/app/Qwen2.5-72B/0123/v091/L20#
root@a49e15233991:/data-new/app/Qwen2.5-72B/0123/v091/L20# python3 -m vllm.entrypoints.openai.api_server --host 0.0.0.0
–port 8000
–model /data-new/models/Qwen2.5-72B
–served-model-name /data-new/models/Qwen2.5-72B --trust-remote-code
–gpu-memory-utilization 0.9
-tp 4
DEBUG 08-20 17:11:31 [init.py:31] No plugins for group vllm.platform_plugins found.
DEBUG 08-20 17:11:31 [init.py:35] Checking if TPU platform is available.
DEBUG 08-20 17:11:31 [init.py:45] TPU platform is not available because: No module named ‘libtpu’
DEBUG 08-20 17:11:31 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:31 [init.py:72] Confirmed CUDA platform is available.
DEBUG 08-20 17:11:31 [init.py:100] Checking if ROCm platform is available.
DEBUG 08-20 17:11:31 [init.py:114] ROCm platform is not available because: No module named ‘amdsmi’
DEBUG 08-20 17:11:31 [init.py:121] Checking if HPU platform is available.
DEBUG 08-20 17:11:31 [init.py:128] HPU platform is not available because habana_frameworks is not found.
DEBUG 08-20 17:11:31 [init.py:138] Checking if XPU platform is available.
DEBUG 08-20 17:11:31 [init.py:148] XPU platform is not available because: No module named ‘intel_extension_for_pytorch’
DEBUG 08-20 17:11:31 [init.py:155] Checking if CPU platform is available.
DEBUG 08-20 17:11:31 [init.py:177] Checking if Neuron platform is available.
DEBUG 08-20 17:11:31 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:31 [init.py:72] Confirmed CUDA platform is available.
INFO 08-20 17:11:31 [init.py:244] Automatically detected platform cuda.
DEBUG 08-20 17:11:32 [utils.py:150] Setting VLLM_WORKER_MULTIPROC_METHOD to ‘spawn’
DEBUG 08-20 17:11:32 [init.py:39] Available plugins for group vllm.general_plugins:
DEBUG 08-20 17:11:32 [init.py:41] - lora_filesystem_resolver → vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
DEBUG 08-20 17:11:32 [init.py:44] All plugins in this group will be loaded. SetVLLM_PLUGINSto control which plugins to load.
INFO 08-20 17:11:33 [api_server.py:1287] vLLM API server version 0.9.1
INFO 08-20 17:11:33 [cli_args.py:309] non-default args: {‘host’: ‘0.0.0.0’, ‘model’: ‘/data-new/models/Qwen2.5-72B’, ‘trust_remote_code’: True, ‘served_model_name’: [‘/data-new/models/Qwen2.5-72B’], ‘tensor_parallel_size’: 4}
INFO 08-20 17:11:39 [config.py:823] This model supports multiple tasks: {‘embed’, ‘generate’, ‘reward’, ‘score’, ‘classify’}. Defaulting to ‘generate’.
DEBUG 08-20 17:11:39 [arg_utils.py:1600] Setting max_num_batched_tokens to 2048 for OPENAI_API_SERVER usage context.
DEBUG 08-20 17:11:39 [arg_utils.py:1607] Setting max_num_seqs to 256 for OPENAI_API_SERVER usage context.
INFO 08-20 17:11:40 [config.py:1946] Defaulting to use mp for distributed inference
INFO 08-20 17:11:40 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=2048.
WARNING 08-20 17:11:42 [env_override.py:17] NCCL_CUMEM_ENABLE is set to 0, skipping override. This may increase memory overhead with cudagraph+allreduce: Report of increased memory overhead during cudagraph capture with nccl >= 2.19 · Issue #1234 · NVIDIA/nccl · GitHub
DEBUG 08-20 17:11:44 [init.py:31] No plugins for group vllm.platform_plugins found.
DEBUG 08-20 17:11:44 [init.py:35] Checking if TPU platform is available.
DEBUG 08-20 17:11:44 [init.py:45] TPU platform is not available because: No module named ‘libtpu’
DEBUG 08-20 17:11:44 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:45 [init.py:72] Confirmed CUDA platform is available.
DEBUG 08-20 17:11:45 [init.py:100] Checking if ROCm platform is available.
DEBUG 08-20 17:11:45 [init.py:114] ROCm platform is not available because: No module named ‘amdsmi’
DEBUG 08-20 17:11:45 [init.py:121] Checking if HPU platform is available.
DEBUG 08-20 17:11:45 [init.py:128] HPU platform is not available because habana_frameworks is not found.
DEBUG 08-20 17:11:45 [init.py:138] Checking if XPU platform is available.
DEBUG 08-20 17:11:45 [init.py:148] XPU platform is not available because: No module named ‘intel_extension_for_pytorch’
DEBUG 08-20 17:11:45 [init.py:155] Checking if CPU platform is available.
DEBUG 08-20 17:11:45 [init.py:177] Checking if Neuron platform is available.
DEBUG 08-20 17:11:45 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:45 [init.py:72] Confirmed CUDA platform is available.
INFO 08-20 17:11:45 [init.py:244] Automatically detected platform cuda.
INFO 08-20 17:11:47 [core.py:455] Waiting for init message from front-end.
DEBUG 08-20 17:11:47 [utils.py:547] HELLO from local core engine process 0.
DEBUG 08-20 17:11:47 [core.py:463] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=[‘ipc:///tmp/f85c2ed1-c67c-4a02-842b-015f14fb9d67’], outputs=[‘ipc:///tmp/7e0e9a93-0af4-427a-9ce1-071e2ce62a5f’], coordinator_input=None, coordinator_output=None), parallel_config={‘data_parallel_master_ip’: ‘127.0.0.1’, ‘data_parallel_master_port’: 0, ‘data_parallel_size’: 1})
DEBUG 08-20 17:11:47 [init.py:39] Available plugins for group vllm.general_plugins:
DEBUG 08-20 17:11:47 [init.py:41] - lora_filesystem_resolver → vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
DEBUG 08-20 17:11:47 [init.py:44] All plugins in this group will be loaded. SetVLLM_PLUGINSto control which plugins to load.
INFO 08-20 17:11:47 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model=‘/data-new/models/Qwen2.5-72B’, speculative_config=None, tokenizer=‘/data-new/models/Qwen2.5-72B’, skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend=‘auto’, disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=‘’), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/data-new/models/Qwen2.5-72B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={“level”:3,“debug_dump_path”:“”,“cache_dir”:“”,“backend”:“”,“custom_ops”:[“none”],“splitting_ops”:[“vllm.unified_attention”,“vllm.unified_attention_with_output”],“use_inductor”:true,“compile_sizes”:,“inductor_compile_config”:{“enable_auto_functionalized_v2”:false},“inductor_passes”:{},“use_cudagraph”:true,“cudagraph_num_of_warmups”:1,“cudagraph_capture_sizes”:[512,504,496,488,480,472,464,456,448,440,432,424,416,408,400,392,384,376,368,360,352,344,336,328,320,312,304,296,288,280,272,264,256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],“cudagraph_copy_inputs”:false,“full_cuda_graph”:false,“max_capture_size”:512,“local_cache_dir”:null}
WARNING 08-20 17:11:47 [multiproc_worker_utils.py:307] Reducing Torch parallelism from 76 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
DEBUG 08-20 17:11:47 [shm_broadcast.py:243] Binding to ipc:///tmp/66ab1609-f3f2-42b7-ad51-4223f25f81a0
INFO 08-20 17:11:47 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, ‘psm_3a74c7b1’), local_subscribe_addr=‘ipc:///tmp/66ab1609-f3f2-42b7-ad51-4223f25f81a0’, remote_subscribe_addr=None, remote_addr_ipv6=False)
WARNING 08-20 17:11:48 [env_override.py:17] NCCL_CUMEM_ENABLE is set to 0, skipping override. This may increase memory overhead with cudagraph+allreduce: Report of increased memory overhead during cudagraph capture with nccl >= 2.19 · Issue #1234 · NVIDIA/nccl · GitHub
WARNING 08-20 17:11:48 [env_override.py:17] NCCL_CUMEM_ENABLE is set to 0, skipping override. This may increase memory overhead with cudagraph+allreduce: Report of increased memory overhead during cudagraph capture with nccl >= 2.19 · Issue #1234 · NVIDIA/nccl · GitHub
WARNING 08-20 17:11:48 [env_override.py:17] NCCL_CUMEM_ENABLE is set to 0, skipping override. This may increase memory overhead with cudagraph+allreduce: Report of increased memory overhead during cudagraph capture with nccl >= 2.19 · Issue #1234 · NVIDIA/nccl · GitHub
WARNING 08-20 17:11:48 [env_override.py:17] NCCL_CUMEM_ENABLE is set to 0, skipping override. This may increase memory overhead with cudagraph+allreduce: Report of increased memory overhead during cudagraph capture with nccl >= 2.19 · Issue #1234 · NVIDIA/nccl · GitHub
DEBUG 08-20 17:11:50 [init.py:31] No plugins for group vllm.platform_plugins found.
DEBUG 08-20 17:11:50 [init.py:35] Checking if TPU platform is available.
DEBUG 08-20 17:11:50 [init.py:45] TPU platform is not available because: No module named ‘libtpu’
DEBUG 08-20 17:11:50 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:72] Confirmed CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:100] Checking if ROCm platform is available.
DEBUG 08-20 17:11:50 [init.py:114] ROCm platform is not available because: No module named ‘amdsmi’
DEBUG 08-20 17:11:50 [init.py:121] Checking if HPU platform is available.
DEBUG 08-20 17:11:50 [init.py:128] HPU platform is not available because habana_frameworks is not found.
DEBUG 08-20 17:11:50 [init.py:138] Checking if XPU platform is available.
DEBUG 08-20 17:11:50 [init.py:148] XPU platform is not available because: No module named ‘intel_extension_for_pytorch’
DEBUG 08-20 17:11:50 [init.py:155] Checking if CPU platform is available.
DEBUG 08-20 17:11:50 [init.py:177] Checking if Neuron platform is available.
DEBUG 08-20 17:11:50 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:72] Confirmed CUDA platform is available.
INFO 08-20 17:11:50 [init.py:244] Automatically detected platform cuda.
DEBUG 08-20 17:11:50 [init.py:31] No plugins for group vllm.platform_plugins found.
DEBUG 08-20 17:11:50 [init.py:35] Checking if TPU platform is available.
DEBUG 08-20 17:11:50 [init.py:45] TPU platform is not available because: No module named ‘libtpu’
DEBUG 08-20 17:11:50 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:72] Confirmed CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:100] Checking if ROCm platform is available.
DEBUG 08-20 17:11:50 [init.py:114] ROCm platform is not available because: No module named ‘amdsmi’
DEBUG 08-20 17:11:50 [init.py:121] Checking if HPU platform is available.
DEBUG 08-20 17:11:50 [init.py:128] HPU platform is not available because habana_frameworks is not found.
DEBUG 08-20 17:11:50 [init.py:138] Checking if XPU platform is available.
DEBUG 08-20 17:11:50 [init.py:148] XPU platform is not available because: No module named ‘intel_extension_for_pytorch’
DEBUG 08-20 17:11:50 [init.py:155] Checking if CPU platform is available.
DEBUG 08-20 17:11:50 [init.py:177] Checking if Neuron platform is available.
DEBUG 08-20 17:11:50 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:72] Confirmed CUDA platform is available.
INFO 08-20 17:11:50 [init.py:244] Automatically detected platform cuda.
DEBUG 08-20 17:11:50 [init.py:31] No plugins for group vllm.platform_plugins found.
DEBUG 08-20 17:11:50 [init.py:35] Checking if TPU platform is available.
DEBUG 08-20 17:11:50 [init.py:45] TPU platform is not available because: No module named ‘libtpu’
DEBUG 08-20 17:11:50 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:72] Confirmed CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:100] Checking if ROCm platform is available.
DEBUG 08-20 17:11:50 [init.py:114] ROCm platform is not available because: No module named ‘amdsmi’
DEBUG 08-20 17:11:50 [init.py:121] Checking if HPU platform is available.
DEBUG 08-20 17:11:50 [init.py:128] HPU platform is not available because habana_frameworks is not found.
DEBUG 08-20 17:11:50 [init.py:138] Checking if XPU platform is available.
DEBUG 08-20 17:11:50 [init.py:148] XPU platform is not available because: No module named ‘intel_extension_for_pytorch’
DEBUG 08-20 17:11:50 [init.py:155] Checking if CPU platform is available.
DEBUG 08-20 17:11:50 [init.py:177] Checking if Neuron platform is available.
DEBUG 08-20 17:11:50 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:72] Confirmed CUDA platform is available.
INFO 08-20 17:11:50 [init.py:244] Automatically detected platform cuda.
DEBUG 08-20 17:11:50 [init.py:31] No plugins for group vllm.platform_plugins found.
DEBUG 08-20 17:11:50 [init.py:35] Checking if TPU platform is available.
DEBUG 08-20 17:11:50 [init.py:45] TPU platform is not available because: No module named ‘libtpu’
DEBUG 08-20 17:11:50 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:72] Confirmed CUDA platform is available.
DEBUG 08-20 17:11:50 [init.py:100] Checking if ROCm platform is available.
DEBUG 08-20 17:11:50 [init.py:114] ROCm platform is not available because: No module named ‘amdsmi’
DEBUG 08-20 17:11:50 [init.py:121] Checking if HPU platform is available.
DEBUG 08-20 17:11:50 [init.py:128] HPU platform is not available because habana_frameworks is not found.
DEBUG 08-20 17:11:50 [init.py:138] Checking if XPU platform is available.
DEBUG 08-20 17:11:50 [init.py:148] XPU platform is not available because: No module named ‘intel_extension_for_pytorch’
DEBUG 08-20 17:11:50 [init.py:155] Checking if CPU platform is available.
DEBUG 08-20 17:11:50 [init.py:177] Checking if Neuron platform is available.
DEBUG 08-20 17:11:50 [init.py:52] Checking if CUDA platform is available.
DEBUG 08-20 17:11:51 [init.py:72] Confirmed CUDA platform is available.
INFO 08-20 17:11:51 [init.py:244] Automatically detected platform cuda.
DEBUG 08-20 17:11:52 [init.py:39] Available plugins for group vllm.general_plugins:
DEBUG 08-20 17:11:52 [init.py:41] - lora_filesystem_resolver → vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
DEBUG 08-20 17:11:52 [init.py:44] All plugins in this group will be loaded. SetVLLM_PLUGINSto control which plugins to load.
DEBUG 08-20 17:11:53 [decorators.py:110] Inferred dynamic dimensions for forward method of <class ‘vllm.model_executor.models.llama.LlamaModel’>: [‘input_ids’, ‘positions’, ‘intermediate_tensors’, ‘inputs_embeds’]
DEBUG 08-20 17:11:53 [decorators.py:110] Inferred dynamic dimensions for forward method of <class ‘vllm.model_executor.models.llama_eagle3.LlamaModel’>: [‘input_ids’, ‘positions’, ‘hidden_states’]
WARNING 08-20 17:11:53 [utils.py:2737] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7fb93e06d760>
DEBUG 08-20 17:11:53 [config.py:4677] enabled custom ops: Counter()
DEBUG 08-20 17:11:53 [config.py:4679] disabled custom ops: Counter()
(VllmWorker rank=2 pid=582) DEBUG 08-20 17:11:53 [shm_broadcast.py:313] Connecting to ipc:///tmp/66ab1609-f3f2-42b7-ad51-4223f25f81a0
(VllmWorker rank=2 pid=582) DEBUG 08-20 17:11:53 [shm_broadcast.py:243] Binding to ipc:///tmp/803fe796-7b50-4ed3-b1df-7e7e6abb558d
(VllmWorker rank=2 pid=582) INFO 08-20 17:11:53 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, ‘psm_2ca1a7e7’), local_subscribe_addr=‘ipc:///tmp/803fe796-7b50-4ed3-b1df-7e7e6abb558d’, remote_subscribe_addr=None, remote_addr_ipv6=False)
DEBUG 08-20 17:11:53 [init.py:39] Available plugins for group vllm.general_plugins:
DEBUG 08-20 17:11:53 [init.py:41] - lora_filesystem_resolver → vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
DEBUG 08-20 17:11:53 [init.py:44] All plugins in this group will be loaded. SetVLLM_PLUGINSto control which plugins to load.
DEBUG 08-20 17:11:53 [init.py:39] Available plugins for group vllm.general_plugins:
DEBUG 08-20 17:11:53 [init.py:41] - lora_filesystem_resolver → vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
DEBUG 08-20 17:11:53 [init.py:44] All plugins in this group will be loaded. SetVLLM_PLUGINSto control which plugins to load.
DEBUG 08-20 17:11:53 [decorators.py:110] Inferred dynamic dimensions for forward method of <class ‘vllm.model_executor.models.llama.LlamaModel’>: [‘input_ids’, ‘positions’, ‘intermediate_tensors’, ‘inputs_embeds’]
DEBUG 08-20 17:11:53 [decorators.py:110] Inferred dynamic dimensions for forward method of <class ‘vllm.model_executor.models.llama_eagle3.LlamaModel’>: [‘input_ids’, ‘positions’, ‘hidden_states’]
DEBUG 08-20 17:11:53 [decorators.py:110] Inferred dynamic dimensions for forward method of <class ‘vllm.model_executor.models.llama.LlamaModel’>: [‘input_ids’, ‘positions’, ‘intermediate_tensors’, ‘inputs_embeds’]
DEBUG 08-20 17:11:53 [decorators.py:110] Inferred dynamic dimensions for forward method of <class ‘vllm.model_executor.models.llama_eagle3.LlamaModel’>: [‘input_ids’, ‘positions’, ‘hidden_states’]
(VllmWorker rank=2 pid=582) DEBUG 08-20 17:11:53 [parallel_state.py:918] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39641 backend=nccl
WARNING 08-20 17:11:53 [utils.py:2737] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7ffab1ecee70>
DEBUG 08-20 17:11:53 [config.py:4677] enabled custom ops: Counter()
DEBUG 08-20 17:11:53 [config.py:4679] disabled custom ops: Counter()
(VllmWorker rank=0 pid=580) DEBUG 08-20 17:11:53 [shm_broadcast.py:313] Connecting to ipc:///tmp/66ab1609-f3f2-42b7-ad51-4223f25f81a0
(VllmWorker rank=0 pid=580) DEBUG 08-20 17:11:53 [shm_broadcast.py:243] Binding to ipc:///tmp/2751cfa5-a07a-4696-b5b9-cc26dedfaa88
(VllmWorker rank=0 pid=580) INFO 08-20 17:11:53 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, ‘psm_86752b18’), local_subscribe_addr=‘ipc:///tmp/2751cfa5-a07a-4696-b5b9-cc26dedfaa88’, remote_subscribe_addr=None, remote_addr_ipv6=False)
WARNING 08-20 17:11:53 [utils.py:2737] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f80998dbec0>
DEBUG 08-20 17:11:53 [config.py:4677] enabled custom ops: Counter()
DEBUG 08-20 17:11:53 [config.py:4679] disabled custom ops: Counter()
(VllmWorker rank=3 pid=583) DEBUG 08-20 17:11:53 [shm_broadcast.py:313] Connecting to ipc:///tmp/66ab1609-f3f2-42b7-ad51-4223f25f81a0
DEBUG 08-20 17:11:53 [init.py:39] Available plugins for group vllm.general_plugins:
DEBUG 08-20 17:11:53 [init.py:41] - lora_filesystem_resolver → vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
DEBUG 08-20 17:11:53 [init.py:44] All plugins in this group will be loaded. SetVLLM_PLUGINSto control which plugins to load.
(VllmWorker rank=3 pid=583) DEBUG 08-20 17:11:53 [shm_broadcast.py:243] Binding to ipc:///tmp/7c48f4e9-75f4-49d7-ad10-c1dc40291d4d
(VllmWorker rank=3 pid=583) INFO 08-20 17:11:53 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, ‘psm_e7b220b6’), local_subscribe_addr=‘ipc:///tmp/7c48f4e9-75f4-49d7-ad10-c1dc40291d4d’, remote_subscribe_addr=None, remote_addr_ipv6=False)
DEBUG 08-20 17:11:53 [decorators.py:110] Inferred dynamic dimensions for forward method of <class ‘vllm.model_executor.models.llama.LlamaModel’>: [‘input_ids’, ‘positions’, ‘intermediate_tensors’, ‘inputs_embeds’]
DEBUG 08-20 17:11:53 [decorators.py:110] Inferred dynamic dimensions for forward method of <class ‘vllm.model_executor.models.llama_eagle3.LlamaModel’>: [‘input_ids’, ‘positions’, ‘hidden_states’]
WARNING 08-20 17:11:53 [utils.py:2737] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7fad21568b00>
DEBUG 08-20 17:11:53 [config.py:4677] enabled custom ops: Counter()
DEBUG 08-20 17:11:53 [config.py:4679] disabled custom ops: Counter()
(VllmWorker rank=1 pid=581) DEBUG 08-20 17:11:53 [shm_broadcast.py:313] Connecting to ipc:///tmp/66ab1609-f3f2-42b7-ad51-4223f25f81a0
(VllmWorker rank=0 pid=580) DEBUG 08-20 17:11:53 [parallel_state.py:918] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39641 backend=nccl
(VllmWorker rank=1 pid=581) DEBUG 08-20 17:11:53 [shm_broadcast.py:243] Binding to ipc:///tmp/32902809-7db0-497d-a4f3-59cfa109bfcd
(VllmWorker rank=1 pid=581) INFO 08-20 17:11:53 [shm_broadcast.py:289] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, ‘psm_e6d6d90e’), local_subscribe_addr=‘ipc:///tmp/32902809-7db0-497d-a4f3-59cfa109bfcd’, remote_subscribe_addr=None, remote_addr_ipv6=False)
(VllmWorker rank=3 pid=583) DEBUG 08-20 17:11:53 [parallel_state.py:918] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39641 backend=nccl
(VllmWorker rank=1 pid=581) DEBUG 08-20 17:11:54 [parallel_state.py:918] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39641 backend=nccl
(VllmWorker rank=1 pid=581) INFO 08-20 17:11:54 [utils.py:1126] Found nccl from library libnccl.so.2
(VllmWorker rank=1 pid=581) INFO 08-20 17:11:54 [pynccl.py:70] vLLM is using nccl==2.26.2
(VllmWorker rank=3 pid=583) INFO 08-20 17:11:54 [utils.py:1126] Found nccl from library libnccl.so.2
(VllmWorker rank=3 pid=583) INFO 08-20 17:11:54 [pynccl.py:70] vLLM is using nccl==2.26.2
(VllmWorker rank=2 pid=582) INFO 08-20 17:11:54 [utils.py:1126] Found nccl from library libnccl.so.2
(VllmWorker rank=0 pid=580) INFO 08-20 17:11:54 [utils.py:1126] Found nccl from library libnccl.so.2
(VllmWorker rank=2 pid=582) INFO 08-20 17:11:54 [pynccl.py:70] vLLM is using nccl==2.26.2
(VllmWorker rank=0 pid=580) INFO 08-20 17:11:54 [pynccl.py:70] vLLM is using nccl==2.26.2
DEBUG 08-20 17:11:57 [utils.py:485] Waiting for 1 local, 0 remote core engine proc(s) to start.
DEBUG 08-20 17:12:07 [utils.py:485] Waiting for 1 local, 0 remote core engine proc(s) to start.
DEBUG 08-20 17:12:17 [utils.py:485] Waiting for 1 local, 0 remote core engine proc(s) to start.
DEBUG 08-20 17:12:27 [utils.py:485] Waiting for 1 local, 0 remote core engine proc(s) to start.
DEBUG 08-20 17:12:37 [utils.py:485] Waiting for 1 local, 0 remote core engine proc(s) to start.
DEBUG 08-20 17:12:47 [utils.py:485] Waiting for 1 local, 0 remote core engine proc(s) to start.
DEBUG 08-20 17:12:57 [utils.py:485] Waiting for 1 local, 0 remote core engine proc(s) to start.