Promblem Description:
when i use swift rollout
to launch a vllm server with data-parallel-size = 2,i got some error like:
File “/usr/local/lib/python3.10/dist-packages/vllm/executor/uniproc_executor.py”, line 57, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
File “/usr/local/lib/python3.10/dist-packages/vllm/utils/init.py”, line 2736, in run_method
return func(*args, **kwargs)
File “/usr/local/lib/python3.10/dist-packages/vllm/worker/worker_base.py”, line 606, in init_device
self.worker.init_device() # type: ignore
File “/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py”, line 193, in init_device
init_worker_distributed_environment(self.vllm_config, self.rank,
File “/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py”, line 534, in init_worker_distributed_environment
ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
File “/usr/local/lib/python3.10/dist-packages/vllm/distributed/parallel_state.py”, line 1095, in ensure_model_parallel_initialized
initialize_model_parallel(tensor_model_parallel_size,
File “/usr/local/lib/python3.10/dist-packages/vllm/distributed/parallel_state.py”, line 1061, in initialize_model_parallel
_DP = init_model_parallel_group(group_ranks,
File “/usr/local/lib/python3.10/dist-packages/vllm/distributed/parallel_state.py”, line 832, in init_model_parallel_group
return GroupCoordinator(
File “/usr/local/lib/python3.10/dist-packages/vllm/distributed/parallel_state.py”, line 255, in init
self.device_communicator = device_comm_cls(
File “/usr/local/lib/python3.10/dist-packages/vllm/distributed/device_communicators/cuda_communicator.py”, line 50, in init
self.pynccl_comm = PyNcclCommunicator(
File “/usr/local/lib/python3.10/dist-packages/vllm/distributed/device_communicators/pynccl.py”, line 100, in init
self.comm: ncclComm_t = self.nccl.ncclCommInitRank(
File “/usr/local/lib/python3.10/dist-packages/vllm/distributed/device_communicators/pynccl_wrapper.py”, line 286, in ncclCommInitRank
self.NCCL_CHECK(self._funcs[“ncclCommInitRank”](ctypes.byref(comm),
File “/usr/local/lib/python3.10/dist-packages/vllm/distributed/device_communicators/pynccl_wrapper.py”, line 257, in NCCL_CHECK
raise RuntimeError(f"NCCL error: {error_str}")
RuntimeError: NCCL error: invalid usage (run with NCCL_DEBUG=WARN for details)
I read ms-swift’s source code, then i reproduce this error with the script below:
from multiprocessing import Pipe, Process
from vllm import SamplingParams, EngineArgs, LLMEngine
import os
DP = 2
connections = []
processes = []
master_port=29503
def llm_worker(data_parallel_rank: int, master_port: int, connection) -> None:
# Set required environment variables for DP to work with vLLM
os.environ['VLLM_DP_RANK'] = str(data_parallel_rank)
os.environ['VLLM_DP_RANK_LOCAL'] = str(data_parallel_rank)
os.environ['VLLM_DP_SIZE'] = str(DP)
os.environ['VLLM_DP_MASTER_PORT'] = str(master_port)
kwargs = {'model': '/path/to/qwen3_1.7b', 'dtype': 'float16', 'gpu_memory_utilization': 0.9, 'tensor_parallel_size': 1, 'pipeline_parallel_size': 1, 'max_model_len': None, 'max_num_seqs': 256, 'disable_log_stats': True, 'disable_custom_all_reduce': True, 'enforce_eager': False, 'trust_remote_code': True, 'enable_prefix_caching': False, 'distributed_executor_backend': None, 'device': 'cuda:0', 'seed': None, 'quantization': None, 'worker_extension_cls': 'trl.scripts.vllm_serve.WeightSyncWorkerExtension', 'enable_chunked_prefill': False, 'enable_sleep_mode': False}
args = EngineArgs(**kwargs)
engine = LLMEngine.from_engine_args(args)
# engine = SwiftRolloutDeploy.get_infer_engine(args, template=args.get_template(None))
for data_parallel_rank in range(DP):
parent_conn, child_conn = Pipe()
process = Process(target=llm_worker, args=(data_parallel_rank, master_port, child_conn))
process.start()
connections.append(parent_conn)
processes.append(process)
only when DP set to 2,the error above will occur; if i set DP to 1,the code will finish successfully.
Environment:
deepspeed==0.17.2
transformers==4.52.4
vllm==0.9.2
torch==2.7.0
GPU:8 * V100
NCCL:2.26(用的是python的nvidia-nccl-cu12==2.26.2)
CUDA:12.6(用的是python的nvidia-cuda-cupti-cu12==12.6.80)
系统:Ubuntu22.04