for comparison, I tried to run another model with similar parameters
vllm serve /mnt/nfs/models/Llama-2-7B-Chat-GPTQ/ --tensor-parallel-size 4 --host 0.0.0.0 --port 8000 --distributed-executor-backend ray --gpu-memory-utilization 0.9 --max-model-len 4096 --max-num-seqs 8 --max-num-batched-tokens 4096 --block-size 16 --dtype half
RunLLM help me
Error executing method 'load_model'. This might cause deadlock in distributed execution. [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] Traceback (most recent call last): [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/worker/worker_base.py", line 614, in execute_method [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] return run_method(self, method, args, kwargs) [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/utils/__init__.py", line 2736, in run_method [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] return func(*args, **kwargs) [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/model_executor/model_loader/base_loader.py", line 38, in load_model [repeated 6x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] self.model_runner.load_model() [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] self.model = get_model(vllm_config=self.vllm_config) [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/model_executor/model_loader/__init__.py", line 59, in get_model [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] return loader.load_model(vllm_config=vllm_config, [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] model = initialize_model(vllm_config=vllm_config, [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [repeated 4x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/model_executor/model_loader/utils.py", line 64, in initialize_model [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] return model_class(vllm_config=vllm_config, prefix=prefix) [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/model_executor/layers/linear.py", line 1223, in __init__ [repeated 12x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] self.model = self._init_model(vllm_config=vllm_config, [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 567, in _init_model [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] return LlamaModel(vllm_config=vllm_config, [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] self.start_layer, self.end_layer, self.layers = make_layers( [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/model_executor/models/utils.py", line 640, in make_layers [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}")) [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 348, in <lambda> [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] lambda prefix: layer_type(config=config, [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] self.mlp = LlamaMLP( [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] self.down_proj = RowParallelLinear( [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ^^^^^^^^^^^^^^^^^^ [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] self.quant_method.create_weights( [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/model_executor/layers/quantization/gptq.py", line 148, in create_weights [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] raise ValueError( [repeated 2x across cluster]
(RayWorkerWrapper pid=79744, ip=192.168.13.143) ERROR 07-23 09:14:10 [worker_base.py:622] ValueError: The input size is not aligned with the quantized weight shape. This can be caused by too large tensor parallel size. [repeated 2x across cluster]
[rank0]:[W723 09:14:12.981959726 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
Traceback (most recent call last):
File "/home/deepseek-fb/vllm_env/bin/vllm", line 8, in <module>
sys.exit(main())
^^^^^^
File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/entrypoints/cli/main.py", line 65, in main
args.dispatch_function(args)
File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/entrypoints/cli/serve.py", line 55, in cmd
uvloop.run(run_server(args))
File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/uvloop/__init__.py", line 109, in run
return __asyncio.run(
^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 194, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/uvloop/__init__.py", line 61, in wrapper
return await main
^^^^^^^^^^
File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1431, in run_server
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1451, in run_server_worker
async with build_async_engine_client(args, client_config) as engine_client:
File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 158, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
return await anext(self.gen)
^^^^^^^^^^^^^^^^^^^^^
File "/home/deepseek-fb/vllm_env/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 291, in build_async_engine_client_from_engine_args
raise RuntimeError(
RuntimeError: Engine process failed to start. See stack trace for the root cause.