vllm serve /dfs/data/model/bge-m3/ --task embed --trust-remote-code --max-model-len 8192
Using the above command to start in the CPU environment will result in an error:
Traceback (most recent call last):
File “/dfs/data/miniconda/envs/Qwen2-New/bin/vllm”, line 8, in
sys.exit(main())
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py”, line 53, in main
args.dispatch_function(args)
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/cli/serve.py”, line 27, in cmd
uvloop.run(run_server(args))
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/uvloop/init.py”, line 82, in run
return loop.run_until_complete(wrapper())
File “uvloop/loop.pyx”, line 1517, in uvloop.loop.Loop.run_until_complete
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/uvloop/init.py”, line 61, in wrapper
return await main
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py”, line 1078, in run_server
async with build_async_engine_client(args) as engine_client:
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/contextlib.py”, line 199, in aenter
return await anext(self.gen)
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py”, line 146, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/contextlib.py”, line 199, in aenter
return await anext(self.gen)
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py”, line 166, in build_async_engine_client_from_engine_args
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/engine/arg_utils.py”, line 1098, in create_engine_config
device_config = DeviceConfig(device=self.device)
File “”, line 4, in init
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/config.py”, line 2119, in post_init
raise RuntimeError(
RuntimeError: Failed to infer device type, please set the environment variable VLLM_LOGGING_LEVEL=DEBUG
to turn on verbose logging to help debug the issue.
add --device Still will report an error:
Traceback (most recent call last):
File “/dfs/data/miniconda/envs/Qwen2-New/bin/vllm”, line 8, in
sys.exit(main())
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py”, line 53, in main
args.dispatch_function(args)
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/cli/serve.py”, line 27, in cmd
uvloop.run(run_server(args))
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/uvloop/init.py”, line 82, in run
return loop.run_until_complete(wrapper())
File “uvloop/loop.pyx”, line 1517, in uvloop.loop.Loop.run_until_complete
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/uvloop/init.py”, line 61, in wrapper
return await main
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py”, line 1078, in run_server
async with build_async_engine_client(args) as engine_client:
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/contextlib.py”, line 199, in aenter
return await anext(self.gen)
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py”, line 146, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/contextlib.py”, line 199, in aenter
return await anext(self.gen)
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py”, line 166, in build_async_engine_client_from_engine_args
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/engine/arg_utils.py”, line 1273, in create_engine_config
config = VllmConfig(
File “”, line 19, in init
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/config.py”, line 3848, in post_init
self.model_config.verify_async_output_proc(self.parallel_config,
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/config.py”, line 902, in verify_async_output_proc
if not current_platform.is_async_output_supported(self.enforce_eager):
File “/dfs/data/miniconda/envs/Qwen2-New/lib/python3.10/site-packages/vllm/platforms/interface.py”, line 214, in is_async_output_supported
raise NotImplementedError
NotImplementedError
Does vllm support inference or service startup of CPU small model?