The following error log from the previous comment as the forum cannot send more than 50000 words
(EngineCore_0 pid=6321) Process EngineCore_0:
(EngineCore_0 pid=6321) Traceback (most recent call last):
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/flashinfer/jit/cpp_ext.py", line 199, in run_ninja
(EngineCore_0 pid=6321) subprocess.run(
(EngineCore_0 pid=6321) File "/home/admin/.pyenv/versions/3.11.9/lib/python3.11/subprocess.py", line 571, in run
(EngineCore_0 pid=6321) raise CalledProcessError(retcode, process.args,
(EngineCore_0 pid=6321) subprocess.CalledProcessError: Command '['ninja', '-v', '-C', '/home/admin/.cache/flashinfer/80/cached_ops', '-f', '/home/admin/.cache/flashinfer/80/cached_ops/sampling/build.ninja']' returned non-zero exit status 1.
(EngineCore_0 pid=6321)
(EngineCore_0 pid=6321) The above exception was the direct cause of the following exception:
(EngineCore_0 pid=6321)
(EngineCore_0 pid=6321) Traceback (most recent call last):
(EngineCore_0 pid=6321) File "/home/admin/.pyenv/versions/3.11.9/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_0 pid=6321) self.run()
(EngineCore_0 pid=6321) File "/home/admin/.pyenv/versions/3.11.9/lib/python3.11/multiprocessing/process.py", line 108, in run
(EngineCore_0 pid=6321) self._target(*self._args, **self._kwargs)
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 704, in run_engine_core
(EngineCore_0 pid=6321) raise e
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 691, in run_engine_core
(EngineCore_0 pid=6321) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 492, in __init__
(EngineCore_0 pid=6321) super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 89, in __init__
(EngineCore_0 pid=6321) self._initialize_kv_caches(vllm_config)
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 179, in _initialize_kv_caches
(EngineCore_0 pid=6321) self.model_executor.determine_available_memory())
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/executor/abstract.py", line 76, in determine_available_memory
(EngineCore_0 pid=6321) output = self.collective_rpc("determine_available_memory")
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
(EngineCore_0 pid=6321) answer = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/utils/__init__.py", line 3007, in run_method
(EngineCore_0 pid=6321) return func(*args, **kwargs)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(EngineCore_0 pid=6321) return func(*args, **kwargs)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_worker.py", line 244, in determine_available_memory
(EngineCore_0 pid=6321) self.model_runner.profile_run()
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2627, in profile_run
(EngineCore_0 pid=6321) output = self._dummy_sampler_run(last_hidden_states)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
(EngineCore_0 pid=6321) return func(*args, **kwargs)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2470, in _dummy_sampler_run
(EngineCore_0 pid=6321) raise e
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2460, in _dummy_sampler_run
(EngineCore_0 pid=6321) sampler_output = self.sampler(logits=logits,
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
(EngineCore_0 pid=6321) return self._call_impl(*args, **kwargs)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
(EngineCore_0 pid=6321) return forward_call(*args, **kwargs)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/sample/sampler.py", line 68, in forward
(EngineCore_0 pid=6321) sampled = self.sample(logits, sampling_metadata)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/sample/sampler.py", line 135, in sample
(EngineCore_0 pid=6321) random_sampled = self.topk_topp_sampler(
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
(EngineCore_0 pid=6321) return self._call_impl(*args, **kwargs)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
(EngineCore_0 pid=6321) return forward_call(*args, **kwargs)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/sample/ops/topk_topp_sampler.py", line 109, in forward_cuda
(EngineCore_0 pid=6321) return flashinfer_sample(logits.contiguous(), k, p, generators)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/sample/ops/topk_topp_sampler.py", line 295, in flashinfer_sample
(EngineCore_0 pid=6321) next_token_ids = flashinfer.sampling.top_k_top_p_sampling_from_logits(
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/flashinfer/sampling.py", line 980, in top_k_top_p_sampling_from_logits
(EngineCore_0 pid=6321) masked_logits = top_k_mask_logits(logits, top_k)
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/flashinfer/sampling.py", line 1300, in top_k_mask_logits
(EngineCore_0 pid=6321) return get_sampling_module().top_k_mask_logits(
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/flashinfer/sampling.py", line 47, in get_sampling_module
(EngineCore_0 pid=6321) module = gen_sampling_module().build_and_load()
(EngineCore_0 pid=6321) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/flashinfer/jit/core.py", line 147, in build_and_load
(EngineCore_0 pid=6321) self.build(verbose, need_lock=False)
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/flashinfer/jit/core.py", line 127, in build
(EngineCore_0 pid=6321) run_ninja(jit_env.FLASHINFER_JIT_DIR, self.ninja_path, verbose)
(EngineCore_0 pid=6321) File "/home/admin/.venv/lib/python3.11/site-packages/flashinfer/jit/cpp_ext.py", line 211, in run_ninja
(EngineCore_0 pid=6321) raise RuntimeError(msg) from e
(EngineCore_0 pid=6321) RuntimeError: Ninja build failed. Ninja output:
(EngineCore_0 pid=6321) ninja: Entering directory `/home/admin/.cache/flashinfer/80/cached_ops'
(EngineCore_0 pid=6321) [1/4] /usr/bin/nvcc --generate-dependencies-with-compile --dependency-output sampling/renorm.cuda.o.d -DTORCH_EXTENSION_NAME=sampling -DTORCH_API_INCLUDE_EXTENSION_H -DPy_LIMITED_API=0x03090000 -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -D_GLIBCXX_USE_CXX11_ABI=1 -isystem /home/admin/.pyenv/versions/3.11.9/include/python3.11 -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/tools/util/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/spdlog/include --compiler-options=-fPIC --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -O3 -std=c++17 --threads=24 -use_fast_math -DFLASHINFER_ENABLE_F16 -DFLASHINFER_ENABLE_BF16 -DFLASHINFER_ENABLE_FP8_E4M3 -DFLASHINFER_ENABLE_FP8_E5M2 -DNDEBUG -c /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc/renorm.cu -o sampling/renorm.cuda.o
(EngineCore_0 pid=6321) FAILED: [code=1] sampling/renorm.cuda.o
(EngineCore_0 pid=6321) /usr/bin/nvcc --generate-dependencies-with-compile --dependency-output sampling/renorm.cuda.o.d -DTORCH_EXTENSION_NAME=sampling -DTORCH_API_INCLUDE_EXTENSION_H -DPy_LIMITED_API=0x03090000 -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -D_GLIBCXX_USE_CXX11_ABI=1 -isystem /home/admin/.pyenv/versions/3.11.9/include/python3.11 -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/tools/util/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/spdlog/include --compiler-options=-fPIC --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -O3 -std=c++17 --threads=24 -use_fast_math -DFLASHINFER_ENABLE_F16 -DFLASHINFER_ENABLE_BF16 -DFLASHINFER_ENABLE_FP8_E4M3 -DFLASHINFER_ENABLE_FP8_E5M2 -DNDEBUG -c /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc/renorm.cu -o sampling/renorm.cuda.o
(EngineCore_0 pid=6321) In file included from /usr/include/crt/math_functions.h:10551,
(EngineCore_0 pid=6321) from /usr/include/crt/common_functions.h:303,
(EngineCore_0 pid=6321) from /usr/include/cuda_runtime.h:118,
(EngineCore_0 pid=6321) from <command-line>:
(EngineCore_0 pid=6321) /usr/include/c++/11/cmath:45:15: fatal error: math.h: No such file or directory
(EngineCore_0 pid=6321) 45 | #include_next <math.h>
(EngineCore_0 pid=6321) | ^~~~~~~~
(EngineCore_0 pid=6321) compilation terminated.
(EngineCore_0 pid=6321) In file included from /usr/include/crt/math_functions.h:10551,
(EngineCore_0 pid=6321) from /usr/include/crt/common_functions.h:303,
(EngineCore_0 pid=6321) from /usr/include/cuda_runtime.h:118,
(EngineCore_0 pid=6321) from <command-line>:
(EngineCore_0 pid=6321) /usr/include/c++/11/cmath:45:15: fatal error: math.h: No such file or directory
(EngineCore_0 pid=6321) 45 | #include_next <math.h>
(EngineCore_0 pid=6321) | ^~~~~~~~
(EngineCore_0 pid=6321) compilation terminated.
(EngineCore_0 pid=6321) fatal : Could not open input file /tmp/tmpxft_0000193d_00000000-7_renorm.cpp1.ii
(EngineCore_0 pid=6321) [2/4] /usr/bin/nvcc --generate-dependencies-with-compile --dependency-output sampling/sampling.cuda.o.d -DTORCH_EXTENSION_NAME=sampling -DTORCH_API_INCLUDE_EXTENSION_H -DPy_LIMITED_API=0x03090000 -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -D_GLIBCXX_USE_CXX11_ABI=1 -isystem /home/admin/.pyenv/versions/3.11.9/include/python3.11 -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/tools/util/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/spdlog/include --compiler-options=-fPIC --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -O3 -std=c++17 --threads=24 -use_fast_math -DFLASHINFER_ENABLE_F16 -DFLASHINFER_ENABLE_BF16 -DFLASHINFER_ENABLE_FP8_E4M3 -DFLASHINFER_ENABLE_FP8_E5M2 -DNDEBUG -c /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc/sampling.cu -o sampling/sampling.cuda.o
(EngineCore_0 pid=6321) FAILED: [code=1] sampling/sampling.cuda.o
(EngineCore_0 pid=6321) /usr/bin/nvcc --generate-dependencies-with-compile --dependency-output sampling/sampling.cuda.o.d -DTORCH_EXTENSION_NAME=sampling -DTORCH_API_INCLUDE_EXTENSION_H -DPy_LIMITED_API=0x03090000 -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -D_GLIBCXX_USE_CXX11_ABI=1 -isystem /home/admin/.pyenv/versions/3.11.9/include/python3.11 -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/tools/util/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/spdlog/include --compiler-options=-fPIC --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -O3 -std=c++17 --threads=24 -use_fast_math -DFLASHINFER_ENABLE_F16 -DFLASHINFER_ENABLE_BF16 -DFLASHINFER_ENABLE_FP8_E4M3 -DFLASHINFER_ENABLE_FP8_E5M2 -DNDEBUG -c /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc/sampling.cu -o sampling/sampling.cuda.o
(EngineCore_0 pid=6321) In file included from /usr/include/crt/math_functions.h:10551,
(EngineCore_0 pid=6321) from /usr/include/crt/common_functions.h:303,
(EngineCore_0 pid=6321) from /usr/include/cuda_runtime.h:118,
(EngineCore_0 pid=6321) from <command-line>:
(EngineCore_0 pid=6321) /usr/include/c++/11/cmath:45:15: fatal error: math.h: No such file or directory
(EngineCore_0 pid=6321) 45 | #include_next <math.h>
(EngineCore_0 pid=6321) | ^~~~~~~~
(EngineCore_0 pid=6321) compilation terminated.
(EngineCore_0 pid=6321) In file included from /usr/include/crt/math_functions.h:10551,
(EngineCore_0 pid=6321) from /usr/include/crt/common_functions.h:303,
(EngineCore_0 pid=6321) from /usr/include/cuda_runtime.h:118,
(EngineCore_0 pid=6321) from <command-line>:
(EngineCore_0 pid=6321) /usr/include/c++/11/cmath:45:15: fatal error: math.h: No such file or directory
(EngineCore_0 pid=6321) 45 | #include_next <math.h>
(EngineCore_0 pid=6321) | ^~~~~~~~
(EngineCore_0 pid=6321) compilation terminated.
(EngineCore_0 pid=6321) fatal : Could not open input file /tmp/tmpxft_0000193c_00000000-7_sampling.cpp1.ii
(EngineCore_0 pid=6321) [3/4] /usr/bin/nvcc --generate-dependencies-with-compile --dependency-output sampling/flashinfer_sampling_ops.cuda.o.d -DTORCH_EXTENSION_NAME=sampling -DTORCH_API_INCLUDE_EXTENSION_H -DPy_LIMITED_API=0x03090000 -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -D_GLIBCXX_USE_CXX11_ABI=1 -isystem /home/admin/.pyenv/versions/3.11.9/include/python3.11 -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/tools/util/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/spdlog/include --compiler-options=-fPIC --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -O3 -std=c++17 --threads=24 -use_fast_math -DFLASHINFER_ENABLE_F16 -DFLASHINFER_ENABLE_BF16 -DFLASHINFER_ENABLE_FP8_E4M3 -DFLASHINFER_ENABLE_FP8_E5M2 -DNDEBUG -c /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc/flashinfer_sampling_ops.cu -o sampling/flashinfer_sampling_ops.cuda.o
(EngineCore_0 pid=6321) FAILED: [code=1] sampling/flashinfer_sampling_ops.cuda.o
(EngineCore_0 pid=6321) /usr/bin/nvcc --generate-dependencies-with-compile --dependency-output sampling/flashinfer_sampling_ops.cuda.o.d -DTORCH_EXTENSION_NAME=sampling -DTORCH_API_INCLUDE_EXTENSION_H -DPy_LIMITED_API=0x03090000 -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1016\" -D_GLIBCXX_USE_CXX11_ABI=1 -isystem /home/admin/.pyenv/versions/3.11.9/include/python3.11 -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include -isystem /home/admin/.venv/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /usr/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/cutlass/tools/util/include -isystem /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/spdlog/include --compiler-options=-fPIC --expt-relaxed-constexpr -gencode=arch=compute_80,code=sm_80 -O3 -std=c++17 --threads=24 -use_fast_math -DFLASHINFER_ENABLE_F16 -DFLASHINFER_ENABLE_BF16 -DFLASHINFER_ENABLE_FP8_E4M3 -DFLASHINFER_ENABLE_FP8_E5M2 -DNDEBUG -c /home/admin/.venv/lib/python3.11/site-packages/flashinfer/data/csrc/flashinfer_sampling_ops.cu -o sampling/flashinfer_sampling_ops.cuda.o
(EngineCore_0 pid=6321) In file included from /usr/include/crt/math_functions.h:10551,
(EngineCore_0 pid=6321) from /usr/include/crt/common_functions.h:303,
(EngineCore_0 pid=6321) from /usr/include/cuda_runtime.h:118,
(EngineCore_0 pid=6321) from <command-line>:
(EngineCore_0 pid=6321) /usr/include/c++/11/cmath:45:15: fatal error: math.h: No such file or directory
(EngineCore_0 pid=6321) 45 | #include_next <math.h>
(EngineCore_0 pid=6321) | ^~~~~~~~
(EngineCore_0 pid=6321) compilation terminated.
(EngineCore_0 pid=6321) In file included from /usr/include/crt/math_functions.h:10551,
(EngineCore_0 pid=6321) from /usr/include/crt/common_functions.h:303,
(EngineCore_0 pid=6321) from /usr/include/cuda_runtime.h:118,
(EngineCore_0 pid=6321) from <command-line>:
(EngineCore_0 pid=6321) /usr/include/c++/11/cmath:45:15: fatal error: math.h: No such file or directory
(EngineCore_0 pid=6321) 45 | #include_next <math.h>
(EngineCore_0 pid=6321) | ^~~~~~~~
(EngineCore_0 pid=6321) compilation terminated.
(EngineCore_0 pid=6321) fatal : Could not open input file /tmp/tmpxft_0000193e_00000000-7_flashinfer_sampling_ops.cpp1.ii
(EngineCore_0 pid=6321) ninja: build stopped: subcommand failed.
(EngineCore_0 pid=6321)
[rank0]:[W820 04:27:31.206065088 ProcessGroupNCCL.cpp:1479] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
(APIServer pid=6216) Traceback (most recent call last):
(APIServer pid=6216) File "/home/admin/.venv/bin/vllm", line 10, in <module>
(APIServer pid=6216) sys.exit(main())
(APIServer pid=6216) ^^^^^^
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/entrypoints/cli/main.py", line 54, in main
(APIServer pid=6216) args.dispatch_function(args)
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/entrypoints/cli/serve.py", line 50, in cmd
(APIServer pid=6216) uvloop.run(run_server(args))
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/uvloop/__init__.py", line 105, in run
(APIServer pid=6216) return runner.run(wrapper())
(APIServer pid=6216) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=6216) File "/home/admin/.pyenv/versions/3.11.9/lib/python3.11/asyncio/runners.py", line 118, in run
(APIServer pid=6216) return self._loop.run_until_complete(task)
(APIServer pid=6216) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=6216) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/uvloop/__init__.py", line 61, in wrapper
(APIServer pid=6216) return await main
(APIServer pid=6216) ^^^^^^^^^^
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 1850, in run_server
(APIServer pid=6216) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 1870, in run_server_worker
(APIServer pid=6216) async with build_async_engine_client(
(APIServer pid=6216) File "/home/admin/.pyenv/versions/3.11.9/lib/python3.11/contextlib.py", line 210, in __aenter__
(APIServer pid=6216) return await anext(self.gen)
(APIServer pid=6216) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 178, in build_async_engine_client
(APIServer pid=6216) async with build_async_engine_client_from_engine_args(
(APIServer pid=6216) File "/home/admin/.pyenv/versions/3.11.9/lib/python3.11/contextlib.py", line 210, in __aenter__
(APIServer pid=6216) return await anext(self.gen)
(APIServer pid=6216) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/entrypoints/openai/api_server.py", line 220, in build_async_engine_client_from_engine_args
(APIServer pid=6216) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=6216) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/utils/__init__.py", line 1557, in inner
(APIServer pid=6216) return fn(*args, **kwargs)
(APIServer pid=6216) ^^^^^^^^^^^^^^^^^^^
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 174, in from_vllm_config
(APIServer pid=6216) return cls(
(APIServer pid=6216) ^^^^
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/async_llm.py", line 120, in __init__
(APIServer pid=6216) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=6216) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
(APIServer pid=6216) return AsyncMPClient(*client_args)
(APIServer pid=6216) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 767, in __init__
(APIServer pid=6216) super().__init__(
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 446, in __init__
(APIServer pid=6216) with launch_core_engines(vllm_config, executor_class,
(APIServer pid=6216) File "/home/admin/.pyenv/versions/3.11.9/lib/python3.11/contextlib.py", line 144, in __exit__
(APIServer pid=6216) next(self.gen)
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/utils.py", line 706, in launch_core_engines
(APIServer pid=6216) wait_for_engine_startup(
(APIServer pid=6216) File "/home/admin/.venv/lib/python3.11/site-packages/vllm/v1/engine/utils.py", line 759, in wait_for_engine_startup
(APIServer pid=6216) raise RuntimeError("Engine core initialization failed. "
(APIServer pid=6216) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}