I am trying to run the benchmarks/disagg_benchmark but it seems like the requests are sent but not received. I get this error once the buffer is full.
Error:
Traceback (most recent call last):
File “/opt/conda/envs/py_3.12/lib/python3.12/threading.py”, line 1075, in _bootstrap_inner
self.run()
File “/opt/conda/envs/py_3.12/lib/python3.12/threading.py”, line 1012, in run
self._target(*self._args, **self._kwargs)
File “/var/lib/jenkins/vllm/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py”, line 181, in drop_select_handler
raise e
File “/var/lib/jenkins/vllm/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py”, line 140, in drop_select_handler
signal = self.signal_pipe.recv_tensor()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/var/lib/jenkins/vllm/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py”, line 269, in recv_tensor
raise e
File “/var/lib/jenkins/vllm/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py”, line 262, in recv_tensor
tensor = future.result()
^^^^^^^^^^^^^^^
File “/opt/conda/envs/py_3.12/lib/python3.12/concurrent/futures/_base.py”, line 456, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File “/opt/conda/envs/py_3.12/lib/python3.12/concurrent/futures/_base.py”, line 401, in __get_result
raise self._exception
File “/opt/conda/envs/py_3.12/lib/python3.12/concurrent/futures/thread.py”, line 59, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/var/lib/jenkins/vllm/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py”, line 192, in _recv_impl
metadata = self._recv_metadata()
^^^^^^^^^^^^^^^^^^^^^
File “/var/lib/jenkins/vllm/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py”, line 167, in _recv_metadata
return self.group.recv_obj(self.target_rank_for_recv)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/var/lib/jenkins/vllm/vllm/distributed/utils.py”, line 169, in recv_obj
self.store.get(
torch.distributed.DistStoreError: wait timeout after 300000ms, keys: /send_to/0/0