- sampler output, one per model forward pass, along with indicator of
- whether torch tensor in sampler output need to be transposed in latter
- sampler_output_to_torch logic.
- For multi step worker, this indicator shall be True.
- """
- self._raise_if_unsupported(execute_model_req)
- # Expand the batch for sequences with a bonus token.
- # Perform a forward pass on the expanded batch and filter the
- # response to retain only the original sequences' responses.
- expanded_request, indices_of_seq_with_bonus_tokens =\
- self._expand_execute_model_request(
- execute_model_req, seq_ids_with_bonus_token_in_last_step)
- # Run model sample_len times.
- model_outputs: List[SamplerOutput] = []
- if current_platform.is_cuda_alike() and isinstance(
- self.model_runner, TP1DraftModelRunner
- ) and self.model_runner.supports_gpu_multi_step(expanded_request):
- # Here we run the draft_model_runner with multi-step prepare
- # on the GPU directly