`vllm.v1.outputs` ¶

Classes:

AsyncModelRunnerOutput –
LogprobsTensors –
ModelRunnerOutput –
RoutedExpertsLists –

CPU-side routed experts, the form :meth:RoutedExpertsManager.store_batch
RoutedExpertsTensors –

Device-side snapshot of routed experts data, pending async D2H.

Functions:

make_empty_encoder_model_runner_output –

Create a ModelRunnerOutput stub that contains the correct

`AsyncModelRunnerOutput` ¶

Bases: ABC

Methods:

get_output –

Get the ModelRunnerOutput for this async output.

Source code in vllm/v1/outputs.py

class AsyncModelRunnerOutput(ABC):
    @abstractmethod
    def get_output(self) -> ModelRunnerOutput:
        """Get the ModelRunnerOutput for this async output.

        This is a blocking call that waits until the results are ready, which
        might involve copying device tensors to the host.
        This method should only be called once per AsyncModelRunnerOutput.
        """
        pass

`get_output()` `abstractmethod` ¶

Get the ModelRunnerOutput for this async output.

This is a blocking call that waits until the results are ready, which might involve copying device tensors to the host. This method should only be called once per AsyncModelRunnerOutput.

Source code in vllm/v1/outputs.py

@abstractmethod
def get_output(self) -> ModelRunnerOutput:
    """Get the ModelRunnerOutput for this async output.

    This is a blocking call that waits until the results are ready, which
    might involve copying device tensors to the host.
    This method should only be called once per AsyncModelRunnerOutput.
    """
    pass

`LogprobsTensors` ¶

Bases: NamedTuple

Methods:

empty_cpu –

Create empty LogprobsTensors on CPU.
filter –

Filter the logprobs tensors with the given bool mask.

Source code in vllm/v1/outputs.py

class LogprobsTensors(NamedTuple):
    # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
    logprob_token_ids: torch.Tensor
    # [num_reqs x num_generated_tokens, max_num_logprobs + 1]
    logprobs: torch.Tensor
    # [num_reqs x num_generated_tokens]
    selected_token_ranks: torch.Tensor
    # [num_reqs]
    cu_num_generated_tokens: list[int] | None = None

    def tolists(self, cu_num_generated_tokens: list[int] | None = None):
        return LogprobsLists(
            self.logprob_token_ids.cpu().numpy(),
            self.logprobs.cpu().numpy(),
            self.selected_token_ranks.cpu().numpy(),
            cu_num_generated_tokens
            if cu_num_generated_tokens is not None
            else self.cu_num_generated_tokens,
        )

    def to_cpu_nonblocking(self) -> "LogprobsTensors":
        if self.logprob_token_ids.device.type == "cpu":
            return self
        return LogprobsTensors(
            self.logprob_token_ids.to("cpu", non_blocking=True),
            self.logprobs.to("cpu", non_blocking=True),
            self.selected_token_ranks.to("cpu", non_blocking=True),
            self.cu_num_generated_tokens,
        )

    def filter(self, mask: torch.Tensor) -> "LogprobsTensors":
        """Filter the logprobs tensors with the given bool mask."""
        assert self.cu_num_generated_tokens is None, (
            "filter can't be used with cu_num_generated_tokens"
        )
        return LogprobsTensors(
            self.logprob_token_ids[mask],
            self.logprobs[mask],
            self.selected_token_ranks[mask],
        )

    @staticmethod
    def empty_cpu(
        num_positions: int, num_tokens_per_position: int
    ) -> "LogprobsTensors":
        """Create empty LogprobsTensors on CPU."""

        logprob_token_ids = torch.empty(
            (num_positions, num_tokens_per_position), dtype=torch.int32, device="cpu"
        )
        logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
        selected_token_ranks = torch.empty(
            num_positions, dtype=torch.int32, device="cpu"
        )
        return LogprobsTensors(
            logprob_token_ids=logprob_token_ids,
            logprobs=logprobs,
            selected_token_ranks=selected_token_ranks,
        )

`empty_cpu(num_positions, num_tokens_per_position)` `staticmethod` ¶

Create empty LogprobsTensors on CPU.

Source code in vllm/v1/outputs.py

@staticmethod
def empty_cpu(
    num_positions: int, num_tokens_per_position: int
) -> "LogprobsTensors":
    """Create empty LogprobsTensors on CPU."""

    logprob_token_ids = torch.empty(
        (num_positions, num_tokens_per_position), dtype=torch.int32, device="cpu"
    )
    logprobs = torch.empty_like(logprob_token_ids, dtype=torch.float32)
    selected_token_ranks = torch.empty(
        num_positions, dtype=torch.int32, device="cpu"
    )
    return LogprobsTensors(
        logprob_token_ids=logprob_token_ids,
        logprobs=logprobs,
        selected_token_ranks=selected_token_ranks,
    )

`filter(mask)` ¶

Filter the logprobs tensors with the given bool mask.

Source code in vllm/v1/outputs.py

def filter(self, mask: torch.Tensor) -> "LogprobsTensors":
    """Filter the logprobs tensors with the given bool mask."""
    assert self.cu_num_generated_tokens is None, (
        "filter can't be used with cu_num_generated_tokens"
    )
    return LogprobsTensors(
        self.logprob_token_ids[mask],
        self.logprobs[mask],
        self.selected_token_ranks[mask],
    )

`ModelRunnerOutput` `dataclass` ¶

Methods:

with_kv_conn_output_only –

Return ModelRunnerOutput containing the provided KVConnectorOutput,

Source code in vllm/v1/outputs.py

@dataclass
class ModelRunnerOutput:
    # [num_reqs]
    req_ids: list[str]
    # req_id -> index
    req_id_to_index: dict[str, int]

    # num_reqs x num_generated_tokens
    # num_generated_tokens is the number of tokens
    # generated in the current step. It can be different for
    # each request due to speculative/jump decoding.
    sampled_token_ids: list[list[int]] = field(default_factory=list)

    # [num_reqs, max_num_logprobs + 1]
    # [num_reqs, max_num_logprobs + 1]
    # [num_reqs]
    logprobs: LogprobsLists | None = None

    # req_id -> (token_ids, logprobs, ranks)
    # [prompt_len, num_prompt_logprobs]
    # [prompt_len, num_prompt_logprobs]
    # [prompt_len]
    prompt_logprobs_dict: dict[str, LogprobsTensors | None] = field(
        default_factory=dict
    )

    # [num_reqs, hidden_size]
    pooler_output: list[torch.Tensor | None] | None = None

    kv_connector_output: KVConnectorOutput | None = None

    ec_connector_output: ECConnectorOutput | None = None

    # req_id -> num_nans_in_logits
    num_nans_in_logits: dict[str, int] | None = None

    # information related to cudagraph execution
    cudagraph_stats: CUDAGraphStat | None = None

    # Per-step routed experts data captured by the worker.
    # ``routing_data`` shape: (num_scheduled_tokens, num_layers,
    #                         num_experts_per_tok); expert IDs as uint8/uint16.
    # ``slot_mapping`` shape: (num_scheduled_tokens,); physical KV-cache
    #                         slot for each row of routing_data.
    # ``num_scheduled_tokens`` is step-level (total across all requests
    # in this step), not per-request. The scheduler persists this into
    # its slot buffer via ``slot_buffer[slot_mapping] = routing_data``.
    # ``None`` when ``enable_return_routed_experts`` is off.
    routed_experts: RoutedExpertsLists | None = None

    @staticmethod
    def with_kv_conn_output_only(
        kv_connector_output: KVConnectorOutput | None,
    ) -> "ModelRunnerOutput":
        """Return ModelRunnerOutput containing the provided KVConnectorOutput,
        otherwise empty. Returns None if kv_connector_output is passed as None.
        """
        if kv_connector_output is None or kv_connector_output.is_empty():
            return EMPTY_MODEL_RUNNER_OUTPUT
        output = copy(EMPTY_MODEL_RUNNER_OUTPUT)
        output.kv_connector_output = kv_connector_output
        return output

`with_kv_conn_output_only(kv_connector_output)` `staticmethod` ¶

Return ModelRunnerOutput containing the provided KVConnectorOutput, otherwise empty. Returns None if kv_connector_output is passed as None.

Source code in vllm/v1/outputs.py

@staticmethod
def with_kv_conn_output_only(
    kv_connector_output: KVConnectorOutput | None,
) -> "ModelRunnerOutput":
    """Return ModelRunnerOutput containing the provided KVConnectorOutput,
    otherwise empty. Returns None if kv_connector_output is passed as None.
    """
    if kv_connector_output is None or kv_connector_output.is_empty():
        return EMPTY_MODEL_RUNNER_OUTPUT
    output = copy(EMPTY_MODEL_RUNNER_OUTPUT)
    output.kv_connector_output = kv_connector_output
    return output

`RoutedExpertsLists` ¶

Bases: NamedTuple

CPU-side routed experts, the form :meth:RoutedExpertsManager.store_batch consumes.

Batched per scheduler step: the leading dim is the number of tokens scheduled across all requests in this step (total_num_scheduled_tokens), not per-request tokens. slot_mapping[i] tells the scheduler which physical KV-cache slot row i of routing_data belongs to.

Source code in vllm/v1/outputs.py

class RoutedExpertsLists(NamedTuple):
    """CPU-side routed experts, the form :meth:`RoutedExpertsManager.store_batch`
    consumes.

    Batched per scheduler step: the leading dim is the number of tokens
    scheduled across all requests in this step (``total_num_scheduled_tokens``),
    not per-request tokens. ``slot_mapping[i]`` tells the scheduler which
    physical KV-cache slot row ``i`` of ``routing_data`` belongs to.
    """

    # (num_scheduled_tokens, num_layers, num_experts_per_tok)
    routing_data: np.ndarray
    # (num_scheduled_tokens,)
    slot_mapping: np.ndarray

`RoutedExpertsTensors` ¶

Bases: NamedTuple

Device-side snapshot of routed experts data, pending async D2H.

Produced by :class:GPUModelRunner at the end of each async-scheduled step. The copy stream waits on the default stream, then issues non-blocking D2H via :meth:to_cpu_nonblocking into a pinned CPU buffer; :class:AsyncGPUModelRunnerOutput.get_output synchronizes the copy before the scheduler reads it.

Sliced to total_num_scheduled_tokens (step-level, across all requests — NOT per-request). Both routing_data and slot_mapping must be private clones when sourced from shared capturer / prepare-input buffers, so the next forward pass / _prepare_inputs on the default stream does not race with a D2H still pending on the copy stream.

Methods:

to_cpu_nonblocking –

Issue non-blocking D2H on the current stream.
tolists –

Convert to the numpy-backed form consumed by the scheduler.

Source code in vllm/v1/outputs.py

class RoutedExpertsTensors(NamedTuple):
    """Device-side snapshot of routed experts data, pending async D2H.

    Produced by :class:`GPUModelRunner` at the end of each async-scheduled
    step. The copy stream waits on the default stream, then issues
    non-blocking D2H via :meth:`to_cpu_nonblocking` into a pinned CPU
    buffer; :class:`AsyncGPUModelRunnerOutput.get_output` synchronizes
    the copy before the scheduler reads it.

    Sliced to ``total_num_scheduled_tokens`` (step-level, across all
    requests — NOT per-request). Both ``routing_data`` and
    ``slot_mapping`` must be private clones when sourced from shared
    capturer / prepare-input buffers, so the next forward pass /
    ``_prepare_inputs`` on the default stream does not race with a
    D2H still pending on the copy stream.
    """

    # (num_scheduled_tokens, num_layers, num_experts_per_tok)
    routing_data: torch.Tensor
    # (num_scheduled_tokens,)
    slot_mapping: torch.Tensor

    def to_cpu_nonblocking(self) -> "RoutedExpertsTensors":
        """Issue non-blocking D2H on the current stream.

        NOTE: ``non_blocking=True`` only delivers true overlap when the
        CPU target is pinned. The current fallback here allocates a
        new pageable CPU tensor per call, which silently degrades to a
        synchronous copy; acceptable because the sync happens on the
        dedicated copy stream, not the default stream.
        """
        if self.routing_data.device.type == "cpu":
            return self
        return RoutedExpertsTensors(
            self.routing_data.to("cpu", non_blocking=True),
            self.slot_mapping.to("cpu", non_blocking=True),
        )

    def tolists(self) -> "RoutedExpertsLists":
        """Convert to the numpy-backed form consumed by the scheduler.

        ``.cpu()`` is a no-op when the tensor is already on CPU, so this
        is cheap for the post-D2H case; for raw device tensors it will
        synchronously block, which is only reached in tests.
        """
        return RoutedExpertsLists(
            self.routing_data.cpu().numpy(),
            self.slot_mapping.cpu().numpy(),
        )

`to_cpu_nonblocking()` ¶

Issue non-blocking D2H on the current stream.

NOTE: non_blocking=True only delivers true overlap when the CPU target is pinned. The current fallback here allocates a new pageable CPU tensor per call, which silently degrades to a synchronous copy; acceptable because the sync happens on the dedicated copy stream, not the default stream.

Source code in vllm/v1/outputs.py

def to_cpu_nonblocking(self) -> "RoutedExpertsTensors":
    """Issue non-blocking D2H on the current stream.

    NOTE: ``non_blocking=True`` only delivers true overlap when the
    CPU target is pinned. The current fallback here allocates a
    new pageable CPU tensor per call, which silently degrades to a
    synchronous copy; acceptable because the sync happens on the
    dedicated copy stream, not the default stream.
    """
    if self.routing_data.device.type == "cpu":
        return self
    return RoutedExpertsTensors(
        self.routing_data.to("cpu", non_blocking=True),
        self.slot_mapping.to("cpu", non_blocking=True),
    )

`tolists()` ¶

Convert to the numpy-backed form consumed by the scheduler.

.cpu() is a no-op when the tensor is already on CPU, so this is cheap for the post-D2H case; for raw device tensors it will synchronously block, which is only reached in tests.

Source code in vllm/v1/outputs.py

def tolists(self) -> "RoutedExpertsLists":
    """Convert to the numpy-backed form consumed by the scheduler.

    ``.cpu()`` is a no-op when the tensor is already on CPU, so this
    is cheap for the post-D2H case; for raw device tensors it will
    synchronously block, which is only reached in tests.
    """
    return RoutedExpertsLists(
        self.routing_data.cpu().numpy(),
        self.slot_mapping.cpu().numpy(),
    )

`make_empty_encoder_model_runner_output(scheduler_output)` ¶

Create a ModelRunnerOutput stub that contains the correct per-request bookkeeping but no generated data yet.

Source code in vllm/v1/outputs.py

def make_empty_encoder_model_runner_output(
    scheduler_output: "SchedulerOutput",
) -> ModelRunnerOutput:
    """
    Create a ModelRunnerOutput stub that contains the correct
    per-request bookkeeping but no generated data yet.
    """
    if not scheduler_output.num_scheduled_tokens:
        return EMPTY_MODEL_RUNNER_OUTPUT

    # Convert to list so we get a deterministic, indexable sequence
    req_ids: list[str] = list(scheduler_output.num_scheduled_tokens.keys())

    # Give every request its own contiguous index
    req_id_to_index: dict[str, int] = {rid: idx for idx, rid in enumerate(req_ids)}

    # No tokens generated yet ⇒ one empty list per request
    sampled_token_ids: list[list[int]] = [[0] for _ in req_ids]

    # Pooler outputs are not available yet ⇒ use None placeholders
    pooler_output: list[torch.Tensor | None] = [None for _ in req_ids]

    return ModelRunnerOutput(
        req_ids=req_ids,
        req_id_to_index=req_id_to_index,
        sampled_token_ids=sampled_token_ids,
        pooler_output=pooler_output,
    )

vllm.v1.outputs ¶

AsyncModelRunnerOutput ¶

get_output() abstractmethod ¶

LogprobsTensors ¶

empty_cpu(num_positions, num_tokens_per_position) staticmethod ¶

filter(mask) ¶

ModelRunnerOutput dataclass ¶

with_kv_conn_output_only(kv_connector_output) staticmethod ¶

RoutedExpertsLists ¶

RoutedExpertsTensors ¶

to_cpu_nonblocking() ¶

tolists() ¶

make_empty_encoder_model_runner_output(scheduler_output) ¶

`vllm.v1.outputs` ¶

`AsyncModelRunnerOutput` ¶

`get_output()` `abstractmethod` ¶

`LogprobsTensors` ¶

`empty_cpu(num_positions, num_tokens_per_position)` `staticmethod` ¶

`filter(mask)` ¶

`ModelRunnerOutput` `dataclass` ¶

`with_kv_conn_output_only(kv_connector_output)` `staticmethod` ¶

`RoutedExpertsLists` ¶

`RoutedExpertsTensors` ¶

`to_cpu_nonblocking()` ¶

`tolists()` ¶

`make_empty_encoder_model_runner_output(scheduler_output)` ¶