`vllm.v1.attention.ops.common` ¶

Classes:

CPTritonContext –

The CPTritonContext is used to avoid recompilation of the Triton JIT.

Functions:

correct_attn_out –

Correct the attention output using the all-gathered lses.
cp_lse_ag_out_ar –

cp_attn_out: [ B, H, D ]
cp_lse_ag_out_rs –

cp_attn_out: [ B, H, D ]
pack_seq_triton –

Pack sequences of different lengths into a batched tensor.
unpack_seq_triton –

Unpack a packed decode query tensor back to the original format.

`CPTritonContext` ¶

The CPTritonContext is used to avoid recompilation of the Triton JIT.

Source code in vllm/v1/attention/ops/common.py

class CPTritonContext:
    """The CPTritonContext is used to avoid recompilation of the Triton JIT."""

    def __init__(self):
        self.inner_kernel = None

    def call_kernel(self, kernel, grid, *regular_args, **const_args):
        if self.inner_kernel is None:
            self.inner_kernel = kernel[grid](*regular_args, **const_args)
        else:
            self.inner_kernel[grid](*regular_args)

`_correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr, vlse_ptr, outputs_stride_B, outputs_stride_H, outputs_stride_D, lses_stride_N, lses_stride_B, lses_stride_H, lse_idx, HEAD_DIM, N_ROUNDED, IS_BASE_E)` ¶

Apply the all-gathered lses to correct each local rank's attention output. we still need perform a cross-rank reduction to obtain the final attention output.

Parameters:

outputs_ptr ¶
(PointerType) –

Pointer to input tensor of shape [ B, H, D ]
lses_ptr ¶
(PointerType) –

Pointer to input tensor of shape [ N, B, H ]
new_output_ptr ¶
(PointerType) –

Pointer to output tensor of shape [ B, H, D ]
vlse_ptr ¶
(PointerType) –

Pointer to output tensor of shape [ B, H ]

Source code in vllm/v1/attention/ops/common.py

@triton.jit
def _correct_attn_cp_out_kernel(
    outputs_ptr,
    new_output_ptr,
    lses_ptr,
    vlse_ptr,
    outputs_stride_B,
    outputs_stride_H,
    outputs_stride_D,
    lses_stride_N,
    lses_stride_B,
    lses_stride_H,
    lse_idx,
    HEAD_DIM: tl.constexpr,
    N_ROUNDED: tl.constexpr,
    IS_BASE_E: tl.constexpr,
):
    """
    Apply the all-gathered lses to correct each local rank's attention
    output. we still need perform a cross-rank reduction to obtain the
    final attention output.

    Args:
        outputs_ptr (triton.PointerType):
            Pointer to input tensor of shape [ B, H, D ]
        lses_ptr (triton.PointerType):
            Pointer to input tensor of shape [ N, B, H ]
        new_output_ptr (triton.PointerType):
            Pointer to output tensor of shape [ B, H, D ]
        vlse_ptr (triton.PointerType):
            Pointer to output tensor of shape [ B, H ]
    """
    batch_idx = tl.program_id(axis=0).to(tl.int64)
    head_idx = tl.program_id(axis=1).to(tl.int64)
    d_offsets = tl.arange(0, HEAD_DIM)
    num_n_offsets = tl.arange(0, N_ROUNDED)

    # shape = [N]
    lse_offsets = (
        num_n_offsets * lses_stride_N
        + batch_idx * lses_stride_B
        + head_idx * lses_stride_H
    )

    # calc final lse
    lse = tl.load(lses_ptr + lse_offsets)
    lse = tl.where((lse != lse) | (lse == float("inf")), -float("inf"), lse)
    lse_max = tl.max(lse, axis=0)
    lse_max = tl.where(lse_max == -float("inf"), 0, lse_max)
    lse -= lse_max
    if IS_BASE_E:
        lse_exp = tl.exp(lse)
        lse_acc = tl.sum(lse_exp, axis=0)
        lse = tl.log(lse_acc)
    else:
        lse_exp = tl.exp2(lse)
        lse_acc = tl.sum(lse_exp, axis=0)
        lse = tl.log2(lse_acc)
    lse += lse_max

    lse_offsets = batch_idx * lses_stride_B + head_idx * lses_stride_H
    tl.store(vlse_ptr + lse_offsets, lse)

    # shape = [D]
    output_offsets = (
        batch_idx * outputs_stride_B
        + head_idx * outputs_stride_H
        + d_offsets * outputs_stride_D
    )

    # correct output
    lse_offset = (
        lse_idx * lses_stride_N + batch_idx * lses_stride_B + head_idx * lses_stride_H
    )
    lse_tmp = tl.load(lses_ptr + lse_offset)
    lse_finally = lse_tmp - lse
    lse_finally = tl.where(
        (lse_finally != lse_finally) | (lse_finally == float("inf")),
        -float("inf"),
        lse_finally,
    )
    factor = tl.exp(lse_finally) if IS_BASE_E else tl.exp2(lse_finally)
    output = tl.load(outputs_ptr + output_offsets)
    output = output * factor

    tl.store(new_output_ptr + output_offsets, output)

`_cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=None, is_lse_base_on_e=True)` ¶

cp_attn_out: [ B, H, D ] cp_attn_lse: [ B, H ]

Source code in vllm/v1/attention/ops/common.py

def _cp_lse_common(
    cp_attn_out: torch.Tensor,
    cp_attn_lse: torch.Tensor,
    cp_group: GroupCoordinator,
    ctx: CPTritonContext | None = None,
    is_lse_base_on_e=True,
):
    """
    cp_attn_out: [ B, H, D ]
    cp_attn_lse: [ B, H ]
    """
    if cp_group.world_size == 1:
        return cp_attn_out

    if ctx is None:
        ctx = CPTritonContext()

    cp_attn_lse = cp_attn_lse.contiguous()
    lses = cp_group.all_gather(cp_attn_lse, dim=0).reshape(
        (cp_group.world_size,) + cp_attn_lse.shape
    )
    out, lse = correct_attn_out(
        cp_attn_out,
        lses,
        cp_group.rank_in_group,
        ctx,
        is_lse_base_on_e=is_lse_base_on_e,
    )
    return out, lse

`correct_attn_out(out, lses, cp_rank, ctx, is_lse_base_on_e=True)` ¶

Correct the attention output using the all-gathered lses.

Parameters:

out ¶
(Tensor) –

Tensor of shape [ B, H, D ]
lses ¶
(Tensor) –

Tensor of shape [ N, B, H ]
cp_rank ¶
(int) –

Current rank in the context-parallel group
ctx ¶
(CPTritonContext) –

Triton context to avoid recompilation

Returns:

tuple[Tensor, Tensor] –

Tuple of (out, lse) with corrected attention and final log-sum-exp.

Source code in vllm/v1/attention/ops/common.py

def correct_attn_out(
    out: torch.Tensor,
    lses: torch.Tensor,
    cp_rank: int,
    ctx: CPTritonContext,
    is_lse_base_on_e: bool = True,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Correct the attention output using the all-gathered lses.

    Args:
        out: Tensor of shape [ B, H, D ]
        lses: Tensor of shape [ N, B, H ]
        cp_rank: Current rank in the context-parallel group
        ctx: Triton context to avoid recompilation

    Returns:
        Tuple of (out, lse) with corrected attention and final log-sum-exp.
    """
    if ctx is None:
        ctx = CPTritonContext()

    # --- Normalize to 3D views ---
    if out.ndim == 4 and out.shape[1] == 1:
        out = out.squeeze(1)
    assert out.ndim == 3, f"expected out [B,H,D] or [B,1,H,D], got {tuple(out.shape)}"

    if lses.ndim == 4 and lses.shape[-1] == 1:
        lses = lses.squeeze(-1)
    if lses.ndim == 4 and lses.shape[1] == 1:
        lses = lses.squeeze(1)
    assert lses.ndim == 3, (
        f"expected lses [N,B,H] (optionally with a 1-sized extra dim), "
        f"got {tuple(lses.shape)}"
    )

    B, H, D = out.shape
    N = lses.shape[0]

    # Strides after we normalized shapes to 3-D views.  The kernel computes
    # offsets for `vlse_ptr` using lses_stride_B/H, so the output buffer must
    # have the same B/H stride layout as a slice of `lses`.
    o_sB, o_sH, o_sD = out.stride()
    l_sN, l_sB, l_sH = lses.stride()

    # Allocate LSE with the same B/H strides as `lses` so writes land correctly
    # even when `lses` is a non-contiguous view (e.g., 4-D to 3-D squeeze).
    lse = torch.empty_strided(
        (B, H), (l_sB, l_sH), device=lses.device, dtype=lses.dtype
    )

    # Kernel launch config
    grid = (B, H, 1)

    regular_args = (
        out,
        out,
        lses,
        lse,
        o_sB,
        o_sH,
        o_sD,
        l_sN,
        l_sB,
        l_sH,
        cp_rank,
    )
    const_args = {"HEAD_DIM": D, "N_ROUNDED": N, "IS_BASE_E": is_lse_base_on_e}
    ctx.call_kernel(_correct_attn_cp_out_kernel, grid, *regular_args, **const_args)
    return out, lse

`cp_lse_ag_out_ar(cp_attn_out, cp_attn_lse, cp_group, ctx=None, return_lse=False, is_lse_base_on_e=True)` ¶

cp_attn_out: [ B, H, D ] cp_attn_lse: [ B, H ]

Source code in vllm/v1/attention/ops/common.py

def cp_lse_ag_out_ar(
    cp_attn_out: torch.Tensor,
    cp_attn_lse: torch.Tensor,
    cp_group: GroupCoordinator,
    ctx: CPTritonContext | None = None,
    return_lse: bool = False,
    is_lse_base_on_e=True,
):
    """
    cp_attn_out: [ B, H, D ]
    cp_attn_lse: [ B, H ]
    """
    out, lse = _cp_lse_common(
        cp_attn_out, cp_attn_lse, cp_group, ctx=ctx, is_lse_base_on_e=is_lse_base_on_e
    )
    out = cp_group.all_reduce(out)

    if return_lse:
        return out, lse
    return out

`cp_lse_ag_out_rs(cp_attn_out, cp_attn_lse, cp_group, ctx=None, return_lse=False, is_lse_base_on_e=True)` ¶

cp_attn_out: [ B, H, D ] cp_attn_lse: [ B, H ]

Source code in vllm/v1/attention/ops/common.py

def cp_lse_ag_out_rs(
    cp_attn_out: torch.Tensor,
    cp_attn_lse: torch.Tensor,
    cp_group: GroupCoordinator,
    ctx: CPTritonContext | None = None,
    return_lse: bool = False,
    is_lse_base_on_e=True,
):
    """
    cp_attn_out: [ B, H, D ]
    cp_attn_lse: [ B, H ]
    """
    out, lse = _cp_lse_common(
        cp_attn_out, cp_attn_lse, cp_group, ctx=ctx, is_lse_base_on_e=is_lse_base_on_e
    )
    out = cp_group.reduce_scatter(out, dim=1)

    if return_lse:
        cp_num_heads = lse.shape[1] // cp_group.world_size
        cp_rank = cp_group.rank_in_group
        lse = lse[:, cp_num_heads * cp_rank : cp_num_heads * (cp_rank + 1)]
        return out, lse
    return out

`pack_seq_triton(x, lengths, pad_value=-float('inf'), block_t=64, block_d=64)` ¶

Pack sequences of different lengths into a batched tensor.

Supports float dtypes (any, via fp32 pad) and torch.uint8 (exact-byte pad — e.g. MXFP4 packed nibbles or ue8m0 scale bytes). For uint8 inputs pad_value must be an integer in [0, 255].

Parameters:

x ¶
(Tensor) –

[N, ...] — input tensor where N is total number of tokens.
lengths ¶
(Tensor) –

[B] — sequence lengths for each batch.
pad_value ¶
(float | int, default: -float('inf') ) –

value to use for padding. Defaults to -inf which is only sensible for float dtypes; pass 0 (or any byte) for uint8 inputs.
block_t ¶
(int, default: 64 ) –

block size for time dimension.
block_d ¶
(int, default: 64 ) –

block size for feature dimension.

Returns:

packed ( Tensor ) –

[B, Lmax, ...] — packed tensor.

Source code in vllm/v1/attention/ops/common.py

def pack_seq_triton(
    x: torch.Tensor,
    lengths: torch.Tensor,
    pad_value: float | int = -float("inf"),
    block_t: int = 64,
    block_d: int = 64,
) -> torch.Tensor:
    """Pack sequences of different lengths into a batched tensor.

    Supports float dtypes (any, via fp32 pad) and ``torch.uint8`` (exact-byte
    pad — e.g. MXFP4 packed nibbles or ue8m0 scale bytes). For uint8 inputs
    ``pad_value`` must be an integer in ``[0, 255]``.

    Args:
        x: [N, ...] — input tensor where N is total number of tokens.
        lengths: [B] — sequence lengths for each batch.
        pad_value: value to use for padding. Defaults to ``-inf`` which is
            only sensible for float dtypes; pass ``0`` (or any byte) for
            uint8 inputs.
        block_t: block size for time dimension.
        block_d: block size for feature dimension.

    Returns:
        packed: [B, Lmax, ...] — packed tensor.
    """
    is_uint8 = x.dtype == torch.uint8
    if is_uint8:
        assert isinstance(pad_value, int) and 0 <= pad_value <= 255, (
            f"uint8 pack requires an integer pad in [0, 255], got {pad_value!r}"
        )
        pad_constexpr: int | float = int(pad_value)
    else:
        pad_constexpr = float(pad_value)

    # Handle multi-dimensional input by reshaping to (N, -1)
    original_shape = x.shape
    if len(original_shape) > 2:
        N = original_shape[0]
        x_reshaped = x.reshape(N, -1)
        D = x_reshaped.shape[1]
    else:
        N, D = x.shape
        x_reshaped = x

    B = lengths.numel()
    Lmax = int(lengths.max().item())

    out = torch.empty((B, Lmax, D), device=x.device, dtype=x.dtype)

    grid = (B, triton.cdiv(Lmax, block_t), triton.cdiv(D, block_d))
    _pack_seq_kernel[grid](
        x_reshaped,
        out,
        lengths.int(),
        N,
        D,
        Lmax,
        PAD_VALUE=pad_constexpr,
        PAD_IS_UINT8=is_uint8,
        BLOCK_T=block_t,
        BLOCK_D=block_d,
        num_warps=4,
        num_stages=2,
    )

    if len(original_shape) > 2:
        out = out.reshape((B, Lmax) + original_shape[1:])

    return out

`unpack_seq_triton(packed_tensor, lengths, block_t=64, block_d=64)` ¶

Unpack a packed decode query tensor back to the original format. Efficient Triton implementation.

Parameters:

packed_tensor ¶
(Tensor) –

[B, Lmax, ...] - packed tensor from pack_seq_triton
lengths ¶
(Tensor) –

[B] - sequence lengths for each batch
block_t ¶
(int, default: 64 ) –

block size for time dimension
block_d ¶
(int, default: 64 ) –

block size for feature dimension

Returns:

unpacked_tensor ( Tensor ) –

[N, ...] where N = sum(lengths)

Source code in vllm/v1/attention/ops/common.py

def unpack_seq_triton(
    packed_tensor: torch.Tensor,
    lengths: torch.Tensor,
    block_t: int = 64,
    block_d: int = 64,
) -> torch.Tensor:
    """
    Unpack a packed decode query tensor back to the original format.
    Efficient Triton implementation.

    Args:
        packed_tensor: [B, Lmax, ...] - packed tensor from pack_seq_triton
        lengths: [B] - sequence lengths for each batch
        block_t: block size for time dimension
        block_d: block size for feature dimension

    Returns:
        unpacked_tensor: [N, ...] where N = sum(lengths)
    """

    # Handle multi-dimensional input by reshaping to (B, Lmax, -1)
    original_shape = packed_tensor.shape
    if len(original_shape) > 3:
        B, Lmax = original_shape[:2]
        packed_reshaped = packed_tensor.reshape(B, Lmax, -1)
        D = packed_reshaped.shape[2]
    else:
        B, Lmax, D = packed_tensor.shape
        packed_reshaped = packed_tensor

    # Calculate total number of elements
    N = int(lengths.sum().item())

    out = torch.empty((N, D), device=packed_tensor.device, dtype=packed_tensor.dtype)

    grid = (B, triton.cdiv(Lmax, block_t), triton.cdiv(D, block_d))
    _unpack_seq_triton_kernel[grid](
        packed_reshaped,
        out,
        lengths.int(),
        B,
        Lmax,
        D,
        BLOCK_T=block_t,
        BLOCK_D=block_d,
        num_warps=4,
        num_stages=2,
    )

    # Reshape output back to original dimensions (except first dimension)
    if len(original_shape) > 3:
        output_shape = (N,) + original_shape[2:]
        out = out.reshape(output_shape)

    return out

`vllm.v1.attention.ops.common` ¶

`CPTritonContext` ¶

`_correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr, vlse_ptr, outputs_stride_B, outputs_stride_H, outputs_stride_D, lses_stride_N, lses_stride_B, lses_stride_H, lse_idx, HEAD_DIM, N_ROUNDED, IS_BASE_E)` ¶

`outputs_ptr` ¶

`lses_ptr` ¶

`new_output_ptr` ¶

`vlse_ptr` ¶

`_cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=None, is_lse_base_on_e=True)` ¶

`correct_attn_out(out, lses, cp_rank, ctx, is_lse_base_on_e=True)` ¶

`out` ¶

`lses` ¶

`cp_rank` ¶

`ctx` ¶

`cp_lse_ag_out_ar(cp_attn_out, cp_attn_lse, cp_group, ctx=None, return_lse=False, is_lse_base_on_e=True)` ¶

`cp_lse_ag_out_rs(cp_attn_out, cp_attn_lse, cp_group, ctx=None, return_lse=False, is_lse_base_on_e=True)` ¶

`pack_seq_triton(x, lengths, pad_value=-float('inf'), block_t=64, block_d=64)` ¶

`x` ¶

`lengths` ¶

`pad_value` ¶

`block_t` ¶

`block_d` ¶

`unpack_seq_triton(packed_tensor, lengths, block_t=64, block_d=64)` ¶

`packed_tensor` ¶

`lengths` ¶

`block_t` ¶

`block_d` ¶

vllm.v1.attention.ops.common ¶

CPTritonContext ¶

_correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr, vlse_ptr, outputs_stride_B, outputs_stride_H, outputs_stride_D, lses_stride_N, lses_stride_B, lses_stride_H, lse_idx, HEAD_DIM, N_ROUNDED, IS_BASE_E) ¶

outputs_ptr ¶

lses_ptr ¶

new_output_ptr ¶

vlse_ptr ¶

_cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=None, is_lse_base_on_e=True) ¶

correct_attn_out(out, lses, cp_rank, ctx, is_lse_base_on_e=True) ¶

out ¶

lses ¶

cp_rank ¶

ctx ¶

cp_lse_ag_out_ar(cp_attn_out, cp_attn_lse, cp_group, ctx=None, return_lse=False, is_lse_base_on_e=True) ¶

cp_lse_ag_out_rs(cp_attn_out, cp_attn_lse, cp_group, ctx=None, return_lse=False, is_lse_base_on_e=True) ¶

pack_seq_triton(x, lengths, pad_value=-float('inf'), block_t=64, block_d=64) ¶

x ¶

lengths ¶

pad_value ¶

block_t ¶

block_d ¶

unpack_seq_triton(packed_tensor, lengths, block_t=64, block_d=64) ¶

packed_tensor ¶

lengths ¶

block_t ¶

block_d ¶

`vllm.v1.attention.ops.common` ¶

`CPTritonContext` ¶

`_correct_attn_cp_out_kernel(outputs_ptr, new_output_ptr, lses_ptr, vlse_ptr, outputs_stride_B, outputs_stride_H, outputs_stride_D, lses_stride_N, lses_stride_B, lses_stride_H, lse_idx, HEAD_DIM, N_ROUNDED, IS_BASE_E)` ¶

`outputs_ptr` ¶

`lses_ptr` ¶

`new_output_ptr` ¶

`vlse_ptr` ¶

`_cp_lse_common(cp_attn_out, cp_attn_lse, cp_group, ctx=None, is_lse_base_on_e=True)` ¶

`correct_attn_out(out, lses, cp_rank, ctx, is_lse_base_on_e=True)` ¶

`out` ¶

`lses` ¶

`cp_rank` ¶

`ctx` ¶

`cp_lse_ag_out_ar(cp_attn_out, cp_attn_lse, cp_group, ctx=None, return_lse=False, is_lse_base_on_e=True)` ¶

`cp_lse_ag_out_rs(cp_attn_out, cp_attn_lse, cp_group, ctx=None, return_lse=False, is_lse_base_on_e=True)` ¶

`pack_seq_triton(x, lengths, pad_value=-float('inf'), block_t=64, block_d=64)` ¶

`x` ¶

`lengths` ¶

`pad_value` ¶

`block_t` ¶

`block_d` ¶

`unpack_seq_triton(packed_tensor, lengths, block_t=64, block_d=64)` ¶

`packed_tensor` ¶

`lengths` ¶

`block_t` ¶

`block_d` ¶