`vllm.models.deepseek_v4.common.ops` ¶

Modules:

cache_utils –

Triton kernels for DeepseekV4 paged K-cache management and sparse-attention index
fused_compress_quant_cache –

Fused compressor + FP8/MXFP4 UE8M0 quantization + KV cache insert kernels.
fused_indexer_q –
fused_inv_rope_fp8_quant –

Fused inverse RoPE + block-scaled FP8 quantization kernel for DeepseekV4 attention.
fused_mtp_input_rmsnorm –

Fused MTP-input RMSNorm: enorm (with mask-zero at position 0) + hnorm.
save_partial_states –

Functions:

build_flashinfer_mixed_sparse_indices –

Build the FlashInfer DSV4 sparse-index matrix for decode-first batches.
compute_global_topk_indices_and_lens –

Map local topk indices to global KV cache slots and count valid entries.
fused_indexer_q_rope_quant –

Fused RoPE + quantize Q for the sparse indexer.
mtp_shared_head_rmsnorm –

RMSNorm for MTP's SharedHead.norm, on (T, H) bf16 input.
quantize_and_insert_k_cache –

Quantize K tensor and insert into paged K cache.

`build_flashinfer_mixed_sparse_indices(decode_swa_indices, decode_compressed_indices, decode_compressed_topk_lens, prefill_topk_indices, query_start_loc, seq_lens, token_to_req_indices, swa_block_table, swa_block_size, compressed_block_table, compressed_block_size, window_size, compress_ratio, topk, decode_compressed_indices_are_local=False, decode_is_valid_token=None)` ¶

Build the FlashInfer DSV4 sparse-index matrix for decode-first batches.

Produces sparse_indices of shape [num_tokens, window_size + padded_topk] (the first window_size columns are SWA slot ids, the rest are compressed/top-k slot ids) and sparse_topk_lens (active length per token). Decode tokens read precomputed SWA/compressed indices; prefill tokens derive their SWA window from the position and translate local compressed indices to global slots via the block tables.

Source code in vllm/models/deepseek_v4/common/ops/cache_utils.py

def build_flashinfer_mixed_sparse_indices(
    decode_swa_indices: torch.Tensor,
    decode_compressed_indices: torch.Tensor | None,
    decode_compressed_topk_lens: torch.Tensor | None,
    prefill_topk_indices: torch.Tensor,
    query_start_loc: torch.Tensor,
    seq_lens: torch.Tensor,
    token_to_req_indices: torch.Tensor,
    swa_block_table: torch.Tensor,
    swa_block_size: int,
    compressed_block_table: torch.Tensor | None,
    compressed_block_size: int,
    window_size: int,
    compress_ratio: int,
    topk: int,
    decode_compressed_indices_are_local: bool = False,
    decode_is_valid_token: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Build the FlashInfer DSV4 sparse-index matrix for decode-first batches.

    Produces ``sparse_indices`` of shape ``[num_tokens, window_size +
    padded_topk]`` (the first ``window_size`` columns are SWA slot ids, the rest
    are compressed/top-k slot ids) and ``sparse_topk_lens`` (active length per
    token). Decode tokens read precomputed SWA/compressed indices; prefill tokens
    derive their SWA window from the position and translate local compressed
    indices to global slots via the block tables.
    """
    assert decode_swa_indices.dtype == torch.int32
    assert decode_swa_indices.dim() == 2
    assert decode_swa_indices.shape[-1] == window_size
    if decode_compressed_topk_lens is not None:
        assert decode_compressed_topk_lens.dtype == torch.int32
    assert prefill_topk_indices.dtype == torch.int32
    assert prefill_topk_indices.dim() == 2
    assert query_start_loc.dtype == torch.int32
    assert seq_lens.dtype == torch.int32
    assert token_to_req_indices.dtype == torch.int32
    assert swa_block_table.dtype == torch.int32

    num_decode_tokens = decode_swa_indices.shape[0]
    num_prefill_tokens = prefill_topk_indices.shape[0]
    num_tokens = num_decode_tokens + num_prefill_tokens
    assert token_to_req_indices.shape[0] >= num_tokens
    if decode_compressed_topk_lens is not None:
        assert decode_compressed_topk_lens.shape[0] >= num_decode_tokens

    decode_compressed_topk = 0
    if decode_compressed_indices is None:
        decode_compressed_indices = prefill_topk_indices
    else:
        assert decode_compressed_indices.dtype == torch.int32
        assert decode_compressed_indices.dim() == 2
        assert decode_compressed_indices.shape[0] == num_decode_tokens
        decode_compressed_topk = decode_compressed_indices.shape[-1]
    if decode_compressed_topk > 0 and decode_compressed_indices_are_local:
        assert decode_is_valid_token is not None
        assert decode_is_valid_token.dtype == torch.bool
        assert decode_is_valid_token.shape[0] >= num_decode_tokens
    else:
        decode_is_valid_token = token_to_req_indices

    if compressed_block_table is None:
        compressed_block_table = swa_block_table
    assert compressed_block_table.dtype == torch.int32
    has_decode_compressed_lens = decode_compressed_topk_lens is not None
    if decode_compressed_topk_lens is None:
        decode_compressed_topk_lens = token_to_req_indices

    # The FlashInfer TRTLLM-gen sparse-MLA kernels require every per-token topk
    # index row to start on a 16-byte boundary: the kernel loads the compressed
    # indices with 128-bit (16-byte) vectorized loads, so a misaligned row would
    # fault or read across rows. 16 bytes = 4 int32 indices, so round the topk
    # width (and hence the row stride, since the SWA columns are fixed-width) up
    # to a multiple of 4. The extra columns are filled with -1 (invalid) and bounded
    # by ``sparse_topk_lens``, so padding never changes the attention result.
    padded_topk = max(topk, decode_compressed_topk)
    padded_topk = (padded_topk + 3) // 4 * 4
    sparse_indices = torch.empty(
        (num_tokens, window_size + padded_topk),
        dtype=torch.int32,
        device=decode_swa_indices.device,
    )
    sparse_topk_lens = torch.empty(
        num_tokens, dtype=torch.int32, device=decode_swa_indices.device
    )
    if num_tokens == 0:
        return sparse_indices, sparse_topk_lens

    window_block_size = triton.next_power_of_2(max(window_size, 1))
    topk_block_size = triton.next_power_of_2(max(padded_topk, 1))
    max_block_size = max(window_block_size, topk_block_size)
    num_warps = 4 if max_block_size >= 256 else 1

    _build_flashinfer_mixed_sparse_indices_kernel[(num_tokens,)](
        sparse_indices,
        sparse_indices.stride(0),
        sparse_topk_lens,
        decode_swa_indices,
        decode_swa_indices.stride(0),
        decode_compressed_indices,
        decode_compressed_indices.stride(0),
        decode_compressed_topk_lens,
        decode_is_valid_token,
        prefill_topk_indices,
        prefill_topk_indices.stride(0),
        query_start_loc,
        seq_lens,
        token_to_req_indices,
        swa_block_table,
        swa_block_table.stride(0),
        swa_block_size,
        compressed_block_table,
        compressed_block_table.stride(0),
        compressed_block_size,
        NUM_DECODE_TOKENS=num_decode_tokens,
        WINDOW_SIZE=window_size,
        COMPRESS_RATIO=compress_ratio,
        TOP_K=topk,
        PADDED_TOP_K=padded_topk,
        PREFILL_TOPK_STRIDE=prefill_topk_indices.shape[-1],
        DECODE_COMPRESSED_TOPK=decode_compressed_topk,
        DECODE_COMPRESSED_INDICES_ARE_LOCAL=decode_compressed_indices_are_local,
        HAS_DECODE_COMPRESSED_LENS=has_decode_compressed_lens,
        WINDOW_BLOCK_SIZE=window_block_size,
        TOPK_BLOCK_SIZE=topk_block_size,
        num_warps=num_warps,
    )
    return sparse_indices, sparse_topk_lens

`compute_global_topk_indices_and_lens(topk_indices, token_to_req_indices, block_table, block_size, is_valid_token)` ¶

Map local topk indices to global KV cache slots and count valid entries.

Fuses three operations into a single kernel: 1. Block-table lookup (local index → global slot id) 2. Valid-entry counting (topk_lens per token) 3. Masking padding tokens to length 0

Source code in vllm/models/deepseek_v4/common/ops/cache_utils.py

def compute_global_topk_indices_and_lens(
    topk_indices: torch.Tensor,
    token_to_req_indices: torch.Tensor,
    block_table: torch.Tensor,
    block_size: int,
    is_valid_token: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Map local topk indices to global KV cache slots and count valid entries.

    Fuses three operations into a single kernel:
    1. Block-table lookup (local index → global slot id)
    2. Valid-entry counting (topk_lens per token)
    3. Masking padding tokens to length 0
    """
    num_tokens = topk_indices.shape[0]
    global_topk_indices = torch.empty_like(topk_indices)
    topk_lens = torch.empty(num_tokens, dtype=torch.int32, device=topk_indices.device)
    _compute_global_topk_indices_and_lens_kernel[(num_tokens,)](
        global_topk_indices,
        global_topk_indices.stride(0),
        topk_lens,
        topk_indices,
        topk_indices.stride(0),
        topk_indices.shape[-1],
        token_to_req_indices,
        block_table,
        block_table.stride(0),
        block_size,
        is_valid_token,
        TRITON_BLOCK_SIZE=1024,
    )
    return global_topk_indices, topk_lens

`fused_indexer_q_rope_quant(positions, index_q, index_q_cos_sin_cache, index_weights, index_weights_softmax_scale, index_weights_head_scale, use_fp4=False)` ¶

Fused RoPE + quantize Q for the sparse indexer.

Weight-fold semantics (important — the two paths differ):

FP8 path (use_fp4=False, default): q_fp8 : (T, H, HEAD_DIM) float8_e4m3fn, per-token-per-head scalar scale (NOT stored — folded into weights below) weights_out = weights * q_scale * softmax_scale * head_scale Rationale: a single per-token q_scale is a scalar the downstream FP8 logits kernel would otherwise multiply in. Folding it into weights avoids emitting a separate tensor and is free for the logits kernel.

MXFP4 path (use_fp4=True): q_packed : (T, H, HEAD_DIM // 2) uint8 (2 E2M1 nibbles per byte) q_scale : (T, H, HEAD_DIM // MXFP4_BLOCK_SIZE) uint8 ue8m0 bytes weights_out = weights * softmax_scale * head_scale Rationale: MXFP4 has PER-BLOCK (32-element) scales that live with the Q values — they cannot be folded into a per-token weight scalar, so weights carries only the softmax and head scales.

Returns (q_quant, weights_out) where q_quant is either a Tensor (FP8) or a (values, scales) tuple (MXFP4). This matches the union type accepted by SparseAttnIndexer.forward_*.

Source code in vllm/models/deepseek_v4/common/ops/fused_indexer_q.py

def fused_indexer_q_rope_quant(
    positions: torch.Tensor,
    index_q: torch.Tensor,
    index_q_cos_sin_cache: torch.Tensor,
    # Index weights
    index_weights: torch.Tensor,
    index_weights_softmax_scale: float,
    index_weights_head_scale: float,
    use_fp4: bool = False,
) -> tuple[
    torch.Tensor | tuple[torch.Tensor, torch.Tensor],
    torch.Tensor,
]:
    """Fused RoPE + quantize Q for the sparse indexer.

    Weight-fold semantics (important — the two paths differ):

    FP8 path (use_fp4=False, default):
        q_fp8      : (T, H, HEAD_DIM) float8_e4m3fn, per-token-per-head
                     scalar scale (NOT stored — folded into weights below)
        weights_out = weights * q_scale * softmax_scale * head_scale
        Rationale: a single per-token q_scale is a scalar the downstream FP8
        logits kernel would otherwise multiply in. Folding it into `weights`
        avoids emitting a separate tensor and is free for the logits kernel.

    MXFP4 path (use_fp4=True):
        q_packed   : (T, H, HEAD_DIM // 2) uint8 (2 E2M1 nibbles per byte)
        q_scale    : (T, H, HEAD_DIM // MXFP4_BLOCK_SIZE) uint8 ue8m0 bytes
        weights_out = weights * softmax_scale * head_scale
        Rationale: MXFP4 has PER-BLOCK (32-element) scales that live with
        the Q values — they cannot be folded into a per-token weight
        scalar, so `weights` carries only the softmax and head scales.

    Returns (q_quant, weights_out) where q_quant is either a Tensor (FP8) or
    a (values, scales) tuple (MXFP4). This matches the union type accepted
    by `SparseAttnIndexer.forward_*`.
    """
    assert positions.ndim == 1
    assert index_q.ndim == 3
    assert index_q_cos_sin_cache.ndim == 2

    num_tokens = positions.shape[0]
    num_index_q_heads = index_q.shape[1]
    index_q_head_dim = index_q.shape[2]

    index_weights_out = torch.empty_like(index_weights, dtype=torch.float32)

    if use_fp4:
        assert index_q_head_dim % MXFP4_BLOCK_SIZE == 0, (
            f"head_dim={index_q_head_dim} must be a multiple of MXFP4 block "
            f"size {MXFP4_BLOCK_SIZE}"
        )
        num_scale_blocks = index_q_head_dim // MXFP4_BLOCK_SIZE
        index_q_packed = torch.empty(
            (num_tokens, num_index_q_heads, index_q_head_dim // 2),
            dtype=torch.uint8,
            device=index_q.device,
        )
        index_q_scale = torch.empty(
            (num_tokens, num_index_q_heads, num_scale_blocks),
            dtype=torch.uint8,
            device=index_q.device,
        )
        if has_cutedsl():
            # lazily import, otherwise some tests fail due to CUDA driver init failure.
            from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import (
                fused_indexer_q_rope_quant_mxfp4_cutedsl,
            )

            fused_indexer_q_rope_quant_mxfp4_cutedsl(
                positions,
                index_q,
                index_q_cos_sin_cache,
                index_weights,
                index_weights_softmax_scale,
                index_weights_head_scale,
                index_q_packed,
                index_q_scale,
                index_weights_out,
            )
        else:
            _fused_indexer_q_rope_mxfp4_kernel[(num_tokens, num_index_q_heads)](
                positions,
                index_q,
                index_q.stride(0),
                index_q.stride(1),
                index_q_cos_sin_cache,
                index_q_cos_sin_cache.stride(0),
                index_q_cos_sin_cache.shape[-1] // 2,
                index_q_packed,
                index_q_packed.stride(0),
                index_q_packed.stride(1),
                index_q_scale,
                index_q_scale.stride(0),
                index_q_scale.stride(1),
                index_q_head_dim,
                MXFP4_BLOCK_SIZE,
                index_weights,
                index_weights.stride(0),
                index_weights_softmax_scale,
                index_weights_head_scale,
                index_weights_out,
                index_weights_out.stride(0),
                num_warps=1,  # TODO: Tune this
            )

        # Values stay uint8 (2 E2M1 nibbles per byte). Scales are 4 ue8m0
        # bytes per (token, head) reinterpreted as one int32, then squeezed
        # from (T, H, 1) to (T, H) to match DeepGEMM's expected q_sf rank
        # (prefill wants 2-D (seq_len, num_heads); decode reshapes this to
        # 3-D (batch, next_n, num_heads)).
        return (
            index_q_packed,
            index_q_scale.view(torch.int32).squeeze(-1),
        ), index_weights_out

    index_q_fp8 = torch.empty_like(index_q, dtype=torch.float8_e4m3fn)
    if has_cutedsl():
        # lazily import, otherwise some tests fail due to CUDA driver init failure.
        from vllm.models.deepseek_v4.nvidia.ops.fused_indexer_q_cutedsl import (
            fused_indexer_q_rope_quant_fp8_cutedsl,
        )

        fused_indexer_q_rope_quant_fp8_cutedsl(
            positions,
            index_q,
            index_q_cos_sin_cache,
            index_weights,
            index_weights_softmax_scale,
            index_weights_head_scale,
            index_q_fp8,
            index_weights_out,
        )
    else:
        _fused_indexer_q_rope_quant_kernel[(num_tokens, num_index_q_heads)](
            positions,
            index_q,
            index_q.stride(0),
            index_q.stride(1),
            index_q_cos_sin_cache,
            index_q_cos_sin_cache.stride(0),
            index_q_cos_sin_cache.shape[-1] // 2,
            index_q_fp8,
            index_q_fp8.stride(0),
            index_q_fp8.stride(1),
            index_q_head_dim,
            index_weights,
            index_weights.stride(0),
            index_weights_softmax_scale,
            index_weights_head_scale,
            index_weights_out,
            index_weights_out.stride(0),
            num_warps=1,  # TODO: Tune this
        )
    return index_q_fp8, index_weights_out

`mtp_shared_head_rmsnorm(hidden_states, weight, eps)` ¶

RMSNorm for MTP's SharedHead.norm, on (T, H) bf16 input.

Uses the same _rmsnorm_row body as fused_mtp_input_rmsnorm so the MTP draft path runs one consistent RMSNorm implementation end to end.

Source code in vllm/models/deepseek_v4/common/ops/fused_mtp_input_rmsnorm.py

def mtp_shared_head_rmsnorm(
    hidden_states: torch.Tensor,
    weight: torch.Tensor,
    eps: float,
) -> torch.Tensor:
    """RMSNorm for MTP's SharedHead.norm, on (T, H) bf16 input.

    Uses the same ``_rmsnorm_row`` body as ``fused_mtp_input_rmsnorm`` so the
    MTP draft path runs one consistent RMSNorm implementation end to end.
    """
    assert hidden_states.ndim == 2
    assert hidden_states.is_contiguous()
    assert weight.is_contiguous()
    num_tokens, hidden = hidden_states.shape
    out = torch.empty_like(hidden_states)
    if num_tokens == 0:
        return out
    block_size = triton.next_power_of_2(hidden)
    _mtp_shared_head_rmsnorm_kernel[(num_tokens,)](
        hidden_states,
        weight,
        out,
        eps,
        HIDDEN=hidden,
        BLOCK_SIZE=block_size,
    )
    return out

`quantize_and_insert_k_cache(k, k_cache, slot_mapping, block_size=64, is_ue8m0=True)` ¶

Quantize K tensor and insert into paged K cache.

K Cache block layout (block_size=64 tokens): - First 64 * 576 = 36864 bytes: Token data - Each token: 448 bytes (fp8) + 128 bytes (bf16) - Next 64 * 8 = 512 bytes: Scales - Each token: 8 bytes (uint8 scales, 7 real + 1 padding) - Padded to multiple of 576

Source code in vllm/models/deepseek_v4/common/ops/cache_utils.py

def quantize_and_insert_k_cache(
    k: torch.Tensor,  # [num_tokens, 512] bf16
    k_cache: torch.Tensor,  # [num_blocks, block_bytes] uint8
    slot_mapping: torch.Tensor,  # [num_tokens] int64
    block_size: int = 64,
    is_ue8m0: bool = True,
):
    """
    Quantize K tensor and insert into paged K cache.

    K Cache block layout (block_size=64 tokens):
    - First 64 * 576 = 36864 bytes: Token data
      - Each token: 448 bytes (fp8) + 128 bytes (bf16)
    - Next 64 * 8 = 512 bytes: Scales
      - Each token: 8 bytes (uint8 scales, 7 real + 1 padding)
    - Padded to multiple of 576
    """
    assert k.dim() == 2 and k.shape[1] == 512, (
        f"K must be [num_tokens, 512], got {k.shape}"
    )
    assert k.dtype == torch.bfloat16, f"K must be bf16, got {k.dtype}"
    assert is_ue8m0, "Only support ue8m0 quantization."

    # NOTE: When using DP, slot_mapping.shape[0] can be less than k.shape[0] due to
    # padding. Always use slot_mapping.shape[0] as the token count.
    num_tokens = slot_mapping.shape[0]
    block_stride = k_cache.stride(0)  # bytes per block

    TOKEN_FP8_DIM = 448
    TOKEN_BF16_DIM = 64
    TOKEN_SCALE_DIM = 8
    QUANT_BLOCK_SIZE = 64
    FP8_MAX = 448.0
    TOKEN_DATA_SIZE = TOKEN_FP8_DIM + TOKEN_BF16_DIM * 2

    grid = (num_tokens,)

    quantize_and_insert_k_kernel[grid](
        k,
        slot_mapping,
        k_cache,
        num_tokens,
        input_dim=512,
        fp8_dim=TOKEN_FP8_DIM,
        bf16_dim=TOKEN_BF16_DIM,
        scale_dim=TOKEN_SCALE_DIM,
        quant_block=QUANT_BLOCK_SIZE,
        cache_block_size=block_size,
        token_data_size=TOKEN_DATA_SIZE,
        block_stride=block_stride,
        fp8_max=FP8_MAX,
        n_quant_blocks=8,
    )

vllm.models.deepseek_v4.common.ops ¶

compute_global_topk_indices_and_lens(topk_indices, token_to_req_indices, block_table, block_size, is_valid_token) ¶

fused_indexer_q_rope_quant(positions, index_q, index_q_cos_sin_cache, index_weights, index_weights_softmax_scale, index_weights_head_scale, use_fp4=False) ¶

mtp_shared_head_rmsnorm(hidden_states, weight, eps) ¶

quantize_and_insert_k_cache(k, k_cache, slot_mapping, block_size=64, is_ue8m0=True) ¶

`vllm.models.deepseek_v4.common.ops` ¶

`compute_global_topk_indices_and_lens(topk_indices, token_to_req_indices, block_table, block_size, is_valid_token)` ¶

`fused_indexer_q_rope_quant(positions, index_q, index_q_cos_sin_cache, index_weights, index_weights_softmax_scale, index_weights_head_scale, use_fp4=False)` ¶

`mtp_shared_head_rmsnorm(hidden_states, weight, eps)` ¶

`quantize_and_insert_k_cache(k, k_cache, slot_mapping, block_size=64, is_ue8m0=True)` ¶