`vllm.v1.attention.ops.rocm_aiter_mla_sparse` ¶

Functions:

fp8_mqa_logits_torch –

Compute FP8 MQA logits for a single sequence without KV paging.
rocm_fp8_mqa_logits –

Compute FP8 MQA logits for a single sequence without KV paging.
rocm_fp8_paged_mqa_logits –

Compute FP8 MQA logits using paged KV-cache.
rocm_inv_rope_einsum –

Reference inverse-RoPE + WO_A einsum path used on ROCm.

`fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)` ¶

Compute FP8 MQA logits for a single sequence without KV paging.

Parameters:

q ¶
(Tensor) –

Query tensor of shape [M, H, D]. Casted to torch.float8_e4m3fn by caller.
kv ¶
(tuple[Tensor, Tensor]) –

Tuple (k_fp8, k_scales) where k_fp8 has shape [N, D] with dtype torch.float8_e4m3fn and k_scales has shape [N] (or [N, 1]) with dtype torch.float32.
weights ¶
(Tensor) –

weights of shape [M, H], dtype torch.float32.
cu_seqlen_ks ¶
(Tensor) –

Start indices (inclusive) for valid K per query position, shape [M], dtype int32.
cu_seqlen_ke ¶
(Tensor) –

End indices (exclusive) for valid K per query position, shape [M], dtype int32.

Returns:

Tensor –

Logits tensor of shape [M, N], dtype torch.float32.

Source code in vllm/v1/attention/ops/rocm_aiter_mla_sparse.py

def fp8_mqa_logits_torch(
    q: torch.Tensor,
    kv: tuple[torch.Tensor, torch.Tensor],
    weights: torch.Tensor,
    cu_seqlen_ks: torch.Tensor,
    cu_seqlen_ke: torch.Tensor,
) -> torch.Tensor:
    """Compute FP8 MQA logits for a single sequence without KV paging.

    Args:
        q: Query tensor of shape [M, H, D]. Casted to
            `torch.float8_e4m3fn` by caller.
        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
            [N, 1]) with dtype `torch.float32`.
        weights: weights of shape [M, H], dtype `torch.float32`.
        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
            shape [M], dtype int32.
        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
            shape [M], dtype int32.

    Returns:
        Logits tensor of shape [M, N], dtype `torch.float32`.
    """
    k_fp8, scale = kv
    seq_len_kv = k_fp8.shape[0]
    k = k_fp8.to(torch.bfloat16)
    q = q.to(torch.bfloat16)
    device = q.device

    mask_lo = (
        torch.arange(0, seq_len_kv, device=device)[None, :] >= cu_seqlen_ks[:, None]
    )
    mask_hi = (
        torch.arange(0, seq_len_kv, device=device)[None, :] < cu_seqlen_ke[:, None]
    )
    mask = mask_lo & mask_hi

    score = torch.einsum("mhd,nd->hmn", q, k).float() * scale
    logits = (score.relu() * weights.unsqueeze(-1).transpose(0, 1)).sum(dim=0)
    logits = logits.masked_fill(~mask, float("-inf"))

    return logits

`rocm_fp8_mqa_logits(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)` ¶

Compute FP8 MQA logits for a single sequence without KV paging.

Parameters:

q ¶
(Tensor) –

Query tensor of shape [M, H, D]. Casted to torch.float8_e4m3fn by caller.
kv ¶
(tuple[Tensor, Tensor]) –

Tuple (k_fp8, k_scales) where k_fp8 has shape [N, D] with dtype torch.float8_e4m3fn and k_scales has shape [N] (or [N, 1]) with dtype torch.float32.
weights ¶
(Tensor) –

weights of shape [M, H], dtype torch.float32.
cu_seqlen_ks ¶
(Tensor) –

Start indices (inclusive) for valid K per query position, shape [M], dtype int32.
cu_seqlen_ke ¶
(Tensor) –

End indices (exclusive) for valid K per query position, shape [M], dtype int32.

Returns:

Tensor –

Logits tensor of shape [M, N], dtype torch.float32.

Source code in vllm/v1/attention/ops/rocm_aiter_mla_sparse.py

def rocm_fp8_mqa_logits(
    q: torch.Tensor,
    kv: tuple[torch.Tensor, torch.Tensor],
    weights: torch.Tensor,
    cu_seqlen_ks: torch.Tensor,
    cu_seqlen_ke: torch.Tensor,
) -> torch.Tensor:
    """Compute FP8 MQA logits for a single sequence without KV paging.

    Args:
        q: Query tensor of shape [M, H, D]. Casted to
            `torch.float8_e4m3fn` by caller.
        kv: Tuple `(k_fp8, k_scales)` where `k_fp8` has shape [N, D] with
            dtype `torch.float8_e4m3fn` and `k_scales` has shape [N] (or
            [N, 1]) with dtype `torch.float32`.
        weights: weights of shape [M, H], dtype `torch.float32`.
        cu_seqlen_ks: Start indices (inclusive) for valid K per query position,
            shape [M], dtype int32.
        cu_seqlen_ke: End indices (exclusive) for valid K per query position,
            shape [M], dtype int32.

    Returns:
        Logits tensor of shape [M, N], dtype `torch.float32`.
    """

    # TODO(ganyi): Temporarily workaround, will remove the module check and reference
    # path after aiter merge this kernel into main
    from vllm._aiter_ops import rocm_aiter_ops

    aiter_mqa_logits_module = None
    if rocm_aiter_ops.is_enabled():
        aiter_mqa_logits_module = mqa_logits_module()

    if aiter_mqa_logits_module is not None:
        fp8_mqa_logits = aiter_mqa_logits_module.fp8_mqa_logits
        k_fp8, scale = kv
        return fp8_mqa_logits(q, k_fp8, scale, weights, cu_seqlen_ks, cu_seqlen_ke)
    else:
        return fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)

`rocm_fp8_paged_mqa_logits(q_fp8, kv_cache_fp8, weights, context_lens, block_tables, schedule_metadata, max_model_len)` ¶

Compute FP8 MQA logits using paged KV-cache.

Parameters:

q_fp8 ¶
(Tensor) –

Query tensor of shape [B, next_n, H, D]. Casted to torch.float8_e4m3fn by caller.
kv_cache_fp8 ¶
(Tensor) –

Paged KV-cache in packed FP8+scale layout with shape [num_blocks, block_size, 1, D+4], dtype torch.uint8. The last 4 bytes per (block,pos) store the float dequant scale.
weights ¶
(Tensor) –

Tensor of shape [B * next_n, H], dtype torch.float32.
context_lens ¶
(Tensor) –

Tensor of shape [B], dtype int32; effective context length for each batch element.
block_tables ¶
(Tensor) –

Tensor of shape [B, max_blocks], dtype int32; maps logical block indices to physical blocks in the paged cache.
schedule_metadata ¶
(Tensor) –

Returned by get_paged_mqa_logits_metadata; used to distribute work across SMs.
max_model_len ¶
(int) –

Maximum sequence length used to size the logits output.

Returns:

Tensor –

Logits tensor of shape [B * next_n, max_model_len], dtype
Tensor –

torch.float32.

Source code in vllm/v1/attention/ops/rocm_aiter_mla_sparse.py

def rocm_fp8_paged_mqa_logits(
    q_fp8: torch.Tensor,
    kv_cache_fp8: torch.Tensor,
    weights: torch.Tensor,
    context_lens: torch.Tensor,
    block_tables: torch.Tensor,
    schedule_metadata: torch.Tensor,
    max_model_len: int,
) -> torch.Tensor:
    """Compute FP8 MQA logits using paged KV-cache.

    Args:
        q_fp8: Query tensor of shape [B, next_n, H, D]. Casted to
            `torch.float8_e4m3fn` by caller.
        kv_cache_fp8: Paged KV-cache in packed FP8+scale layout with shape
            [num_blocks, block_size, 1, D+4], dtype `torch.uint8`. The last
            4 bytes per (block,pos) store the `float` dequant scale.
        weights: Tensor of shape [B * next_n, H], dtype `torch.float32`.
        context_lens: Tensor of shape [B], dtype int32; effective context length
            for each batch element.
        block_tables: Tensor of shape [B, max_blocks], dtype int32; maps logical
            block indices to physical blocks in the paged cache.
        schedule_metadata: Returned by `get_paged_mqa_logits_metadata`;
            used to distribute work across SMs.
        max_model_len: Maximum sequence length used to size the logits output.

    Returns:
        Logits tensor of shape [B * next_n, max_model_len], dtype
        `torch.float32`.
    """
    from vllm._aiter_ops import rocm_aiter_ops

    aiter_paged_mqa_logits_module = None
    # if rocm_aiter_ops.is_enabled():
    batch_size, next_n = q_fp8.shape[:2]
    block_size = kv_cache_fp8.shape[1]

    if rocm_aiter_ops.is_enabled():
        aiter_paged_mqa_logits_module = paged_mqa_logits_module()

    if aiter_paged_mqa_logits_module is not None:
        if _ON_GFX942 or _ON_GFX950:
            deepgemm_fp8_paged_mqa_logits = (
                aiter_paged_mqa_logits_module.deepgemm_fp8_paged_mqa_logits
            )
            batch_size, next_n, heads, _ = q_fp8.shape
            (out_logits,) = current_workspace_manager().get_simultaneous(
                ((batch_size * next_n, max_model_len), torch.float32),
            )
            out_logits.fill_(float("-inf"))
            deepgemm_fp8_paged_mqa_logits(
                q_fp8,
                kv_cache_fp8,
                weights,
                out_logits,
                context_lens,
                block_tables,
                max_model_len,
                ChunkK=256,
                Preshuffle=block_size > 1,
                KVBlockSize=block_size,
                WavePerEU=2,
            )
            return out_logits
        deepgemm_fp8_paged_mqa_logits_stage1 = (
            aiter_paged_mqa_logits_module.deepgemm_fp8_paged_mqa_logits_stage1
        )
        batch_size, next_n, heads, _ = q_fp8.shape
        (out_qk,) = current_workspace_manager().get_simultaneous(
            ((heads, batch_size * next_n, max_model_len), torch.float32),
        )
        out_qk.fill_(float("-inf"))
        deepgemm_fp8_paged_mqa_logits_stage1(
            q_fp8,
            kv_cache_fp8,
            weights,
            out_qk,
            context_lens,
            block_tables,
            max_model_len,
            ChunkQ=heads,
        )
        return out_qk.sum(dim=0)
    else:
        return fp8_paged_mqa_logits_torch(
            q_fp8, kv_cache_fp8, weights, context_lens, block_tables, max_model_len
        )

`rocm_inv_rope_einsum(rotary_emb, o, positions, rope_head_dim, n_local_groups, o_lora_rank, wo_a)` ¶

Reference inverse-RoPE + WO_A einsum path used on ROCm.

Source code in vllm/v1/attention/ops/rocm_aiter_mla_sparse.py

def rocm_inv_rope_einsum(
    rotary_emb: torch.nn.Module,
    o: torch.Tensor,
    positions: torch.Tensor,
    rope_head_dim: int,
    n_local_groups: int,
    o_lora_rank: int,
    wo_a: torch.nn.Module,
) -> torch.Tensor:
    """Reference inverse-RoPE + WO_A einsum path used on ROCm."""
    o_ref = _apply_inv_rope_ref(rotary_emb, o, positions, rope_head_dim).to(
        torch.bfloat16
    )
    o_ref = o_ref.view(o.shape[0], n_local_groups, -1)

    hidden_dim = o_ref.shape[-1]
    if hasattr(wo_a, "weight_scale_inv"):
        wo_a_weight = wo_a.weight.view(n_local_groups, o_lora_rank, hidden_dim).to(
            torch.float32
        )
        wo_a_scale = _expand_2d_block_scales(
            wo_a.weight_scale_inv.view(
                n_local_groups, -1, wo_a.weight_scale_inv.shape[-1]
            ),
            o_lora_rank,
            hidden_dim,
        )
        wo_a_weight = (wo_a_weight * wo_a_scale).to(torch.bfloat16)
    else:
        wo_a_weight = wo_a.weight.view(n_local_groups, o_lora_rank, hidden_dim).to(
            torch.bfloat16
        )

    return torch.einsum("tgd,grd->tgr", o_ref, wo_a_weight)

`vllm.v1.attention.ops.rocm_aiter_mla_sparse` ¶

`fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)` ¶

`q` ¶

`kv` ¶

`weights` ¶

`cu_seqlen_ks` ¶

`cu_seqlen_ke` ¶

`rocm_fp8_mqa_logits(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)` ¶

`q` ¶

`kv` ¶

`weights` ¶

`cu_seqlen_ks` ¶

`cu_seqlen_ke` ¶

`rocm_fp8_paged_mqa_logits(q_fp8, kv_cache_fp8, weights, context_lens, block_tables, schedule_metadata, max_model_len)` ¶

`q_fp8` ¶

`kv_cache_fp8` ¶

`weights` ¶

`context_lens` ¶

`block_tables` ¶

`schedule_metadata` ¶

`max_model_len` ¶

`rocm_inv_rope_einsum(rotary_emb, o, positions, rope_head_dim, n_local_groups, o_lora_rank, wo_a)` ¶

vllm.v1.attention.ops.rocm_aiter_mla_sparse ¶

fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke) ¶

q ¶

kv ¶

weights ¶

cu_seqlen_ks ¶

cu_seqlen_ke ¶

rocm_fp8_mqa_logits(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke) ¶

q ¶

kv ¶

weights ¶

cu_seqlen_ks ¶

cu_seqlen_ke ¶

rocm_fp8_paged_mqa_logits(q_fp8, kv_cache_fp8, weights, context_lens, block_tables, schedule_metadata, max_model_len) ¶

q_fp8 ¶

kv_cache_fp8 ¶

weights ¶

context_lens ¶

block_tables ¶

schedule_metadata ¶

max_model_len ¶

rocm_inv_rope_einsum(rotary_emb, o, positions, rope_head_dim, n_local_groups, o_lora_rank, wo_a) ¶

`vllm.v1.attention.ops.rocm_aiter_mla_sparse` ¶

`fp8_mqa_logits_torch(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)` ¶

`q` ¶

`kv` ¶

`weights` ¶

`cu_seqlen_ks` ¶

`cu_seqlen_ke` ¶

`rocm_fp8_mqa_logits(q, kv, weights, cu_seqlen_ks, cu_seqlen_ke)` ¶

`q` ¶

`kv` ¶

`weights` ¶

`cu_seqlen_ks` ¶

`cu_seqlen_ke` ¶

`rocm_fp8_paged_mqa_logits(q_fp8, kv_cache_fp8, weights, context_lens, block_tables, schedule_metadata, max_model_len)` ¶

`q_fp8` ¶

`kv_cache_fp8` ¶

`weights` ¶

`context_lens` ¶

`block_tables` ¶

`schedule_metadata` ¶

`max_model_len` ¶

`rocm_inv_rope_einsum(rotary_emb, o, positions, rope_head_dim, n_local_groups, o_lora_rank, wo_a)` ¶