`vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe` ¶

Utility helpers for NVFP4 + FlashInfer fused-MoE path

Functions:

reorder_w1w3_to_w3w1 –

Re-order the concatenated [w1, w3] tensors to [w3, w1]

`interleave_linear_and_gate(x, group_size=64, dim=-1)` ¶

Interleave gate and linear weight rows for CuteDSL wrapper.

Source code in vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py

def interleave_linear_and_gate(
    x: torch.Tensor,
    group_size: int = 64,
    dim: int = -1,
) -> torch.Tensor:
    """Interleave gate and linear weight rows for CuteDSL wrapper."""
    sizes = x.size()
    dim = dim % x.dim()
    assert sizes[dim] % (group_size * 2) == 0, (
        f"dim {dim} size {sizes[dim]} must be divisible by {group_size * 2}"
    )
    prev_sizes = sizes[:dim]
    post_sizes = sizes[dim + 1 :]
    x = x.view(*prev_sizes, 2, sizes[dim] // (group_size * 2), group_size, *post_sizes)
    x = x.transpose(dim, dim + 1).contiguous().view(*sizes)
    return x

`is_flashinfer_fp4_cutlass_moe_available()` ¶

Return True when FlashInfer CUTLASS NV-FP4 kernels can be used.

Source code in vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py

def is_flashinfer_fp4_cutlass_moe_available() -> bool:
    """Return `True` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
    return (
        envs.VLLM_USE_FLASHINFER_MOE_FP4
        and has_flashinfer_cutlass_fused_moe()
        and current_platform.is_cuda()
        and current_platform.has_device_capability(100)
    )

`prepare_nvfp4_moe_layer_for_flashinfer_cutedsl(layer, w13, w13_scale, w13_scale_2, a13_scale, w2, w2_scale, w2_scale_2, a2_scale)` ¶

Prepare weights for the CuteDSL wrapper-based NvFP4 MoE backend.

Converts weight scale factors to MMA layout expected by CuteDslMoEWrapper, and interleaves w13 gate/linear rows.

Source code in vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py

def prepare_nvfp4_moe_layer_for_flashinfer_cutedsl(
    layer: "RoutedExperts",
    w13: torch.Tensor,
    w13_scale: torch.Tensor,
    w13_scale_2: torch.Tensor,
    a13_scale: torch.Tensor,
    w2: torch.Tensor,
    w2_scale: torch.Tensor,
    w2_scale_2: torch.Tensor,
    a2_scale: torch.Tensor,
) -> tuple[
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
    torch.Tensor,
]:
    """Prepare weights for the CuteDSL wrapper-based NvFP4 MoE backend.

    Converts weight scale factors to MMA layout expected by CuteDslMoEWrapper,
    and interleaves w13 gate/linear rows.
    """
    from flashinfer.cute_dsl.utils import convert_sf_to_mma_layout

    # Global scaling factors (same as other FlashInfer backends).
    num_experts = w13.shape[0]
    a13_scale = a13_scale.max().to(torch.float32).expand(num_experts)
    a2_scale = a2_scale.max().to(torch.float32).expand(num_experts)

    half = w13.shape[1] // 2
    w13 = torch.cat([w13[:, half:], w13[:, :half]], dim=1)
    w13_scale = torch.cat([w13_scale[:, half:], w13_scale[:, :half]], dim=1)

    # Interleave up/gate rows for w13 weights and scales.
    w13 = interleave_linear_and_gate(w13, group_size=64, dim=1)
    w13_scale = interleave_linear_and_gate(w13_scale, group_size=64, dim=1)

    # Convert w13 scale factors: linear → swizzled → MMA layout.
    w13_scale = swizzle_blockscale(w13_scale)
    E, M_padded, K_sf_padded = w13_scale.shape
    w13_scale_flat = w13_scale.reshape(E * M_padded, K_sf_padded)
    w13_scale = convert_sf_to_mma_layout(
        w13_scale_flat,
        m=M_padded,
        k=K_sf_padded * 16,
        num_groups=E,
        sf_vec_size=16,
    )

    # Convert w2 scale factors: linear → swizzled → MMA layout.
    w2_scale = swizzle_blockscale(w2_scale)
    E, M_padded, K_sf_padded = w2_scale.shape
    w2_scale_flat = w2_scale.reshape(E * M_padded, K_sf_padded)
    w2_scale = convert_sf_to_mma_layout(
        w2_scale_flat,
        m=M_padded,
        k=K_sf_padded * 16,
        num_groups=E,
        sf_vec_size=16,
    )

    return (
        w13,
        w13_scale,
        w13_scale_2,
        a13_scale,
        w2,
        w2_scale,
        w2_scale_2,
        a2_scale,
    )

`reorder_w1w3_to_w3w1(weight, scale, dim=-2)` ¶

Re-order the concatenated [w1, w3] tensors to [w3, w1]

Source code in vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py

def reorder_w1w3_to_w3w1(
    weight: torch.Tensor, scale: torch.Tensor, dim: int = -2
) -> tuple[torch.Tensor, torch.Tensor]:
    """Re-order the concatenated `[w1, w3]` tensors to `[w3, w1]`"""
    size = weight.size(dim)
    assert size % 2 == 0, f"Expected even size in dim {dim}, got {size}"
    half = size // 2

    w1, w3 = weight.split(half, dim=dim)
    s1, s3 = scale.split(half, dim=dim)

    return (
        torch.cat([w3, w1], dim=dim).contiguous(),
        torch.cat([s3, s1], dim=dim).contiguous(),
    )

vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe ¶

interleave_linear_and_gate(x, group_size=64, dim=-1) ¶

is_flashinfer_fp4_cutlass_moe_available() ¶

prepare_nvfp4_moe_layer_for_flashinfer_cutedsl(layer, w13, w13_scale, w13_scale_2, a13_scale, w2, w2_scale, w2_scale_2, a2_scale) ¶

reorder_w1w3_to_w3w1(weight, scale, dim=-2) ¶

`vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe` ¶

`interleave_linear_and_gate(x, group_size=64, dim=-1)` ¶

`is_flashinfer_fp4_cutlass_moe_available()` ¶

`prepare_nvfp4_moe_layer_for_flashinfer_cutedsl(layer, w13, w13_scale, w13_scale_2, a13_scale, w2, w2_scale, w2_scale_2, a2_scale)` ¶

`reorder_w1w3_to_w3w1(weight, scale, dim=-2)` ¶