Skip to content

vllm.model_executor.layers.fused_moe

Modules:

Classes:

Functions:

BatchedDeepGemmExperts

Bases: FusedMoEExpertsModular

Methods:

Source code in vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py
class BatchedDeepGemmExperts(mk.FusedMoEExpertsModular):
    def __init__(
        self,
        moe_config: FusedMoEConfig,
        quant_config: FusedMoEQuantConfig,
        max_num_tokens: int,
        num_dispatchers: int,
    ):
        """
        max_num_tokens: Maximum number of tokens from a DP Rank
        num_dispatchers: The number of DP dispatchers.
        quant_config: Quantization configuration
        """
        super().__init__(
            moe_config=moe_config,
            quant_config=quant_config,
            max_num_tokens=max_num_tokens,
            num_dispatchers=num_dispatchers,
        )
        assert self.block_shape == get_mk_alignment_for_contiguous_layout()
        assert self.quant_config.use_fp8_w8a8

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.BatchedExperts

    @staticmethod
    def _supports_current_device() -> bool:
        return is_deep_gemm_supported()

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return False

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        SUPPORTED_W_A = [(kFp8Static128BlockSym, kFp8Dynamic128Sym)]
        return (weight_key, activation_key) in SUPPORTED_W_A

    @staticmethod
    def _supports_activation(activation: MoEActivation) -> bool:
        return activation == MoEActivation.SILU

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        return True

    def supports_packed_ue8m0_act_scales(self) -> bool:
        """
        DeepGemm supports packed ue8m0 activation scales format in devices == sm100
        """
        return (
            is_deep_gemm_e8m0_used()
            and current_platform.is_device_capability_family(100)
        )

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # Let PrepareAndFinalize::finalize() decide the impl.
        return TopKWeightAndReduceDelegate()

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        # FIXME (varun): We should be able to dispatch only from the leader
        # DP ranks in the case of TP > 1. At the moment, all the Ranks
        # end up sending their tokens. This needs to be fixed.
        assert self.num_dispatchers is not None
        assert self.max_num_tokens is not None
        num_dispatchers = self.num_dispatchers
        num_experts = local_num_experts
        max_num_tokens = M if self.max_num_tokens is None else self.max_num_tokens
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace13 = (num_experts, max_num_tokens * num_dispatchers, max(K, N))
        workspace2 = (num_experts, max_num_tokens * num_dispatchers, activation_out_dim)
        output = (num_experts, max_num_tokens * num_dispatchers, K)
        return (workspace13, workspace2, output)

    def estimate_expected_m(
        self, global_num_experts: int, max_tokens_per_expert: int, topk: int
    ) -> int:
        dp_meta = (
            get_forward_context().dp_metadata
            if is_forward_context_available()
            else None
        )
        if dp_meta is None:
            logger.warning_once(
                "DPMetadata unavailable. Defaulting expected_m to "
                f"{max_tokens_per_expert}.",
            )
            return max_tokens_per_expert

        total_num_tokens = dp_meta.num_tokens_across_dp_cpu.sum().item()
        total_num_tokens_replicated = total_num_tokens * topk

        # Assume even load balancing
        assert global_num_experts != 0
        estimate = round_up(int(total_num_tokens_replicated // global_num_experts), 16)
        # clamp estimate
        estimate = max(estimate, 16)
        estimate = min(max_tokens_per_expert, estimate)
        return estimate

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        assert expert_tokens_meta is not None
        expert_num_tokens = expert_tokens_meta.expert_num_tokens

        assert hidden_states.ndim == 3
        assert self.block_shape is not None

        a1q = hidden_states
        _, N, K = w1.size()

        assert w2.size(1) == K

        E, max_num_tokens, N, K, _ = self.moe_problem_size(
            hidden_states, w1, w2, topk_ids
        )

        workspace1 = _resize_cache(workspace13, (E, max_num_tokens, N))

        expected_m = self.estimate_expected_m(
            global_num_experts=global_num_experts,
            max_tokens_per_expert=max_num_tokens,
            topk=topk_ids.size(-1),
        )

        fp8_m_grouped_gemm_nt_masked(
            (a1q, a1q_scale),
            (w1, self.w1_scale),
            workspace1,
            expert_num_tokens,
            expected_m,
        )

        quant_scale_fmt = DeepGemmQuantScaleFMT.from_oracle()
        a2q, a2q_scale = persistent_masked_m_silu_mul_quant(
            workspace1,
            expert_num_tokens,
            quant_scale_fmt=quant_scale_fmt,
        )

        fp8_m_grouped_gemm_nt_masked(
            (a2q, a2q_scale),
            (w2, self.w2_scale),
            output,
            expert_num_tokens,
            expected_m,
        )

__init__(moe_config, quant_config, max_num_tokens, num_dispatchers)

max_num_tokens: Maximum number of tokens from a DP Rank num_dispatchers: The number of DP dispatchers. quant_config: Quantization configuration

Source code in vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py
def __init__(
    self,
    moe_config: FusedMoEConfig,
    quant_config: FusedMoEQuantConfig,
    max_num_tokens: int,
    num_dispatchers: int,
):
    """
    max_num_tokens: Maximum number of tokens from a DP Rank
    num_dispatchers: The number of DP dispatchers.
    quant_config: Quantization configuration
    """
    super().__init__(
        moe_config=moe_config,
        quant_config=quant_config,
        max_num_tokens=max_num_tokens,
        num_dispatchers=num_dispatchers,
    )
    assert self.block_shape == get_mk_alignment_for_contiguous_layout()
    assert self.quant_config.use_fp8_w8a8

supports_packed_ue8m0_act_scales()

DeepGemm supports packed ue8m0 activation scales format in devices == sm100

Source code in vllm/model_executor/layers/fused_moe/experts/batched_deep_gemm_moe.py
def supports_packed_ue8m0_act_scales(self) -> bool:
    """
    DeepGemm supports packed ue8m0 activation scales format in devices == sm100
    """
    return (
        is_deep_gemm_e8m0_used()
        and current_platform.is_device_capability_family(100)
    )

BatchedTritonExperts

Bases: FusedMoEExpertsModular

A Triton based MoE expert class that operates on expert batched format, i.e. E x max_num_tokens x K. This is the format that the batched dispatch/combine kernels use.

Source code in vllm/model_executor/layers/fused_moe/experts/fused_batched_moe.py
class BatchedTritonExperts(mk.FusedMoEExpertsModular):
    """
    A Triton based MoE expert class that operates on expert batched format,
    i.e. E x max_num_tokens x K.  This is the format that the batched
    dispatch/combine kernels use.
    """

    def __init__(
        self,
        moe_config: FusedMoEConfig,
        quant_config: FusedMoEQuantConfig,
        max_num_tokens: int,
        num_dispatchers: int,
    ):
        super().__init__(
            moe_config=moe_config,
            quant_config=quant_config,
            max_num_tokens=max_num_tokens,
            num_dispatchers=num_dispatchers,
        )
        assert not self.quant_config.use_int8_w8a8, "NYI"
        assert not self.quant_config.use_int8_w8a16, "NYI"
        assert not self.quant_config.use_int4_w4a16, "NYI"
        assert self.quant_config.ocp_mx_scheme is None, "NYI"

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.BatchedExperts

    @staticmethod
    def _supports_current_device() -> bool:
        return current_platform.is_cuda_alike()

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return True

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        p = current_platform
        if p.is_rocm():
            from vllm.platforms.rocm import on_gfx9

            is_rocm_on_gfx9 = on_gfx9()
        else:
            is_rocm_on_gfx9 = False

        device_supports_fp8 = is_rocm_on_gfx9 or (
            p.is_cuda() and p.has_device_capability((8, 9))
        )

        supported: list[tuple[QuantKey | None, QuantKey | None]] = [(None, None)]
        if device_supports_fp8:
            supported += [
                (kFp8Static128BlockSym, kFp8Dynamic128Sym),
                (kFp8StaticChannelSym, kFp8DynamicTokenSym),
                (kFp8StaticTensorSym, kFp8DynamicTokenSym),
                (kFp8StaticTensorSym, kFp8StaticTensorSym),
                (kFp8StaticTensorSym, kFp8DynamicTensorSym),
            ]
        return (weight_key, activation_key) in supported

    @staticmethod
    def _supports_activation(activation: MoEActivation) -> bool:
        return activation in [
            MoEActivation.SILU,
            MoEActivation.GELU,
            MoEActivation.GELU_TANH,
            MoEActivation.SWIGLUOAI,
            MoEActivation.SILU_NO_MUL,
            MoEActivation.GELU_NO_MUL,
            MoEActivation.GELU_TANH_NO_MUL,
            MoEActivation.RELU2_NO_MUL,
        ]

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        return True

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # Let PrepareAndFinalize::finalize() decide the impl.
        return TopKWeightAndReduceDelegate()

    def activation(
        self, activation: MoEActivation, output: torch.Tensor, input: torch.Tensor
    ) -> None:
        gemm1_clamp_limit = self.quant_config.gemm1_clamp_limit
        if activation == MoEActivation.SILU and gemm1_clamp_limit is not None:
            swiglu_limit_func(output, input, float(gemm1_clamp_limit))
            return

        super().activation(activation, output, input)

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        assert self.num_dispatchers is not None
        assert self.max_num_tokens is not None
        num_dp = self.num_dispatchers
        num_experts = local_num_experts
        max_num_tokens = self.max_num_tokens
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
        workspace2 = (num_experts, max_num_tokens * num_dp, activation_out_dim)
        output = (num_experts, max_num_tokens * num_dp, K)
        return (workspace13, workspace2, output)

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        # Check constraints.
        if self.quant_config.use_int4_w4a16:
            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
        else:
            assert hidden_states.size(-1) == w1.size(2), (
                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
            )

        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
        assert hidden_states.dtype in [
            torch.float32,
            torch.float16,
            torch.bfloat16,
            torch.float8_e4m3fn,
            torch.float8_e4m3fnuz,
        ]
        assert expert_tokens_meta is not None

        expert_num_tokens = expert_tokens_meta.expert_num_tokens

        E, max_num_tokens, N, K, top_k_num = self.moe_problem_size(
            hidden_states, w1, w2, topk_ids
        )

        assert w1.size(0) == E
        assert w2.size(0) == E

        config_dtype = self.quant_config.config_name(hidden_states.dtype)

        config = try_get_optimal_moe_config(
            w1.size(),
            w2.size(),
            top_k_num,
            config_dtype,
            max_num_tokens,
            block_shape=self.block_shape,
        )

        if hidden_states.dtype == torch.bfloat16:
            compute_type = tl.bfloat16
        elif hidden_states.dtype == torch.float16:
            compute_type = tl.float16
        elif hidden_states.dtype == torch.float32:
            compute_type = tl.float32
        elif hidden_states.dtype == current_platform.fp8_dtype():
            compute_type = tl.bfloat16
        else:
            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")

        # We can reuse the memory between these because by the time we need
        # cache3, we're done with cache1
        intermediate_cache1 = _resize_cache(workspace13, (E, max_num_tokens, N))
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        intermediate_cache2 = _resize_cache(
            workspace2, (E, max_num_tokens, activation_out_dim)
        )

        # TODO(bnell): should this be done for any quantized type?
        if self.quant_config.use_fp8_w8a8:
            intermediate_cache1.fill_(0)

        a1q_scale = normalize_batched_scales_shape(a1q_scale, E)

        # MM1
        invoke_moe_batched_triton_kernel(
            A=hidden_states,
            B=w1,
            C=intermediate_cache1,
            expert_num_tokens=expert_num_tokens,
            compute_type=compute_type,
            A_scale=a1q_scale,
            B_scale=self.w1_scale,
            B_zp=self.w1_zp,
            use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
            use_int8_w8a16=self.quant_config.use_int8_w8a16,
            use_int4_w4a16=self.quant_config.use_int4_w4a16,
            config=config,
            per_act_token_quant=self.per_act_token_quant,
            block_shape=self.block_shape,
        )

        intermediate_cache2.fill_(0)

        # TODO (bnell): use triton utility from batched deep gemm.
        self.activation(
            activation,
            intermediate_cache2.view(-1, activation_out_dim),
            intermediate_cache1.view(-1, N),
        )

        qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input(
            intermediate_cache2,
            a2_scale,
            max_num_tokens,
            E,
            N,
            expert_num_tokens,
            self.quant_dtype,
            self.per_act_token_quant,
            self.block_shape,
        )

        invoke_moe_batched_triton_kernel(
            A=qintermediate_cache2,
            B=w2,
            C=output,
            expert_num_tokens=expert_num_tokens,
            compute_type=compute_type,
            A_scale=a2q_scale,
            B_scale=self.w2_scale,
            B_zp=self.w2_zp,
            use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
            use_int8_w8a16=self.quant_config.use_int8_w8a16,
            use_int4_w4a16=self.quant_config.use_int4_w4a16,
            config=config,
            per_act_token_quant=self.per_act_token_quant,
            block_shape=self.block_shape,
        )

CutlassBatchedExpertsFp8

Bases: CutlassExpertsFp8Base

Batched CUTLASS FP8 fused MoE expert implementation.

Source code in vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py
class CutlassBatchedExpertsFp8(CutlassExpertsFp8Base):
    """Batched CUTLASS FP8 fused MoE expert implementation."""

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        # BATCHED activation format works with EP because
        # expert_map is not used to identify experts (the
        # info is encoded/managed by the P/F logic).
        return True

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.BatchedExperts

    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
        return self.out_dtype if self.out_dtype is not None else act_dtype

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        num_dp = self.num_dispatchers
        assert num_dp is not None
        experts_per_worker = self.moe_config.num_local_experts
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace1 = (experts_per_worker, M * num_dp, max(N, K))
        workspace2 = (
            experts_per_worker,
            M * num_dp,
            max(activation_out_dim, K),
        )
        output = (experts_per_worker, M, K)
        return (workspace1, workspace2, output)

CutlassExpertsFp8

Bases: CutlassExpertsFp8Base

CUTLASS FP8 fused MoE expert implementation.

Source code in vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py
class CutlassExpertsFp8(CutlassExpertsFp8Base):
    """CUTLASS FP8 fused MoE expert implementation."""

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        # CutlassExpertsFp8 does not support expert map, which is
        # needed for STANDARD activation format kernels in DP/EP mode.
        # Note that the BATCHED activation format does not use
        # the expert map for identifying experts.
        return not (
            moe_parallel_config.use_fi_nvl_two_sided_kernels
            or moe_parallel_config.use_deepep_ht_kernels
            or moe_parallel_config.use_fi_nvl_one_sided_kernels
        )

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        # topk weights and reduction are fused in moe_unpermute cuda kernel
        return TopKWeightAndReduceNoOP()

    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
        return self.out_dtype if self.out_dtype is not None else act_dtype

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace1 = (M * topk, max(N, K))
        workspace2 = (M * topk, max(activation_out_dim, K))
        output = (M, K)
        return (workspace1, workspace2, output)

DeepGemmExperts

Bases: FusedMoEExpertsModular

DeepGemm-based fused MoE expert implementation.

Source code in vllm/model_executor/layers/fused_moe/experts/deep_gemm_moe.py
class DeepGemmExperts(mk.FusedMoEExpertsModular):
    """DeepGemm-based fused MoE expert implementation."""

    def __init__(self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig):
        super().__init__(moe_config=moe_config, quant_config=quant_config)
        assert quant_config.block_shape == get_mk_alignment_for_contiguous_layout()
        assert quant_config.quant_dtype == torch.float8_e4m3fn
        assert not quant_config.per_act_token_quant
        assert not quant_config.per_out_ch_quant

        self.gemm1_clamp_limit = quant_config.gemm1_clamp_limit

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    @staticmethod
    def _supports_current_device() -> bool:
        return is_deep_gemm_supported()

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return False

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        SUPPORTED_W_A = [
            (kFp8Static128BlockSym, kFp8Dynamic128Sym),
        ]
        return (weight_key, activation_key) in SUPPORTED_W_A

    @staticmethod
    def _supports_activation(activation: MoEActivation) -> bool:
        return activation in [MoEActivation.SILU, MoEActivation.SWIGLUSTEP]

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        # NOTE(rob): discovered an IMA with this combination. Needs investigation.
        return not (
            moe_parallel_config.use_fi_nvl_two_sided_kernels
            or moe_parallel_config.use_fi_nvl_one_sided_kernels
        )

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        return TopKWeightAndReduceNoOP()

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        assert self.block_shape is not None
        block_m = self.block_shape[0]
        M_sum = compute_aligned_M(
            M, topk, local_num_experts, block_m, expert_tokens_meta
        )
        assert M_sum % block_m == 0

        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace1 = (M_sum, max(activation_out_dim, K))
        workspace2 = (M_sum, max(N, K))
        output = (M, K)
        return (workspace1, workspace2, output)

    def _act_mul_quant(
        self, input: torch.Tensor, output: torch.Tensor, activation: MoEActivation
    ) -> tuple[torch.Tensor, torch.Tensor]:
        assert self.block_shape is not None
        block_k = self.block_shape[1]
        scale_fmt = DeepGemmQuantScaleFMT.from_oracle()

        M_sum, N = input.size()
        activation_out_dim = self.adjust_N_for_activation(N, activation)

        # 1. DeepGemm UE8M0: fused SiLU+mul+clamp+quant+pack
        if scale_fmt == DeepGemmQuantScaleFMT.UE8M0:
            if activation == MoEActivation.SILU:
                return fused_silu_mul_fp8_quant_packed(
                    input=input,
                    output_q=output,
                    group_size=block_k,
                    clamp_limit=self.gemm1_clamp_limit,
                )
            act_out = torch.empty(
                (M_sum, activation_out_dim), dtype=input.dtype, device=input.device
            )
            self.activation(activation, act_out, input)
            a2q, a2q_scale = per_token_group_quant_fp8_packed_for_deepgemm(
                act_out,
                block_k,
                out_q=output,
            )
            return a2q, a2q_scale

        # 2. Hopper / non‑E8M0: prefer the fused SiLU+mul+quant kernel
        if activation == MoEActivation.SILU:
            use_ue8m0 = scale_fmt == DeepGemmQuantScaleFMT.FLOAT32_CEIL_UE8M0
            return silu_mul_per_token_group_quant_fp8_colmajor(
                input=input,
                output=output,
                use_ue8m0=use_ue8m0,
                clamp_limit=self.gemm1_clamp_limit,
            )

        # 3. fallback path for non-SiLU activations in non‑UE8M0 cases.
        act_out = torch.empty(
            (M_sum, activation_out_dim), dtype=input.dtype, device=input.device
        )
        self.activation(activation, act_out, input)
        return per_token_group_quant_fp8(
            act_out, block_k, column_major_scales=True, out_q=output
        )

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        assert a1q_scale is not None
        assert a2_scale is None
        assert self.block_shape is not None
        assert self.w1_scale is not None
        assert self.w2_scale is not None

        a1q = hidden_states
        _, N, K = w1.size()

        local_num_experts = w1.size(0)
        if global_num_experts == -1:
            global_num_experts = local_num_experts

        assert w2.size(1) == K

        M_sum = compute_aligned_M(
            M=topk_ids.size(0),
            num_topk=topk_ids.size(1),
            local_num_experts=local_num_experts,
            alignment=get_mk_alignment_for_contiguous_layout()[0],
            expert_tokens_meta=expert_tokens_meta,
        )

        a1q_perm = _resize_cache(
            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, K)
        )
        a1q, a1q_scale, expert_ids, inv_perm = deepgemm_moe_permute(
            aq=a1q,
            aq_scale=a1q_scale,
            topk_ids=topk_ids,
            local_num_experts=local_num_experts,
            expert_map=expert_map,
            expert_tokens_meta=expert_tokens_meta,
            aq_out=a1q_perm,
        )
        assert a1q.size(0) == M_sum

        mm1_out = _resize_cache(workspace2, (M_sum, N))
        m_grouped_fp8_gemm_nt_contiguous(
            (a1q, a1q_scale), (w1, self.w1_scale), mm1_out, expert_ids
        )

        activation_out_dim = self.adjust_N_for_activation(N, activation)
        quant_out = _resize_cache(
            workspace13.view(dtype=torch.float8_e4m3fn), (M_sum, activation_out_dim)
        )
        a2q, a2q_scale = self._act_mul_quant(
            input=mm1_out.view(-1, N), output=quant_out, activation=activation
        )

        mm2_out = _resize_cache(workspace2, (M_sum, K))
        m_grouped_fp8_gemm_nt_contiguous(
            (a2q, a2q_scale), (w2, self.w2_scale), mm2_out, expert_ids
        )

        if apply_router_weight_on_input:
            topk_weights = torch.ones_like(topk_weights)

        deepgemm_unpermute_and_reduce(
            a=mm2_out,
            topk_ids=topk_ids,
            topk_weights=topk_weights,
            inv_perm=inv_perm,
            expert_map=expert_map,
            output=output,
        )

FusedMoEActivationFormat

Bases: Enum

The standard activation format (num_tokens, hidden dim).

Attributes:

  • Standard

    The batched experts format (num experts, max tokens per expert, hidden dim)

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
class FusedMoEActivationFormat(Enum):
    """
    The standard activation format (num_tokens, hidden dim).
    """

    Standard = ("standard",)
    """
    The batched experts format (num experts, max tokens per expert, hidden dim)
    """
    BatchedExperts = ("batched_experts",)

Standard = ('standard',) class-attribute instance-attribute

The batched experts format (num experts, max tokens per expert, hidden dim)

FusedMoEExpertsModular

Bases: FusedMoEExperts

An abstract base class for the [Permute-Experts-Unpermute] step described above.

Methods:

  • adjust_N_for_activation

    Calculate the output dimension for the activation function.

  • apply

    This function computes the intermediate result of a Mixture of Experts

  • moe_problem_size

    Extract the MoE problem size from the given tensor arguments:

  • workspace_dtype

    Workspace type: The dtype to use for the workspace tensors.

  • workspace_shapes

    Compute the shapes for the temporary and final outputs of the two gemms

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
class FusedMoEExpertsModular(FusedMoEExperts):
    """
    An abstract base class for the [Permute-Experts-Unpermute] step described
        above.
    """

    @staticmethod
    def is_monolithic() -> bool:
        return False

    def moe_problem_size(
        self,
        a1: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_ids: torch.Tensor,
    ) -> tuple[int, int, int, int, int]:
        """
        Extract the MoE problem size from the given tensor arguments:
        - a: The hidden states, input to the MoE layer.
        - w1: The first set of expert weights.
        - w2: The second set of expert weights.
        - topk_ids: The topk ids.

        Note: extracting the problem shape from the weight and activation
        tensors is not obvious.  It needs to be done this way specifically
        due to subtle issues with particular kernels, e.g. the int4 kernels
        divide the trailing dimension by two, so it's not "correct" to
        extract N or K from the trailing dimension of w1 or w2.  Similarly,
        some kernels transpose the weights, so this needs to be kept in mind.

        Note: This implementation covers most cases. However, if experts
        require a specialized implementation, like MarlinExperts, they are free
        to override this function.
        """
        assert len(w1.shape) == 3 and len(w2.shape) == 3
        E, N, _ = w1.shape
        K = a1.size(-1)

        if a1.dim() == 2:
            # Make sure we are using the correct a1 (pre-permute).
            assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
            M = a1.size(0)
        else:
            assert a1.dim() == 3
            assert a1.size(0) == E, f"{a1.size(0)} == {E}"
            M = a1.size(1)  # This is max_num_tokens

        assert topk_ids.dim() == 2
        topk = topk_ids.size(1)

        return E, M, N, K, topk

    def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
        """
        Workspace type: The dtype to use for the workspace tensors.
        """
        return act_dtype

    @abstractmethod
    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        """
        Compute the shapes for the temporary and final outputs of the two gemms
        and activation in the fused expert function.  Since the gemms are
        independent, the workspace for the first gemm can be shared with the
        workspace for the last gemm.

        Inputs:
        - M: number of tokens.
        - N: Row (or column) dimension of expert weights.
        - K: hidden dimension
        - topk: The number of top-k experts to select.
        - global_num_experts: global number of experts.
        - local_num_experts: local number of experts due to DP/EP.
        - expert_tokens_meta: number of tokens per expert metadata for batched
                              format.

        Returns a tuple of:
        - workspace13 shape tuple: must be large enough to hold the
          result of either expert gemm.
        - workspace2 shape tuple: must be large enough to hold the
          result of the activation function.
        - output shape tuple: must be exact size of the final gemm output.
        - Note: workspace shapes can be 0 if the workspace is not needed.
          But in order for activation chunking to work, the first dimension
          of each tuple must be the number of tokens when the shape is
          not 0.
        """
        raise NotImplementedError

    @staticmethod
    def adjust_N_for_activation(N: int, activation: MoEActivation) -> int:
        """
        Calculate the output dimension for the activation function.

        For *_no_mul activations (e.g. relu2_no_mul),
        there's no gate/up split, so output size equals input size (N).

        For regular gated activations (e.g., silu, gelu, swigluoai),
        output size is N // 2 due to gate × activation(up) multiplication.

        Args:
            N: The intermediate size (width of w1/w3 weights).
            activation: The activation function enum.

        Returns:
            The output dimension after activation.
        """
        return N if not activation.is_gated else N // 2

    def activation(
        self, activation: MoEActivation, output: torch.Tensor, input: torch.Tensor
    ) -> None:
        apply_moe_activation(activation, output, input)

    @abstractmethod
    def finalize_weight_and_reduce_impl(self) -> TopKWeightAndReduce:
        raise NotImplementedError

    @abstractmethod
    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ) -> None:
        """
        This function computes the intermediate result of a Mixture of Experts
        (MoE) layer using two sets of weights, w1 and w2.

        Parameters:
        - output: (torch.Tensor): The unweighted, unreduced output tensor.
        - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE
          layer.
        - w1 (torch.Tensor): The first set of expert weights.
        - w2 (torch.Tensor): The second set of expert weights.
        - topk_weights: A map of row to expert weights. Some implementations
          choose to do weight application.
        - topk_ids (torch.Tensor): A map of row to expert id.
        - activation (str): The activation function to apply after the first
          MoE layer.
        - global_num_experts (int): The total number of experts in the global
          expert space.
        - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
          from the global expert space to the local expert space of the expert
          parallel shard.
        - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be
          used for a1.  Result of quantization from prepare/finalize and not
          from the FusedMoEQuantConfig.
        - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs
          must be large enough to hold output of either MoE gemm.
        - workspace2 (torch.Tensor): A scratch tensor used for the activation
          function.
        - expert_tokens_meta (Optional[ExpertTokensMetadata]) - An optional
          ExpertTokensMetadata object containing gpu/cpu tensors
          as big as the number of local experts with the information about the
          number of tokens assigned to each local expert.
        - apply_router_weight_on_input: True if router weights are already
          applied on the input. This is relevant if the implementation
          chooses to do weight application.
        """
        raise NotImplementedError

adjust_N_for_activation(N, activation) staticmethod

Calculate the output dimension for the activation function.

For *_no_mul activations (e.g. relu2_no_mul), there's no gate/up split, so output size equals input size (N).

For regular gated activations (e.g., silu, gelu, swigluoai), output size is N // 2 due to gate × activation(up) multiplication.

Parameters:

  • N

    (int) –

    The intermediate size (width of w1/w3 weights).

  • activation

    (MoEActivation) –

    The activation function enum.

Returns:

  • int

    The output dimension after activation.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
@staticmethod
def adjust_N_for_activation(N: int, activation: MoEActivation) -> int:
    """
    Calculate the output dimension for the activation function.

    For *_no_mul activations (e.g. relu2_no_mul),
    there's no gate/up split, so output size equals input size (N).

    For regular gated activations (e.g., silu, gelu, swigluoai),
    output size is N // 2 due to gate × activation(up) multiplication.

    Args:
        N: The intermediate size (width of w1/w3 weights).
        activation: The activation function enum.

    Returns:
        The output dimension after activation.
    """
    return N if not activation.is_gated else N // 2

apply(output, hidden_states, w1, w2, topk_weights, topk_ids, activation, global_num_experts, expert_map, a1q_scale, a2_scale, workspace13, workspace2, expert_tokens_meta, apply_router_weight_on_input) abstractmethod

This function computes the intermediate result of a Mixture of Experts (MoE) layer using two sets of weights, w1 and w2.

Parameters: - output: (torch.Tensor): The unweighted, unreduced output tensor. - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE layer. - w1 (torch.Tensor): The first set of expert weights. - w2 (torch.Tensor): The second set of expert weights. - topk_weights: A map of row to expert weights. Some implementations choose to do weight application. - topk_ids (torch.Tensor): A map of row to expert id. - activation (str): The activation function to apply after the first MoE layer. - global_num_experts (int): The total number of experts in the global expert space. - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard. - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be used for a1. Result of quantization from prepare/finalize and not from the FusedMoEQuantConfig. - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs must be large enough to hold output of either MoE gemm. - workspace2 (torch.Tensor): A scratch tensor used for the activation function. - expert_tokens_meta (Optional[ExpertTokensMetadata]) - An optional ExpertTokensMetadata object containing gpu/cpu tensors as big as the number of local experts with the information about the number of tokens assigned to each local expert. - apply_router_weight_on_input: True if router weights are already applied on the input. This is relevant if the implementation chooses to do weight application.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
@abstractmethod
def apply(
    self,
    output: torch.Tensor,
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: MoEActivation,
    global_num_experts: int,
    expert_map: torch.Tensor | None,
    a1q_scale: torch.Tensor | None,
    a2_scale: torch.Tensor | None,
    workspace13: torch.Tensor,
    workspace2: torch.Tensor,
    expert_tokens_meta: ExpertTokensMetadata | None,
    apply_router_weight_on_input: bool,
) -> None:
    """
    This function computes the intermediate result of a Mixture of Experts
    (MoE) layer using two sets of weights, w1 and w2.

    Parameters:
    - output: (torch.Tensor): The unweighted, unreduced output tensor.
    - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE
      layer.
    - w1 (torch.Tensor): The first set of expert weights.
    - w2 (torch.Tensor): The second set of expert weights.
    - topk_weights: A map of row to expert weights. Some implementations
      choose to do weight application.
    - topk_ids (torch.Tensor): A map of row to expert id.
    - activation (str): The activation function to apply after the first
      MoE layer.
    - global_num_experts (int): The total number of experts in the global
      expert space.
    - expert_map (Optional[torch.Tensor]):  A tensor mapping expert indices
      from the global expert space to the local expert space of the expert
      parallel shard.
    - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be
      used for a1.  Result of quantization from prepare/finalize and not
      from the FusedMoEQuantConfig.
    - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs
      must be large enough to hold output of either MoE gemm.
    - workspace2 (torch.Tensor): A scratch tensor used for the activation
      function.
    - expert_tokens_meta (Optional[ExpertTokensMetadata]) - An optional
      ExpertTokensMetadata object containing gpu/cpu tensors
      as big as the number of local experts with the information about the
      number of tokens assigned to each local expert.
    - apply_router_weight_on_input: True if router weights are already
      applied on the input. This is relevant if the implementation
      chooses to do weight application.
    """
    raise NotImplementedError

moe_problem_size(a1, w1, w2, topk_ids)

Extract the MoE problem size from the given tensor arguments: - a: The hidden states, input to the MoE layer. - w1: The first set of expert weights. - w2: The second set of expert weights. - topk_ids: The topk ids.

Note: extracting the problem shape from the weight and activation tensors is not obvious. It needs to be done this way specifically due to subtle issues with particular kernels, e.g. the int4 kernels divide the trailing dimension by two, so it's not "correct" to extract N or K from the trailing dimension of w1 or w2. Similarly, some kernels transpose the weights, so this needs to be kept in mind.

Note: This implementation covers most cases. However, if experts require a specialized implementation, like MarlinExperts, they are free to override this function.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
def moe_problem_size(
    self,
    a1: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_ids: torch.Tensor,
) -> tuple[int, int, int, int, int]:
    """
    Extract the MoE problem size from the given tensor arguments:
    - a: The hidden states, input to the MoE layer.
    - w1: The first set of expert weights.
    - w2: The second set of expert weights.
    - topk_ids: The topk ids.

    Note: extracting the problem shape from the weight and activation
    tensors is not obvious.  It needs to be done this way specifically
    due to subtle issues with particular kernels, e.g. the int4 kernels
    divide the trailing dimension by two, so it's not "correct" to
    extract N or K from the trailing dimension of w1 or w2.  Similarly,
    some kernels transpose the weights, so this needs to be kept in mind.

    Note: This implementation covers most cases. However, if experts
    require a specialized implementation, like MarlinExperts, they are free
    to override this function.
    """
    assert len(w1.shape) == 3 and len(w2.shape) == 3
    E, N, _ = w1.shape
    K = a1.size(-1)

    if a1.dim() == 2:
        # Make sure we are using the correct a1 (pre-permute).
        assert topk_ids.size(0) == a1.size(0), f"{topk_ids.size(0)} != {a1.size(0)}"
        M = a1.size(0)
    else:
        assert a1.dim() == 3
        assert a1.size(0) == E, f"{a1.size(0)} == {E}"
        M = a1.size(1)  # This is max_num_tokens

    assert topk_ids.dim() == 2
    topk = topk_ids.size(1)

    return E, M, N, K, topk

workspace_dtype(act_dtype)

Workspace type: The dtype to use for the workspace tensors.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype:
    """
    Workspace type: The dtype to use for the workspace tensors.
    """
    return act_dtype

workspace_shapes(M, N, K, topk, global_num_experts, local_num_experts, expert_tokens_meta, activation) abstractmethod

Compute the shapes for the temporary and final outputs of the two gemms and activation in the fused expert function. Since the gemms are independent, the workspace for the first gemm can be shared with the workspace for the last gemm.

Inputs: - M: number of tokens. - N: Row (or column) dimension of expert weights. - K: hidden dimension - topk: The number of top-k experts to select. - global_num_experts: global number of experts. - local_num_experts: local number of experts due to DP/EP. - expert_tokens_meta: number of tokens per expert metadata for batched format.

Returns a tuple of: - workspace13 shape tuple: must be large enough to hold the result of either expert gemm. - workspace2 shape tuple: must be large enough to hold the result of the activation function. - output shape tuple: must be exact size of the final gemm output. - Note: workspace shapes can be 0 if the workspace is not needed. But in order for activation chunking to work, the first dimension of each tuple must be the number of tokens when the shape is not 0.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
@abstractmethod
def workspace_shapes(
    self,
    M: int,
    N: int,
    K: int,
    topk: int,
    global_num_experts: int,
    local_num_experts: int,
    expert_tokens_meta: ExpertTokensMetadata | None,
    activation: MoEActivation,
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
    """
    Compute the shapes for the temporary and final outputs of the two gemms
    and activation in the fused expert function.  Since the gemms are
    independent, the workspace for the first gemm can be shared with the
    workspace for the last gemm.

    Inputs:
    - M: number of tokens.
    - N: Row (or column) dimension of expert weights.
    - K: hidden dimension
    - topk: The number of top-k experts to select.
    - global_num_experts: global number of experts.
    - local_num_experts: local number of experts due to DP/EP.
    - expert_tokens_meta: number of tokens per expert metadata for batched
                          format.

    Returns a tuple of:
    - workspace13 shape tuple: must be large enough to hold the
      result of either expert gemm.
    - workspace2 shape tuple: must be large enough to hold the
      result of the activation function.
    - output shape tuple: must be exact size of the final gemm output.
    - Note: workspace shapes can be 0 if the workspace is not needed.
      But in order for activation chunking to work, the first dimension
      of each tuple must be the number of tokens when the shape is
      not 0.
    """
    raise NotImplementedError

FusedMoEMethodBase

Bases: QuantizeMethodBase

Methods:

Attributes:

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
class FusedMoEMethodBase(QuantizeMethodBase):
    def __init__(self, moe: FusedMoEConfig):
        super().__init__()
        self.moe: FusedMoEConfig = moe
        self.moe_quant_config: FusedMoEQuantConfig | None = None
        self.moe_kernel: mk.FusedMoEKernel | None = None

    @property
    def supports_internal_mk(self) -> bool:
        # NOTE(rob): temporary attribute to indicate support for
        # completed migration to the new internal MK interface.
        return self.moe_kernel is not None

    @property
    def mk_can_overlap_shared_experts(self) -> bool:
        # NOTE(rob): temporary attribute to indicate support for
        # completed migration to the new internal MK interface.
        return (
            self.moe_kernel is not None and self.moe_kernel.can_overlap_shared_experts
        )

    @abstractmethod
    def create_weights(
        self,
        layer: "RoutedExperts",
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        raise NotImplementedError

    def uses_weight_scale_2_pattern(self) -> bool:
        """
        Returns True if this quantization method uses 'weight_scale_2' pattern
        for per-tensor weight scales (e.g., FP4 variants), False otherwise.

        This method should be overridden by subclasses that use the
        'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
        """
        return False

    def maybe_roundup_sizes(
        self,
        hidden_size: int,
        intermediate_size_per_partition: int,
        act_dtype: torch.dtype,
        moe_parallel_config: FusedMoEParallelConfig,
    ) -> tuple[int, int]:
        """
        Given layer hidden size and intermediate size per partition and MoE
        configurations, round up hidden_size and intermediate_size_per_partition
        if necessary.

        Args:
            hidden_size: Layer hidden-size
            intermediate_size_per_partition: Intermediate size per partition for
                the layer.
            act_dtype: Data type of the layer activations.
            moe_parallel_config: Fused MoE parallelization strategy configuration.

        Return:
            A tuple of (rounded_hidden_size, rounded_intermediate_size_per_partition),
            where:
                - rounded_hidden_size is the possibly rounded up hidden size.
                - rounded_intermediate_size_per_partition is the possibly rounded
                  up intermediate size per partition.
        """
        from .all2all_utils import maybe_roundup_layer_hidden_size

        return maybe_roundup_layer_hidden_size(
            hidden_size, act_dtype, moe_parallel_config
        ), intermediate_size_per_partition

    def maybe_make_prepare_finalize(
        self,
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ) -> FusedMoEPrepareAndFinalizeModular | None:
        from .all2all_utils import maybe_make_prepare_finalize

        pf = maybe_make_prepare_finalize(
            self.moe, self.moe_quant_config, routing_tables
        )
        assert pf is None or isinstance(pf, FusedMoEPrepareAndFinalizeModular)
        return pf

    def select_gemm_impl(
        self,
        prepare_finalize: FusedMoEPrepareAndFinalizeModular,
        layer: "RoutedExperts",
    ) -> FusedMoEExpertsModular:
        # based on the all2all implementation, select the appropriate
        # gemm implementation
        raise ValueError(
            f"{self.__class__.__name__} uses the new modular kernel initialization "
            "logic. This function should not be called."
        )

    @abstractmethod
    def get_fused_moe_quant_config(
        self, layer: "RoutedExperts"
    ) -> FusedMoEQuantConfig | None:
        raise NotImplementedError

    @property
    def topk_indices_dtype(self) -> torch.dtype | None:
        if self.moe_kernel is not None:
            return self.moe_kernel.prepare_finalize.topk_indices_dtype()
        return None

    @property
    def skip_forward_padding(self) -> bool:
        """Whether to skip the padding in the forward before applying the moe method."""
        return False

    @property
    def has_unpadded_output(self) -> bool:
        """
        Indicates that the hidden_states output might be the unpadded
        hidden_states shape rather than the full padded shape.
        """
        return False

    @property
    def supports_eplb(self) -> bool:
        return False

    @property
    def method_name(self) -> str:
        return self.__class__.__name__

    @property
    def is_monolithic(self) -> bool:
        if self.moe_kernel is None:
            if hasattr(self, "experts_cls"):
                return self.experts_cls.is_monolithic()
            else:
                return False
        return self.moe_kernel.is_monolithic

    def apply(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: "SharedExperts | None",
        shared_experts_input: torch.Tensor | None,
    ) -> torch.Tensor:
        """
        Apply the MoE operation using modular kernels.

        Args:
            layer: RoutedExperts instance containing weight parameters
            x: Input tensor
            topk_weights: Expert weights from router
            topk_ids: Selected expert IDs from router
            shared_experts_input: Input for shared experts (if any)

        Returns:
            Output tensor from routed experts
        """
        raise NotImplementedError

    def apply_monolithic(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        router_logits: torch.Tensor,
        input_ids: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """
        Apply the MoE operation using monolithic kernels.

        Args:
            layer: RoutedExperts instance containing weight parameters
            x: Input tensor
            router_logits: Router logits (routing done internally)

        Returns:
            Output tensor from routed experts
        """
        raise NotImplementedError

has_unpadded_output property

Indicates that the hidden_states output might be the unpadded hidden_states shape rather than the full padded shape.

skip_forward_padding property

Whether to skip the padding in the forward before applying the moe method.

apply(layer, x, topk_weights, topk_ids, shared_experts, shared_experts_input)

Apply the MoE operation using modular kernels.

Parameters:

  • layer

    (RoutedExperts) –

    RoutedExperts instance containing weight parameters

  • x

    (Tensor) –

    Input tensor

  • topk_weights

    (Tensor) –

    Expert weights from router

  • topk_ids

    (Tensor) –

    Selected expert IDs from router

  • shared_experts_input

    (Tensor | None) –

    Input for shared experts (if any)

Returns:

  • Tensor

    Output tensor from routed experts

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
def apply(
    self,
    layer: "RoutedExperts",
    x: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    shared_experts: "SharedExperts | None",
    shared_experts_input: torch.Tensor | None,
) -> torch.Tensor:
    """
    Apply the MoE operation using modular kernels.

    Args:
        layer: RoutedExperts instance containing weight parameters
        x: Input tensor
        topk_weights: Expert weights from router
        topk_ids: Selected expert IDs from router
        shared_experts_input: Input for shared experts (if any)

    Returns:
        Output tensor from routed experts
    """
    raise NotImplementedError

apply_monolithic(layer, x, router_logits, input_ids=None)

Apply the MoE operation using monolithic kernels.

Parameters:

  • layer

    (RoutedExperts) –

    RoutedExperts instance containing weight parameters

  • x

    (Tensor) –

    Input tensor

  • router_logits

    (Tensor) –

    Router logits (routing done internally)

Returns:

  • Tensor

    Output tensor from routed experts

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
def apply_monolithic(
    self,
    layer: "RoutedExperts",
    x: torch.Tensor,
    router_logits: torch.Tensor,
    input_ids: torch.Tensor | None = None,
) -> torch.Tensor:
    """
    Apply the MoE operation using monolithic kernels.

    Args:
        layer: RoutedExperts instance containing weight parameters
        x: Input tensor
        router_logits: Router logits (routing done internally)

    Returns:
        Output tensor from routed experts
    """
    raise NotImplementedError

maybe_roundup_sizes(hidden_size, intermediate_size_per_partition, act_dtype, moe_parallel_config)

Given layer hidden size and intermediate size per partition and MoE configurations, round up hidden_size and intermediate_size_per_partition if necessary.

Parameters:

  • hidden_size

    (int) –

    Layer hidden-size

  • intermediate_size_per_partition

    (int) –

    Intermediate size per partition for the layer.

  • act_dtype

    (dtype) –

    Data type of the layer activations.

  • moe_parallel_config

    (FusedMoEParallelConfig) –

    Fused MoE parallelization strategy configuration.

Return

A tuple of (rounded_hidden_size, rounded_intermediate_size_per_partition), where: - rounded_hidden_size is the possibly rounded up hidden size. - rounded_intermediate_size_per_partition is the possibly rounded up intermediate size per partition.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
def maybe_roundup_sizes(
    self,
    hidden_size: int,
    intermediate_size_per_partition: int,
    act_dtype: torch.dtype,
    moe_parallel_config: FusedMoEParallelConfig,
) -> tuple[int, int]:
    """
    Given layer hidden size and intermediate size per partition and MoE
    configurations, round up hidden_size and intermediate_size_per_partition
    if necessary.

    Args:
        hidden_size: Layer hidden-size
        intermediate_size_per_partition: Intermediate size per partition for
            the layer.
        act_dtype: Data type of the layer activations.
        moe_parallel_config: Fused MoE parallelization strategy configuration.

    Return:
        A tuple of (rounded_hidden_size, rounded_intermediate_size_per_partition),
        where:
            - rounded_hidden_size is the possibly rounded up hidden size.
            - rounded_intermediate_size_per_partition is the possibly rounded
              up intermediate size per partition.
    """
    from .all2all_utils import maybe_roundup_layer_hidden_size

    return maybe_roundup_layer_hidden_size(
        hidden_size, act_dtype, moe_parallel_config
    ), intermediate_size_per_partition

uses_weight_scale_2_pattern()

Returns True if this quantization method uses 'weight_scale_2' pattern for per-tensor weight scales (e.g., FP4 variants), False otherwise.

This method should be overridden by subclasses that use the 'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.

Source code in vllm/model_executor/layers/fused_moe/fused_moe_method_base.py
def uses_weight_scale_2_pattern(self) -> bool:
    """
    Returns True if this quantization method uses 'weight_scale_2' pattern
    for per-tensor weight scales (e.g., FP4 variants), False otherwise.

    This method should be overridden by subclasses that use the
    'weight_scale_2' pattern instead of the standard 'weight_scale' pattern.
    """
    return False

FusedMoEParallelConfig dataclass

Methods:

  • make

    Determine MoE parallel configuration. Based on the input tp_size_,

  • make_no_parallel

    For usage in CI/CD and testing.

Source code in vllm/model_executor/layers/fused_moe/config.py
@dataclass
class FusedMoEParallelConfig:
    tp_size: int
    pcp_size: int
    dp_size: int
    ep_size: int
    tp_rank: int
    pcp_rank: int
    dp_rank: int
    ep_rank: int
    sp_size: int

    use_ep: bool  # whether to use EP or not
    all2all_backend: str  # all2all backend for MoE communication
    enable_eplb: bool  # whether to enable expert load balancing

    @property
    def is_sequence_parallel(self) -> bool:
        return self.sp_size > 1

    @property
    def use_all2all_kernels(self):
        return self.dp_size > 1 and self.use_ep

    @property
    def use_deepep_ht_kernels(self):
        return (
            self.use_all2all_kernels
            and self.all2all_backend == "deepep_high_throughput"
        )

    @property
    def use_deepep_ll_kernels(self):
        return self.use_all2all_kernels and self.all2all_backend == "deepep_low_latency"

    @property
    def use_fi_nvl_two_sided_kernels(self):
        return self.use_all2all_kernels and (
            self.all2all_backend == "flashinfer_all2allv"
            or self.all2all_backend == "flashinfer_nvlink_two_sided"
        )

    @property
    def use_fi_nvl_one_sided_kernels(self):
        return (
            self.use_all2all_kernels
            and self.all2all_backend == "flashinfer_nvlink_one_sided"
        )

    @property
    def use_batched_activation_format(self):
        return self.use_deepep_ll_kernels or self.use_nixl_ep_kernels

    @property
    def needs_round_robin_routing_tables(self):
        return self.use_deepep_ll_kernels or self.use_nixl_ep_kernels

    @property
    def use_ag_rs_all2all_kernels(self):
        return (
            self.use_all2all_kernels
            and self.all2all_backend == "allgather_reducescatter"
        )

    @property
    def use_mori_kernels(self):
        return self.use_all2all_kernels and self.all2all_backend in (
            "mori_high_throughput",
            "mori_low_latency",
        )

    @property
    def use_nixl_ep_kernels(self):
        return self.use_all2all_kernels and self.all2all_backend == "nixl_ep"

    @property
    def use_deepep_v2_kernels(self):
        return self.use_all2all_kernels and self.all2all_backend == "deepep_v2"

    @staticmethod
    def flatten_tp_across_dp_and_pcp(
        tp_size: int, dp_size: int, dp_rank: int, pcp_size: int, pcp_rank: int
    ) -> tuple[int, int]:
        tp_rank = 0 if tp_size == 1 else get_tensor_model_parallel_rank()
        # There are actually dp_size * pcp_size * tp_size devices.
        # Update tp_size and tp_rank so we shard across all devices.
        flatten_tp_size = dp_size * pcp_size * tp_size
        flatten_tp_rank = dp_rank * pcp_size * tp_size + pcp_rank * tp_size + tp_rank
        return flatten_tp_size, flatten_tp_rank

    @staticmethod
    def make(
        tp_size_: int,
        pcp_size_: int,
        dp_size_: int,
        sp_size_: int,
        vllm_parallel_config: ParallelConfig,
    ) -> "FusedMoEParallelConfig":
        """
        Determine MoE parallel configuration. Based on the input `tp_size_`,
        `dp_size_` and vllm's parallel config, determine what
        level's of parallelism to use in the fused moe layer.

        Args:
            tp_size_ (int): `tp_size` passed into the FusedMoE constructor.
            pcp_size_ (int): `pcp_size` passed into the FusedMoE constructor.
            dp_size_ (int): `dp_size` passed into the FusedMoE constructor.
            vllm_parallel_config (ParallelConfig): vLLM's parallel config
                object which contains the `enable_expert_parallel` flag.

        Examples:
            When there is no parallelism requested,
            i.e. `tp_size_` = `pcp_size_` = `dp_size_` = 1, we simply return the sizes
            unaltered and the ranks set to 0.

            Expert Parallelism is considered only when either `dp_size_`, `pcp_size_` or
            `tp_size_` is non trivial.

            Note that PCP serves the same function as DP here.

            When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different
            devices:

            - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
                legend : {size, rank}
            - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
            - Comment : Tensors are sharded across 2 devices.

            When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different
                devices:

            - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
            - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
            - Comment: There are 2 engine instances and the tensors are sharded
                across 2 decvices.

            When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different
                devices:

            - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
            - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
            - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
            - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
            - Comment: There are 2 engine instances and the tensors are sharded
                across 4 devices.

            When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different
                devices:

            - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
            - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
            - Comment: The experts are split between the 2 devices.

            When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different
                devices:

            - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
            - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
            - Comment: There are 2 engine instances and the experts are split
                between the 2 devices.

            When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different
                devices:

            - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
            - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
            - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
            - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
            - Comment: There are 2 engine instances and the experts are split
                between the 4 devices.
        """
        use_ep = (
            dp_size_ * pcp_size_ * tp_size_ > 1
            and vllm_parallel_config.enable_expert_parallel
        )

        dp_size = dp_size_
        dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
        pcp_size = pcp_size_
        pcp_rank = get_pcp_group().rank_in_group if pcp_size > 1 else 0
        tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
            tp_size_, dp_size_, dp_rank, pcp_size_, pcp_rank
        )

        if not use_ep:
            return FusedMoEParallelConfig(
                tp_size=tp_size,
                tp_rank=tp_rank,
                pcp_size=pcp_size,
                pcp_rank=pcp_rank,
                dp_size=dp_size,
                dp_rank=dp_rank,
                ep_size=1,
                ep_rank=0,
                sp_size=sp_size_,
                use_ep=False,
                all2all_backend=vllm_parallel_config.all2all_backend,
                enable_eplb=vllm_parallel_config.enable_eplb,
            )
        # DP + EP / TP + EP / DP + TP + EP
        assert use_ep
        # In EP, each device owns a set of experts fully. There is no tensor
        # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
        ep_size = tp_size
        ep_rank = tp_rank
        return FusedMoEParallelConfig(
            tp_size=1,
            tp_rank=0,
            pcp_size=pcp_size,
            pcp_rank=pcp_rank,
            dp_size=dp_size,
            dp_rank=dp_rank,
            ep_size=ep_size,
            ep_rank=ep_rank,
            sp_size=sp_size_,
            use_ep=True,
            all2all_backend=vllm_parallel_config.all2all_backend,
            enable_eplb=vllm_parallel_config.enable_eplb,
        )

    @classmethod
    def make_no_parallel(cls) -> "FusedMoEParallelConfig":
        """For usage in CI/CD and testing."""
        return FusedMoEParallelConfig(
            tp_size=1,
            tp_rank=0,
            pcp_size=1,
            pcp_rank=0,
            dp_size=1,
            dp_rank=0,
            ep_size=1,
            ep_rank=0,
            sp_size=1,
            use_ep=False,
            all2all_backend="allgather_reducescatter",
            enable_eplb=False,
        )

make(tp_size_, pcp_size_, dp_size_, sp_size_, vllm_parallel_config) staticmethod

Determine MoE parallel configuration. Based on the input tp_size_, dp_size_ and vllm's parallel config, determine what level's of parallelism to use in the fused moe layer.

Parameters:

  • tp_size_

    (int) –

    tp_size passed into the FusedMoE constructor.

  • pcp_size_

    (int) –

    pcp_size passed into the FusedMoE constructor.

  • dp_size_

    (int) –

    dp_size passed into the FusedMoE constructor.

  • vllm_parallel_config

    (ParallelConfig) –

    vLLM's parallel config object which contains the enable_expert_parallel flag.

Examples:

When there is no parallelism requested, i.e. tp_size_ = pcp_size_ = dp_size_ = 1, we simply return the sizes unaltered and the ranks set to 0.

Expert Parallelism is considered only when either dp_size_, pcp_size_ or tp_size_ is non trivial.

Note that PCP serves the same function as DP here.

When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different devices:

  • device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} // legend : {size, rank}
  • device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
  • Comment : Tensors are sharded across 2 devices.

When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different devices:

  • device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
  • device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
  • Comment: There are 2 engine instances and the tensors are sharded across 2 decvices.

When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different devices:

  • device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
  • device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
  • device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
  • device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
  • Comment: There are 2 engine instances and the tensors are sharded across 4 devices.

When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different devices:

  • device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
  • device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
  • Comment: The experts are split between the 2 devices.

When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different devices:

  • device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
  • device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
  • Comment: There are 2 engine instances and the experts are split between the 2 devices.

When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different devices:

  • device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
  • device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
  • device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
  • device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
  • Comment: There are 2 engine instances and the experts are split between the 4 devices.
Source code in vllm/model_executor/layers/fused_moe/config.py
@staticmethod
def make(
    tp_size_: int,
    pcp_size_: int,
    dp_size_: int,
    sp_size_: int,
    vllm_parallel_config: ParallelConfig,
) -> "FusedMoEParallelConfig":
    """
    Determine MoE parallel configuration. Based on the input `tp_size_`,
    `dp_size_` and vllm's parallel config, determine what
    level's of parallelism to use in the fused moe layer.

    Args:
        tp_size_ (int): `tp_size` passed into the FusedMoE constructor.
        pcp_size_ (int): `pcp_size` passed into the FusedMoE constructor.
        dp_size_ (int): `dp_size` passed into the FusedMoE constructor.
        vllm_parallel_config (ParallelConfig): vLLM's parallel config
            object which contains the `enable_expert_parallel` flag.

    Examples:
        When there is no parallelism requested,
        i.e. `tp_size_` = `pcp_size_` = `dp_size_` = 1, we simply return the sizes
        unaltered and the ranks set to 0.

        Expert Parallelism is considered only when either `dp_size_`, `pcp_size_` or
        `tp_size_` is non trivial.

        Note that PCP serves the same function as DP here.

        When TP = 2, DP(PCP) = 1 and EP = False, the configuration on different
        devices:

        - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} //
            legend : {size, rank}
        - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0}
        - Comment : Tensors are sharded across 2 devices.

        When TP = 1, DP(PCP) = 2 and EP = False, the configuration on different
            devices:

        - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0}
        - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0}
        - Comment: There are 2 engine instances and the tensors are sharded
            across 2 decvices.

        When TP = 2, DP(PCP) = 2 and EP = False, the configuration on different
            devices:

        - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0}
        - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0}
        - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0}
        - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0}
        - Comment: There are 2 engine instances and the tensors are sharded
            across 4 devices.

        When, TP = 2, DP(PCP) = 1 and EP = True, the configuration on different
            devices:

        - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0}
        - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1}
        - Comment: The experts are split between the 2 devices.

        When, TP = 1, DP(PCP) = 2 and EP = True, the configuration on different
            devices:

        - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0}
        - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1}
        - Comment: There are 2 engine instances and the experts are split
            between the 2 devices.

        When TP = 2, DP(PCP) = 2 and EP = True, the configuration on different
            devices:

        - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0}
        - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1}
        - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2}
        - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3}
        - Comment: There are 2 engine instances and the experts are split
            between the 4 devices.
    """
    use_ep = (
        dp_size_ * pcp_size_ * tp_size_ > 1
        and vllm_parallel_config.enable_expert_parallel
    )

    dp_size = dp_size_
    dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0
    pcp_size = pcp_size_
    pcp_rank = get_pcp_group().rank_in_group if pcp_size > 1 else 0
    tp_size, tp_rank = FusedMoEParallelConfig.flatten_tp_across_dp_and_pcp(
        tp_size_, dp_size_, dp_rank, pcp_size_, pcp_rank
    )

    if not use_ep:
        return FusedMoEParallelConfig(
            tp_size=tp_size,
            tp_rank=tp_rank,
            pcp_size=pcp_size,
            pcp_rank=pcp_rank,
            dp_size=dp_size,
            dp_rank=dp_rank,
            ep_size=1,
            ep_rank=0,
            sp_size=sp_size_,
            use_ep=False,
            all2all_backend=vllm_parallel_config.all2all_backend,
            enable_eplb=vllm_parallel_config.enable_eplb,
        )
    # DP + EP / TP + EP / DP + TP + EP
    assert use_ep
    # In EP, each device owns a set of experts fully. There is no tensor
    # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that.
    ep_size = tp_size
    ep_rank = tp_rank
    return FusedMoEParallelConfig(
        tp_size=1,
        tp_rank=0,
        pcp_size=pcp_size,
        pcp_rank=pcp_rank,
        dp_size=dp_size,
        dp_rank=dp_rank,
        ep_size=ep_size,
        ep_rank=ep_rank,
        sp_size=sp_size_,
        use_ep=True,
        all2all_backend=vllm_parallel_config.all2all_backend,
        enable_eplb=vllm_parallel_config.enable_eplb,
    )

make_no_parallel() classmethod

For usage in CI/CD and testing.

Source code in vllm/model_executor/layers/fused_moe/config.py
@classmethod
def make_no_parallel(cls) -> "FusedMoEParallelConfig":
    """For usage in CI/CD and testing."""
    return FusedMoEParallelConfig(
        tp_size=1,
        tp_rank=0,
        pcp_size=1,
        pcp_rank=0,
        dp_size=1,
        dp_rank=0,
        ep_size=1,
        ep_rank=0,
        sp_size=1,
        use_ep=False,
        all2all_backend="allgather_reducescatter",
        enable_eplb=False,
    )

FusedMoEPrepareAndFinalizeModular

Bases: FusedMoEPrepareAndFinalize

An abstract base class for the [Quantize-Prepare] and [Finalize] steps described above for the Modular case.

Methods:

  • finalize

    Perform any combine plus apply weights and perform a reduction on the

  • finalize_async

    Perform any combine plus apply weights and perform a reduction on the

  • prepare

    Perform any quantization (and/or) dispatching needed for this kernel.

  • prepare_async

    Perform any quantization (and/or) dispatching needed for this kernel

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
class FusedMoEPrepareAndFinalizeModular(FusedMoEPrepareAndFinalize):
    """
    An abstract base class for the [Quantize-Prepare] and [Finalize] steps
    described above for the Modular case.
    """

    @abstractmethod
    def prepare(
        self,
        a1: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        num_experts: int,
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
        defer_input_quant: bool,
    ) -> PrepareResultType:
        """
        Perform any quantization (and/or) dispatching needed for this kernel.
        - a1: The (unquantized) input to the MoE layer.
        - topk_ids: The topk ids.
        - topk_weights: The topk weights.
        - num_experts: The total number of experts in the global expert space.
        - expert_map: A tensor mapping expert indices from the global expert
          space to the local expert space of the expert parallel shard.
        - apply_router_weight_on_input: When True, apply the weights to the
          activations, before quantization + dispatching.
        - quant_config: Quantization info provided by the fused experts.
        - defer_input_quant: Runtime parameter indicating whether or not to
          defer input quantization to the FusedMoEExpertsModular
          in cases where the compute kernel expects unquantized inputs

        Returns a tuple of:
        - quantized + dispatched a.
        - Optional quantized + dispatched a1_scales.
        - Optional ExpertTokensMetadata containing gpu/cpu tensors
          as big as the number of local experts with the information about the
          number of tokens assigned to each local expert.
        - Optional dispatched expert topk IDs
        - Optional dispatched expert topk weight
        """
        raise NotImplementedError

    def prepare_async(
        self,
        a1: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        num_experts: int,
        expert_map: torch.Tensor | None,
        apply_router_weight_on_input: bool,
        quant_config: FusedMoEQuantConfig,
        defer_input_quant: bool,
    ) -> tuple[Callable, ReceiverType] | ReceiverType:
        """
        Perform any quantization (and/or) dispatching needed for this kernel
        but do not wait for results from other workers.
        - a1: The (unquantized) input to the MoE layer.
        - a1_scale: Optional scales for a1
        - a2_scale: Optional scales for the second MoE gemm.  Required to make
          sure the quantization is consistent for both gemms.
        - topk_ids: The topk ids.
        - topk_weights: The topk weights.
        - num_experts: The total number of experts in the global expert space.
        - expert_map: A tensor mapping expert indices from the global expert
          space to the local expert space of the expert parallel shard.
        - apply_router_weight_on_input: When True, apply the weights to the
          activations, before quantization + dispatching.
        - defer_input_quant: Runtime parameter indicating whether or not to
          defer input quantization to the FusedMoEExpertsModular
          in cases where the compute kernel expects unquantized inputs

        Returns a callback or a hook callback pair that when invoked waits for
        results from other workers and has the same return signature as
        `prepare`, if a hook is returned this is more lightweight check that
        the recv is complete without doing extra work (used by DBO, will be
        refactored in the very near future)

        e.g.

        ret = obj.prepare_async(...)

        if isinstance(ret, tuple):
            hook, receiver = ret
            hook()

        if hook is not None:
        a, a_scales, expert_meta, topk_ids, topk_weights = receiver()

        is equivalent to:

        a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...)
        """
        raise NotImplementedError

    @abstractmethod
    def finalize(
        self,
        output: torch.Tensor,
        fused_expert_output: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        apply_router_weight_on_input: bool,
        weight_and_reduce_impl: TopKWeightAndReduce,
    ) -> None:
        """
        Perform any combine plus apply weights and perform a reduction on the
        fused experts output.
        - output: The output tensor, written in place.  Must be (M, K) shape.
        - fused_expert_output: The unweighted, unreduced output of the fused
          experts, it will have (M, topk, K) shape.
        - topk_weights: The weights to be applied to the fused_experts_output.
        - topk_ids: The topk_ids.
        - apply_router_weight_on_input: When False, apply the weights to
          fused_expert_output.
        - weight_and_reduce_impl: An optional TopKWeightAndReduce
          implementation.
        """
        raise NotImplementedError

    def finalize_async(
        self,
        output: torch.Tensor,
        fused_expert_output: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        apply_router_weight_on_input: bool,
        weight_and_reduce_impl: TopKWeightAndReduce,
    ) -> tuple[Callable, Callable] | Callable:
        """
        Perform any combine plus apply weights and perform a reduction on the
        fused experts output but do not wait for results from other workers.
        - output: The output tensor, written in place.  Must be (M, K) shape.
        - fused_expert_output: The unweighted, unreduced output of the fused
          experts, it will have (M, topk, K) shape.
        - topk_weights: The weights to be applied to the fused_experts_output.
        - topk_ids: The topk_ids.
        - apply_router_weight_on_input: When False, apply the weights to
          fused_expert_output.
        - weight_and_reduce_impl: An optional TopKWeightAndReduce
          implementation.

        Returns a callback or a hook callback pair that when invoked waits for
        results from other workers and has the same return signature as
        `finalize`, if a hook is returned this is more lightweight check that
        the recv is complete without doing extra work (used by DBO, will be
        refactored in the very near future)

        ret = obj.finalize_async(output, ...)
        ... output not valid yet ...
        if isinstance(ret, tuple):
            hook, receiver = ret
            hook()
        receiver()
        ... output valid here ...

        is equivalent to:

        obj.finalize(output, ...)
        """
        raise NotImplementedError

finalize(output, fused_expert_output, topk_weights, topk_ids, apply_router_weight_on_input, weight_and_reduce_impl) abstractmethod

Perform any combine plus apply weights and perform a reduction on the fused experts output. - output: The output tensor, written in place. Must be (M, K) shape. - fused_expert_output: The unweighted, unreduced output of the fused experts, it will have (M, topk, K) shape. - topk_weights: The weights to be applied to the fused_experts_output. - topk_ids: The topk_ids. - apply_router_weight_on_input: When False, apply the weights to fused_expert_output. - weight_and_reduce_impl: An optional TopKWeightAndReduce implementation.

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
@abstractmethod
def finalize(
    self,
    output: torch.Tensor,
    fused_expert_output: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    apply_router_weight_on_input: bool,
    weight_and_reduce_impl: TopKWeightAndReduce,
) -> None:
    """
    Perform any combine plus apply weights and perform a reduction on the
    fused experts output.
    - output: The output tensor, written in place.  Must be (M, K) shape.
    - fused_expert_output: The unweighted, unreduced output of the fused
      experts, it will have (M, topk, K) shape.
    - topk_weights: The weights to be applied to the fused_experts_output.
    - topk_ids: The topk_ids.
    - apply_router_weight_on_input: When False, apply the weights to
      fused_expert_output.
    - weight_and_reduce_impl: An optional TopKWeightAndReduce
      implementation.
    """
    raise NotImplementedError

finalize_async(output, fused_expert_output, topk_weights, topk_ids, apply_router_weight_on_input, weight_and_reduce_impl)

Perform any combine plus apply weights and perform a reduction on the fused experts output but do not wait for results from other workers. - output: The output tensor, written in place. Must be (M, K) shape. - fused_expert_output: The unweighted, unreduced output of the fused experts, it will have (M, topk, K) shape. - topk_weights: The weights to be applied to the fused_experts_output. - topk_ids: The topk_ids. - apply_router_weight_on_input: When False, apply the weights to fused_expert_output. - weight_and_reduce_impl: An optional TopKWeightAndReduce implementation.

Returns a callback or a hook callback pair that when invoked waits for results from other workers and has the same return signature as finalize, if a hook is returned this is more lightweight check that the recv is complete without doing extra work (used by DBO, will be refactored in the very near future)

ret = obj.finalize_async(output, ...) ... output not valid yet ... if isinstance(ret, tuple): hook, receiver = ret hook() receiver() ... output valid here ...

is equivalent to:

obj.finalize(output, ...)

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
def finalize_async(
    self,
    output: torch.Tensor,
    fused_expert_output: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    apply_router_weight_on_input: bool,
    weight_and_reduce_impl: TopKWeightAndReduce,
) -> tuple[Callable, Callable] | Callable:
    """
    Perform any combine plus apply weights and perform a reduction on the
    fused experts output but do not wait for results from other workers.
    - output: The output tensor, written in place.  Must be (M, K) shape.
    - fused_expert_output: The unweighted, unreduced output of the fused
      experts, it will have (M, topk, K) shape.
    - topk_weights: The weights to be applied to the fused_experts_output.
    - topk_ids: The topk_ids.
    - apply_router_weight_on_input: When False, apply the weights to
      fused_expert_output.
    - weight_and_reduce_impl: An optional TopKWeightAndReduce
      implementation.

    Returns a callback or a hook callback pair that when invoked waits for
    results from other workers and has the same return signature as
    `finalize`, if a hook is returned this is more lightweight check that
    the recv is complete without doing extra work (used by DBO, will be
    refactored in the very near future)

    ret = obj.finalize_async(output, ...)
    ... output not valid yet ...
    if isinstance(ret, tuple):
        hook, receiver = ret
        hook()
    receiver()
    ... output valid here ...

    is equivalent to:

    obj.finalize(output, ...)
    """
    raise NotImplementedError

prepare(a1, topk_weights, topk_ids, num_experts, expert_map, apply_router_weight_on_input, quant_config, defer_input_quant) abstractmethod

Perform any quantization (and/or) dispatching needed for this kernel. - a1: The (unquantized) input to the MoE layer. - topk_ids: The topk ids. - topk_weights: The topk weights. - num_experts: The total number of experts in the global expert space. - expert_map: A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard. - apply_router_weight_on_input: When True, apply the weights to the activations, before quantization + dispatching. - quant_config: Quantization info provided by the fused experts. - defer_input_quant: Runtime parameter indicating whether or not to defer input quantization to the FusedMoEExpertsModular in cases where the compute kernel expects unquantized inputs

Returns a tuple of: - quantized + dispatched a. - Optional quantized + dispatched a1_scales. - Optional ExpertTokensMetadata containing gpu/cpu tensors as big as the number of local experts with the information about the number of tokens assigned to each local expert. - Optional dispatched expert topk IDs - Optional dispatched expert topk weight

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
@abstractmethod
def prepare(
    self,
    a1: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    num_experts: int,
    expert_map: torch.Tensor | None,
    apply_router_weight_on_input: bool,
    quant_config: FusedMoEQuantConfig,
    defer_input_quant: bool,
) -> PrepareResultType:
    """
    Perform any quantization (and/or) dispatching needed for this kernel.
    - a1: The (unquantized) input to the MoE layer.
    - topk_ids: The topk ids.
    - topk_weights: The topk weights.
    - num_experts: The total number of experts in the global expert space.
    - expert_map: A tensor mapping expert indices from the global expert
      space to the local expert space of the expert parallel shard.
    - apply_router_weight_on_input: When True, apply the weights to the
      activations, before quantization + dispatching.
    - quant_config: Quantization info provided by the fused experts.
    - defer_input_quant: Runtime parameter indicating whether or not to
      defer input quantization to the FusedMoEExpertsModular
      in cases where the compute kernel expects unquantized inputs

    Returns a tuple of:
    - quantized + dispatched a.
    - Optional quantized + dispatched a1_scales.
    - Optional ExpertTokensMetadata containing gpu/cpu tensors
      as big as the number of local experts with the information about the
      number of tokens assigned to each local expert.
    - Optional dispatched expert topk IDs
    - Optional dispatched expert topk weight
    """
    raise NotImplementedError

prepare_async(a1, topk_weights, topk_ids, num_experts, expert_map, apply_router_weight_on_input, quant_config, defer_input_quant)

Perform any quantization (and/or) dispatching needed for this kernel but do not wait for results from other workers. - a1: The (unquantized) input to the MoE layer. - a1_scale: Optional scales for a1 - a2_scale: Optional scales for the second MoE gemm. Required to make sure the quantization is consistent for both gemms. - topk_ids: The topk ids. - topk_weights: The topk weights. - num_experts: The total number of experts in the global expert space. - expert_map: A tensor mapping expert indices from the global expert space to the local expert space of the expert parallel shard. - apply_router_weight_on_input: When True, apply the weights to the activations, before quantization + dispatching. - defer_input_quant: Runtime parameter indicating whether or not to defer input quantization to the FusedMoEExpertsModular in cases where the compute kernel expects unquantized inputs

Returns a callback or a hook callback pair that when invoked waits for results from other workers and has the same return signature as prepare, if a hook is returned this is more lightweight check that the recv is complete without doing extra work (used by DBO, will be refactored in the very near future)

e.g.

ret = obj.prepare_async(...)

if isinstance(ret, tuple): hook, receiver = ret hook()

if hook is not None: a, a_scales, expert_meta, topk_ids, topk_weights = receiver()

is equivalent to:

a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...)

Source code in vllm/model_executor/layers/fused_moe/modular_kernel.py
def prepare_async(
    self,
    a1: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    num_experts: int,
    expert_map: torch.Tensor | None,
    apply_router_weight_on_input: bool,
    quant_config: FusedMoEQuantConfig,
    defer_input_quant: bool,
) -> tuple[Callable, ReceiverType] | ReceiverType:
    """
    Perform any quantization (and/or) dispatching needed for this kernel
    but do not wait for results from other workers.
    - a1: The (unquantized) input to the MoE layer.
    - a1_scale: Optional scales for a1
    - a2_scale: Optional scales for the second MoE gemm.  Required to make
      sure the quantization is consistent for both gemms.
    - topk_ids: The topk ids.
    - topk_weights: The topk weights.
    - num_experts: The total number of experts in the global expert space.
    - expert_map: A tensor mapping expert indices from the global expert
      space to the local expert space of the expert parallel shard.
    - apply_router_weight_on_input: When True, apply the weights to the
      activations, before quantization + dispatching.
    - defer_input_quant: Runtime parameter indicating whether or not to
      defer input quantization to the FusedMoEExpertsModular
      in cases where the compute kernel expects unquantized inputs

    Returns a callback or a hook callback pair that when invoked waits for
    results from other workers and has the same return signature as
    `prepare`, if a hook is returned this is more lightweight check that
    the recv is complete without doing extra work (used by DBO, will be
    refactored in the very near future)

    e.g.

    ret = obj.prepare_async(...)

    if isinstance(ret, tuple):
        hook, receiver = ret
        hook()

    if hook is not None:
    a, a_scales, expert_meta, topk_ids, topk_weights = receiver()

    is equivalent to:

    a, a_scales, expert_meta, topk_ids, topk_weights = obj.prepare(...)
    """
    raise NotImplementedError

FusedMoEQuantConfig dataclass

The FusedMoEQuantConfig contains all the quantization parameters for a single FusedMoEMethodBase operation. It consists of four FusedMoEQuantDescs, one for each activation and set of weights.

Each FusedMoEMethodBase must implement a get_fused_moe_quant_config method to construct a FusedMoEQuantConfig for use with that class.

FusedMoEQuant configs are only used for modular kernels, fused_experts (from fused_moe.py), cutlass_moe_fp[48], rocm_aiter_fused_experts and triton_kernel_moe_forward. Other MoE methods can ignore the FusedMoEQuantConfig (for now) and hardcode it to None.

There are currently some restrictions on what can be expressed: - Most MoE ops only support similar quantization strategies for each parameter, e.g. both weights must have the same GroupShape and both activations must share the same GroupShape. One exception to this is the cutlass moe which allows per channel quantization on the outputs. Note: this restrictions are not always rigorously checked. - Not all fused MoE functions support all the parameters, e.g. zero points, global scales, alphas and biases are not universally supported. - Fully general GroupShapes are not allowed. Activations only support per token, per tensor or K-blocked. - Weights are not required to have a GroupShape since they have already been quantized.

Other notes: - PrecisionConfigs are specific to GPT OSS Triton. - As a follow up it would probably make sense to subclass FusedMoEQuantDesc or FusedMoEQuantConfig for particular FusedMoEMethodBase subclasses so that only the required quantization parameters are used/stored.

Methods:

  • batched_scale_shape

    Construct the proper activation batched scale shape for this

  • config_name

    Return a string used to construct the filename that contains the

  • make

    General builder function for a FusedMoEQuantConfig.

  • scale_shape

    Construct the proper activation scale shape for this

Source code in vllm/model_executor/layers/fused_moe/config.py
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
@dataclass
class FusedMoEQuantConfig:
    """
    The FusedMoEQuantConfig contains all the quantization parameters for
    a single FusedMoEMethodBase operation.  It consists of four
    FusedMoEQuantDescs, one for each activation and set of weights.

    Each FusedMoEMethodBase must implement a get_fused_moe_quant_config
    method to construct a FusedMoEQuantConfig for use with that class.

    FusedMoEQuant configs are only used for modular kernels, fused_experts
    (from fused_moe.py), cutlass_moe_fp[48], rocm_aiter_fused_experts and
    triton_kernel_moe_forward.  Other MoE methods can ignore the
    FusedMoEQuantConfig (for now) and hardcode it to None.

    There are currently some restrictions on what can be expressed:
    - Most MoE ops only support similar quantization strategies for
      each parameter, e.g. both weights must have the same GroupShape
      and both activations must share the same GroupShape.  One exception to
      this is the cutlass moe which allows per channel quantization on the
      outputs.  Note: this restrictions are not always rigorously checked.
    - Not all fused MoE functions support all the parameters, e.g. zero points,
      global scales, alphas and biases are not universally supported.
    - Fully general GroupShapes are not allowed.  Activations only support
      per token, per tensor or K-blocked.
    - Weights are not required to have a GroupShape since they have already
      been quantized.

    Other notes:
    - PrecisionConfigs are specific to GPT OSS Triton.
    - As a follow up it would probably make sense to subclass FusedMoEQuantDesc
      or FusedMoEQuantConfig for particular FusedMoEMethodBase subclasses
      so that only the required quantization parameters are used/stored.
    """

    # TODO(bnell) make sure a1_scales/a2_scales don't interfere with chunking
    _a1: FusedMoEQuantDesc
    _a2: FusedMoEQuantDesc
    _w1: FusedMoEQuantDesc
    _w2: FusedMoEQuantDesc
    is_scale_swizzled: bool = True

    # MXFP4-specific TRTLLM parameters for SwiGLU activation clamping.
    # These correspond to gemm1_alpha, gemm1_beta, gemm1_clamp_limit
    # in TrtLlmMxfp4ExpertsBase.
    gemm1_alpha: float | None = None
    gemm1_beta: float | None = None
    gemm1_clamp_limit: float | None = None

    mx_alignment: int = 0

    def __post_init__(self):
        assert not self.per_act_token_quant or self.block_shape is None, (
            "illegal quantization"
        )

    #
    # Convenience accessors for various properties.
    #

    @property
    def quant_dtype(self) -> torch.dtype | str | None:
        return self._a1.dtype

    @property
    def weight_quant_dtype(self) -> torch.dtype | str | None:
        return self._w1.dtype

    @property
    def is_quantized(self) -> bool:
        return self.quant_dtype is not None

    @property
    def is_per_act_token(self) -> bool:
        return self._a1.shape == GroupShape.PER_TOKEN

    @property
    def per_act_token_quant(self) -> bool:
        return self._a1.shape == GroupShape.PER_TOKEN

    @property
    def per_out_ch_quant(self) -> bool:
        return self._w1.shape == GroupShape.PER_TOKEN

    @property
    def is_per_tensor(self) -> bool:
        return self._a1.shape == GroupShape.PER_TENSOR

    @property
    def block_shape(self) -> list[int] | None:
        if (
            self._a1.shape is not None
            and self._a1.shape != GroupShape.PER_TENSOR
            and self._a1.shape != GroupShape.PER_TOKEN
        ):
            return [self._a1.shape.row, self._a1.shape.col]
        else:
            return None

    @property
    def is_block_quantized(self) -> bool:
        return self.block_shape is not None

    @property
    def a1_scale(self) -> torch.Tensor | None:
        assert self._a1.scale is None or isinstance(self._a1.scale, torch.Tensor)
        return self._a1.scale

    @property
    def a1_gscale(self) -> torch.Tensor | None:
        return self._a1.alpha_or_gscale

    @property
    def a2_scale(self) -> torch.Tensor | None:
        assert self._a2.scale is None or isinstance(self._a2.scale, torch.Tensor)
        return self._a2.scale

    @property
    def a2_gscale(self) -> torch.Tensor | None:
        return self._a2.alpha_or_gscale

    @property
    def w1_scale(self) -> torch.Tensor | None:
        assert self._w1.scale is None or isinstance(self._w1.scale, torch.Tensor)
        return self._w1.scale

    @property
    def w1_zp(self) -> torch.Tensor | None:
        return self._w1.zp

    @property
    def w1_bias(self) -> torch.Tensor | None:
        return self._w1.bias

    @property
    def w1_precision(self) -> "PrecisionConfig | None":
        assert self._w1.scale is None or isinstance(self._w1.scale, PrecisionConfig)
        return self._w1.scale

    @property
    def g1_alphas(self) -> torch.Tensor | None:
        return self._w1.alpha_or_gscale

    @property
    def w2_scale(self) -> torch.Tensor | None:
        assert self._w2.scale is None or isinstance(self._w2.scale, torch.Tensor)
        return self._w2.scale

    @property
    def w2_zp(self) -> torch.Tensor | None:
        return self._w2.zp

    @property
    def w2_bias(self) -> torch.Tensor | None:
        return self._w2.bias

    @property
    def w2_precision(self) -> "PrecisionConfig | None":
        assert self._w2.scale is None or isinstance(self._w2.scale, PrecisionConfig)
        return self._w2.scale

    @property
    def g2_alphas(self) -> torch.Tensor | None:
        return self._w2.alpha_or_gscale

    @property
    def use_fp8_w8a8(self) -> bool:
        return self.quant_dtype == current_platform.fp8_dtype()

    @property
    def use_int8_w8a8(self) -> bool:
        return self.quant_dtype == torch.int8

    @property
    def use_int8_w8a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == torch.int8

    @property
    def use_fp8_w8a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == current_platform.fp8_dtype()

    @property
    def use_int4_w4a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == "int4"

    @property
    def use_nvfp4_w4a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == "nvfp4"

    @property
    def ocp_mx_scheme(self) -> str | None:
        if not hasattr(self, "_ocp_mx_scheme"):
            if (self._a1.dtype is not None and not isinstance(self._a1.dtype, str)) or (
                self._w1.dtype is not None and not isinstance(self._w1.dtype, str)
            ):
                self._ocp_mx_scheme = None
            else:
                ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype(
                    self._a1.dtype, self._w1.dtype
                )

                if ocp_mx_scheme is not None:
                    ocp_mx_scheme = ocp_mx_scheme.value

                self._ocp_mx_scheme = ocp_mx_scheme

        return self._ocp_mx_scheme

    @property
    def use_mxfp4_w4a16(self) -> bool:
        return self._a1.dtype is None and self._w1.dtype == "mxfp4"

    @property
    def use_mxfp4_w4a4(self) -> bool:
        return self._a1.dtype == "mxfp4" and self._w1.dtype == "mxfp4"

    @property
    def use_nvfp4_w4a4(self) -> bool:
        return self.quant_dtype == "nvfp4"

    @property
    def use_mxfp4_w4a8(self) -> bool:
        return self._a1.dtype == "fp8" and self._w1.dtype == "mxfp4"

    def config_name(self, dtype: torch.dtype) -> str | None:
        """
        Return a string used to construct the filename that contains the
        tuning info for a particular quantization scheme.  See
        try_get_optimal_moe_config in fused_moe.py.
        """
        return _get_config_dtype_str(
            use_fp8_w8a8=self.use_fp8_w8a8,
            use_fp8_w8a16=self.use_fp8_w8a16,
            use_int8_w8a16=self.use_int8_w8a16,
            use_int4_w4a16=self.use_int4_w4a16,
            ocp_mx_scheme=self.ocp_mx_scheme,
            dtype=dtype,
        )

    def scale_shape(
        self,
        max_tokens: int,
        hidden_dim: int,
    ) -> tuple[int, int] | None:
        """
        Construct the proper activation scale shape for this
        config.
        """
        if self.is_quantized:
            if self.is_block_quantized:
                assert self.block_shape is not None
                _, block_k = self.block_shape
                k_tiles = cdiv(hidden_dim, block_k)
                return (max_tokens, k_tiles)
            elif self.is_per_act_token:
                return (max_tokens, 1)
            else:
                return (1, 1)
        else:
            return None

    def batched_scale_shape(
        self,
        num_experts: int,
        max_tokens: int,
        hidden_dim: int,
    ) -> tuple[int, int, int] | None:
        """
        Construct the proper activation batched scale shape for this
        config, e.g. (num experts, *scale_shape).
        """
        if self.is_quantized:
            scale_shape = self.scale_shape(max_tokens, hidden_dim)
            assert scale_shape is not None
            return (num_experts, *scale_shape)
        else:
            return None

    @staticmethod
    def make(
        quant_dtype: torch.dtype | str | None = None,
        per_act_token_quant: bool = False,
        per_out_ch_quant: bool = False,
        block_shape: list[int] | None = None,
        w1_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
        w2_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
        a1_scale: torch.Tensor | None = None,
        a2_scale: torch.Tensor | None = None,
        g1_alphas: torch.Tensor | None = None,
        g2_alphas: torch.Tensor | None = None,
        a1_gscale: torch.Tensor | None = None,
        a2_gscale: torch.Tensor | None = None,
        w1_bias: torch.Tensor | None = None,
        w2_bias: torch.Tensor | None = None,
        w1_zp: torch.Tensor | None = None,
        w2_zp: torch.Tensor | None = None,
        weight_dtype: torch.dtype | str | None = None,
        is_scale_swizzled: bool = True,
        gemm1_alpha: float | None = None,
        gemm1_beta: float | None = None,
        gemm1_clamp_limit: float | None = None,
    ) -> "FusedMoEQuantConfig":
        """
        General builder function for a FusedMoEQuantConfig.
        - quant_dtype: Optional quantization type. None if activations are
          unquantized or quantized prior to calling.  Note: "nvfp4", "mxfp4",
          "mxfp6_e3m2", "mxfp6_e2m3" are the only valid string values
          for quant_dtype.
        - per_act_token_quant: Activations have per token quantization.
        - per_out_ch_quant: Outputs have per channel quantization. (only
          for cutlass).
        - block_shape: Optional block size for block-wise quantization.
          Incompatible with per_act_token and per_out_ch quant.
        - w1_scale: Optional scale to be used for w1.
        - w2_scale: Optional scale to be used for w2.
        - a1_scale: Optional scale to be used for a1.
        - a2_scale: Optional scale to be used for a2.
        - g1_alphas: Optional global quantization scales for w1 (for nvfp4).
                     Optional per-channel scales for w1 (for W4A8 FP8).
                     Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
        - g2_alphas: Optional global quantization scales for w2 (for nvfp4).
                     Optional per-channel scales for w2 (for W4A8 FP8).
                     Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
        - a1_gscale: Optional global quantization scales for a1 (1.0 /a2_scale).
        - a2_gscale: Optional global quantization scales for a2 (1.0 /a2_scale).

        - w1_bias: Optional biases for w1 (GPT OSS Triton).
        - w2_bias: Optional biases for w1 (GPT OSS Triton).
        - w1_zp: Optional w1 zero points for int4/int8 quantization.
        - w2_zp: Optional w2 zero points for int4/int8 quantization.
        - is_scale_swizzled: Whether the activation scale-factor layout is
          swizzled. Pass through to the underlying quantization kernel for
          dtypes that distinguish layouts (nvfp4, mxfp8). Defaults to True.
        - gemm1_alpha: Optional MXFP4 TRTLLM SwiGLU alpha parameter.
        - gemm1_beta: Optional MXFP4 TRTLLM SwiGLU beta parameter.
        - gemm1_clamp_limit: Optional MXFP4 TRTLLM SwiGLU clamp limit.
        """
        assert not isinstance(quant_dtype, str) or quant_dtype in {
            "nvfp4",
            "mxfp4",
            "mxfp6_e3m2",
            "mxfp6_e2m3",
            "mxfp8",
        }
        assert not isinstance(weight_dtype, str) or weight_dtype in {
            "nvfp4",
            "mxfp4",
            "mxfp6_e3m2",
            "mxfp6_e2m3",
            "int4",
            "mxfp8",
        }

        if weight_dtype is None:
            weight_dtype = quant_dtype

        a_shape, w_shape = _quant_flags_to_group_shape(
            quant_dtype, per_act_token_quant, per_out_ch_quant, block_shape
        )
        quant_config = FusedMoEQuantConfig(
            _a1=FusedMoEQuantDesc(quant_dtype, a_shape, a1_scale, a1_gscale),
            _a2=FusedMoEQuantDesc(quant_dtype, a_shape, a2_scale, a2_gscale),
            _w1=FusedMoEQuantDesc(
                weight_dtype, w_shape, w1_scale, g1_alphas, w1_zp, w1_bias
            ),
            _w2=FusedMoEQuantDesc(
                weight_dtype, w_shape, w2_scale, g2_alphas, w2_zp, w2_bias
            ),
            is_scale_swizzled=is_scale_swizzled,
            gemm1_alpha=gemm1_alpha,
            gemm1_beta=gemm1_beta,
            gemm1_clamp_limit=gemm1_clamp_limit,
        )
        assert quant_config.per_act_token_quant == per_act_token_quant
        assert quant_config.per_out_ch_quant == per_out_ch_quant
        assert quant_config.block_shape == block_shape
        return quant_config

batched_scale_shape(num_experts, max_tokens, hidden_dim)

Construct the proper activation batched scale shape for this config, e.g. (num experts, *scale_shape).

Source code in vllm/model_executor/layers/fused_moe/config.py
def batched_scale_shape(
    self,
    num_experts: int,
    max_tokens: int,
    hidden_dim: int,
) -> tuple[int, int, int] | None:
    """
    Construct the proper activation batched scale shape for this
    config, e.g. (num experts, *scale_shape).
    """
    if self.is_quantized:
        scale_shape = self.scale_shape(max_tokens, hidden_dim)
        assert scale_shape is not None
        return (num_experts, *scale_shape)
    else:
        return None

config_name(dtype)

Return a string used to construct the filename that contains the tuning info for a particular quantization scheme. See try_get_optimal_moe_config in fused_moe.py.

Source code in vllm/model_executor/layers/fused_moe/config.py
def config_name(self, dtype: torch.dtype) -> str | None:
    """
    Return a string used to construct the filename that contains the
    tuning info for a particular quantization scheme.  See
    try_get_optimal_moe_config in fused_moe.py.
    """
    return _get_config_dtype_str(
        use_fp8_w8a8=self.use_fp8_w8a8,
        use_fp8_w8a16=self.use_fp8_w8a16,
        use_int8_w8a16=self.use_int8_w8a16,
        use_int4_w4a16=self.use_int4_w4a16,
        ocp_mx_scheme=self.ocp_mx_scheme,
        dtype=dtype,
    )

make(quant_dtype=None, per_act_token_quant=False, per_out_ch_quant=False, block_shape=None, w1_scale=None, w2_scale=None, a1_scale=None, a2_scale=None, g1_alphas=None, g2_alphas=None, a1_gscale=None, a2_gscale=None, w1_bias=None, w2_bias=None, w1_zp=None, w2_zp=None, weight_dtype=None, is_scale_swizzled=True, gemm1_alpha=None, gemm1_beta=None, gemm1_clamp_limit=None) staticmethod

General builder function for a FusedMoEQuantConfig. - quant_dtype: Optional quantization type. None if activations are unquantized or quantized prior to calling. Note: "nvfp4", "mxfp4", "mxfp6_e3m2", "mxfp6_e2m3" are the only valid string values for quant_dtype. - per_act_token_quant: Activations have per token quantization. - per_out_ch_quant: Outputs have per channel quantization. (only for cutlass). - block_shape: Optional block size for block-wise quantization. Incompatible with per_act_token and per_out_ch quant. - w1_scale: Optional scale to be used for w1. - w2_scale: Optional scale to be used for w2. - a1_scale: Optional scale to be used for a1. - a2_scale: Optional scale to be used for a2. - g1_alphas: Optional global quantization scales for w1 (for nvfp4). Optional per-channel scales for w1 (for W4A8 FP8). Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8). - g2_alphas: Optional global quantization scales for w2 (for nvfp4). Optional per-channel scales for w2 (for W4A8 FP8). Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8). - a1_gscale: Optional global quantization scales for a1 (1.0 /a2_scale). - a2_gscale: Optional global quantization scales for a2 (1.0 /a2_scale).

  • w1_bias: Optional biases for w1 (GPT OSS Triton).
  • w2_bias: Optional biases for w1 (GPT OSS Triton).
  • w1_zp: Optional w1 zero points for int4/int8 quantization.
  • w2_zp: Optional w2 zero points for int4/int8 quantization.
  • is_scale_swizzled: Whether the activation scale-factor layout is swizzled. Pass through to the underlying quantization kernel for dtypes that distinguish layouts (nvfp4, mxfp8). Defaults to True.
  • gemm1_alpha: Optional MXFP4 TRTLLM SwiGLU alpha parameter.
  • gemm1_beta: Optional MXFP4 TRTLLM SwiGLU beta parameter.
  • gemm1_clamp_limit: Optional MXFP4 TRTLLM SwiGLU clamp limit.
Source code in vllm/model_executor/layers/fused_moe/config.py
@staticmethod
def make(
    quant_dtype: torch.dtype | str | None = None,
    per_act_token_quant: bool = False,
    per_out_ch_quant: bool = False,
    block_shape: list[int] | None = None,
    w1_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
    w2_scale: Union[torch.Tensor, "PrecisionConfig", None] = None,
    a1_scale: torch.Tensor | None = None,
    a2_scale: torch.Tensor | None = None,
    g1_alphas: torch.Tensor | None = None,
    g2_alphas: torch.Tensor | None = None,
    a1_gscale: torch.Tensor | None = None,
    a2_gscale: torch.Tensor | None = None,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
    w1_zp: torch.Tensor | None = None,
    w2_zp: torch.Tensor | None = None,
    weight_dtype: torch.dtype | str | None = None,
    is_scale_swizzled: bool = True,
    gemm1_alpha: float | None = None,
    gemm1_beta: float | None = None,
    gemm1_clamp_limit: float | None = None,
) -> "FusedMoEQuantConfig":
    """
    General builder function for a FusedMoEQuantConfig.
    - quant_dtype: Optional quantization type. None if activations are
      unquantized or quantized prior to calling.  Note: "nvfp4", "mxfp4",
      "mxfp6_e3m2", "mxfp6_e2m3" are the only valid string values
      for quant_dtype.
    - per_act_token_quant: Activations have per token quantization.
    - per_out_ch_quant: Outputs have per channel quantization. (only
      for cutlass).
    - block_shape: Optional block size for block-wise quantization.
      Incompatible with per_act_token and per_out_ch quant.
    - w1_scale: Optional scale to be used for w1.
    - w2_scale: Optional scale to be used for w2.
    - a1_scale: Optional scale to be used for a1.
    - a2_scale: Optional scale to be used for a2.
    - g1_alphas: Optional global quantization scales for w1 (for nvfp4).
                 Optional per-channel scales for w1 (for W4A8 FP8).
                 Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
    - g2_alphas: Optional global quantization scales for w2 (for nvfp4).
                 Optional per-channel scales for w2 (for W4A8 FP8).
                 Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8).
    - a1_gscale: Optional global quantization scales for a1 (1.0 /a2_scale).
    - a2_gscale: Optional global quantization scales for a2 (1.0 /a2_scale).

    - w1_bias: Optional biases for w1 (GPT OSS Triton).
    - w2_bias: Optional biases for w1 (GPT OSS Triton).
    - w1_zp: Optional w1 zero points for int4/int8 quantization.
    - w2_zp: Optional w2 zero points for int4/int8 quantization.
    - is_scale_swizzled: Whether the activation scale-factor layout is
      swizzled. Pass through to the underlying quantization kernel for
      dtypes that distinguish layouts (nvfp4, mxfp8). Defaults to True.
    - gemm1_alpha: Optional MXFP4 TRTLLM SwiGLU alpha parameter.
    - gemm1_beta: Optional MXFP4 TRTLLM SwiGLU beta parameter.
    - gemm1_clamp_limit: Optional MXFP4 TRTLLM SwiGLU clamp limit.
    """
    assert not isinstance(quant_dtype, str) or quant_dtype in {
        "nvfp4",
        "mxfp4",
        "mxfp6_e3m2",
        "mxfp6_e2m3",
        "mxfp8",
    }
    assert not isinstance(weight_dtype, str) or weight_dtype in {
        "nvfp4",
        "mxfp4",
        "mxfp6_e3m2",
        "mxfp6_e2m3",
        "int4",
        "mxfp8",
    }

    if weight_dtype is None:
        weight_dtype = quant_dtype

    a_shape, w_shape = _quant_flags_to_group_shape(
        quant_dtype, per_act_token_quant, per_out_ch_quant, block_shape
    )
    quant_config = FusedMoEQuantConfig(
        _a1=FusedMoEQuantDesc(quant_dtype, a_shape, a1_scale, a1_gscale),
        _a2=FusedMoEQuantDesc(quant_dtype, a_shape, a2_scale, a2_gscale),
        _w1=FusedMoEQuantDesc(
            weight_dtype, w_shape, w1_scale, g1_alphas, w1_zp, w1_bias
        ),
        _w2=FusedMoEQuantDesc(
            weight_dtype, w_shape, w2_scale, g2_alphas, w2_zp, w2_bias
        ),
        is_scale_swizzled=is_scale_swizzled,
        gemm1_alpha=gemm1_alpha,
        gemm1_beta=gemm1_beta,
        gemm1_clamp_limit=gemm1_clamp_limit,
    )
    assert quant_config.per_act_token_quant == per_act_token_quant
    assert quant_config.per_out_ch_quant == per_out_ch_quant
    assert quant_config.block_shape == block_shape
    return quant_config

scale_shape(max_tokens, hidden_dim)

Construct the proper activation scale shape for this config.

Source code in vllm/model_executor/layers/fused_moe/config.py
def scale_shape(
    self,
    max_tokens: int,
    hidden_dim: int,
) -> tuple[int, int] | None:
    """
    Construct the proper activation scale shape for this
    config.
    """
    if self.is_quantized:
        if self.is_block_quantized:
            assert self.block_shape is not None
            _, block_k = self.block_shape
            k_tiles = cdiv(hidden_dim, block_k)
            return (max_tokens, k_tiles)
        elif self.is_per_act_token:
            return (max_tokens, 1)
        else:
            return (1, 1)
    else:
        return None

FusedMoERouter

Bases: ABC

FusedMoERouter is an abstract class that provides a 'select_experts' method that is used for routing hidden states based on router logits.

Methods:

  • select_experts

    Route the input hidden states to the top-k experts based on the

Source code in vllm/model_executor/layers/fused_moe/router/fused_moe_router.py
class FusedMoERouter(ABC):
    """
    FusedMoERouter is an abstract class that provides a 'select_experts'
    method that is used for routing hidden states based on router logits.
    """

    def __init__(self, eplb_state: EplbLayerState | None = None):
        self._routing_replay_out: torch.Tensor | None = None
        self.eplb_state = eplb_state

    @abstractmethod
    def set_capture_fn(
        self,
        capture_fn: Callable[[torch.Tensor], None] | None,
    ) -> None:
        raise NotImplementedError

    @property
    @abstractmethod
    def routing_method_type(self) -> RoutingMethodType:
        raise NotImplementedError

    @abstractmethod
    def _select_experts(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        topk_indices_dtype: torch.dtype | None = None,
        *,
        input_ids: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        raise NotImplementedError

    def select_experts(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        topk_indices_dtype: torch.dtype | None = None,
        *,
        input_ids: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Route the input hidden states to the top-k experts based on the
        router logits.

        Returns:
            (topk_weights, topk_ids)
            (tuple[torch.Tensor, torch.Tensor]):
            The weights and expert ids computation result.

            **Compatibility**: When EPLB is not enabled, the returned ids are
            equivalent to global logical ids, so should be compatible with
            plain MoE implementations without redundant experts.
        """

        topk_weights, topk_ids = self._select_experts(
            hidden_states,
            router_logits,
            topk_indices_dtype=topk_indices_dtype,
            input_ids=input_ids,
        )

        # Write routing data for non-monolithic path (Triton, etc.)
        # (set by bind_routing_capture_to_model during capturer init)
        if self._routing_replay_out is not None:
            self._routing_replay_out[: topk_ids.shape[0]].copy_(
                topk_ids.to(torch.int16)
            )

        return topk_weights, topk_ids

select_experts(hidden_states, router_logits, topk_indices_dtype=None, *, input_ids=None)

Route the input hidden states to the top-k experts based on the router logits.

Returns:

Source code in vllm/model_executor/layers/fused_moe/router/fused_moe_router.py
def select_experts(
    self,
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    topk_indices_dtype: torch.dtype | None = None,
    *,
    input_ids: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Route the input hidden states to the top-k experts based on the
    router logits.

    Returns:
        (topk_weights, topk_ids)
        (tuple[torch.Tensor, torch.Tensor]):
        The weights and expert ids computation result.

        **Compatibility**: When EPLB is not enabled, the returned ids are
        equivalent to global logical ids, so should be compatible with
        plain MoE implementations without redundant experts.
    """

    topk_weights, topk_ids = self._select_experts(
        hidden_states,
        router_logits,
        topk_indices_dtype=topk_indices_dtype,
        input_ids=input_ids,
    )

    # Write routing data for non-monolithic path (Triton, etc.)
    # (set by bind_routing_capture_to_model during capturer init)
    if self._routing_replay_out is not None:
        self._routing_replay_out[: topk_ids.shape[0]].copy_(
            topk_ids.to(torch.int16)
        )

    return topk_weights, topk_ids

GateLinear

Bases: ReplicatedLinear

MoE gate linear layer with multi-tier GEMM dispatch:

  1. DSV3 specialized kernel (SM90+, fp32 out, M<=16, H=7168, E=256/384)
  2. fp32 specialized kernel (SM90+, bf16/fp32 in, fp32 out, M<=32, H=3072, E=256)
  3. cuBLAS bf16×bf16→fp32 (SM90+ + bf16 weight + fp32 out_dtype)
  4. F.linear via ReplicatedLinear (ultimate fallback)

The out_dtype attribute is mutable and can be set after init (e.g. when the required dtype depends on the expert quantization method which is only known later).

Methods:

  • set_out_dtype

    Set output dtype for the router logits after init.

Source code in vllm/model_executor/layers/fused_moe/router/gate_linear.py
@PluggableLayer.register("gate_linear")
class GateLinear(ReplicatedLinear):
    """MoE gate linear layer with multi-tier GEMM dispatch:

    1. DSV3 specialized kernel (SM90+, fp32 out, M<=16, H=7168, E=256/384)
    2. fp32 specialized kernel  (SM90+, bf16/fp32 in, fp32 out,
       M<=32, H=3072, E=256)
    3. cuBLAS bf16×bf16→fp32 (SM90+ + bf16 weight + fp32 out_dtype)
    4. F.linear via ReplicatedLinear (ultimate fallback)

    The ``out_dtype`` attribute is mutable and can be set after init
    (e.g. when the required dtype depends on the expert quantization
    method which is only known later).
    """

    # Dimensions supported by the DSV3 specialized kernel
    DSV3_SUPPORTED_NUM_EXPERTS = [256, 384]
    DSV3_SUPPORTED_HIDDEN_SIZES = [7168]

    # Dimensions supported by the fp32 specialized kernel
    FP32_SUPPORTED_NUM_EXPERTS = [256]
    FP32_SUPPORTED_HIDDEN_SIZES = [3072]
    FP32_MAX_TOKENS = 32

    def __init__(
        self,
        input_size: int,
        output_size: int,
        bias: bool = False,
        out_dtype: torch.dtype | None = None,
        params_dtype: torch.dtype | None = None,
        force_fp32_compute: bool = False,
        prefix: str = "",
    ):
        is_hopper_or_blackwell = current_platform.is_device_capability(
            (9, 0)
        ) or current_platform.is_device_capability_family(100)
        can_use_specialized_kernels = (
            current_platform.is_cuda() and is_hopper_or_blackwell and not bias
        )

        # If fp32 compute is required and no specialized kernel is available,
        # store weights in fp32 so the fallback linear path computes in fp32.
        if force_fp32_compute and not can_use_specialized_kernels:
            params_dtype = torch.float32

        super().__init__(
            input_size,
            output_size,
            bias=bias,
            params_dtype=params_dtype,
            quant_config=None,
            prefix=prefix,
        )
        self.out_dtype = out_dtype

        # DSV3 specialized kernel eligibility (SM90+, exact dims)
        self.allow_specialized_router_gemm = can_use_specialized_kernels
        self.allow_dsv3_router_gemm = (
            self.allow_specialized_router_gemm
            and output_size in self.DSV3_SUPPORTED_NUM_EXPERTS
            and input_size in self.DSV3_SUPPORTED_HIDDEN_SIZES
        )

        # fp32 specialized kernel eligibility (SM90+, exact dims, fp32 weight)
        self.allow_fp32_router_gemm = (
            not bias
            and self.weight.dtype == torch.float32
            and current_platform.is_cuda()
            and is_hopper_or_blackwell
            and output_size in self.FP32_SUPPORTED_NUM_EXPERTS
            and input_size in self.FP32_SUPPORTED_HIDDEN_SIZES
        )

        # cuBLAS bf16→fp32 eligibility
        self.allow_cublas_router_gemm = (
            self.allow_specialized_router_gemm
            and self.weight.dtype == torch.bfloat16
            and self.out_dtype == torch.float32
        )

    def set_out_dtype(self, out_dtype: torch.dtype) -> None:
        """Set output dtype for the router logits after init.

        Useful when the required dtype depends on the expert quantization
        method which is only known after the gate is constructed.
        """
        if self.out_dtype is not None:
            raise ValueError("out_dtype has already been set")
        self.out_dtype = out_dtype

        if (
            not self.allow_cublas_router_gemm
            and self.allow_specialized_router_gemm
            and out_dtype == torch.float32
        ):
            self.allow_cublas_router_gemm = self.weight.dtype == torch.bfloat16

    def forward(
        self, x: torch.Tensor
    ) -> torch.Tensor | tuple[torch.Tensor, Parameter | None]:
        # Tier 1: DSV3 specialized kernel
        if self.allow_dsv3_router_gemm and x.shape[0] <= 16:
            output = ops.dsv3_router_gemm(
                hidden_states=x,
                router_weight=self.weight,
                output_dtype=self.out_dtype,
            )
            return output, None

        # Tier 2: fp32 specialized kernel (H=3072, E=256, M<=32)
        # Dispatch is wrapped in a custom op so that torch.compile/CUDA-graph
        # capture does not freeze the runtime num_tokens branch.
        if self.allow_fp32_router_gemm and x.dtype in (
            torch.float32,
            torch.bfloat16,
        ):
            output = torch.ops.vllm.fp32_router_gemm_dispatch(x, self.weight)
            return output, None

        # Tier 3: cuBLAS bf16→fp32
        if self.allow_cublas_router_gemm and x.dtype == torch.bfloat16:
            output = torch.mm(x, self.weight.T, out_dtype=torch.float32)
            return output, None

        # Tier 4: F.linear (ReplicatedLinear)
        if self.out_dtype is not None and x.dtype != self.weight.dtype:
            x = x.to(self.weight.dtype)
        output, output_bias = super().forward(x)
        if self.out_dtype is not None and output.dtype != self.out_dtype:
            output = output.to(self.out_dtype)
        return output, output_bias

set_out_dtype(out_dtype)

Set output dtype for the router logits after init.

Useful when the required dtype depends on the expert quantization method which is only known after the gate is constructed.

Source code in vllm/model_executor/layers/fused_moe/router/gate_linear.py
def set_out_dtype(self, out_dtype: torch.dtype) -> None:
    """Set output dtype for the router logits after init.

    Useful when the required dtype depends on the expert quantization
    method which is only known after the gate is constructed.
    """
    if self.out_dtype is not None:
        raise ValueError("out_dtype has already been set")
    self.out_dtype = out_dtype

    if (
        not self.allow_cublas_router_gemm
        and self.allow_specialized_router_gemm
        and out_dtype == torch.float32
    ):
        self.allow_cublas_router_gemm = self.weight.dtype == torch.bfloat16

GroupedTopk

Bases: CustomOp

GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model.

Source code in vllm/model_executor/layers/fused_moe/router/grouped_topk_router.py
@CustomOp.register("grouped_topk")
class GroupedTopk(CustomOp):
    """GroupedTopk used by the Deepseek-V2 and Deepseek-V3 model."""

    # --8<-- [end:grouped_topk]

    def __init__(
        self,
        topk: int,
        renormalize: bool,
        num_expert_group: int = 0,
        topk_group: int = 0,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        num_fused_shared_experts: int = 0,
    ) -> None:
        super().__init__()
        self.native_impl = grouped_topk
        self.topk = topk
        self.renormalize = renormalize
        self.num_expert_group = num_expert_group
        self.topk_group = topk_group
        self.scoring_func = scoring_func
        self.routed_scaling_factor = routed_scaling_factor
        self.num_fused_shared_experts = num_fused_shared_experts

    def forward_native(
        self,
        hidden_states: torch.Tensor,
        gating_output: torch.Tensor,
        e_score_correction_bias: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        return self.native_impl(
            hidden_states,
            gating_output,
            self.topk,
            self.renormalize,
            self.num_expert_group,
            self.topk_group,
            self.scoring_func,
            self.routed_scaling_factor,
            e_score_correction_bias,
        )

    def forward_cuda(
        self,
        hidden_states: torch.Tensor,
        gating_output: torch.Tensor,
        e_score_correction_bias: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        return self.forward_native(
            hidden_states, gating_output, e_score_correction_bias
        )

    def forward_hip(
        self,
        hidden_states: torch.Tensor,
        gating_output: torch.Tensor,
        e_score_correction_bias: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if rocm_aiter_ops.is_fused_moe_enabled():
            if not rocm_aiter_ops.is_fusion_moe_shared_experts_enabled():
                assert self.num_fused_shared_experts == 0
            return rocm_aiter_grouped_topk(
                hidden_states,
                gating_output,
                self.topk,
                self.renormalize,
                self.num_expert_group,
                self.topk_group,
                self.scoring_func,
                self.routed_scaling_factor,
                e_score_correction_bias,
                self.num_fused_shared_experts,
            )
        else:
            return self.forward_native(
                hidden_states, gating_output, e_score_correction_bias
            )

MoEActivation

Bases: Enum

Activation functions for MoE layers.

Methods:

  • from_str

    Parse from string for backward compatibility.

  • without_mul

    Get the non-gated variant of this activation.

Attributes:

  • custom_op_name (str) –

    Maps to the CustomOp name of activations

  • is_gated (bool) –

    Returns True if activation expects gate*activation(up) pattern.

Source code in vllm/model_executor/layers/fused_moe/activation.py
class MoEActivation(Enum):
    """Activation functions for MoE layers."""

    # Gated activations (gate * activation(up)) expect input of shape [..., 2*d]
    # and produce output of shape [..., d]
    SILU = "silu"
    GELU = "gelu"
    GELU_TANH = "gelu_tanh"
    RELU2 = "relu2"
    SWIGLUOAI = "swigluoai"
    SWIGLUSTEP = "swiglustep"

    # Non-gated activations (no mul with gate) expect input of shape [..., d]
    # and produce output of shape [..., d].
    # NOTE: Non-gated activations require the "_no_mul" suffix to be present.
    SILU_NO_MUL = "silu_no_mul"
    GELU_NO_MUL = "gelu_no_mul"
    GELU_TANH_NO_MUL = "gelu_tanh_no_mul"
    RELU2_NO_MUL = "relu2_no_mul"

    @property
    def is_gated(self) -> bool:
        """Returns True if activation expects gate*activation(up) pattern.

        Gated activations expect input tensor with 2x the output size,
        where the first half is the gate and second half is the up projection.
        """
        return not self.value.endswith("_no_mul")

    @property
    def custom_op_name(self) -> str:
        """Maps to the CustomOp name of activations
        in vllm/model_executor/layers/activation.py."""
        return _CUSTOM_OP_NAMES[self]

    def without_mul(self) -> "MoEActivation":
        """Get the non-gated variant of this activation.

        For activations that have a _no_mul variant, returns that variant.
        For activations without a _no_mul variant (or already _no_mul),
        returns self.
        """
        return _WITHOUT_MUL.get(self, self)

    @classmethod
    def from_str(cls, s: str) -> "MoEActivation":
        """Parse from string for backward compatibility."""
        s = _STR_ALIASES.get(s, s)
        for member in cls:
            if member.value == s:
                return member
        valid = [m.value for m in cls]
        raise ValueError(f"Unknown MoE activation: {s!r}. Valid activations: {valid}")

custom_op_name property

Maps to the CustomOp name of activations in vllm/model_executor/layers/activation.py.

is_gated property

Returns True if activation expects gate*activation(up) pattern.

Gated activations expect input tensor with 2x the output size, where the first half is the gate and second half is the up projection.

from_str(s) classmethod

Parse from string for backward compatibility.

Source code in vllm/model_executor/layers/fused_moe/activation.py
@classmethod
def from_str(cls, s: str) -> "MoEActivation":
    """Parse from string for backward compatibility."""
    s = _STR_ALIASES.get(s, s)
    for member in cls:
        if member.value == s:
            return member
    valid = [m.value for m in cls]
    raise ValueError(f"Unknown MoE activation: {s!r}. Valid activations: {valid}")

without_mul()

Get the non-gated variant of this activation.

For activations that have a _no_mul variant, returns that variant. For activations without a _no_mul variant (or already _no_mul), returns self.

Source code in vllm/model_executor/layers/fused_moe/activation.py
def without_mul(self) -> "MoEActivation":
    """Get the non-gated variant of this activation.

    For activations that have a _no_mul variant, returns that variant.
    For activations without a _no_mul variant (or already _no_mul),
    returns self.
    """
    return _WITHOUT_MUL.get(self, self)

MoERunner

Bases: MoERunnerInterface

Standard MoE runner implementation for executing Mixture of Experts layers.

This is the primary concrete implementation of MoE execution logic, providing comprehensive support for standard MoE operations. It handles: - Expert routing and token dispatching using various routing strategies - Shared experts computation with optional parallel execution using CUDA streams - Tensor model parallel and expert parallel operations - Multiple quantization methods and optimized kernel selection - Both monolithic and decomposed expert execution paths - Integration with various parallel execution modes (TP, EP, DP)

The runner orchestrates the complete MoE forward pass including routing tokens to experts, executing expert computations in parallel, and combining results. It supports advanced features like overlapped execution of shared experts, optimized kernels for different parallel configurations, and seamless integration with vLLM's distributed execution framework.

Eventually, this class may be split into more specialized implementations for different configurations (e.g., with/without shared experts, gates, etc.).

Methods:

Attributes:

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
class MoERunner(MoERunnerInterface):
    """
    Standard MoE runner implementation for executing Mixture of Experts layers.

    This is the primary concrete implementation of MoE execution logic, providing
    comprehensive support for standard MoE operations. It handles:
    - Expert routing and token dispatching using various routing strategies
    - Shared experts computation with optional parallel execution using CUDA streams
    - Tensor model parallel and expert parallel operations
    - Multiple quantization methods and optimized kernel selection
    - Both monolithic and decomposed expert execution paths
    - Integration with various parallel execution modes (TP, EP, DP)

    The runner orchestrates the complete MoE forward pass including routing tokens
    to experts, executing expert computations in parallel, and combining results.
    It supports advanced features like overlapped execution of shared experts,
    optimized kernels for different parallel configurations, and seamless
    integration with vLLM's distributed execution framework.

    Eventually, this class may be split into more specialized implementations
    for different configurations (e.g., with/without shared experts, gates, etc.).
    """

    def __init__(
        self,
        layer_name: str,
        moe_config: FusedMoEConfig,
        router: FusedMoERouter,
        routed_experts: RoutedExperts,
        enable_dbo: bool = False,
        gate: torch.nn.Module | None = None,
        shared_experts: torch.nn.Module | None = None,
        shared_expert_gate: torch.nn.Module | None = None,
        routed_input_transform: torch.nn.Module | None = None,
        routed_output_transform: torch.nn.Module | None = None,
        routed_scaling_factor: float = 1.0,
    ):
        super().__init__()
        self.moe_config = moe_config
        self.router = router
        self.routed_input_transform = routed_input_transform
        self.routed_output_transform = routed_output_transform
        self.routed_scaling_factor = routed_scaling_factor
        self.gate = gate
        self.shared_expert_gate = shared_expert_gate
        self.routed_experts = routed_experts
        self.enable_dbo = enable_dbo

        # When both gates are present and FSE is enabled, fuse their
        # weight matrices into [num_experts + num_shared, hidden] so one
        # F.linear produces combined logits. The topk kernel can then
        # apply routing softmax and shared expert activation (sigmoid)
        # in a single launch.
        self._fse_fuse_gate = gate is not None and shared_expert_gate is not None
        self._combined_gate_weight: torch.Tensor | None = None

        self._shared_experts: SharedExperts | None = None
        if shared_experts is not None:
            can_overlap = lambda: self._quant_method.mk_can_overlap_shared_experts
            self._shared_experts = SharedExperts(
                shared_experts,
                moe_config=moe_config,
                enable_dbo=enable_dbo,
                mk_can_overlap_shared_experts=can_overlap,
            )

        # Needed for string -> MoERunner layer lookup in custom ops.
        self.layer_name = layer_name

        self._forward_entry = self._select_forward()

        # For smuggling this layer into the fused moe custom op
        register_layer_for_moe_forward_op(get_current_vllm_config(), self)

    def _select_forward(self) -> Callable:
        if current_platform.is_tpu() or current_platform.is_cpu():
            # TODO: Once the OOM issue for the TPU backend is resolved, we
            # will switch to using the moe_forward custom op.
            # Note: CPU doesn't require wrapped _forward_impl.
            return _moe_forward if self._shared_experts is None else _moe_forward_shared

        return (
            torch.ops.vllm.moe_forward
            if self._shared_experts is None
            else torch.ops.vllm.moe_forward_shared
        )

    @property
    def shared_experts(self) -> SharedExperts | None:
        return self._shared_experts

    @property
    def is_internal_router(self) -> bool:
        return self.gate is not None

    # TODO(bnell): Temporary hack. Get rid of this.
    def _replace_quant_method(self, quant_method: FusedMoEMethodBase):
        self.routed_experts._replace_quant_method(quant_method)

    # TODO(bnell): Hack for elastic_ep. Get rid of this
    def _set_moe_config(self, new_moe_config: FusedMoEConfig):
        self.moe_config = new_moe_config
        self.routed_experts._set_moe_config(new_moe_config)
        if self._shared_experts is not None:
            self._shared_experts._set_moe_config(new_moe_config)

    def _maybe_fuse_gate_weights(self):
        """Fuse router and shared expert gate weights on first call.

        Cannot be done at __init__ because gate weights are loaded after
        module construction (via weight_loader). Called once from
        _forward_impl before the first forward pass.
        """
        if self._combined_gate_weight is None:
            assert self.gate is not None and self.shared_expert_gate is not None
            self._combined_gate_weight = torch.cat(
                [self.gate.weight, self.shared_expert_gate.weight],
                dim=0,
            )

    @property
    def _quant_method(self) -> FusedMoEMethodBase:
        return self.routed_experts.quant_method

    def apply_routed_input_transform(
        self, hidden_states: torch.Tensor
    ) -> tuple[torch.Tensor, torch.Tensor | None]:
        """Apply transform for routed experts (e.g., latent projection).

        This is called by FusedMoE.forward_native. The original hidden_states
        is saved separately so shared experts get [S, hidden_size] while
        routed experts get the transformed [S, moe_latent_size].

        Returns (possibly transformed) hidden states and the input for shared
        experts (or None if there are no shared experts).
        """
        if self.routed_input_transform is not None:
            result = self.routed_input_transform(hidden_states)
            # ReplicatedLinear returns (output, extra_bias) tuple.
            # We only need the output tensor; extra_bias is not used here.
            if isinstance(result, tuple):
                return result[0], hidden_states
            return result, hidden_states

        return (
            hidden_states,
            hidden_states if self._shared_experts is not None else None,
        )

    def apply_routed_output_transform(
        self,
        fused_output: torch.Tensor,
    ) -> torch.Tensor:
        """Apply transform to routed expert output (e.g., latent to full dim).

        Used by latent MoE models (e.g., NemotronH) where routed experts
        operate in a compressed latent space and need projection back to
        the full hidden dimension before combining with shared expert output.
        """
        if self.routed_output_transform is not None:
            r = self.routed_output_transform(fused_output)
            fused_output = r[0] if isinstance(r, tuple) else r
        return fused_output

    def _maybe_apply_routed_scale_to_output(
        self,
        shared_output: torch.Tensor | None,
        fused_output: torch.Tensor,
    ) -> tuple[torch.Tensor | None, torch.Tensor]:
        """Apply routed_scaling_factor to the output with FP16 overflow
        protection.

        Scale the fused expert output by routed_scaling_factor. For FP16,
        avoid overflow by dividing shared_output by the scale instead
        (the decoder layer compensates with matching divisions).
        """
        if self.routed_scaling_factor != 1.0:
            if fused_output.dtype != torch.float16 or shared_output is None:
                fused_output *= self.routed_scaling_factor
            elif shared_output is not None:
                shared_output *= 1.0 / self.routed_scaling_factor
        return shared_output, fused_output

    @property
    def _fused_output_is_reduced(self) -> bool:
        return (
            self._quant_method.moe_kernel is not None
            and self._quant_method.moe_kernel.output_is_reduced()
        )

    def _maybe_reduce_shared_expert_output(
        self,
        shared_output: torch.Tensor | None,
    ) -> torch.Tensor | None:
        """All-reduce shared expert output when the combine kernel already
        reduced fused output.

        * If the combine kernel does the reduction for fused_output, reduce
          shared_output separately. O.w, reduce fused_output+shared_output later.
        * If we have SP (TP=N, DP=M, EP), there is a separate AG step handled
          in the model.
        """
        if (
            shared_output is not None
            and not self.moe_config.is_sequence_parallel
            and self._fused_output_is_reduced
        ):
            shared_output = tensor_model_parallel_all_reduce(shared_output)
        return shared_output

    def _maybe_reduce_final_output(
        self,
        states: torch.Tensor,
        trunc_size: int | None,
    ) -> torch.Tensor:
        """All-reduce the combined output if needed.

        This is the "late" all-reduce path. When neither fused nor shared
        output was individually reduced, the combined sum is all-reduced
        here. Skipped when sequence-parallel is active (SP handles its
        own reduction) or when the early path already reduced both outputs.
        """
        # We don't need to reduce the final output if:
        # - We are not running with TP or DP
        # - The MK already reduced the fused output itself.
        if (
            not self.moe_config.is_sequence_parallel
            and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1)
            and not self._fused_output_is_reduced
        ):
            states = tensor_model_parallel_all_reduce(states)

        return states[..., :trunc_size] if trunc_size is not None else states

    def _encode_layer_name(self) -> str | LayerName:
        if _USE_LAYERNAME:
            return LayerName(self.layer_name)
        # Can be unavailable or None in unittests
        if (
            is_forward_context_available()
            and get_forward_context().all_moe_layers is not None
        ):
            return "from_forward_context"
        return self.layer_name

    def _maybe_pad_hidden_states(
        self,
        shared_experts_input: torch.Tensor | None,
        hidden_states: torch.Tensor,
    ) -> tuple[torch.Tensor, int | None, int | None]:
        """Pad hidden_states to moe_config.hidden_dim and compute the
        original dimension for later truncation.

        For latent MoE, the routed hidden_states may be smaller than
        hidden_dim. Padding ensures uniform tensor sizes through the
        fused MoE kernel. The returned trunc_size is used by
        _maybe_reduce_final_output to strip the padding from the result.
        """
        shared_experts_hidden_dim = (
            shared_experts_input.shape[-1] if shared_experts_input is not None else 0
        )
        transformed_hidden_dim: int | None = hidden_states.shape[-1]
        if (
            not self._quant_method.skip_forward_padding
            and self.moe_config.hidden_dim != transformed_hidden_dim
        ):
            assert transformed_hidden_dim is not None
            hidden_states = F.pad(
                hidden_states,
                (0, self.moe_config.hidden_dim - transformed_hidden_dim),
                mode="constant",
                value=0.0,
            )

        # Truncation sizes for stripping kernel padding from the output.
        # None means no truncation needed (no padding was applied).
        #
        # Two truncation points exist in forward():
        #   pre_xform:  applied to fused_output BEFORE routed_output_transform
        #   post_xform: applied to the final result AFTER all-reduce
        #
        # Latent MoE with shared experts (NemotronH):
        #   - pre_xform strips padding from the latent dim so
        #     routed_output_transform receives the correct input size
        #   - post_xform truncates to shared_experts_hidden_dim (full hidden)
        #     after shared + routed outputs are combined and all-reduced
        #
        # Standard MoE / MoE without transforms (GPT-OSS, Mixtral):
        #   - pre_xform is None (no early truncation)
        #   - post_xform strips padding after all-reduce (or None if unpadded)
        if transformed_hidden_dim == hidden_states.shape[-1]:
            transformed_hidden_dim = None

        if self.routed_output_transform is not None and shared_experts_hidden_dim > 0:
            pre_xform_trunc_size = transformed_hidden_dim
            post_xform_trunc_size = shared_experts_hidden_dim
        else:
            pre_xform_trunc_size = None
            post_xform_trunc_size = transformed_hidden_dim

        return hidden_states, pre_xform_trunc_size, post_xform_trunc_size

    def _maybe_apply_shared_experts(
        self,
        shared_experts_input: torch.Tensor | None,
        order: SharedExpertsOrder,
    ):
        if self._shared_experts is not None:
            assert shared_experts_input is not None
            self._shared_experts(shared_experts_input, order)

    def _apply_quant_method(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        shared_experts_input: torch.Tensor | None,
        input_ids: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor | None, torch.Tensor]:
        """Run expert routing and the fused MoE kernel via the quant method.

        Orchestrates shared expert execution (before/after), expert selection
        via the router, and the actual fused MoE computation. Returns
        (shared_expert_output, fused_expert_output).
        """
        self._maybe_apply_shared_experts(
            shared_experts_input, SharedExpertsOrder.NO_OVERLAP
        )

        if self.routed_experts.quant_method.is_monolithic:
            # Monolithic kernels: pass router_logits to routed_experts
            fused_out = self.routed_experts.forward_monolithic(
                x=hidden_states,
                router_logits=router_logits,
                input_ids=input_ids,
            )
        else:
            # Modular kernels: select experts first, then call routed_experts
            topk_weights, topk_ids = self.router.select_experts(
                hidden_states=hidden_states,
                router_logits=router_logits,
                topk_indices_dtype=self._quant_method.topk_indices_dtype,
                input_ids=input_ids,
            )

            fused_out = self.routed_experts.forward_modular(
                x=hidden_states,
                topk_weights=topk_weights,
                topk_ids=topk_ids,
                shared_experts=self._shared_experts,
                shared_experts_input=shared_experts_input,
            )

        self._maybe_apply_shared_experts(
            shared_experts_input,
            SharedExpertsOrder.MULTI_STREAM_OVERLAPPED,
        )

        return (
            self._shared_experts.output if self._shared_experts is not None else None,
            fused_out,
        )

    def _sequence_parallel_context(self):
        """Return a context manager for sequence-parallel token
        redistribution.

        When sequence parallelism is active, returns a context that handles
        local size tracking for proper token scatter/gather. Otherwise
        returns a no-op context.
        """
        ctx = get_forward_context()
        return (
            ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size)
            if ctx.dp_metadata
            else nullcontext()
        )

    def _maybe_sync_shared_experts_stream(
        self,
        shared_experts_input: torch.Tensor | None,
    ):
        # If router/gate provided, then apply it here.
        # (Note: This code runs only when "overlapped mode" is on to allow
        #        parallel execution of shared experts with the FusedMoE via
        #        separate cuda stream)
        if self._shared_experts is not None:
            assert shared_experts_input is not None
            self._shared_experts.maybe_sync_shared_experts_stream(shared_experts_input)

    def _maybe_add_zero_expert_output(
        self,
        result: torch.Tensor,
    ) -> torch.Tensor:
        """Add the zero expert's contribution to the final result.

        When a ZeroExpertRouter is used, it computes a bias-like output
        from the "zero expert" that is added to the combined routed+shared
        expert output.
        """
        if isinstance(self.router, ZeroExpertRouter):
            zero_expert_output = self.router.zero_expert_output
            assert zero_expert_output is not None
            result = result + zero_expert_output
        return result

    def forward(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        input_ids: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """Invoke the fused moe layer.

        Input:
        - hidden_states
        - router_logits

        Output:
        - The new hidden_states.

        Calling sequence
        - forward
          - self._forward_entry (_moe_forward or _moe_forward_shared custom op)
            - _forward_impl

        Note: The existence of _moe_forward and _moe_forward_shared custom ops are due
        to the following reason:
        1. pytorch cannot handle union types in custom op signatures so
           _moe_forward and _moe_forward_shared must be split.
        """

        # Apply transform for routed experts (e.g., latent projection
        # for latent MoE)
        hidden_states, shared_experts_input = self.apply_routed_input_transform(
            hidden_states
        )

        # Record before `_maybe_pad_hidden_states` pads activations to match
        # `moe_config.hidden_dim`, e.g. after `align_trtllm_fp4_moe_hidden_dim_for_fi`
        # so routed output can be trimmed before
        # shared+routed add / latent up proj if needed.

        hidden_states, og_hidden_dim_pre_xform, og_hidden_dim_post_xform = (
            self._maybe_pad_hidden_states(
                shared_experts_input,
                hidden_states,
            )
        )

        result = self._forward_entry(
            hidden_states,
            router_logits,
            shared_experts_input,
            input_ids,
            self._encode_layer_name(),
            self.moe_config.hidden_dim_unpadded
            if self._quant_method.has_unpadded_output
            else 0,
        )

        #
        # Note: there are two all-reduce points below. They are mutually
        # exclusive, controlled by _fused_output_is_reduced
        #  - When True: the combine kernel already reduced fused_output,
        #    so we reduce shared_output here to match, then skip the
        #    all-reduce in _maybe_reduce_final_output.
        #  - When False: neither output is reduced yet, so we combine
        #    them first and all-reduce the sum in _maybe_reduce_final_output.

        # Extract outputs from result
        shared_output, fused_output = _unpack(result)

        if og_hidden_dim_pre_xform is not None:
            fused_output = fused_output[..., :og_hidden_dim_pre_xform]

        # If combine kernel already reduced fused, reduce shared to match.
        # See note above re: the two all-reduce points.
        shared_output = self._maybe_reduce_shared_expert_output(shared_output)

        shared_output, fused_output = self._maybe_apply_routed_scale_to_output(
            shared_output, fused_output
        )

        # Apply output transform (e.g. latent -> full dim)
        fused_output = self.apply_routed_output_transform(fused_output)

        if shared_output is not None:
            result = shared_output + fused_output
        else:
            result = fused_output

        result = self._maybe_reduce_final_output(result, og_hidden_dim_post_xform)

        return self._maybe_add_zero_expert_output(result)

    @property
    def do_naive_dispatch_combine(self) -> bool:
        return (
            self.moe_config.dp_size > 1 and not self._quant_method.supports_internal_mk
        )

    def _maybe_dispatch(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        # For naive dispatch/combine Dp/Ep, dispatch the hidden states and
        # router logits to all experts.
        # NOTE: this will be removed once all kernels are migrated into the
        # MoEKernel framework.
        if self.do_naive_dispatch_combine:
            result = get_ep_group().dispatch_router_logits(
                hidden_states,
                router_logits,
                self.moe_config.is_sequence_parallel,
            )
            assert len(result) == 2
            hidden_states, router_logits = result

        # NOTE: Similar with DP, PCP also needs dispatch and combine. For
        # simplicity, AgRsAll2All was added separately for PCP here. Maybe
        # we should modify All2AllManager abstraction to better support PCP.
        if self.moe_config.pcp_size > 1:
            hidden_states = get_pcp_group().all_gather(
                hidden_states,
                dim=0,
            )
            router_logits = get_pcp_group().all_gather(
                router_logits,
                dim=0,
            )

        return hidden_states, router_logits

    def _maybe_combine(
        self,
        shared_output: torch.Tensor | None,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor | tuple[torch.Tensor | None, torch.Tensor]:
        if self.do_naive_dispatch_combine:
            hidden_states = get_ep_group().combine(
                hidden_states, self.moe_config.is_sequence_parallel
            )

        if self.moe_config.pcp_size > 1:
            hidden_states = get_pcp_group().reduce_scatter(
                hidden_states,
                dim=0,
            )

        if self.shared_experts is not None:
            assert shared_output is not None
            return shared_output, hidden_states
        else:
            return hidden_states

    def _forward_impl(
        self,
        hidden_states: torch.Tensor,
        router_logits: torch.Tensor,
        shared_experts_input: torch.Tensor | None,
        input_ids: torch.Tensor | None = None,
    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
        """Entry point called by the custom op to run the MoE computation.

        Handles pre-dispatch setup (gate application, external shared expert
        triggering, quant config init) then performs the following steps
        within the sequence-parallel context.

        - Performs expert routing
        - fused MoE kernel execution
        - shared expert computation.

        Returns a single tensor of combined fused and shared output (if present).
        """
        # TODO(bnell): this can be removed after MK migration is complete.
        self.routed_experts._ensure_moe_quant_config_init()

        # Sync aux and main stream for shared expert multi-stream overlap.
        self._maybe_sync_shared_experts_stream(shared_experts_input)

        # If the Runner holds the gate, apply it after the stream sync,
        # so it can run overlapped with the
        # NOTE: in future PR, MoE runner will always hold the gate.
        if self.gate is not None:
            if self._fse_fuse_gate:
                self._maybe_fuse_gate_weights()
                router_logits = F.linear(hidden_states, self._combined_gate_weight)
            else:
                router_logits, _ = self.gate(hidden_states)

        with self._sequence_parallel_context():
            # TODO(bnell): parts of the dispatch/combine steps will go away once
            # #32567 lands and the remaining kernels are made MKs.  The PCP
            # code will probably remain
            hidden_states, router_logits = self._maybe_dispatch(
                hidden_states,
                router_logits,
            )

            shared_output, hidden_states = self._apply_quant_method(
                hidden_states=hidden_states,
                router_logits=router_logits,
                shared_experts_input=shared_experts_input,
                input_ids=input_ids,
            )

            return self._maybe_combine(
                shared_output,
                hidden_states,
            )

    #########################################################
    #
    # Old methods from FusedMoE layer. Remove when possible.
    #
    #########################################################

    # Note: maybe_init_modular_kernel should only be called by
    # prepare_communication_buffer_for_model.
    # This is called after all weight loading and post-processing, so it
    # should be safe to swap out the quant_method.
    def maybe_init_modular_kernel(self) -> None:
        # NOTE(rob): WIP refactor. For quant methods that own the MK
        # we create the MK during process_weights_after_loading.
        if (
            self.routed_experts.quant_method.supports_internal_mk
            or self.routed_experts.quant_method.is_monolithic
        ):
            return None

        self.routed_experts._ensure_moe_quant_config_init()
        # routing_tables only needed for round-robin expert placement with
        # DeepEP all2all backend.
        routing_tables = self._expert_routing_tables()

        if isinstance(self.routed_experts.quant_method, FusedMoEModularMethod):
            base_quant_method = self.routed_experts.quant_method.old_quant_method
        else:
            base_quant_method = self.routed_experts.quant_method

        prepare_finalize = base_quant_method.maybe_make_prepare_finalize(
            routing_tables=routing_tables
        )
        if prepare_finalize is not None:
            logger.debug(
                "%s for %s(%s)", prepare_finalize.__class__.__name__, self, id(self)
            )
            self._replace_quant_method(
                FusedMoEModularMethod.make(
                    self.routed_experts,
                    base_quant_method,
                    prepare_finalize,
                )
            )

    #
    # Properties
    #

    @property
    def layer_id(self):
        # Delayed import to avoid circular dependency
        from vllm.model_executor.models.utils import extract_layer_index

        return extract_layer_index(self.layer_name)

    #
    # Attributes still needed by models
    #

    @property
    def is_monolithic(self) -> bool:
        return self.routed_experts.quant_method.is_monolithic

    @property
    def activation(self) -> MoEActivation:
        return self.routed_experts.activation

    #
    # Expert maps
    #

    @property
    def expert_map_manager(self):
        """Forward to routed_experts.expert_map_manager for backward compatibility."""
        return self.routed_experts.expert_map_manager

    @property
    def expert_placement_strategy(self) -> ExpertPlacementStrategy:
        return self.expert_map_manager.placement_strategy

    @property
    def expert_global_to_physical(self) -> torch.Tensor | None:
        tables = self.expert_map_manager.routing_tables
        return tables[0] if tables else None

    @property
    def expert_physical_to_global(self) -> torch.Tensor | None:
        """Routing table: physical expert ID to global expert ID."""
        tables = self.expert_map_manager.routing_tables
        return tables[1] if tables else None

    @property
    def expert_local_to_global(self) -> torch.Tensor | None:
        """Routing table: local expert ID to global expert ID."""
        tables = self.expert_map_manager.routing_tables
        return tables[2] if tables else None

    @property
    def expert_map(self) -> torch.Tensor | None:
        return self.routed_experts.expert_map

    def _expert_routing_tables(
        self,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None:
        return self.routed_experts._expert_routing_tables()

    def update_expert_map(self):
        self.routed_experts.update_expert_map()

    def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
        """Map global expert ID to local expert ID."""
        return self.routed_experts._map_global_expert_id_to_local_expert_id(expert_id)

    def get_expert_weights(self) -> Iterable[torch.Tensor]:
        return self.routed_experts.get_expert_weights()

    #
    # EPLB
    #

    @property
    def eplb_state(self) -> EplbLayerState | None:
        return self.router.eplb_state

    def set_eplb_state(
        self,
        moe_layer_idx: int,
        expert_load_view: torch.Tensor,
        logical_to_physical_map: torch.Tensor,
        logical_replica_count: torch.Tensor,
    ) -> None:
        """
        Register the EPLB state in this layer.

        This is used later in forward pass, where we get the expert mapping
        and record the load metrics in `expert_load_view`.
        """
        if self.router.eplb_state is not None:
            self.router.eplb_state.set_layer_state(
                moe_layer_idx,
                expert_load_view,
                logical_to_physical_map,
                logical_replica_count,
            )

expert_local_to_global property

Routing table: local expert ID to global expert ID.

expert_map_manager property

Forward to routed_experts.expert_map_manager for backward compatibility.

expert_physical_to_global property

Routing table: physical expert ID to global expert ID.

_apply_quant_method(hidden_states, router_logits, shared_experts_input, input_ids=None)

Run expert routing and the fused MoE kernel via the quant method.

Orchestrates shared expert execution (before/after), expert selection via the router, and the actual fused MoE computation. Returns (shared_expert_output, fused_expert_output).

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _apply_quant_method(
    self,
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    shared_experts_input: torch.Tensor | None,
    input_ids: torch.Tensor | None = None,
) -> tuple[torch.Tensor | None, torch.Tensor]:
    """Run expert routing and the fused MoE kernel via the quant method.

    Orchestrates shared expert execution (before/after), expert selection
    via the router, and the actual fused MoE computation. Returns
    (shared_expert_output, fused_expert_output).
    """
    self._maybe_apply_shared_experts(
        shared_experts_input, SharedExpertsOrder.NO_OVERLAP
    )

    if self.routed_experts.quant_method.is_monolithic:
        # Monolithic kernels: pass router_logits to routed_experts
        fused_out = self.routed_experts.forward_monolithic(
            x=hidden_states,
            router_logits=router_logits,
            input_ids=input_ids,
        )
    else:
        # Modular kernels: select experts first, then call routed_experts
        topk_weights, topk_ids = self.router.select_experts(
            hidden_states=hidden_states,
            router_logits=router_logits,
            topk_indices_dtype=self._quant_method.topk_indices_dtype,
            input_ids=input_ids,
        )

        fused_out = self.routed_experts.forward_modular(
            x=hidden_states,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            shared_experts=self._shared_experts,
            shared_experts_input=shared_experts_input,
        )

    self._maybe_apply_shared_experts(
        shared_experts_input,
        SharedExpertsOrder.MULTI_STREAM_OVERLAPPED,
    )

    return (
        self._shared_experts.output if self._shared_experts is not None else None,
        fused_out,
    )

_forward_impl(hidden_states, router_logits, shared_experts_input, input_ids=None)

Entry point called by the custom op to run the MoE computation.

Handles pre-dispatch setup (gate application, external shared expert triggering, quant config init) then performs the following steps within the sequence-parallel context.

  • Performs expert routing
  • fused MoE kernel execution
  • shared expert computation.

Returns a single tensor of combined fused and shared output (if present).

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _forward_impl(
    self,
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    shared_experts_input: torch.Tensor | None,
    input_ids: torch.Tensor | None = None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
    """Entry point called by the custom op to run the MoE computation.

    Handles pre-dispatch setup (gate application, external shared expert
    triggering, quant config init) then performs the following steps
    within the sequence-parallel context.

    - Performs expert routing
    - fused MoE kernel execution
    - shared expert computation.

    Returns a single tensor of combined fused and shared output (if present).
    """
    # TODO(bnell): this can be removed after MK migration is complete.
    self.routed_experts._ensure_moe_quant_config_init()

    # Sync aux and main stream for shared expert multi-stream overlap.
    self._maybe_sync_shared_experts_stream(shared_experts_input)

    # If the Runner holds the gate, apply it after the stream sync,
    # so it can run overlapped with the
    # NOTE: in future PR, MoE runner will always hold the gate.
    if self.gate is not None:
        if self._fse_fuse_gate:
            self._maybe_fuse_gate_weights()
            router_logits = F.linear(hidden_states, self._combined_gate_weight)
        else:
            router_logits, _ = self.gate(hidden_states)

    with self._sequence_parallel_context():
        # TODO(bnell): parts of the dispatch/combine steps will go away once
        # #32567 lands and the remaining kernels are made MKs.  The PCP
        # code will probably remain
        hidden_states, router_logits = self._maybe_dispatch(
            hidden_states,
            router_logits,
        )

        shared_output, hidden_states = self._apply_quant_method(
            hidden_states=hidden_states,
            router_logits=router_logits,
            shared_experts_input=shared_experts_input,
            input_ids=input_ids,
        )

        return self._maybe_combine(
            shared_output,
            hidden_states,
        )

_map_global_expert_id_to_local_expert_id(expert_id)

Map global expert ID to local expert ID.

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
    """Map global expert ID to local expert ID."""
    return self.routed_experts._map_global_expert_id_to_local_expert_id(expert_id)

_maybe_add_zero_expert_output(result)

Add the zero expert's contribution to the final result.

When a ZeroExpertRouter is used, it computes a bias-like output from the "zero expert" that is added to the combined routed+shared expert output.

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _maybe_add_zero_expert_output(
    self,
    result: torch.Tensor,
) -> torch.Tensor:
    """Add the zero expert's contribution to the final result.

    When a ZeroExpertRouter is used, it computes a bias-like output
    from the "zero expert" that is added to the combined routed+shared
    expert output.
    """
    if isinstance(self.router, ZeroExpertRouter):
        zero_expert_output = self.router.zero_expert_output
        assert zero_expert_output is not None
        result = result + zero_expert_output
    return result

_maybe_apply_routed_scale_to_output(shared_output, fused_output)

Apply routed_scaling_factor to the output with FP16 overflow protection.

Scale the fused expert output by routed_scaling_factor. For FP16, avoid overflow by dividing shared_output by the scale instead (the decoder layer compensates with matching divisions).

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _maybe_apply_routed_scale_to_output(
    self,
    shared_output: torch.Tensor | None,
    fused_output: torch.Tensor,
) -> tuple[torch.Tensor | None, torch.Tensor]:
    """Apply routed_scaling_factor to the output with FP16 overflow
    protection.

    Scale the fused expert output by routed_scaling_factor. For FP16,
    avoid overflow by dividing shared_output by the scale instead
    (the decoder layer compensates with matching divisions).
    """
    if self.routed_scaling_factor != 1.0:
        if fused_output.dtype != torch.float16 or shared_output is None:
            fused_output *= self.routed_scaling_factor
        elif shared_output is not None:
            shared_output *= 1.0 / self.routed_scaling_factor
    return shared_output, fused_output

_maybe_fuse_gate_weights()

Fuse router and shared expert gate weights on first call.

Cannot be done at init because gate weights are loaded after module construction (via weight_loader). Called once from _forward_impl before the first forward pass.

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _maybe_fuse_gate_weights(self):
    """Fuse router and shared expert gate weights on first call.

    Cannot be done at __init__ because gate weights are loaded after
    module construction (via weight_loader). Called once from
    _forward_impl before the first forward pass.
    """
    if self._combined_gate_weight is None:
        assert self.gate is not None and self.shared_expert_gate is not None
        self._combined_gate_weight = torch.cat(
            [self.gate.weight, self.shared_expert_gate.weight],
            dim=0,
        )

_maybe_pad_hidden_states(shared_experts_input, hidden_states)

Pad hidden_states to moe_config.hidden_dim and compute the original dimension for later truncation.

For latent MoE, the routed hidden_states may be smaller than hidden_dim. Padding ensures uniform tensor sizes through the fused MoE kernel. The returned trunc_size is used by _maybe_reduce_final_output to strip the padding from the result.

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _maybe_pad_hidden_states(
    self,
    shared_experts_input: torch.Tensor | None,
    hidden_states: torch.Tensor,
) -> tuple[torch.Tensor, int | None, int | None]:
    """Pad hidden_states to moe_config.hidden_dim and compute the
    original dimension for later truncation.

    For latent MoE, the routed hidden_states may be smaller than
    hidden_dim. Padding ensures uniform tensor sizes through the
    fused MoE kernel. The returned trunc_size is used by
    _maybe_reduce_final_output to strip the padding from the result.
    """
    shared_experts_hidden_dim = (
        shared_experts_input.shape[-1] if shared_experts_input is not None else 0
    )
    transformed_hidden_dim: int | None = hidden_states.shape[-1]
    if (
        not self._quant_method.skip_forward_padding
        and self.moe_config.hidden_dim != transformed_hidden_dim
    ):
        assert transformed_hidden_dim is not None
        hidden_states = F.pad(
            hidden_states,
            (0, self.moe_config.hidden_dim - transformed_hidden_dim),
            mode="constant",
            value=0.0,
        )

    # Truncation sizes for stripping kernel padding from the output.
    # None means no truncation needed (no padding was applied).
    #
    # Two truncation points exist in forward():
    #   pre_xform:  applied to fused_output BEFORE routed_output_transform
    #   post_xform: applied to the final result AFTER all-reduce
    #
    # Latent MoE with shared experts (NemotronH):
    #   - pre_xform strips padding from the latent dim so
    #     routed_output_transform receives the correct input size
    #   - post_xform truncates to shared_experts_hidden_dim (full hidden)
    #     after shared + routed outputs are combined and all-reduced
    #
    # Standard MoE / MoE without transforms (GPT-OSS, Mixtral):
    #   - pre_xform is None (no early truncation)
    #   - post_xform strips padding after all-reduce (or None if unpadded)
    if transformed_hidden_dim == hidden_states.shape[-1]:
        transformed_hidden_dim = None

    if self.routed_output_transform is not None and shared_experts_hidden_dim > 0:
        pre_xform_trunc_size = transformed_hidden_dim
        post_xform_trunc_size = shared_experts_hidden_dim
    else:
        pre_xform_trunc_size = None
        post_xform_trunc_size = transformed_hidden_dim

    return hidden_states, pre_xform_trunc_size, post_xform_trunc_size

_maybe_reduce_final_output(states, trunc_size)

All-reduce the combined output if needed.

This is the "late" all-reduce path. When neither fused nor shared output was individually reduced, the combined sum is all-reduced here. Skipped when sequence-parallel is active (SP handles its own reduction) or when the early path already reduced both outputs.

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _maybe_reduce_final_output(
    self,
    states: torch.Tensor,
    trunc_size: int | None,
) -> torch.Tensor:
    """All-reduce the combined output if needed.

    This is the "late" all-reduce path. When neither fused nor shared
    output was individually reduced, the combined sum is all-reduced
    here. Skipped when sequence-parallel is active (SP handles its
    own reduction) or when the early path already reduced both outputs.
    """
    # We don't need to reduce the final output if:
    # - We are not running with TP or DP
    # - The MK already reduced the fused output itself.
    if (
        not self.moe_config.is_sequence_parallel
        and (self.moe_config.tp_size > 1 or self.moe_config.ep_size > 1)
        and not self._fused_output_is_reduced
    ):
        states = tensor_model_parallel_all_reduce(states)

    return states[..., :trunc_size] if trunc_size is not None else states

_maybe_reduce_shared_expert_output(shared_output)

All-reduce shared expert output when the combine kernel already reduced fused output.

  • If the combine kernel does the reduction for fused_output, reduce shared_output separately. O.w, reduce fused_output+shared_output later.
  • If we have SP (TP=N, DP=M, EP), there is a separate AG step handled in the model.
Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _maybe_reduce_shared_expert_output(
    self,
    shared_output: torch.Tensor | None,
) -> torch.Tensor | None:
    """All-reduce shared expert output when the combine kernel already
    reduced fused output.

    * If the combine kernel does the reduction for fused_output, reduce
      shared_output separately. O.w, reduce fused_output+shared_output later.
    * If we have SP (TP=N, DP=M, EP), there is a separate AG step handled
      in the model.
    """
    if (
        shared_output is not None
        and not self.moe_config.is_sequence_parallel
        and self._fused_output_is_reduced
    ):
        shared_output = tensor_model_parallel_all_reduce(shared_output)
    return shared_output

_sequence_parallel_context()

Return a context manager for sequence-parallel token redistribution.

When sequence parallelism is active, returns a context that handles local size tracking for proper token scatter/gather. Otherwise returns a no-op context.

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def _sequence_parallel_context(self):
    """Return a context manager for sequence-parallel token
    redistribution.

    When sequence parallelism is active, returns a context that handles
    local size tracking for proper token scatter/gather. Otherwise
    returns a no-op context.
    """
    ctx = get_forward_context()
    return (
        ctx.dp_metadata.sp_local_sizes(self.moe_config.sp_size)
        if ctx.dp_metadata
        else nullcontext()
    )

apply_routed_input_transform(hidden_states)

Apply transform for routed experts (e.g., latent projection).

This is called by FusedMoE.forward_native. The original hidden_states is saved separately so shared experts get [S, hidden_size] while routed experts get the transformed [S, moe_latent_size].

Returns (possibly transformed) hidden states and the input for shared experts (or None if there are no shared experts).

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def apply_routed_input_transform(
    self, hidden_states: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor | None]:
    """Apply transform for routed experts (e.g., latent projection).

    This is called by FusedMoE.forward_native. The original hidden_states
    is saved separately so shared experts get [S, hidden_size] while
    routed experts get the transformed [S, moe_latent_size].

    Returns (possibly transformed) hidden states and the input for shared
    experts (or None if there are no shared experts).
    """
    if self.routed_input_transform is not None:
        result = self.routed_input_transform(hidden_states)
        # ReplicatedLinear returns (output, extra_bias) tuple.
        # We only need the output tensor; extra_bias is not used here.
        if isinstance(result, tuple):
            return result[0], hidden_states
        return result, hidden_states

    return (
        hidden_states,
        hidden_states if self._shared_experts is not None else None,
    )

apply_routed_output_transform(fused_output)

Apply transform to routed expert output (e.g., latent to full dim).

Used by latent MoE models (e.g., NemotronH) where routed experts operate in a compressed latent space and need projection back to the full hidden dimension before combining with shared expert output.

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def apply_routed_output_transform(
    self,
    fused_output: torch.Tensor,
) -> torch.Tensor:
    """Apply transform to routed expert output (e.g., latent to full dim).

    Used by latent MoE models (e.g., NemotronH) where routed experts
    operate in a compressed latent space and need projection back to
    the full hidden dimension before combining with shared expert output.
    """
    if self.routed_output_transform is not None:
        r = self.routed_output_transform(fused_output)
        fused_output = r[0] if isinstance(r, tuple) else r
    return fused_output

forward(hidden_states, router_logits, input_ids=None)

Invoke the fused moe layer.

Input: - hidden_states - router_logits

Output: - The new hidden_states.

Calling sequence - forward - self._forward_entry (_moe_forward or _moe_forward_shared custom op) - _forward_impl

Note: The existence of _moe_forward and _moe_forward_shared custom ops are due to the following reason: 1. pytorch cannot handle union types in custom op signatures so _moe_forward and _moe_forward_shared must be split.

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def forward(
    self,
    hidden_states: torch.Tensor,
    router_logits: torch.Tensor,
    input_ids: torch.Tensor | None = None,
) -> torch.Tensor:
    """Invoke the fused moe layer.

    Input:
    - hidden_states
    - router_logits

    Output:
    - The new hidden_states.

    Calling sequence
    - forward
      - self._forward_entry (_moe_forward or _moe_forward_shared custom op)
        - _forward_impl

    Note: The existence of _moe_forward and _moe_forward_shared custom ops are due
    to the following reason:
    1. pytorch cannot handle union types in custom op signatures so
       _moe_forward and _moe_forward_shared must be split.
    """

    # Apply transform for routed experts (e.g., latent projection
    # for latent MoE)
    hidden_states, shared_experts_input = self.apply_routed_input_transform(
        hidden_states
    )

    # Record before `_maybe_pad_hidden_states` pads activations to match
    # `moe_config.hidden_dim`, e.g. after `align_trtllm_fp4_moe_hidden_dim_for_fi`
    # so routed output can be trimmed before
    # shared+routed add / latent up proj if needed.

    hidden_states, og_hidden_dim_pre_xform, og_hidden_dim_post_xform = (
        self._maybe_pad_hidden_states(
            shared_experts_input,
            hidden_states,
        )
    )

    result = self._forward_entry(
        hidden_states,
        router_logits,
        shared_experts_input,
        input_ids,
        self._encode_layer_name(),
        self.moe_config.hidden_dim_unpadded
        if self._quant_method.has_unpadded_output
        else 0,
    )

    #
    # Note: there are two all-reduce points below. They are mutually
    # exclusive, controlled by _fused_output_is_reduced
    #  - When True: the combine kernel already reduced fused_output,
    #    so we reduce shared_output here to match, then skip the
    #    all-reduce in _maybe_reduce_final_output.
    #  - When False: neither output is reduced yet, so we combine
    #    them first and all-reduce the sum in _maybe_reduce_final_output.

    # Extract outputs from result
    shared_output, fused_output = _unpack(result)

    if og_hidden_dim_pre_xform is not None:
        fused_output = fused_output[..., :og_hidden_dim_pre_xform]

    # If combine kernel already reduced fused, reduce shared to match.
    # See note above re: the two all-reduce points.
    shared_output = self._maybe_reduce_shared_expert_output(shared_output)

    shared_output, fused_output = self._maybe_apply_routed_scale_to_output(
        shared_output, fused_output
    )

    # Apply output transform (e.g. latent -> full dim)
    fused_output = self.apply_routed_output_transform(fused_output)

    if shared_output is not None:
        result = shared_output + fused_output
    else:
        result = fused_output

    result = self._maybe_reduce_final_output(result, og_hidden_dim_post_xform)

    return self._maybe_add_zero_expert_output(result)

set_eplb_state(moe_layer_idx, expert_load_view, logical_to_physical_map, logical_replica_count)

Register the EPLB state in this layer.

This is used later in forward pass, where we get the expert mapping and record the load metrics in expert_load_view.

Source code in vllm/model_executor/layers/fused_moe/runner/moe_runner.py
def set_eplb_state(
    self,
    moe_layer_idx: int,
    expert_load_view: torch.Tensor,
    logical_to_physical_map: torch.Tensor,
    logical_replica_count: torch.Tensor,
) -> None:
    """
    Register the EPLB state in this layer.

    This is used later in forward pass, where we get the expert mapping
    and record the load metrics in `expert_load_view`.
    """
    if self.router.eplb_state is not None:
        self.router.eplb_state.set_layer_state(
            moe_layer_idx,
            expert_load_view,
            logical_to_physical_map,
            logical_replica_count,
        )

RoutedExperts

Bases: PluggableLayer

Container for routed expert weights and execution logic.

This module owns the expert weight parameters (w13_weight, w2_weight, scales, etc.) and handles: - Loading checkpoint weights into parameters - Executing routed experts via quant_method.apply()

Methods:

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
@PluggableLayer.register("routed_experts")
class RoutedExperts(PluggableLayer):
    """
    Container for routed expert weights and execution logic.

    This module owns the expert weight parameters (w13_weight, w2_weight, scales, etc.)
    and handles:
    - Loading checkpoint weights into parameters
    - Executing routed experts via quant_method.apply()
    """

    def __init__(
        self,
        layer_name: str,
        params_dtype: torch.dtype,
        moe_config: FusedMoEConfig,
        quant_config: QuantizationConfig | None,
        expert_map_manager: ExpertMapManager,
        expert_mapping: list[tuple[str, str, int, str]] | None = None,
        #
        # Extra params that are needed by quant_methods, pass along for now
        # Prefer getting these from other sources, e.g. moe_config or
        # router object
        #
        renormalize: bool = True,
        use_grouped_topk: bool = False,
        num_expert_group: int | None = None,
        topk_group: int | None = None,
        custom_routing_function: Callable | None = None,
        scoring_func: str = "softmax",
        routed_scaling_factor: float = 1.0,
        swiglu_limit: float | None = None,
        e_score_correction_bias: torch.Tensor | None = None,
        apply_router_weight_on_input: bool = False,
    ):
        super().__init__()
        self.layer_name = layer_name
        self.moe_config = moe_config
        self.quant_config = quant_config
        self.expert_mapping = expert_mapping
        self.expert_map_manager = expert_map_manager
        self.hidden_size = moe_config.hidden_dim
        self.global_num_experts = moe_config.num_experts
        self.local_num_experts = moe_config.num_local_experts
        self.params_dtype = params_dtype

        # Register buffers for state_dict compatibility
        self.update_expert_map_info()

        self.rocm_aiter_fmoe_enabled = moe_config.rocm_aiter_fmoe_enabled

        # It would be good to eventually codify these in FusedMoEConfig
        # or some other config.
        self.top_k = self.moe_config.experts_per_token
        self.activation = self.moe_config.activation
        self.renormalize = renormalize
        self.use_grouped_topk = use_grouped_topk
        self.num_expert_group = num_expert_group
        self.topk_group = topk_group
        self.custom_routing_function = custom_routing_function
        self.scoring_func = scoring_func
        self.routed_scaling_factor = routed_scaling_factor
        self.swiglu_limit = swiglu_limit
        self.e_score_correction_bias = e_score_correction_bias
        self.apply_router_weight_on_input = apply_router_weight_on_input
        # End random parameters

        self.quant_method = self._get_quant_method(
            self.layer_name,
            self.quant_config,
            self.moe_config,
        )

        # Round up hidden size and update moe_config.
        # TODO: move roundup to _get_quant_method?
        self.hidden_size, self.intermediate_size_per_partition = (
            self.quant_method.maybe_roundup_sizes(
                self.hidden_size,
                self.moe_config.intermediate_size_per_partition,
                self.moe_config.in_dtype,
                self.moe_config.moe_parallel_config,
            )
        )
        self.moe_config.hidden_dim = self.hidden_size
        self.moe_config.intermediate_size_per_partition = (
            self.intermediate_size_per_partition
        )

        if (
            self.moe_config.moe_parallel_config.enable_eplb
            and not self.quant_method.supports_eplb
        ):
            # TODO: Add support for additional quantization methods.
            # The implementation for other quantization methods does not
            # contain essential differences, but the current quant API
            # design causes duplicated work when extending to new
            # quantization methods, so I'm leaving it for now.
            # If you plan to add support for more quantization methods,
            # please refer to the implementation in `Fp8MoEMethod`.
            raise NotImplementedError(
                f"EPLB is not supported {self.quant_method.__class__.__name__}."
            )

        moe_quant_params: dict[str, Any] = {
            "num_experts": moe_config.num_local_experts,
            "hidden_size": self.hidden_size,
            "unpadded_hidden_size": self.moe_config.hidden_dim_unpadded,
            "intermediate_size_per_partition": (
                self.moe_config.intermediate_size_per_partition
            ),
            "params_dtype": params_dtype,
            "weight_loader": self.weight_loader,
            "global_num_experts": moe_config.num_experts,
        }

        # need full intermediate size pre-sharding for WNA16 act order
        if self._needs_intermediate_size_param(self.quant_method):
            moe_quant_params["intermediate_size_full"] = (
                self.moe_config.intermediate_size
            )

        self.quant_method.create_weights(layer=self, **moe_quant_params)

    # TODO(bnell): Temporary hack. Get rid of this.
    def _replace_quant_method(self, quant_method: FusedMoEMethodBase):
        self.quant_method = quant_method

    # TODO(bnell): Hack for elastic_ep. Get rid of this
    def _set_moe_config(self, new_moe_config: FusedMoEConfig):
        self.moe_config = new_moe_config
        self.global_num_experts = new_moe_config.num_experts
        # local experts?

    def _get_quant_method(
        self,
        prefix: str,
        quant_config: QuantizationConfig | None,
        moe_config: FusedMoEConfig,
    ) -> FusedMoEMethodBase:
        """
        Helper method to ensure quant_method is never None and
        of the proper type.
        """
        quant_method = None
        if quant_config is not None:
            quant_method = quant_config.get_quant_method(self, prefix)
        if quant_method is None:
            quant_method = UnquantizedFusedMoEMethod(moe_config)
        assert isinstance(quant_method, FusedMoEMethodBase)
        return quant_method

    # TODO(bnell): make this a method on quant_method
    def _needs_intermediate_size_param(self, quant_method: FusedMoEMethodBase) -> bool:
        return quant_method.__class__.__name__ in (
            "AutoGPTQMoEMethod",
            "CompressedTensorsWNA16MarlinMoEMethod",
            "CompressedTensorsWNA16MoEMethod",
        )

    def _ensure_moe_quant_config_init(self):
        if self.quant_method.moe_quant_config is None:
            # Note: the moe_quant_config can't be constructed until after
            # weight loading post processing.
            self.quant_method.moe_quant_config = (
                self.quant_method.get_fused_moe_quant_config(self)
            )

    @property
    def use_ep(self) -> bool:
        return self.moe_config.moe_parallel_config.use_ep

    @property
    def expert_map(self) -> torch.Tensor | None:
        return (
            self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask
        )

    def update_expert_map_info(self):
        # Update local attributes from ExpertMapManager
        self.local_num_experts = self.expert_map_manager.local_num_experts
        self.expert_placement_strategy = self.expert_map_manager.placement_strategy
        self.register_buffer("_expert_map", self.expert_map_manager.expert_map)
        self.register_buffer("expert_mask", self.expert_map_manager.expert_mask)

        # Get routing tables from ExpertMapManager
        routing_tables = self.expert_map_manager.routing_tables
        if routing_tables is not None:
            # Register routing tables as buffers for this layer
            global_to_physical, physical_to_global, local_global = routing_tables
            self.register_buffer("expert_global_to_physical", global_to_physical)
            self.register_buffer("expert_physical_to_global", physical_to_global)
            self.register_buffer("expert_local_to_global", local_global)

    def _expert_routing_tables(
        self,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None:
        # Return cached routing tables if already registered as buffers
        if hasattr(self, "expert_global_to_physical"):
            return cast(
                tuple[torch.Tensor, torch.Tensor, torch.Tensor],
                (
                    self.expert_global_to_physical,
                    self.expert_physical_to_global,
                    self.expert_local_to_global,
                ),
            )
        return None

    def update_expert_map(self):
        # Update ExpertMapManager with new EP configuration
        # The moe_parallel_config (including ep_size and ep_rank)
        # should already be updated.
        # Note: ExpertMapManager.update() recalculates expert maps and
        # reinitializes routing tables internally.
        self.expert_map_manager.update(
            self.moe_config.moe_parallel_config,
            global_num_experts=self.global_num_experts,
        )

        # Update local attributes from ExpertMapManager
        self.update_expert_map_info()

    def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
        """Map global expert ID to local expert ID."""
        return self.expert_map_manager.map_global_to_local(expert_id)

    #
    # Weight Loading Methods
    #

    def _load_per_tensor_weight_scale(
        self,
        shard_id: str,
        param: torch.nn.Parameter,
        loaded_weight: torch.Tensor,
        expert_id: int,
    ):
        param_data = param.data
        # for per tensor weight quantization
        if shard_id in ("w1", "w3"):
            # We have to keep the weight scales of w1 and w3 because
            # we need to re-quantize w1/w3 weights after weight loading.
            idx = 0 if shard_id == "w1" else 1
            param_data[expert_id][idx] = loaded_weight
        # If we are in the row parallel case (down_proj)
        elif shard_id == "w2":
            param_data[expert_id] = loaded_weight

    def _load_combined_w13_weight_scale(
        self,
        shard_dim: int,
        loaded_weight: torch.Tensor,
        param: torch.Tensor,
        tp_rank: int,
    ):
        """
        Load w13 weight scales assuming that w1 weight scales and w3 weight
        scales are stored in the same loaded_weight tensor.
        """
        shard_size = param.shape[shard_dim]
        loaded_weight = loaded_weight.narrow(
            shard_dim, shard_size * tp_rank, shard_size
        )
        param.copy_(loaded_weight)

    def _load_model_weight_or_group_weight_scale(
        self,
        shard_dim: int,
        expert_data: torch.Tensor,
        shard_id: str,
        loaded_weight: torch.Tensor,
        tp_rank: int,
        load_full_w2: bool = False,
    ):
        """
        Load grouped weight scales for group quantization or model weights

        Args:
            shard_dim: dimension to shard
            expert_data: parameter for a particular expert
            shard_id: either w1, w2, or w3
            loaded_weight: checkpoint weight to load into the param
            tp_rank: tensor parallel rank
            load_full_w2: whether or not the w2 loaded should be sharded.
        """
        if shard_id == "w2":
            # In the case where we have actorder/g_idx, we do not partition the
            # w2 scales, as indicated by `load_full` argument, for all tp cases
            self._load_w2(
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=tp_rank,
                load_full=load_full_w2,
            )
        elif shard_id in ("w1", "w3"):
            self._load_w13(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=tp_rank,
            )

    def _load_per_channel_weight_scale(
        self,
        expert_data: torch.Tensor,
        shard_dim: int,
        shard_id: str,
        loaded_weight: torch.Tensor,
        tp_rank: int,
    ):
        # for per channel weight quantization
        if shard_id == "w2":
            hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim)
            expert_data = self._narrow_expert_data_for_padding(
                expert_data,
                loaded_weight,
                hidden_dim=hidden_dim,
                shard_dim=shard_dim,
            )
            expert_data.copy_(loaded_weight)
        elif shard_id in ("w1", "w3"):
            self._load_w13(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=tp_rank,
            )

    @staticmethod
    def _get_hidden_dim(shard_dim: int, ndim: int) -> int:
        """Compute the hidden dimension index from the shard (intermediate)
        dimension and tensor rank.

        For 2D weight tensors the two data dims are (0, 1). For 3D tensors
        with an expert dimension at dim 0, they are (1, 2). ``shard_dim``
        occupies one of these; the hidden dimension is the other.
        For 1D tensors (e.g. per-channel scales) returns 0.
        """
        if ndim < 2:
            return 0
        dim_a = ndim - 2
        dim_b = ndim - 1
        if shard_dim == dim_a:
            return dim_b
        if shard_dim == dim_b:
            return dim_a
        raise ValueError(
            f"shard_dim={shard_dim} is not a valid data dimension "
            f"for a {ndim}D tensor (expected {dim_a} or {dim_b})"
        )

    @staticmethod
    def _narrow_expert_data_for_padding(
        expert_data: torch.Tensor,
        loaded_weight: torch.Tensor,
        hidden_dim: int,
        shard_dim: int | None = None,
    ) -> torch.Tensor:
        """Narrow expert_data to match loaded_weight for padded dimensions.

        When backends (e.g., DeepEP) round up hidden_size, weight parameters
        are larger than checkpoint weights. Narrow the padded hidden dimension
        before copying. Similarly, when padding occurs on the shard
        (intermediate) dimension (e.g. for MXFP4 GEMM), narrow that dimension
        as well.

        Args:
            expert_data: The (possibly padded) parameter tensor to narrow.
            loaded_weight: The checkpoint weight tensor with original size.
            hidden_dim: The dimension index corresponding to hidden_size.
                Must be non-negative.
            shard_dim: The dimension index corresponding to the shard
                (intermediate) dimension. Defaults to `None`.
        """
        dims = (hidden_dim,) if shard_dim is None else (hidden_dim, shard_dim)
        if loaded_weight.ndim > 0:
            for dim in dims:
                if (
                    0 <= dim < expert_data.ndim
                    and dim < loaded_weight.ndim
                    and expert_data.shape[dim] > loaded_weight.shape[dim]
                ):
                    expert_data = expert_data.narrow(dim, 0, loaded_weight.shape[dim])
        return expert_data

    def _load_w13(
        self,
        expert_data: torch.Tensor,
        shard_dim: int,
        shard_id: str,
        loaded_weight: torch.Tensor,
        tp_rank: int,
        load_full: bool = False,
    ):
        # Index the loaded weight for tp sharding.
        # gate_up_proj: "MergedColumnParallel", so tp sharding on output_dim
        if self.moe_config.is_act_and_mul:
            shard_size = expert_data.shape[shard_dim] // 2
        else:
            shard_size = expert_data.shape[shard_dim]
        # Only narrow if the loaded_weight is not a scalar (0-dim tensor)
        # and we're not loading the full weight
        if not load_full and loaded_weight.ndim > 0:
            # When the parameter has been padded (e.g. MXFP4 rounding up
            # intermediate_size_per_partition), shard_size is the padded
            # size.  Compute the offset into the checkpoint weight using
            # the *unpadded* per-rank size so that every TP rank lands at
            # the correct slice.
            tp_size = self.moe_config.moe_parallel_config.tp_size
            loaded_per_rank = loaded_weight.shape[shard_dim] // tp_size
            start_offset = loaded_per_rank * tp_rank
            available = loaded_weight.shape[shard_dim] - start_offset
            if available <= 0:
                # If there is no available weight to load for this TP rank
                # (can happen on last TP rank with padding), we can skip
                # loading and return early
                return
            narrow_size = min(loaded_per_rank, available)
            loaded_weight = loaded_weight.narrow(shard_dim, start_offset, narrow_size)
        # Narrow parameter and load.
        # w1, gate_proj: Load into first logical weight of w13.
        if shard_id == "w1":
            expert_data = expert_data.narrow(shard_dim, 0, shard_size)
        # w3, up_proj: Load into second logical weight of w13.
        else:
            assert shard_id == "w3"
            expert_data = expert_data.narrow(shard_dim, shard_size, shard_size)
        hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim)
        expert_data = self._narrow_expert_data_for_padding(
            expert_data,
            loaded_weight,
            hidden_dim=hidden_dim,
            shard_dim=shard_dim,
        )
        expert_data.copy_(loaded_weight)

    def _load_w2(
        self,
        expert_data: torch.Tensor,
        shard_dim: int,
        loaded_weight: torch.Tensor,
        tp_rank: int,
        load_full: bool = False,
    ):
        # Index the loaded weight for tp sharding.
        # down_proj: "RowParallel" so tp sharding on input_dim
        # Only narrow if the loaded_weight is not a scalar (0-dim tensor)
        # and we're not loading the full weight
        if not load_full and loaded_weight.ndim > 0:
            # Same padding fix as _load_w13: use unpadded per-rank size.
            tp_size = self.moe_config.moe_parallel_config.tp_size
            loaded_per_rank = loaded_weight.shape[shard_dim] // tp_size
            start_offset = loaded_per_rank * tp_rank
            available = loaded_weight.shape[shard_dim] - start_offset
            if available <= 0:
                # If there is no available weight to load for this TP rank
                # (can happen on last TP rank with padding), we can skip
                # loading and return early
                return
            narrow_size = min(loaded_per_rank, available)
            loaded_weight = loaded_weight.narrow(shard_dim, start_offset, narrow_size)
        # w2, down_proj: Load into only logical weight of w2.
        hidden_dim = self._get_hidden_dim(shard_dim, expert_data.ndim)
        expert_data = self._narrow_expert_data_for_padding(
            expert_data,
            loaded_weight,
            hidden_dim=hidden_dim,
            shard_dim=shard_dim,
        )
        expert_data.copy_(loaded_weight)

    def _load_single_value(
        self, param: torch.nn.Parameter, loaded_weight: torch.Tensor, expert_id: int
    ):
        param_data = param.data

        # Input scales can be loaded directly and should be equal.
        param_data[expert_id] = loaded_weight

    def _load_g_idx(
        self,
        shard_id: str,
        expert_data: torch.Tensor,
        shard_dim: int,
        loaded_weight: torch.Tensor,
        tp_rank: int,
    ):
        if shard_id == "w2":
            self._load_w2(
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=tp_rank,
            )
        else:
            assert shard_id in ("w1", "w3")
            expert_data.copy_(loaded_weight)

    @overload
    def weight_loader(
        self,
        param: torch.nn.Parameter,
        loaded_weight: torch.Tensor,
        weight_name: str,
        shard_id: str,
        expert_id: int,
        return_success: Literal[False],
    ) -> None: ...

    @overload
    def weight_loader(
        self,
        param: torch.nn.Parameter,
        loaded_weight: torch.Tensor,
        weight_name: str,
        shard_id: str,
        expert_id: int,
        return_success: Literal[True],
    ) -> bool: ...

    def weight_loader(
        self,
        param: torch.nn.Parameter,
        loaded_weight: torch.Tensor,
        weight_name: str,
        shard_id: str,
        expert_id: int,
        return_success: bool = False,
    ) -> bool | None:
        quant_config_name = self.quant_config and self.quant_config.get_name()
        if quant_config_name == "gpt_oss_mxfp4":
            # (FIXME) for gpt-oss all experts are combined
            if "bias" in weight_name:
                dim1 = loaded_weight.shape[1]
                param.data[:, :dim1].copy_(loaded_weight)
            else:
                dim1 = loaded_weight.shape[1]
                dim2 = loaded_weight.shape[2]
                param.data[:, :dim1, :dim2].copy_(loaded_weight)
            return True if return_success else None

        quant_method_name = self.quant_method.__class__.__name__
        global_expert_id = expert_id
        expert_id = self._map_global_expert_id_to_local_expert_id(global_expert_id)

        use_global_sf = (
            getattr(self.quant_method, "use_global_sf", False)
            and "input_scale" in weight_name
        )

        if expert_id == -1 and not use_global_sf:
            # Failed to load this param since it's not local to this rank
            return False if return_success else None
        # Hereafter, `expert_id` is local physical id

        # is_transposed: if the dim to shard the weight
        # should be flipped. Required by GPTQ, compressed-tensors
        # should be whatever dimension intermediate_size_per_partition is
        is_transposed = getattr(param, "is_transposed", False)

        # compressed-tensors checkpoints with packed weights are stored flipped
        # TODO (mgoin): check self.quant_method.quant_config.quant_format
        # against known CompressionFormat enum values that have this quality
        if quant_method_name in (
            "CompressedTensorsWNA16MarlinMoEMethod",
            "CompressedTensorsWNA16MoEMethod",
            "CompressedTensorsWNA16RDNA3MoEMethod",
        ):
            if is_transposed:
                loaded_weight = loaded_weight.t().contiguous()
            else:
                loaded_weight = loaded_weight

        if shard_id not in ("w1", "w2", "w3"):
            raise ValueError(f"shard_id must be ['w1','w2','w3'] but got {shard_id}.")

        # Fetch the dim to shard the parameter/loaded weight
        # based on the shard id. This will be whatever
        # dimension intermediate_size_per_partition is used.
        SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0}

        is_gguf_weight = getattr(param, "is_gguf_weight", False)
        is_gguf_weight_type = getattr(param, "is_gguf_weight_type", False)
        if is_gguf_weight_type:
            param.weight_type = loaded_weight.item()
            param.data.copy_(loaded_weight)
            return True if return_success else None

        # Case for BitsAndBytes
        use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
        if use_bitsandbytes_4bit:
            shard_dim = 0

            expert_data = param.data[expert_id]
            if shard_id == "w2":
                # BnB params are stored as flat packed tensors (e.g.
                # (packed_size, 1)), not in the logical weight layout.
                # Narrowing packed data for hidden-dim padding is not
                # meaningful, so require an exact shape match.
                if expert_data.shape != loaded_weight.shape:
                    raise ValueError(
                        "BitsAndBytes quantization with padded hidden_size "
                        "(e.g., from DeepEP) is not supported. "
                        f"Parameter shape {tuple(expert_data.shape)} != "
                        f"checkpoint shape {tuple(loaded_weight.shape)}"
                    )
                expert_data.copy_(loaded_weight)
            elif shard_id in ("w1", "w3"):
                # BnB stores weights as flat packed tensors.  _load_w13 is
                # still used to split the w1/w3 portions along shard_dim.
                # _narrow_expert_data_for_padding will be a no-op since
                # packed sizes should already match; if DeepEP padding
                # causes a mismatch the copy_() will fail with a clear
                # shape error.
                full_load = True
                self._load_w13(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.moe_config.tp_rank,
                    load_full=full_load,
                )
            return True if return_success else None

        shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id]
        if is_transposed:
            shard_dim = int(not shard_dim)

        full_load = len(loaded_weight.shape) == 3
        if full_load:
            shard_dim += 1

        # Materialize GGUF UninitializedParameter accounting merged weights
        if is_gguf_weight and isinstance(param, UninitializedParameter):
            # To materialize a tensor, we must have full shape including
            # number of experts, making this portion to require `full_load`.
            assert full_load
            final_shape = list(loaded_weight.shape)
            # w1 and w3 are merged per expert.
            if shard_id in {"w1", "w3"}:
                final_shape[1] *= 2
            final_shape[shard_dim] = final_shape[shard_dim] // self.moe_config.tp_size
            param.materialize(final_shape, dtype=loaded_weight.dtype)

        expert_data = param.data if full_load else param.data[expert_id]

        # Case input scale: input_scale loading is only supported for fp8
        if "input_scale" in weight_name:
            # this is needed for compressed-tensors only
            loaded_weight = loaded_weight.to(param.data.device)

            # ModelOpt NVFP4 stores w13 input scales as two logical shards.
            # The generic assignment below would broadcast w1/w3 into the
            # whole expert row, so the second shard would overwrite the first.
            if (
                "ModelOpt" in quant_method_name
                and param.data.ndim == 2
                and shard_id in ("w1", "w3")
            ):
                scale_expert_id = global_expert_id if use_global_sf else expert_id
                scale_shard_id = 0 if shard_id == "w1" else 1
                param.data[scale_expert_id][scale_shard_id] = loaded_weight.reshape(())
                return True if return_success else None

            if (
                "compressed" in quant_method_name.lower()
                and param.data[expert_id] != 1
                and (param.data[expert_id] - loaded_weight).abs() > 1e-5
            ):
                raise ValueError(
                    "input_scales of w1 and w3 of a layer "
                    f"must be equal. But got {param.data[expert_id]} "
                    f"vs. {loaded_weight}"
                )

            self._load_single_value(
                param=param,
                loaded_weight=loaded_weight,
                expert_id=global_expert_id if use_global_sf else expert_id,
            )
            return True if return_success else None

        # Case g_idx
        if "g_idx" in weight_name:
            self._load_g_idx(
                shard_dim=0,
                shard_id=shard_id,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=self.moe_config.tp_rank,
            )
            return True if return_success else None

        # TODO @dsikka: ModelOpt should follow the proper MoE loading pattern
        if "ModelOpt" in quant_method_name:
            # Determine per-tensor weight scale patterns based on variant
            # Use the dedicated method instead of brittle string matching
            uses_weight_scale_2 = self.quant_method.uses_weight_scale_2_pattern()
            quant_method = getattr(param, "quant_method", None)

            # Call _load_per_tensor_weight_scale() to load per-tensor (scalar)
            # weights scales.
            # Input scales are always per-tensor.
            # Weight scales: FP4 uses "weight_scale_2" and FP8 uses
            # "weight_scale" for per-tensor scales.
            # NOTE: ModelOpt MXFP8 MoE uses block scales in weight_scale
            # tensors (quant_method=BLOCK), so those must not be treated
            # as per-tensor scalars here.
            is_block_weight_scale = (
                "weight_scale" in weight_name
                and quant_method == FusedMoeWeightScaleSupported.BLOCK.value
            )
            is_per_tensor = (
                "weight_scale_2" in weight_name
                if uses_weight_scale_2
                else "weight_scale" in weight_name
            ) or "input_scale" in weight_name
            is_per_tensor = is_per_tensor and not is_block_weight_scale
            if is_per_tensor:
                self._load_per_tensor_weight_scale(
                    shard_id=shard_id,
                    param=param,
                    loaded_weight=loaded_weight,
                    expert_id=expert_id,
                )
                return True if return_success else None

            # If the weight is w13_weight_scale and w13_weight_scales are
            # combined into single loaded_weight, call
            # _load_combined_w13_weight_scale() to load it.
            # This is checked by comparing the hidden_out dims of the
            # loaded_weight and the param.
            if "w13_weight_scale" in weight_name:
                loaded_weight_hidden_out = loaded_weight.shape[-2]
                param_hidden_out = param.data.shape[-2] * self.moe_config.tp_size
                if loaded_weight_hidden_out == param_hidden_out:
                    self._load_combined_w13_weight_scale(
                        shard_dim=shard_dim,
                        loaded_weight=loaded_weight,
                        param=expert_data,
                        tp_rank=self.moe_config.tp_rank,
                    )
                    return True if return_success else None

            # For other weights, call _load_model_weight_or_group_weight_scale()
            # to load it.
            if "weight" in weight_name:
                self._load_model_weight_or_group_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.moe_config.tp_rank,
                )
            return True if return_success else None

        # Case weight scales, zero_points and offset, weight/input global scales
        if "scale" in weight_name or "zero" in weight_name or "offset" in weight_name:
            # load the weight scales and zp based on the quantization scheme
            # supported weight scales/zp can be found in
            # FusedMoeWeightScaleSupported
            # TODO @dsikka: once hardened, refactor to use vLLM Parameters
            # specific to each case
            quant_method = getattr(param, "quant_method", None)
            if quant_method == FusedMoeWeightScaleSupported.CHANNEL.value:
                self._load_per_channel_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.moe_config.tp_rank,
                )
            elif quant_method in [
                FusedMoeWeightScaleSupported.GROUP.value,
                FusedMoeWeightScaleSupported.BLOCK.value,
            ]:
                self._load_model_weight_or_group_weight_scale(
                    shard_id=shard_id,
                    shard_dim=shard_dim,
                    loaded_weight=loaded_weight,
                    expert_data=expert_data,
                    tp_rank=self.moe_config.tp_rank,
                    load_full_w2=getattr(param, "load_full_w2", False),
                )
            elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value:
                self._load_per_tensor_weight_scale(
                    shard_id=shard_id,
                    param=param,
                    loaded_weight=loaded_weight,
                    expert_id=expert_id,
                )
            else:
                WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported]
                raise ValueError(
                    f"quant method must be one of {WEIGHT_SCALE_SUPPORTED}"
                )
            return True if return_success else None

        # Case weight_shape
        if "weight_shape" in weight_name:
            # only required by compressed-tensors
            self._load_single_value(
                param=param, loaded_weight=loaded_weight, expert_id=expert_id
            )
            return True if return_success else None

        # Case model weights
        if "weight" in weight_name:
            self._load_model_weight_or_group_weight_scale(
                shard_id=shard_id,
                shard_dim=shard_dim,
                loaded_weight=loaded_weight,
                expert_data=expert_data,
                tp_rank=self.moe_config.tp_rank,
            )
            return True if return_success else None

        return False if return_success else None

    def load_weights(
        self, weights: Iterable[tuple[str, torch.Tensor]]
    ) -> Iterable[str]:
        if (expert_mapping := self.expert_mapping) is None:
            raise ValueError(
                "`self.expert_mapping` must be provided to "
                "load weights using `self.load_weights`."
            )
        for expert_name, loaded_weight in weights:
            qual_name = f"{self.layer_name}.{expert_name}"
            for param_name, weight_name, expert_id, shard_id in expert_mapping:
                if weight_name not in qual_name:
                    continue
                weight_name = qual_name.replace(weight_name, param_name)
                param_name = weight_name.removeprefix(f"{self.layer_name}.")
                param = getattr(self, param_name)
                # Fused expert weights can be identified by their 3D tensors
                if loaded_weight.dim() == 3:
                    # Repurpose expert_id as shard_idx for deconcatenating w1 and w3
                    if shard_id in {"w1", "w3"}:
                        shard_idx = expert_id
                        experts_shard = loaded_weight.chunk(2, dim=1)[shard_idx]
                    else:
                        experts_shard = loaded_weight
                    start = 0
                else:
                    # loaded_weight is a single expert weight, so we add a dummy expert
                    # dimension to unify the loading logic with the fused case
                    experts_shard = loaded_weight.unsqueeze(0)
                    start = expert_id

                # Unified loading logic for fused and non-fused experts
                loaded_experts = experts_shard.unbind()
                for expert_id, loaded_expert in enumerate(loaded_experts, start=start):
                    success = self.weight_loader(
                        param=param,
                        loaded_weight=loaded_expert,
                        weight_name=weight_name,
                        shard_id=shard_id,
                        expert_id=expert_id,
                        return_success=True,
                    )
                    if success:
                        logger.debug(
                            "Loaded expert %d of shard %s into %s for layer %s",
                            expert_id,
                            shard_id,
                            param_name,
                            self.layer_name,
                        )
                        yield param_name

    @staticmethod
    def make_expert_params_mapping(
        model: torch.nn.Module,
        ckpt_gate_proj_name: str,
        ckpt_down_proj_name: str,
        ckpt_up_proj_name: str,
        num_experts: int,
        num_redundant_experts: int = 0,
        routed_experts_prefix: str = "routed_experts",
    ) -> list[tuple[str, str, int, str]]:
        """
        Create expert parameter mapping for weight loading with redundant experts.

        This mapping handles the physical-to-logical expert ID conversion needed
        when loading weights with EPLB redundant experts.

        Args:
            model: The model containing the MoE layer
            ckpt_gate_proj_name: Name of gate projection in checkpoint
            ckpt_down_proj_name: Name of down projection in checkpoint
            ckpt_up_proj_name: Name of up projection in checkpoint
            num_experts: Number of logical (non-redundant) experts
            num_redundant_experts: Number of redundant experts

        Returns:
            List of tuples (param_name, weight_name, expert_id, shard_id)
            where:
            - param_name: Parameter name in the layer
            - weight_name: Weight name in checkpoint
            - expert_id: Physical expert ID
            - shard_id: Shard identifier (w1, w2, w3)
        """
        num_physical_experts = num_experts + num_redundant_experts

        # In the returned mapping:
        # - `expert_id` is the physical expert id
        # - `weight_name` contains the weight name of the logical expert
        # So that we should map the expert id to logical in `weight_name`
        physical_to_logical_map = (
            EplbState.build_initial_global_physical_to_logical_map(
                num_experts, num_redundant_experts
            )
        )

        base_layer = (
            "base_layer."
            if any(".base_layer." in name for name, _ in model.named_parameters())
            else ""
        )

        if routed_experts_prefix != "":
            routed_experts_prefix = f"{routed_experts_prefix}."

        return [
            # (param_name, weight_name, expert_id, shard_id)
            (
                f"experts.{routed_experts_prefix}{base_layer}w13_"
                if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
                else f"experts.{routed_experts_prefix}{base_layer}w2_",
                f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}",
                expert_id,
                shard_id,
            )
            for expert_id in range(num_physical_experts)
            for shard_id, weight_name in [
                ("w1", ckpt_gate_proj_name),
                ("w2", ckpt_down_proj_name),
                ("w3", ckpt_up_proj_name),
            ]
        ]

    def get_expert_weights(self) -> Iterable[torch.Tensor]:
        def _maybe_make_contiguous(
            name: str, p: torch.nn.Parameter
        ) -> torch.nn.Parameter:
            """
            In some cases, the last 2 dimensions (the non-expert dimensions)
            of the weight scale tensor are transposed. This function
            transforms the tensor (view update) so the tensor is contiguous().
            Example: A non-contiguous scale tensor,
              `x` of shape (E, 32, 16) and stride (512, 1, 32) is transformed to
              `x_` of shape (E, 16, 32) and stride (512, 32, 1).
              Note that we specifically use torch.transpose() so `x_` refers
              to the same underlying memory. The tensors `x` and `x_`, pointing
              to the same underlying memory make this transformation safe in the
              context of EPLB. i.e. It is the same memory and just the view
              is different.
            Note: This function handles the "weight_scale" tensors specifically.
            This could however be generalized to handle similar tensors.
            """
            if p.ndim != 3:
                return p
            if p.is_contiguous():
                # Already contiguous. do nothing.
                return p
            # p is non-contiguous. We only handle the case where the last 2
            # dimensions of the scales tensor is transposed. We can handle
            # other cases when they become relevant.
            is_transposed_12 = p.stride(1) == 1 and p.stride(2) != 1
            if "weight_scale" not in name or not is_transposed_12:
                # do nothing.
                return p

            # Do not update the layer parameter as the layer's MoE operations would
            # expect the parameter's tensor to the same shape / stride. Instead,
            # make a new torch.nn.Parameter that is used just in the context of
            # EPLB.
            return torch.nn.Parameter(
                torch.transpose(p.data, 1, 2), requires_grad=False
            )

        weights = list(self.named_parameters())
        weights = [(name, _maybe_make_contiguous(name, p)) for name, p in weights]

        # `w13_input_scale` and `w2_input_scale` are global per-tensor
        # activation scales shared across all experts (e.g. NVFP4).
        # They are broadcast views (stride 0) from .expand() and are
        # not actual expert weights, so exclude them from EPLB.
        NON_EXPERT_WEIGHTS = {
            "e_score_correction_bias",
            "w13_input_scale",
            "w2_input_scale",
            "hash_indices_table",
        }

        # Parameters of non-expert submodules that live inside runner (RoutedExperts).
        # These must be excluded from EPLB weight rearrangement.
        NON_EXPERT_PREFIXES = ()

        assert all(
            weight.is_contiguous()
            for name, weight in weights
            if not name.startswith(NON_EXPERT_PREFIXES)
            and name not in NON_EXPERT_WEIGHTS
        )

        return [
            weight.view(self.local_num_experts, -1)
            for name, weight in weights
            if name not in NON_EXPERT_WEIGHTS
            and weight.shape != torch.Size([])
            and not name.startswith(NON_EXPERT_PREFIXES)
        ]

    #
    # Execution
    #

    def forward_modular(
        self,
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: "SharedExperts | None" = None,
        shared_experts_input: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """
        Execute routed experts using the quantization method's apply function.

        This is called by the runner after router selection (for modular kernels)
        quant_method.apply() which accesses the weights on this RoutedExperts
        instance.

        Args:
            x: Input tensor after any transforms
            topk_weights: Routing weights from router (for modular kernels)
            topk_ids: Selected expert IDs from router (for modular kernels)
            shared_experts: The shared experts (if any)
            shared_experts_input: Input for shared experts (if any)

        Returns:
            Output tensor from routed experts
        """
        assert not self.quant_method.is_monolithic

        # Modular kernels use pre-computed routing
        return self.quant_method.apply(
            layer=self,
            x=x,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            shared_experts=shared_experts,
            shared_experts_input=shared_experts_input,
        )

    def forward_monolithic(
        self,
        x: torch.Tensor,
        router_logits: torch.Tensor | None = None,
        input_ids: torch.Tensor | None = None,
    ) -> torch.Tensor:
        """
        Execute routed experts using the quantization method's apply function.

        This is called by the runner after router selection (for modular kernels)
        or with router logits (for monolithic kernels). It delegates to
        quant_method.apply() which accesses the weights on this RoutedExperts
        instance.

        Args:
            x: Input tensor after any transforms
            router_logits: Router logits (for monolithic kernels)
            input_ids: input ids for DeepSeek V4

        Returns:
            Output tensor from routed experts
        """
        assert self.quant_method.is_monolithic

        # Monolithic kernels handle routing internally
        return self.quant_method.apply_monolithic(
            layer=self,
            x=x,
            router_logits=router_logits,
            input_ids=input_ids,
        )

    def forward(
        self,
        *args,
        **kwargs,
    ) -> torch.Tensor:
        raise AssertionError("Call forward_modular or forward_monolithic instead.")

_get_hidden_dim(shard_dim, ndim) staticmethod

Compute the hidden dimension index from the shard (intermediate) dimension and tensor rank.

For 2D weight tensors the two data dims are (0, 1). For 3D tensors with an expert dimension at dim 0, they are (1, 2). shard_dim occupies one of these; the hidden dimension is the other. For 1D tensors (e.g. per-channel scales) returns 0.

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
@staticmethod
def _get_hidden_dim(shard_dim: int, ndim: int) -> int:
    """Compute the hidden dimension index from the shard (intermediate)
    dimension and tensor rank.

    For 2D weight tensors the two data dims are (0, 1). For 3D tensors
    with an expert dimension at dim 0, they are (1, 2). ``shard_dim``
    occupies one of these; the hidden dimension is the other.
    For 1D tensors (e.g. per-channel scales) returns 0.
    """
    if ndim < 2:
        return 0
    dim_a = ndim - 2
    dim_b = ndim - 1
    if shard_dim == dim_a:
        return dim_b
    if shard_dim == dim_b:
        return dim_a
    raise ValueError(
        f"shard_dim={shard_dim} is not a valid data dimension "
        f"for a {ndim}D tensor (expected {dim_a} or {dim_b})"
    )

_get_quant_method(prefix, quant_config, moe_config)

Helper method to ensure quant_method is never None and of the proper type.

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
def _get_quant_method(
    self,
    prefix: str,
    quant_config: QuantizationConfig | None,
    moe_config: FusedMoEConfig,
) -> FusedMoEMethodBase:
    """
    Helper method to ensure quant_method is never None and
    of the proper type.
    """
    quant_method = None
    if quant_config is not None:
        quant_method = quant_config.get_quant_method(self, prefix)
    if quant_method is None:
        quant_method = UnquantizedFusedMoEMethod(moe_config)
    assert isinstance(quant_method, FusedMoEMethodBase)
    return quant_method

_load_combined_w13_weight_scale(shard_dim, loaded_weight, param, tp_rank)

Load w13 weight scales assuming that w1 weight scales and w3 weight scales are stored in the same loaded_weight tensor.

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
def _load_combined_w13_weight_scale(
    self,
    shard_dim: int,
    loaded_weight: torch.Tensor,
    param: torch.Tensor,
    tp_rank: int,
):
    """
    Load w13 weight scales assuming that w1 weight scales and w3 weight
    scales are stored in the same loaded_weight tensor.
    """
    shard_size = param.shape[shard_dim]
    loaded_weight = loaded_weight.narrow(
        shard_dim, shard_size * tp_rank, shard_size
    )
    param.copy_(loaded_weight)

_load_model_weight_or_group_weight_scale(shard_dim, expert_data, shard_id, loaded_weight, tp_rank, load_full_w2=False)

Load grouped weight scales for group quantization or model weights

Parameters:

  • shard_dim

    (int) –

    dimension to shard

  • expert_data

    (Tensor) –

    parameter for a particular expert

  • shard_id

    (str) –

    either w1, w2, or w3

  • loaded_weight

    (Tensor) –

    checkpoint weight to load into the param

  • tp_rank

    (int) –

    tensor parallel rank

  • load_full_w2

    (bool, default: False ) –

    whether or not the w2 loaded should be sharded.

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
def _load_model_weight_or_group_weight_scale(
    self,
    shard_dim: int,
    expert_data: torch.Tensor,
    shard_id: str,
    loaded_weight: torch.Tensor,
    tp_rank: int,
    load_full_w2: bool = False,
):
    """
    Load grouped weight scales for group quantization or model weights

    Args:
        shard_dim: dimension to shard
        expert_data: parameter for a particular expert
        shard_id: either w1, w2, or w3
        loaded_weight: checkpoint weight to load into the param
        tp_rank: tensor parallel rank
        load_full_w2: whether or not the w2 loaded should be sharded.
    """
    if shard_id == "w2":
        # In the case where we have actorder/g_idx, we do not partition the
        # w2 scales, as indicated by `load_full` argument, for all tp cases
        self._load_w2(
            shard_dim=shard_dim,
            loaded_weight=loaded_weight,
            expert_data=expert_data,
            tp_rank=tp_rank,
            load_full=load_full_w2,
        )
    elif shard_id in ("w1", "w3"):
        self._load_w13(
            shard_id=shard_id,
            shard_dim=shard_dim,
            loaded_weight=loaded_weight,
            expert_data=expert_data,
            tp_rank=tp_rank,
        )

_map_global_expert_id_to_local_expert_id(expert_id)

Map global expert ID to local expert ID.

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
    """Map global expert ID to local expert ID."""
    return self.expert_map_manager.map_global_to_local(expert_id)

_narrow_expert_data_for_padding(expert_data, loaded_weight, hidden_dim, shard_dim=None) staticmethod

Narrow expert_data to match loaded_weight for padded dimensions.

When backends (e.g., DeepEP) round up hidden_size, weight parameters are larger than checkpoint weights. Narrow the padded hidden dimension before copying. Similarly, when padding occurs on the shard (intermediate) dimension (e.g. for MXFP4 GEMM), narrow that dimension as well.

Parameters:

  • expert_data

    (Tensor) –

    The (possibly padded) parameter tensor to narrow.

  • loaded_weight

    (Tensor) –

    The checkpoint weight tensor with original size.

  • hidden_dim

    (int) –

    The dimension index corresponding to hidden_size. Must be non-negative.

  • shard_dim

    (int | None, default: None ) –

    The dimension index corresponding to the shard (intermediate) dimension. Defaults to None.

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
@staticmethod
def _narrow_expert_data_for_padding(
    expert_data: torch.Tensor,
    loaded_weight: torch.Tensor,
    hidden_dim: int,
    shard_dim: int | None = None,
) -> torch.Tensor:
    """Narrow expert_data to match loaded_weight for padded dimensions.

    When backends (e.g., DeepEP) round up hidden_size, weight parameters
    are larger than checkpoint weights. Narrow the padded hidden dimension
    before copying. Similarly, when padding occurs on the shard
    (intermediate) dimension (e.g. for MXFP4 GEMM), narrow that dimension
    as well.

    Args:
        expert_data: The (possibly padded) parameter tensor to narrow.
        loaded_weight: The checkpoint weight tensor with original size.
        hidden_dim: The dimension index corresponding to hidden_size.
            Must be non-negative.
        shard_dim: The dimension index corresponding to the shard
            (intermediate) dimension. Defaults to `None`.
    """
    dims = (hidden_dim,) if shard_dim is None else (hidden_dim, shard_dim)
    if loaded_weight.ndim > 0:
        for dim in dims:
            if (
                0 <= dim < expert_data.ndim
                and dim < loaded_weight.ndim
                and expert_data.shape[dim] > loaded_weight.shape[dim]
            ):
                expert_data = expert_data.narrow(dim, 0, loaded_weight.shape[dim])
    return expert_data

forward_modular(x, topk_weights, topk_ids, shared_experts=None, shared_experts_input=None)

Execute routed experts using the quantization method's apply function.

This is called by the runner after router selection (for modular kernels) quant_method.apply() which accesses the weights on this RoutedExperts instance.

Parameters:

  • x

    (Tensor) –

    Input tensor after any transforms

  • topk_weights

    (Tensor) –

    Routing weights from router (for modular kernels)

  • topk_ids

    (Tensor) –

    Selected expert IDs from router (for modular kernels)

  • shared_experts

    (SharedExperts | None, default: None ) –

    The shared experts (if any)

  • shared_experts_input

    (Tensor | None, default: None ) –

    Input for shared experts (if any)

Returns:

  • Tensor

    Output tensor from routed experts

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
def forward_modular(
    self,
    x: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    shared_experts: "SharedExperts | None" = None,
    shared_experts_input: torch.Tensor | None = None,
) -> torch.Tensor:
    """
    Execute routed experts using the quantization method's apply function.

    This is called by the runner after router selection (for modular kernels)
    quant_method.apply() which accesses the weights on this RoutedExperts
    instance.

    Args:
        x: Input tensor after any transforms
        topk_weights: Routing weights from router (for modular kernels)
        topk_ids: Selected expert IDs from router (for modular kernels)
        shared_experts: The shared experts (if any)
        shared_experts_input: Input for shared experts (if any)

    Returns:
        Output tensor from routed experts
    """
    assert not self.quant_method.is_monolithic

    # Modular kernels use pre-computed routing
    return self.quant_method.apply(
        layer=self,
        x=x,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        shared_experts=shared_experts,
        shared_experts_input=shared_experts_input,
    )

forward_monolithic(x, router_logits=None, input_ids=None)

Execute routed experts using the quantization method's apply function.

This is called by the runner after router selection (for modular kernels) or with router logits (for monolithic kernels). It delegates to quant_method.apply() which accesses the weights on this RoutedExperts instance.

Parameters:

  • x

    (Tensor) –

    Input tensor after any transforms

  • router_logits

    (Tensor | None, default: None ) –

    Router logits (for monolithic kernels)

  • input_ids

    (Tensor | None, default: None ) –

    input ids for DeepSeek V4

Returns:

  • Tensor

    Output tensor from routed experts

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
def forward_monolithic(
    self,
    x: torch.Tensor,
    router_logits: torch.Tensor | None = None,
    input_ids: torch.Tensor | None = None,
) -> torch.Tensor:
    """
    Execute routed experts using the quantization method's apply function.

    This is called by the runner after router selection (for modular kernels)
    or with router logits (for monolithic kernels). It delegates to
    quant_method.apply() which accesses the weights on this RoutedExperts
    instance.

    Args:
        x: Input tensor after any transforms
        router_logits: Router logits (for monolithic kernels)
        input_ids: input ids for DeepSeek V4

    Returns:
        Output tensor from routed experts
    """
    assert self.quant_method.is_monolithic

    # Monolithic kernels handle routing internally
    return self.quant_method.apply_monolithic(
        layer=self,
        x=x,
        router_logits=router_logits,
        input_ids=input_ids,
    )

make_expert_params_mapping(model, ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name, num_experts, num_redundant_experts=0, routed_experts_prefix='routed_experts') staticmethod

Create expert parameter mapping for weight loading with redundant experts.

This mapping handles the physical-to-logical expert ID conversion needed when loading weights with EPLB redundant experts.

Parameters:

  • model

    (Module) –

    The model containing the MoE layer

  • ckpt_gate_proj_name

    (str) –

    Name of gate projection in checkpoint

  • ckpt_down_proj_name

    (str) –

    Name of down projection in checkpoint

  • ckpt_up_proj_name

    (str) –

    Name of up projection in checkpoint

  • num_experts

    (int) –

    Number of logical (non-redundant) experts

  • num_redundant_experts

    (int, default: 0 ) –

    Number of redundant experts

Returns:

Source code in vllm/model_executor/layers/fused_moe/routed_experts.py
@staticmethod
def make_expert_params_mapping(
    model: torch.nn.Module,
    ckpt_gate_proj_name: str,
    ckpt_down_proj_name: str,
    ckpt_up_proj_name: str,
    num_experts: int,
    num_redundant_experts: int = 0,
    routed_experts_prefix: str = "routed_experts",
) -> list[tuple[str, str, int, str]]:
    """
    Create expert parameter mapping for weight loading with redundant experts.

    This mapping handles the physical-to-logical expert ID conversion needed
    when loading weights with EPLB redundant experts.

    Args:
        model: The model containing the MoE layer
        ckpt_gate_proj_name: Name of gate projection in checkpoint
        ckpt_down_proj_name: Name of down projection in checkpoint
        ckpt_up_proj_name: Name of up projection in checkpoint
        num_experts: Number of logical (non-redundant) experts
        num_redundant_experts: Number of redundant experts

    Returns:
        List of tuples (param_name, weight_name, expert_id, shard_id)
        where:
        - param_name: Parameter name in the layer
        - weight_name: Weight name in checkpoint
        - expert_id: Physical expert ID
        - shard_id: Shard identifier (w1, w2, w3)
    """
    num_physical_experts = num_experts + num_redundant_experts

    # In the returned mapping:
    # - `expert_id` is the physical expert id
    # - `weight_name` contains the weight name of the logical expert
    # So that we should map the expert id to logical in `weight_name`
    physical_to_logical_map = (
        EplbState.build_initial_global_physical_to_logical_map(
            num_experts, num_redundant_experts
        )
    )

    base_layer = (
        "base_layer."
        if any(".base_layer." in name for name, _ in model.named_parameters())
        else ""
    )

    if routed_experts_prefix != "":
        routed_experts_prefix = f"{routed_experts_prefix}."

    return [
        # (param_name, weight_name, expert_id, shard_id)
        (
            f"experts.{routed_experts_prefix}{base_layer}w13_"
            if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
            else f"experts.{routed_experts_prefix}{base_layer}w2_",
            f"experts.{physical_to_logical_map[expert_id]}.{weight_name}.{base_layer}",
            expert_id,
            shard_id,
        )
        for expert_id in range(num_physical_experts)
        for shard_id, weight_name in [
            ("w1", ckpt_gate_proj_name),
            ("w2", ckpt_down_proj_name),
            ("w3", ckpt_up_proj_name),
        ]
    ]

TritonExperts

Bases: LoRAExpertsMixin, FusedMoEExpertsModular

Triton-based fused MoE expert implementation.

Source code in vllm/model_executor/layers/fused_moe/experts/triton_moe.py
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
class TritonExperts(LoRAExpertsMixin, mk.FusedMoEExpertsModular):
    """Triton-based fused MoE expert implementation."""

    def __init__(
        self,
        moe_config: FusedMoEConfig,
        quant_config: FusedMoEQuantConfig,
    ):
        # Whether quantized MOE runs natively, or through
        # higher-precision + activation QDQ.
        self.quantization_emulation = False
        super().__init__(moe_config, quant_config)

    @staticmethod
    def activation_format() -> mk.FusedMoEActivationFormat:
        return mk.FusedMoEActivationFormat.Standard

    @staticmethod
    def _supports_current_device() -> bool:
        return current_platform.is_cuda_alike() or current_platform.is_xpu()

    @staticmethod
    def _supports_no_act_and_mul() -> bool:
        return True

    @staticmethod
    def _supports_quant_scheme(
        weight_key: QuantKey | None,
        activation_key: QuantKey | None,
    ) -> bool:
        # INT8 requires at least 7.5 (Turing).
        device_supports_int8 = (
            current_platform.is_cuda()
            and current_platform.has_device_capability((7, 5))
        )

        supported: list[tuple[QuantKey | None, QuantKey | None]] = [(None, None)]
        if device_supports_int8:
            supported.append((kInt8StaticChannelSym, kInt8DynamicTokenSym))
        if current_platform.supports_fp8():
            supported += [
                (kFp8Static128BlockSym, kFp8Dynamic128Sym),
                (kFp8StaticChannelSym, kFp8DynamicTokenSym),
                (kFp8StaticTensorSym, kFp8DynamicTokenSym),
                (kFp8StaticTensorSym, kFp8StaticTensorSym),
                (kFp8StaticTensorSym, kFp8DynamicTensorSym),
            ]
        return (weight_key, activation_key) in supported

    @staticmethod
    def _supports_activation(activation: MoEActivation) -> bool:
        return activation in [
            MoEActivation.SILU,
            MoEActivation.GELU,
            MoEActivation.GELU_TANH,
            MoEActivation.SWIGLUOAI,
            MoEActivation.SWIGLUSTEP,
            MoEActivation.SILU_NO_MUL,
            MoEActivation.GELU_NO_MUL,
            MoEActivation.GELU_TANH_NO_MUL,
            MoEActivation.RELU2_NO_MUL,
        ]

    @staticmethod
    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
        return not (
            moe_parallel_config.use_fi_nvl_two_sided_kernels
            or moe_parallel_config.use_fi_nvl_one_sided_kernels
        )

    @staticmethod
    def _supports_batch_invariance():
        return True

    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
        return TopKWeightAndReduceNoOP()

    def activation(
        self, activation: MoEActivation, output: torch.Tensor, input: torch.Tensor
    ) -> None:
        gemm1_clamp_limit = self.quant_config.gemm1_clamp_limit
        if activation == MoEActivation.SILU and gemm1_clamp_limit is not None:
            swiglu_limit_func(output, input, float(gemm1_clamp_limit))
            return

        super().activation(activation, output, input)

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        activation_out_dim = self.adjust_N_for_activation(N, activation)
        workspace1 = (M, topk, max(activation_out_dim, K))
        workspace2 = (M, topk, max(N, K))
        output = (M, K)
        return (workspace1, workspace2, output)

    def apply(
        self,
        output: torch.Tensor,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        activation: MoEActivation,
        global_num_experts: int,
        expert_map: torch.Tensor | None,
        a1q_scale: torch.Tensor | None,
        a2_scale: torch.Tensor | None,
        workspace13: torch.Tensor,
        workspace2: torch.Tensor,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        apply_router_weight_on_input: bool,
    ):
        # Check constraints.
        if self.quant_config.use_int4_w4a16:
            assert hidden_states.size(-1) // 2 == w1.size(2), "Hidden size mismatch"
        else:
            assert hidden_states.size(-1) == w1.size(2), (
                f"Hidden size mismatch {hidden_states.size(-1)} != {w1.size(2)}"
            )

        assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
        assert hidden_states.dim() == 2
        assert w1.stride(-1) == 1, "Stride of last dimension must be 1"
        assert w2.stride(-1) == 1, "Stride of last dimension must be 1"
        assert hidden_states.dtype in [
            torch.float32,
            torch.float16,
            torch.bfloat16,
            torch.float8_e4m3fn,
            torch.float8_e4m3fnuz,
        ]

        E, num_tokens, N, K, top_k_num = self.moe_problem_size(
            hidden_states, w1, w2, topk_ids
        )

        if global_num_experts == -1:
            global_num_experts = E

        config = try_get_optimal_moe_config(
            w1.size(),
            w2.size(),
            top_k_num,
            self.quant_config.config_name(hidden_states.dtype),
            num_tokens,
            block_shape=self.block_shape,
        )

        if hidden_states.dtype == torch.bfloat16:
            compute_type = tl.bfloat16
        elif hidden_states.dtype == torch.float16:
            compute_type = tl.float16
        elif hidden_states.dtype == torch.float32:
            compute_type = tl.float32
        elif (
            hidden_states.dtype == torch.float8_e4m3fn
            or hidden_states.dtype == torch.float8_e4m3fnuz
        ):
            compute_type = tl.bfloat16
        else:
            raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")

        # Note that the output tensor might be in workspace1
        intermediate_cache1 = _resize_cache(workspace2, (num_tokens, top_k_num, N))
        cache2_dim = self.adjust_N_for_activation(N, activation)
        intermediate_cache2 = _resize_cache(
            workspace13, (num_tokens * top_k_num, cache2_dim)
        )
        intermediate_cache3 = _resize_cache(workspace2, (num_tokens, top_k_num, K))

        sorted_token_ids, expert_ids, num_tokens_post_padded = (
            _prepare_expert_assignment(
                topk_ids,
                config,
                num_tokens,
                top_k_num,
                global_num_experts,
                expert_map,
                use_int8_w8a16=self.quant_config.use_int8_w8a16,
                use_int4_w4a16=self.quant_config.use_int4_w4a16,
                block_shape=self.block_shape,
            )
        )

        # LoRA w13: applied to intermediate_cache1 before activation. When
        # the LoRA layer requested a dual-stream schedule, we run base w13
        # GEMM on the default stream and the LoRA fast-path on aux_stream;
        # the LoRA writes its delta into a fresh zero buffer (add_inputs=
        # False) and we sum it into intermediate_cache1 after both finish.

        sorted_token_ids_lora = None
        expert_ids_lora = None
        num_tokens_post_padded_lora = None
        token_lora_mapping = None
        lora_context = self._lora_context

        def _base_w13_fn():
            invoke_fused_moe_triton_kernel(
                hidden_states,
                w1,
                intermediate_cache1,
                a1q_scale if a1q_scale is not None else self.a1_scale,
                self.w1_scale,
                None,  # topk_weights
                sorted_token_ids,
                expert_ids,
                num_tokens_post_padded,
                False,  # mul_routed_weights
                top_k_num,
                config,
                compute_type=compute_type,
                use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
                use_int8_w8a8=self.quant_config.use_int8_w8a8,
                use_int8_w8a16=self.quant_config.use_int8_w8a16,
                use_int4_w4a16=self.quant_config.use_int4_w4a16,
                per_channel_quant=self.per_act_token_quant,
                block_shape=self.block_shape,
                B_bias=self.w1_bias,
            )

        if lora_context is not None and lora_context.aux_stream is not None:
            # add_inputs=False: kernel overwrites lora_delta_w13. zeros (not
            # empty) so untouched rows -- e.g. blocks where every program
            # early-exits because lora_id<0 -- stay at zero and the trailing
            # add_() is a no-op there.
            lora_delta_w13 = torch.zeros_like(intermediate_cache1)

            def _lora_w13_fn():
                return self.apply_w13_lora(
                    lora_context,
                    y=lora_delta_w13,
                    x=hidden_states,
                    topk_ids=topk_ids,
                    topk_weights=topk_weights,
                    expert_map=expert_map,
                    w1=w1,
                    w2=w2,
                    num_tokens=num_tokens,
                    top_k_num=top_k_num,
                    add_inputs=False,
                )

            assert lora_context.events is not None
            _, lora_meta = maybe_execute_in_parallel(
                _base_w13_fn,
                _lora_w13_fn,
                lora_context.events[0],
                lora_context.events[1],
                lora_context.aux_stream,
            )
            (
                sorted_token_ids_lora,
                expert_ids_lora,
                num_tokens_post_padded_lora,
                token_lora_mapping,
            ) = lora_meta
            intermediate_cache1.add_(lora_delta_w13)
        else:
            _base_w13_fn()
            if lora_context is not None:
                (
                    sorted_token_ids_lora,
                    expert_ids_lora,
                    num_tokens_post_padded_lora,
                    token_lora_mapping,
                ) = self.apply_w13_lora(
                    lora_context,
                    y=intermediate_cache1,
                    x=hidden_states,
                    topk_ids=topk_ids,
                    topk_weights=topk_weights,
                    expert_map=expert_map,
                    w1=w1,
                    w2=w2,
                    num_tokens=num_tokens,
                    top_k_num=top_k_num,
                )

        a2q_scale: torch.Tensor | None = None

        # Fuse SiLU+Mul + FP8 block quantize into a single kernel
        # when conditions permit (gated SiLU, fp8 block quant with
        # group_size=128, no LoRA requiring the BF16 intermediate).
        if (
            activation == MoEActivation.SILU
            and self.quant_config.use_fp8_w8a8
            and self.block_shape == [128, 128]
            and lora_context is None
            and not is_deep_gemm_e8m0_used()
        ):
            qintermediate_cache2, a2q_scale = ops.silu_and_mul_per_block_quant(
                intermediate_cache1.view(-1, N),
                group_size=128,
                quant_dtype=current_platform.fp8_dtype(),
            )
        else:
            self.activation(
                activation, intermediate_cache2, intermediate_cache1.view(-1, N)
            )

            qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
                intermediate_cache2,
                a2_scale,
                self.quant_dtype,
                self.per_act_token_quant,
                self.block_shape,
                quantization_emulation=self.quantization_emulation,
            )

        # LoRA w2: applied to intermediate_cache3 before moe_sum, using the
        # unquantized intermediate_cache2 as the lora_a input.  Reuses the
        # sorted_token_ids_lora computed above. Same dual-stream pattern as
        # the w13 pair: base GEMM on default stream, LoRA delta on aux,
        # join via .add_() into intermediate_cache3.
        def _base_w2_fn():
            invoke_fused_moe_triton_kernel(
                qintermediate_cache2,
                w2,
                intermediate_cache3,
                a2q_scale,
                self.w2_scale,
                topk_weights,
                sorted_token_ids,
                expert_ids,
                num_tokens_post_padded,
                not apply_router_weight_on_input,
                1,
                config,
                compute_type=compute_type,
                use_fp8_w8a8=self.quant_config.use_fp8_w8a8,
                use_int8_w8a8=self.quant_config.use_int8_w8a8,
                use_int8_w8a16=self.quant_config.use_int8_w8a16,
                use_int4_w4a16=self.quant_config.use_int4_w4a16,
                per_channel_quant=self.per_act_token_quant,
                block_shape=self.block_shape,
                B_bias=self.w2_bias,
            )

        if lora_context is not None and lora_context.aux_stream is not None:
            lora_delta_w2 = torch.zeros_like(intermediate_cache3)

            def _lora_w2_fn():
                self.apply_w2_lora(
                    lora_context,
                    y=lora_delta_w2,
                    x=intermediate_cache2,
                    topk_weights=topk_weights,
                    sorted_token_ids_lora=sorted_token_ids_lora,
                    expert_ids_lora=expert_ids_lora,
                    num_tokens_post_padded_lora=num_tokens_post_padded_lora,
                    token_lora_mapping=token_lora_mapping,
                    num_tokens=num_tokens,
                    w1=w1,
                    w2=w2,
                    top_k_num=top_k_num,
                    add_inputs=False,
                )

            assert lora_context.events is not None
            maybe_execute_in_parallel(
                _base_w2_fn,
                _lora_w2_fn,
                lora_context.events[2],
                lora_context.events[3],
                lora_context.aux_stream,
            )
            intermediate_cache3.add_(lora_delta_w2)
        else:
            _base_w2_fn()
            if lora_context is not None:
                self.apply_w2_lora(
                    lora_context,
                    y=intermediate_cache3,
                    x=intermediate_cache2,
                    topk_weights=topk_weights,
                    sorted_token_ids_lora=sorted_token_ids_lora,
                    expert_ids_lora=expert_ids_lora,
                    num_tokens_post_padded_lora=num_tokens_post_padded_lora,
                    token_lora_mapping=token_lora_mapping,
                    num_tokens=num_tokens,
                    w1=w1,
                    w2=w2,
                    top_k_num=top_k_num,
                )

        # separate function is required for MoE + LoRA
        self.moe_sum(intermediate_cache3, output)

    def moe_sum(self, input: torch.Tensor, output: torch.Tensor) -> None:
        ops.moe_sum(input, output)

TritonOrDeepGemmExperts

Bases: FallbackExperts

DeepGemm with fallback to Triton for low latency shapes.

Source code in vllm/model_executor/layers/fused_moe/experts/triton_deep_gemm_moe.py
class TritonOrDeepGemmExperts(FallbackExperts):
    """DeepGemm with fallback to Triton for low latency shapes."""

    def __init__(self, moe_config: FusedMoEConfig, quant_config: FusedMoEQuantConfig):
        super().__init__(
            experts=DeepGemmExperts(moe_config, quant_config),
            fallback_experts=TritonExperts(moe_config, quant_config),
        )

    @staticmethod
    def get_clses() -> tuple[
        type[mk.FusedMoEExpertsModular],
        type[mk.FusedMoEExpertsModular],
    ]:
        return (DeepGemmExperts, TritonExperts)

    def workspace_shapes(
        self,
        M: int,
        N: int,
        K: int,
        topk: int,
        global_num_experts: int,
        local_num_experts: int,
        expert_tokens_meta: mk.ExpertTokensMetadata | None,
        activation: MoEActivation,
    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
        # Note: the deep gemm workspaces are strictly larger than the triton
        # workspaces so we can be pessimistic here and allocate for DeepGemm
        # even if we fall back to triton later, e.g. if expert maps are set.
        if is_deep_gemm_e8m0_used() or _valid_deep_gemm_shape(M, N, K):
            return self.experts.workspace_shapes(
                M,
                N,
                K,
                topk,
                global_num_experts,
                local_num_experts,
                expert_tokens_meta,
                activation,
            )
        else:
            return self.fallback_experts.workspace_shapes(
                M,
                N,
                K,
                topk,
                global_num_experts,
                local_num_experts,
                expert_tokens_meta,
                activation,
            )

    def _select_experts_impl(
        self,
        hidden_states: torch.Tensor,
        w1: torch.Tensor,
        w2: torch.Tensor,
    ) -> mk.FusedMoEExpertsModular:
        if is_deep_gemm_e8m0_used() or _valid_deep_gemm(hidden_states, w1, w2):
            return self.experts
        else:
            return self.fallback_experts

UnquantizedFusedMoEMethod

Bases: FusedMoEMethodBase, CustomOp

MoE method without quantization.

Source code in vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py
@CustomOp.register("unquantized_fused_moe")
class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
    """MoE method without quantization."""

    # --8<-- [end:unquantized_fused_moe]

    def __init__(self, moe: FusedMoEConfig):
        super().__init__(moe)
        self.unquantized_backend, self.experts_cls = select_unquantized_moe_backend(
            moe_config=self.moe,
        )

    @property
    def is_monolithic(self) -> bool:
        # Escape hatch for CPU, which stays on the old monolithic path.
        if self.unquantized_backend == UnquantizedMoeBackend.CPU:
            return True
        return super().is_monolithic

    @property
    def supports_eplb(self) -> bool:
        return True

    def maybe_make_prepare_finalize(
        self,
        routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
    ):
        raise ValueError(
            f"{self.__class__.__name__} uses the new modular kernel initialization "
            "logic for all but the CPU backend. CPU backend is monolithic. "
            "So this function should not be called."
        )

    def select_gemm_impl(
        self,
        prepare_finalize: FusedMoEPrepareAndFinalizeModular,
        layer: "RoutedExperts",
    ) -> FusedMoEExpertsModular:
        raise ValueError(
            f"{self.__class__.__name__} uses the new modular kernel initialization "
            "logic. This function should not be called."
        )

    def create_weights(
        self,
        layer: "RoutedExperts",
        num_experts: int,
        hidden_size: int,
        intermediate_size_per_partition: int,
        params_dtype: torch.dtype,
        **extra_weight_attrs,
    ):
        if self.moe.is_act_and_mul:
            w13_up_dim = 2 * intermediate_size_per_partition
        else:
            w13_up_dim = intermediate_size_per_partition
        # Fused gate_up_proj (column parallel)
        w13_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                w13_up_dim,
                hidden_size,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w13_weight", w13_weight)
        set_weight_attrs(w13_weight, extra_weight_attrs)
        if self.moe.has_bias:
            w13_bias = torch.nn.Parameter(
                torch.zeros(num_experts, w13_up_dim, dtype=params_dtype),
                requires_grad=False,
            )
            layer.register_parameter("w13_bias", w13_bias)
            set_weight_attrs(w13_bias, extra_weight_attrs)
        # down_proj (row parallel)
        w2_weight = torch.nn.Parameter(
            torch.empty(
                num_experts,
                hidden_size,
                intermediate_size_per_partition,
                dtype=params_dtype,
            ),
            requires_grad=False,
        )
        layer.register_parameter("w2_weight", w2_weight)
        set_weight_attrs(w2_weight, extra_weight_attrs)
        if self.moe.has_bias:
            w2_bias = torch.nn.Parameter(
                torch.zeros(num_experts, hidden_size, dtype=params_dtype),
                requires_grad=False,
            )
            layer.register_parameter("w2_bias", w2_bias)
            set_weight_attrs(w2_bias, extra_weight_attrs)

    def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
        # Pad the weight tensor. This is an optimization on ROCm platform, which
        # can benefit from tensors located far enough from one another in memory
        if (
            envs.VLLM_ROCM_MOE_PADDING
            and current_platform.is_rocm()
            and weight.stride(-1) == 1
            and (weight.stride(-2) * weight.element_size()) % 512 == 0
        ):
            num_pad = 256 // weight.element_size()
            weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
            torch.accelerator.empty_cache()

        return weight

    def _setup_kernel(
        self,
        layer: "RoutedExperts",
        w13: torch.Tensor,
        w2: torch.Tensor,
    ) -> None:
        # Shuffle weights to runtime format.
        w13_new, w2_new = convert_to_unquantized_kernel_format(
            self.unquantized_backend,
            layer=layer,
            w13_weight=w13,
            w2_weight=w2,
        )
        # `moe_kernel` is initialized to None in FusedMoEMethodBase.__init__;
        # On the first call we replace the parameter normally. On subsequent
        # calls (e.g. RL weight updates that re-trigger
        # process_weights_after_loading) the moe kernel has already been set
        # up and CUDA graphs may have captured the parameter addresses, so
        # we copy the shuffled data into the existing storage instead of
        # re-registering a new Parameter.
        is_weight_update = self.moe_kernel is not None  # type: ignore[has-type]
        replace_parameter(layer, "w13_weight", w13_new, prefer_copy=is_weight_update)
        replace_parameter(layer, "w2_weight", w2_new, prefer_copy=is_weight_update)

        # AITER backend requires weights to be marked as shuffled.
        if self.unquantized_backend == UnquantizedMoeBackend.AITER:
            layer.w13_weight.is_shuffled = True
            layer.w2_weight.is_shuffled = True

        if not is_weight_update:
            # Setup moe kernel only on the first call. For the unquantized
            # method, moe_quant_config is either the constant
            # FUSED_MOE_UNQUANTIZED_CONFIG or biased_moe_quant_config(...)
            # which references layer.w{13,2}_bias; since weight updates
            # mutate those bias tensors in place, the kernel does not need
            # to be re-built.
            self.moe_quant_config = self.get_fused_moe_quant_config(layer)
            assert self.moe_quant_config is not None
            assert self.experts_cls is not None
            self.moe_kernel = make_unquantized_moe_kernel(
                quant_config=self.moe_quant_config,
                moe_config=self.moe,
                backend=self.unquantized_backend,
                experts_cls=self.experts_cls,
                routing_tables=layer._expert_routing_tables(),
            )

    def process_weights_after_loading(self, layer: "RoutedExperts") -> None:
        super().process_weights_after_loading(layer)

        # Padding the weight for better performance on ROCm.
        # _maybe_pad_weight is idempotent: on the first call it allocates a
        # padded storage and returns a strided view; on subsequent calls
        # (weight updates) the stride condition no longer matches so it
        # returns the input unchanged. The reassignment to .data is therefore
        # a no-op on updates and preserves the storage address (data_ptr)
        # used by captured CUDA graphs.
        layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data)
        layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data)

        if self.unquantized_backend in [
            UnquantizedMoeBackend.TPU,
            UnquantizedMoeBackend.OOT,
        ]:
            # OOT handles internally.
            return

        elif self.unquantized_backend == UnquantizedMoeBackend.CPU:
            # CPU stays on the old path — no oracle, no moe_kernel.
            from vllm.model_executor.layers.fused_moe import cpu_fused_moe

            if current_platform.get_cpu_architecture() == CpuArchEnum.X86:
                from vllm.model_executor.layers.utils import check_cpu_sgl_kernel

                dtype_w13 = layer.w13_weight.dtype
                _, n_w13, k_w13 = layer.w13_weight.size()
                dtype_w2 = layer.w2_weight.dtype
                _, n_w2, k_w2 = layer.w2_weight.size()
                if (
                    envs.VLLM_CPU_SGL_KERNEL
                    and check_cpu_sgl_kernel(n_w13, k_w13, dtype_w13)
                    and check_cpu_sgl_kernel(n_w2, k_w2, dtype_w2)
                ):
                    packed_w13_weight = torch.ops._C.convert_weight_packed(
                        layer.w13_weight
                    )
                    assert packed_w13_weight.size() == layer.w13_weight.size()
                    layer.w13_weight.copy_(packed_w13_weight)
                    del packed_w13_weight
                    packed_w2_weight = torch.ops._C.convert_weight_packed(
                        layer.w2_weight
                    )
                    assert packed_w2_weight.size() == layer.w2_weight.size()
                    layer.w2_weight.copy_(packed_w2_weight)
                    self.cpu_fused_moe: Callable = cpu_fused_moe.SGLFusedMOE(layer)
                else:
                    self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
            else:
                self.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer)
        elif self.unquantized_backend == UnquantizedMoeBackend.XPU:
            w13 = layer.w13_weight
            w2 = layer.w2_weight

            w13.data = w13.transpose(-1, -2).contiguous()
            w2.data = w2.transpose(-1, -2).contiguous()

            self._setup_kernel(
                layer=layer,
                w13=w13,
                w2=w2,
            )
        else:
            self._setup_kernel(
                layer=layer,
                w13=layer.w13_weight,
                w2=layer.w2_weight,
            )

    def get_fused_moe_quant_config(self, layer: torch.nn.Module) -> FusedMoEQuantConfig:
        if self.moe.has_bias:
            return biased_moe_quant_config(
                layer.w13_bias,
                layer.w2_bias,
            )
        else:
            return FUSED_MOE_UNQUANTIZED_CONFIG

    def apply(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: SharedExperts | None,
        shared_experts_input: torch.Tensor | None,
    ) -> torch.Tensor:
        return self.forward(
            layer=layer,
            x=x,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            shared_experts=shared_experts,
            shared_experts_input=shared_experts_input,
        )

    def forward_native(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: SharedExperts | None,
        shared_experts_input: torch.Tensor | None,
    ) -> torch.Tensor:
        assert self.moe_kernel is not None
        return self.moe_kernel.apply(
            hidden_states=x,
            w1=layer.w13_weight,
            w2=layer.w2_weight,
            topk_weights=topk_weights,
            topk_ids=topk_ids,
            activation=layer.activation,
            apply_router_weight_on_input=layer.apply_router_weight_on_input,
            global_num_experts=layer.global_num_experts,
            expert_map=layer.expert_map,
            shared_experts=shared_experts,
            shared_experts_input=shared_experts_input,
        )

    def forward_cuda(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        topk_weights: torch.Tensor,
        topk_ids: torch.Tensor,
        shared_experts: SharedExperts | None,
        shared_experts_input: torch.Tensor | None,
    ) -> torch.Tensor:
        return self.forward_native(
            layer,
            x,
            topk_weights,
            topk_ids,
            shared_experts,
            shared_experts_input,
        )

    def apply_monolithic(
        self,
        layer: "RoutedExperts",
        x: torch.Tensor,
        router_logits: torch.Tensor,
        input_ids: torch.Tensor | None = None,
    ) -> torch.Tensor:
        assert self.is_monolithic
        if self.unquantized_backend == UnquantizedMoeBackend.CPU:
            assert self.moe_kernel is None
            return self.cpu_fused_moe(
                layer,
                x,
                layer.use_grouped_topk,
                layer.top_k,
                router_logits,
                layer.renormalize,
                layer.topk_group,
                layer.num_expert_group,
                layer.global_num_experts,
                layer.expert_map,
                layer.custom_routing_function,
                layer.scoring_func,
                layer.routed_scaling_factor,
                layer.e_score_correction_bias,
                layer.apply_router_weight_on_input,
                layer.activation,
            )
        else:
            assert self.moe_kernel is not None
            return self.moe_kernel.apply_monolithic(
                x,
                layer.w13_weight,
                layer.w2_weight,
                router_logits,
                activation=layer.activation,
                global_num_experts=layer.global_num_experts,
                expert_map=layer.expert_map,
                apply_router_weight_on_input=layer.apply_router_weight_on_input,
                num_expert_group=layer.num_expert_group,
                topk_group=layer.topk_group,
                e_score_correction_bias=layer.e_score_correction_bias,
                routed_scaling_factor=layer.routed_scaling_factor,
            )

FusedMoE(num_experts, top_k, hidden_size, intermediate_size, params_dtype=None, renormalize=True, use_grouped_topk=False, num_expert_group=None, topk_group=None, quant_config=None, tp_size=None, dp_size=None, pcp_size=None, prefix='', custom_routing_function=None, router=None, scoring_func='softmax', routed_scaling_factor=1.0, swiglu_limit=None, e_score_correction_bias=None, apply_router_weight_on_input=False, activation='silu', enable_eplb=False, num_redundant_experts=0, has_bias=False, is_sequence_parallel=False, expert_mapping=None, n_shared_experts=None, router_logits_dtype=None, gate=None, shared_experts=None, shared_expert_gate=None, routed_input_transform=None, routed_output_transform=None, apply_routed_scale_to_output=False, zero_expert_type=None, hash_indices_table=None, runner_cls=None, runner_args=None, routed_experts_cls=None, routed_experts_args=None)

Factory function for creating MoE execution pipeline.

Creates and configures a complete MoE execution pipeline including: - Router (for token-to-expert assignment) - RoutedExperts (containing expert weight parameters) - MoERunner (orchestrates the complete forward pass)

The experts contain both MergedColumnParallel weights (gate_up_proj/w13) and RowParallelLinear weights (down_proj/w2).

Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We copy that naming convention here and handle any remapping in the load_weights function in each model implementation.

Parameters:

  • num_experts

    (int) –

    Number of experts in the model (global count)

  • top_k

    (int) –

    Number of experts selected for each token

  • hidden_size

    (int) –

    Input hidden state size of the transformer

  • intermediate_size

    (int) –

    Intermediate size of the experts

  • params_dtype

    (dtype | None, default: None ) –

    Data type for the parameters

  • renormalize

    (bool, default: True ) –

    Whether to renormalize the logits in the router

  • use_grouped_topk

    (bool, default: False ) –

    Whether to use grouped top-k routing

  • num_expert_group

    (int | None, default: None ) –

    Number of expert groups for grouped top-k

  • topk_group

    (int | None, default: None ) –

    Top-k value per group for grouped top-k

  • quant_config

    (QuantizationConfig | None, default: None ) –

    Quantization configuration

  • tp_size

    (int | None, default: None ) –

    Tensor parallelism size (None = use global default)

  • dp_size

    (int | None, default: None ) –

    Data parallelism size (None = use global default)

  • pcp_size

    (int | None, default: None ) –

    Pipeline context parallelism size (None = use global default)

  • prefix

    (str, default: '' ) –

    Layer name prefix for weight loading

  • custom_routing_function

    (Callable | None, default: None ) –

    Custom routing function override

  • router

    (FusedMoERouter | None, default: None ) –

    Pre-configured router instance (None = create default)

  • scoring_func

    (str, default: 'softmax' ) –

    Scoring function for routing ("softmax" or others)

  • routed_scaling_factor

    (float, default: 1.0 ) –

    Scaling factor applied to topk_weights or output

  • swiglu_limit

    (float | None, default: None ) –

    SwiGLU activation limit

  • e_score_correction_bias

    (Tensor | None, default: None ) –

    Expert score correction bias tensor

  • apply_router_weight_on_input

    (bool, default: False ) –

    Whether to apply router weights on input

  • activation

    (str, default: 'silu' ) –

    Activation function name ("silu", "gelu", etc.)

  • enable_eplb

    (bool, default: False ) –

    Whether to enable expert parallelism load balancer

  • num_redundant_experts

    (int, default: 0 ) –

    Number of redundant experts for EPLB

  • has_bias

    (bool, default: False ) –

    Whether expert layers have bias terms

  • is_sequence_parallel

    (bool, default: False ) –

    Whether sequence parallelism is enabled

  • expert_mapping

    (list[tuple[str, str, int, str]] | None, default: None ) –

    Expert parameter mapping for weight loading

  • n_shared_experts

    (int | None, default: None ) –

    Number of shared experts (ROCm aiter only)

  • router_logits_dtype

    (dtype | None, default: None ) –

    Data type for router logits buffers

  • gate

    (Module | None, default: None ) –

    Pre-configured gate module

  • shared_experts

    (Module | None, default: None ) –

    Pre-configured shared experts module

  • shared_expert_gate

    (Module | None, default: None ) –

    Pre-configured shared expert gate module

  • routed_input_transform

    (Module | None, default: None ) –

    Input transformation module

  • routed_output_transform

    (Module | None, default: None ) –

    Output transformation module

  • apply_routed_scale_to_output

    (bool, default: False ) –

    Whether to apply routed_scaling_factor to output instead of topk_weights

  • zero_expert_type

    (str | None, default: None ) –

    Type of zero expert handling

  • hash_indices_table

    (Tensor | None, default: None ) –

    Hash table for expert indices

  • runner_cls

    (type[MoERunner] | None, default: None ) –

    Custom MoERunner class (None = use default MoERunner)

  • runner_args

    (dict[str, Any] | None, default: None ) –

    Additional arguments for runner constructor

  • routed_experts_cls

    (type[RoutedExperts] | None, default: None ) –

    Custom RoutedExperts class (None = use default)

  • routed_experts_args

    (dict[str, Any] | None, default: None ) –

    Additional arguments for routed_experts constructor

Returns:

  • MoERunner ( MoERunner ) –

    Configured MoE execution pipeline ready for forward passes

Source code in vllm/model_executor/layers/fused_moe/layer.py
def FusedMoE(
    num_experts: int,  # Global number of experts
    top_k: int,
    hidden_size: int,
    intermediate_size: int,
    params_dtype: torch.dtype | None = None,
    renormalize: bool = True,
    use_grouped_topk: bool = False,
    num_expert_group: int | None = None,
    topk_group: int | None = None,
    quant_config: QuantizationConfig | None = None,
    tp_size: int | None = None,
    dp_size: int | None = None,
    pcp_size: int | None = None,
    prefix: str = "",
    custom_routing_function: Callable | None = None,
    router: FusedMoERouter | None = None,
    scoring_func: str = "softmax",
    routed_scaling_factor: float = 1.0,
    swiglu_limit: float | None = None,
    e_score_correction_bias: torch.Tensor | None = None,
    apply_router_weight_on_input: bool = False,
    activation: str = "silu",
    enable_eplb: bool = False,
    num_redundant_experts: int = 0,
    has_bias: bool = False,
    is_sequence_parallel: bool = False,
    expert_mapping: list[tuple[str, str, int, str]] | None = None,
    n_shared_experts: int | None = None,
    router_logits_dtype: torch.dtype | None = None,
    gate: torch.nn.Module | None = None,
    shared_experts: torch.nn.Module | None = None,
    shared_expert_gate: torch.nn.Module | None = None,
    routed_input_transform: torch.nn.Module | None = None,
    routed_output_transform: torch.nn.Module | None = None,
    apply_routed_scale_to_output: bool = False,
    zero_expert_type: str | None = None,
    hash_indices_table: torch.Tensor | None = None,
    runner_cls: type[MoERunner] | None = None,
    runner_args: dict[str, Any] | None = None,
    routed_experts_cls: type[RoutedExperts] | None = None,
    routed_experts_args: dict[str, Any] | None = None,
) -> MoERunner:
    """Factory function for creating MoE execution pipeline.

    Creates and configures a complete MoE execution pipeline including:
    - Router (for token-to-expert assignment)
    - RoutedExperts (containing expert weight parameters)
    - MoERunner (orchestrates the complete forward pass)

    The experts contain both MergedColumnParallel weights (gate_up_proj/w13)
    and RowParallelLinear weights (down_proj/w2).

    Note: Mixtral uses w1, w2, and w3 for gate, up, and down_proj. We
    copy that naming convention here and handle any remapping in the
    load_weights function in each model implementation.

    Args:
        num_experts: Number of experts in the model (global count)
        top_k: Number of experts selected for each token
        hidden_size: Input hidden state size of the transformer
        intermediate_size: Intermediate size of the experts
        params_dtype: Data type for the parameters
        renormalize: Whether to renormalize the logits in the router
        use_grouped_topk: Whether to use grouped top-k routing
        num_expert_group: Number of expert groups for grouped top-k
        topk_group: Top-k value per group for grouped top-k
        quant_config: Quantization configuration
        tp_size: Tensor parallelism size (None = use global default)
        dp_size: Data parallelism size (None = use global default)
        pcp_size: Pipeline context parallelism size (None = use global default)
        prefix: Layer name prefix for weight loading
        custom_routing_function: Custom routing function override
        router: Pre-configured router instance (None = create default)
        scoring_func: Scoring function for routing ("softmax" or others)
        routed_scaling_factor: Scaling factor applied to topk_weights or output
        swiglu_limit: SwiGLU activation limit
        e_score_correction_bias: Expert score correction bias tensor
        apply_router_weight_on_input: Whether to apply router weights on input
        activation: Activation function name ("silu", "gelu", etc.)
        enable_eplb: Whether to enable expert parallelism load balancer
        num_redundant_experts: Number of redundant experts for EPLB
        has_bias: Whether expert layers have bias terms
        is_sequence_parallel: Whether sequence parallelism is enabled
        expert_mapping: Expert parameter mapping for weight loading
        n_shared_experts: Number of shared experts (ROCm aiter only)
        router_logits_dtype: Data type for router logits buffers
        gate: Pre-configured gate module
        shared_experts: Pre-configured shared experts module
        shared_expert_gate: Pre-configured shared expert gate module
        routed_input_transform: Input transformation module
        routed_output_transform: Output transformation module
        apply_routed_scale_to_output: Whether to apply routed_scaling_factor to
                                      output instead of topk_weights
        zero_expert_type: Type of zero expert handling
        hash_indices_table: Hash table for expert indices
        runner_cls: Custom MoERunner class (None = use default MoERunner)
        runner_args: Additional arguments for runner constructor
        routed_experts_cls: Custom RoutedExperts class (None = use default)
        routed_experts_args: Additional arguments for routed_experts constructor

    Returns:
        MoERunner: Configured MoE execution pipeline ready for forward passes
    """
    vllm_config = get_current_vllm_config()

    layer_name = prefix

    moe_activation = MoEActivation.from_str(activation)
    is_act_and_mul = moe_activation.is_gated

    moe_parallel_config = make_parallel_config(
        tp_size=tp_size,
        dp_size=dp_size,
        pcp_size=pcp_size,
        is_sequence_parallel=is_sequence_parallel,
        parallel_config=vllm_config.parallel_config,
    )

    global_num_experts, logical_num_experts, num_fused_shared_experts = (
        determine_expert_counts(
            num_experts,
            num_redundant_experts,
            n_shared_experts,
            is_act_and_mul,
        )
    )

    # Initialize EPLB manager (or None?)
    eplb_state: EplbLayerState | None = None
    if enable_eplb:
        use_ep = moe_parallel_config.use_ep
        ep_size = moe_parallel_config.ep_size
        if use_ep and global_num_experts % ep_size != 0:
            raise ValueError(
                f"EPLB currently only supports even distribution of "
                f"experts across ranks. Got {global_num_experts} experts "
                f"and {ep_size} EP ranks."
            )
        eplb_state = EplbLayerState()
    else:
        assert num_redundant_experts == 0, (
            "Redundant experts are only supported with EPLB."
        )

    max_num_batched_tokens = vllm_config.scheduler_config.max_num_batched_tokens

    # Create ExpertMapManager to handle expert mapping and placement for EP.
    # See ExpertMapManager for a detailed description of what it does and when
    # it is required.
    expert_map_manager = ExpertMapManager(
        max_num_batched_tokens=max_num_batched_tokens,
        top_k=top_k,
        global_num_experts=global_num_experts,
        num_redundant_experts=num_redundant_experts,
        num_expert_group=num_expert_group,
        moe_parallel_config=moe_parallel_config,
        placement_strategy=vllm_config.parallel_config.expert_placement_strategy,
        enable_eplb=eplb_state is not None,
        num_fused_shared_experts=num_fused_shared_experts,
        rocm_aiter_enabled=rocm_aiter_ops.is_fused_moe_enabled() and is_act_and_mul,
    )

    # TODO(bnell): we should not have to create a router if the kernel is
    # monolithic.
    if router is None:
        router = create_fused_moe_router(
            top_k=top_k,
            global_num_experts=global_num_experts,
            eplb_state=eplb_state,
            renormalize=renormalize,
            use_grouped_topk=use_grouped_topk,
            num_expert_group=num_expert_group,
            topk_group=topk_group,
            custom_routing_function=custom_routing_function,
            scoring_func=scoring_func,
            # When apply_routed_scale_to_output is True, we set the scaling factor
            # to 1.0 so it ends up being a nop. Applying the scale will be handled
            # by the runner in this case.
            # The member variable must be set in the same way as the router since
            # some quantization methods can access it.
            routed_scaling_factor=routed_scaling_factor
            if not apply_routed_scale_to_output
            else 1.0,
            e_score_correction_bias=e_score_correction_bias,
            num_fused_shared_experts=num_fused_shared_experts,
            zero_expert_type=zero_expert_type,
            num_logical_experts=logical_num_experts,
            hash_indices_table=hash_indices_table,
        )

    if params_dtype is None:
        params_dtype = torch.get_default_dtype()

    # FIXME (varun): We should have a better way of inferring the activation
    # datatype. This works for now as the tensor datatype entering the MoE
    # operation is typically unquantized (i.e. float16/bfloat16).
    if vllm_config.model_config is not None:
        moe_in_dtype = vllm_config.model_config.dtype
    else:
        # TODO (bnell): This is a hack to get test_mixtral_moe to work
        # since model_config is not set in the pytest test.
        moe_in_dtype = params_dtype

    moe_config = FusedMoEConfig(
        num_experts=global_num_experts,
        experts_per_token=top_k,
        hidden_dim=hidden_size,
        intermediate_size=intermediate_size,
        num_local_experts=expert_map_manager.local_num_experts,
        num_logical_experts=logical_num_experts,
        moe_parallel_config=moe_parallel_config,
        in_dtype=moe_in_dtype,
        moe_backend=vllm_config.kernel_config.moe_backend,
        router_logits_dtype=router_logits_dtype,
        max_num_tokens=max_num_batched_tokens,
        has_bias=has_bias,
        is_lora_enabled=vllm_config.lora_config is not None,
        activation=moe_activation,
        device=vllm_config.device_config.device,
        routing_method=router.routing_method_type,  # Not ideal
        swiglu_limit=swiglu_limit,
        max_capture_size=vllm_config.compilation_config.max_cudagraph_capture_size,
    )

    logger.debug("FusedMoEConfig = %s", moe_config)

    # Create RoutedExperts instance BEFORE create_weights()
    # This will hold all expert weight parameters
    if routed_experts_cls is None:
        routed_experts_cls = RoutedExperts

    assert params_dtype is not None
    routed_experts = routed_experts_cls(
        layer_name,
        params_dtype,
        moe_config,
        quant_config,
        expert_map_manager=expert_map_manager,
        expert_mapping=expert_mapping,
        # Extra params that are needed by quant_methods, pass along for now
        # Prefer getting these from other sources, e.g. moe_config or
        # router object
        renormalize=renormalize,
        use_grouped_topk=use_grouped_topk,
        num_expert_group=num_expert_group,
        topk_group=topk_group,
        custom_routing_function=custom_routing_function,
        scoring_func=scoring_func,
        routed_scaling_factor=routed_scaling_factor,
        swiglu_limit=swiglu_limit,
        # TODO get from router? needs to be truncated?
        e_score_correction_bias=e_score_correction_bias,
        apply_router_weight_on_input=apply_router_weight_on_input,
        **routed_experts_args if routed_experts_args is not None else {},
    )

    if runner_cls is None:
        runner_cls = MoERunner

    runner = runner_cls(
        layer_name=layer_name,
        moe_config=moe_config,
        router=router,
        routed_experts=routed_experts,
        enable_dbo=vllm_config.parallel_config.enable_dbo,
        gate=gate,
        shared_expert_gate=shared_expert_gate,
        shared_experts=shared_experts,
        routed_input_transform=routed_input_transform,
        routed_output_transform=routed_output_transform,
        # When apply_routed_scale_to_output is True, we allow
        # the scaling factor to be passed to the runner, otherwise
        # we pass 1.0 so it ends up being a nop.
        routed_scaling_factor=routed_scaling_factor
        if apply_routed_scale_to_output
        else 1.0,
        **runner_args if runner_args is not None else {},
    )

    return runner

activation_without_mul(activation)

Get the non-gated variant of an activation function.

Parameters:

  • activation

    (str) –

    The activation function name (e.g., "silu", "gelu")

Returns:

  • str

    The non-gated activation name (e.g., "silu_no_mul", "gelu_no_mul")

Source code in vllm/model_executor/layers/fused_moe/activation.py
def activation_without_mul(activation: str) -> str:
    """Get the non-gated variant of an activation function.

    Args:
        activation: The activation function name (e.g., "silu", "gelu")

    Returns:
        The non-gated activation name (e.g., "silu_no_mul", "gelu_no_mul")
    """
    return MoEActivation.from_str(activation).without_mul().value

apply_moe_activation(activation, output, input)

Apply MoE activation function.

Source code in vllm/model_executor/layers/fused_moe/activation.py
def apply_moe_activation(
    activation: MoEActivation,
    output: torch.Tensor,
    input: torch.Tensor,
) -> torch.Tensor:
    """Apply MoE activation function."""
    assert input.dim() == 2, "Input must be 2D"
    assert output.dim() == 2, "Output must be 2D"
    if activation.is_gated:
        assert output.size(-1) * 2 == input.size(-1), (
            f"{activation.value} expects 2x ratio: "
            f"{output.size(-1) * 2} vs {input.size(-1)}"
        )
    else:
        assert output.size(-1) == input.size(-1), (
            f"{activation.value} expects equal sizes: "
            f"{output.size(-1)} vs {input.size(-1)}"
        )

    # Activations with gated multiplication (gate × activation(up))
    if activation == MoEActivation.SILU:
        torch.ops._C.silu_and_mul(output, input)
    elif activation == MoEActivation.GELU:
        torch.ops._C.gelu_and_mul(output, input)
    elif activation == MoEActivation.GELU_TANH:
        torch.ops._C.gelu_tanh_and_mul(output, input)
    elif activation == MoEActivation.SWIGLUOAI:
        torch.ops._C.swigluoai_and_mul(output, input)
    elif activation == MoEActivation.SWIGLUSTEP:
        from vllm.model_executor.layers.activation import swiglustep_and_mul_triton

        swiglustep_and_mul_triton(output, input)

    # Activations without gated multiplication
    elif activation == MoEActivation.SILU_NO_MUL:
        output.copy_(F.silu(input))
    elif activation == MoEActivation.GELU_NO_MUL:
        output.copy_(F.gelu(input))
    elif activation == MoEActivation.GELU_TANH_NO_MUL:
        output.copy_(F.gelu(input, approximate="tanh"))
    elif activation == MoEActivation.RELU2_NO_MUL:
        F.relu(input, inplace=True)
        torch.square(input, out=output)
    else:
        raise ValueError(f"Unsupported FusedMoe activation: {activation}")

    return output

fused_experts(hidden_states, w1, w2, topk_weights, topk_ids, activation=MoEActivation.SILU, apply_router_weight_on_input=False, global_num_experts=-1, expert_map=None, quant_config=None)

Run fused MoE expert computation using Triton kernels.

Source code in vllm/model_executor/layers/fused_moe/fused_moe.py
def fused_experts(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    activation: MoEActivation = MoEActivation.SILU,
    apply_router_weight_on_input: bool = False,
    global_num_experts: int = -1,
    expert_map: torch.Tensor | None = None,
    quant_config: FusedMoEQuantConfig | None = None,
) -> torch.Tensor:
    """Run fused MoE expert computation using Triton kernels."""
    if quant_config is None:
        quant_config = FUSED_MOE_UNQUANTIZED_CONFIG

    return torch.ops.vllm.fused_experts(
        hidden_states=hidden_states,
        w1=w1,
        w2=w2,
        topk_weights=topk_weights,
        topk_ids=topk_ids,
        activation=activation.value,
        apply_router_weight_on_input=apply_router_weight_on_input,
        use_fp8_w8a8=quant_config.use_fp8_w8a8,
        use_int8_w8a8=quant_config.use_int8_w8a8,
        use_int8_w8a16=quant_config.use_int8_w8a16,
        use_int4_w4a16=quant_config.use_int4_w4a16,
        ocp_mx_scheme=quant_config.ocp_mx_scheme,
        per_channel_quant=quant_config.per_act_token_quant,
        global_num_experts=global_num_experts,
        expert_map=expert_map,
        w1_scale=quant_config.w1_scale,
        w2_scale=quant_config.w2_scale,
        w1_zp=quant_config.w1_zp,
        w2_zp=quant_config.w2_zp,
        a1_scale=quant_config.a1_scale,
        a2_scale=quant_config.a2_scale,
        block_shape=quant_config.block_shape,
        w1_bias=quant_config.w1_bias,
        w2_bias=quant_config.w2_bias,
    )

fused_moe_make_expert_params_mapping(model, ckpt_gate_proj_name, ckpt_down_proj_name, ckpt_up_proj_name, num_experts, num_redundant_experts=0, routed_experts_prefix='routed_experts')

Delegate to EPLB manager.

Source code in vllm/model_executor/layers/fused_moe/layer.py
def fused_moe_make_expert_params_mapping(
    model: torch.nn.Module,
    ckpt_gate_proj_name: str,
    ckpt_down_proj_name: str,
    ckpt_up_proj_name: str,
    num_experts: int,
    num_redundant_experts: int = 0,
    routed_experts_prefix: str = "routed_experts",
) -> list[tuple[str, str, int, str]]:
    """Delegate to EPLB manager."""
    return RoutedExperts.make_expert_params_mapping(
        model,
        ckpt_gate_proj_name,
        ckpt_down_proj_name,
        ckpt_up_proj_name,
        num_experts,
        num_redundant_experts,
        routed_experts_prefix,
    )