Skip to content

vllm.config.kernel

Classes:

  • IrOpPriorityConfig

    Configuration for vLLM IR op priority for dispatching/lowering during the

  • KernelConfig

    Configuration for kernel selection and warmup behavior.

IrOpPriorityConfig

Configuration for vLLM IR op priority for dispatching/lowering during the forward pass. Each member is a list of strings, which will be installed in worker init via vllm.ir.ops..set_default(). A single comma-separated string is accepted as well,

If specified manually, platform defaults will be appended to the lists. See KernelConfig.set_platform_defaults().

Methods:

  • compute_hash

    Produces a hash unique to the pass configuration.

  • set_default

    Permanently set the IR op priority for all op members.

  • set_priority

    Context manager to set the IR op priority for all op members.

  • with_default

    A helper to create an IrOpPriorityConfig where fields not specified in kwargs

Attributes:

Source code in vllm/config/kernel.py
@config
class IrOpPriorityConfig:
    """
    Configuration for vLLM IR op priority for dispatching/lowering during the
    forward pass. Each member is a list of strings, which will be installed
    in worker init via vllm.ir.ops.<op_name>.set_default().
    A single comma-separated string is accepted as well,

    If specified manually, platform defaults will be appended to the lists.
    See KernelConfig.set_platform_defaults().
    """

    rms_norm: list[str] = Field(default_factory=list)
    """Priority list for vllm.ir.ops.rms_norm"""

    fused_add_rms_norm: list[str] = Field(default_factory=list)
    """Priority list for vllm.ir.ops.fused_add_rms_norm"""

    def compute_hash(self) -> str:
        """
        Produces a hash unique to the pass configuration.
        Any new fields that affect compilation should be added to the hash.
        Any future fields that don't affect compilation should be excluded.

        Also, manually add IR op impl UUIDs to make sure they affect the compile cache.
        """
        factors = get_hash_factors(self, set())

        # Implementations are hidden from Dynamo,
        # so they don't show up in the traced files list.
        from vllm.ir.op import IrOp

        assert "_impls" not in factors
        factors["_impls"] = {
            name: {
                provider: IrOp.registry[name].impls[provider].uuid() for provider in p
            }
            for name, p in asdict(self).items()  # type: ignore[call-overload]
        }

        return hash_factors(factors)

    @field_validator("*", mode="before")
    @classmethod
    def _to_list_str(cls, value: str | list[str]):
        if isinstance(value, str):
            value = value.replace(" ", "").split(",")

        assert all(isinstance(v, str) for v in value)
        return value

    def _iter_op_priorities(self):
        """
        Yield (IrOp, priority_list) for each field, after importing platform
        kernels and validating each entry.
        """
        from vllm.ir.op import IrOp
        from vllm.platforms import current_platform

        current_platform.import_ir_kernels()

        for field in fields(self):  # type: ignore[arg-type]
            op_priority = getattr(self, field.name)
            assert op_priority is not None, (
                f"IR op priority for {field.name} must be set"
            )
            logger.debug("Setting IR op priority for %s to %s", field.name, op_priority)
            yield IrOp.registry[field.name], op_priority

    def set_default(self) -> None:
        """
        Permanently set the IR op priority for all op members.
        """
        for ir_op, op_priority in self._iter_op_priorities():
            ir_op.set_default(op_priority)

    @contextlib.contextmanager
    def set_priority(self):
        """
        Context manager to set the IR op priority for all op members.
        It also imports IR kernel implementations for the current platform
        to ensure all implementations are made available.
        """
        with contextlib.ExitStack() as stack:
            for ir_op, op_priority in self._iter_op_priorities():
                stack.enter_context(ir_op.set_priority(op_priority))
            yield

    @classmethod
    def with_default(
        cls, default: list[str], /, **kwargs: list[str]
    ) -> "IrOpPriorityConfig":
        """
        A helper to create an IrOpPriorityConfig where fields not specified in kwargs
        use the given default list.
        """
        for field in fields(cls):  # type: ignore[arg-type]
            if field.name not in kwargs:
                kwargs[field.name] = list(default)

        return cls(**kwargs)

fused_add_rms_norm = Field(default_factory=list) class-attribute instance-attribute

Priority list for vllm.ir.ops.fused_add_rms_norm

rms_norm = Field(default_factory=list) class-attribute instance-attribute

Priority list for vllm.ir.ops.rms_norm

_iter_op_priorities()

Yield (IrOp, priority_list) for each field, after importing platform kernels and validating each entry.

Source code in vllm/config/kernel.py
def _iter_op_priorities(self):
    """
    Yield (IrOp, priority_list) for each field, after importing platform
    kernels and validating each entry.
    """
    from vllm.ir.op import IrOp
    from vllm.platforms import current_platform

    current_platform.import_ir_kernels()

    for field in fields(self):  # type: ignore[arg-type]
        op_priority = getattr(self, field.name)
        assert op_priority is not None, (
            f"IR op priority for {field.name} must be set"
        )
        logger.debug("Setting IR op priority for %s to %s", field.name, op_priority)
        yield IrOp.registry[field.name], op_priority

compute_hash()

Produces a hash unique to the pass configuration. Any new fields that affect compilation should be added to the hash. Any future fields that don't affect compilation should be excluded.

Also, manually add IR op impl UUIDs to make sure they affect the compile cache.

Source code in vllm/config/kernel.py
def compute_hash(self) -> str:
    """
    Produces a hash unique to the pass configuration.
    Any new fields that affect compilation should be added to the hash.
    Any future fields that don't affect compilation should be excluded.

    Also, manually add IR op impl UUIDs to make sure they affect the compile cache.
    """
    factors = get_hash_factors(self, set())

    # Implementations are hidden from Dynamo,
    # so they don't show up in the traced files list.
    from vllm.ir.op import IrOp

    assert "_impls" not in factors
    factors["_impls"] = {
        name: {
            provider: IrOp.registry[name].impls[provider].uuid() for provider in p
        }
        for name, p in asdict(self).items()  # type: ignore[call-overload]
    }

    return hash_factors(factors)

set_default()

Permanently set the IR op priority for all op members.

Source code in vllm/config/kernel.py
def set_default(self) -> None:
    """
    Permanently set the IR op priority for all op members.
    """
    for ir_op, op_priority in self._iter_op_priorities():
        ir_op.set_default(op_priority)

set_priority()

Context manager to set the IR op priority for all op members. It also imports IR kernel implementations for the current platform to ensure all implementations are made available.

Source code in vllm/config/kernel.py
@contextlib.contextmanager
def set_priority(self):
    """
    Context manager to set the IR op priority for all op members.
    It also imports IR kernel implementations for the current platform
    to ensure all implementations are made available.
    """
    with contextlib.ExitStack() as stack:
        for ir_op, op_priority in self._iter_op_priorities():
            stack.enter_context(ir_op.set_priority(op_priority))
        yield

with_default(default, /, **kwargs) classmethod

A helper to create an IrOpPriorityConfig where fields not specified in kwargs use the given default list.

Source code in vllm/config/kernel.py
@classmethod
def with_default(
    cls, default: list[str], /, **kwargs: list[str]
) -> "IrOpPriorityConfig":
    """
    A helper to create an IrOpPriorityConfig where fields not specified in kwargs
    use the given default list.
    """
    for field in fields(cls):  # type: ignore[arg-type]
        if field.name not in kwargs:
            kwargs[field.name] = list(default)

    return cls(**kwargs)

KernelConfig

Configuration for kernel selection and warmup behavior.

Methods:

Attributes:

Source code in vllm/config/kernel.py
@config
class KernelConfig:
    """Configuration for kernel selection and warmup behavior."""

    ir_op_priority: IrOpPriorityConfig = Field(default_factory=IrOpPriorityConfig)
    """
    vLLM IR op priority for dispatching/lowering during the forward pass.
    Platform defaults appended automatically during VllmConfig.__post_init__.
    """

    enable_flashinfer_autotune: bool = None  # type: ignore[assignment]
    """If True, run FlashInfer autotuning during kernel warmup."""

    moe_backend: MoEBackend = "auto"
    """Backend for MoE expert computation kernels. Available options:

    - "auto": Automatically select the best backend based on model and hardware
    - "triton": Use Triton-based fused MoE kernels
    - "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
    - "deep_gemm_mega_moe": Use DeepGEMM mega MoE kernels
    - "cutlass": Use vLLM CUTLASS kernels
    - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
    - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
    - "flashinfer_b12x": Use FlashInfer CuteDSL fused MoE for SM12x
      (RTX Pro 6000 / DGX Spark)
    - "marlin": Use Marlin kernels (weight-only quantization)
    - "humming": Use Humming Mixed Precision kernels
    - "triton_unfused": Use Triton unfused MoE kernels
    - "aiter": Use AMD AITer kernels (ROCm only)
    - "emulation": use BF16/FP16 GEMM, dequantizing weights and
                   running QDQ on activations.
    """

    linear_backend: LinearBackend = "auto"
    """Backend for quantized linear layer GEMM kernels. Available options:

    - "auto": Automatically select the best backend based on model and hardware
    - "cutlass": Use CUTLASS-based kernels
    - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
    - "flashinfer_trtllm": Use FlashInfer with TensorRT-LLM kernels
    - "flashinfer_cudnn": Use FlashInfer with cuDNN kernels
    - "marlin": Use Marlin kernels
    - "triton": Use Triton-based kernels
    - "deep_gemm": Use DeepGEMM kernels
    - "torch": Use PyTorch native scaled_mm kernels
    - "aiter": Use AMD AITer kernels (ROCm only)
    - "machete": Use Machete kernels (mixed-precision)
    - "fbgemm": Use FBGEMM kernels
    - "conch": Use Conch mixed-precision kernels
    - "exllama": Use Exllama mixed-precision kernels
    - "emulation": Use slow dequant-to-BF16 emulation (for testing only)"""

    @field_validator("moe_backend", mode="before")
    @classmethod
    def _normalize_moe_backend(cls, value: Any) -> Any:
        if isinstance(value, str):
            return value.lower().replace("-", "_")
        return value

    @field_validator("linear_backend", mode="before")
    @classmethod
    def _normalize_linear_backend(cls, value: Any) -> Any:
        if isinstance(value, str):
            return value.lower().replace("-", "_")
        return value

    def compute_hash(self) -> str:
        """
        Produces a hash unique to the pass configuration.
        Any new fields that affect compilation should be added to the hash.
        Any future fields that don't affect compilation should be excluded.
        """
        ignored_factors = {
            "enable_flashinfer_autotune",
            "ir_op_priority",  # handled separately below
        }
        factors = get_hash_factors(self, ignored_factors)
        factors["ir_op_priority"] = self.ir_op_priority.compute_hash()
        return hash_factors(factors)

    @field_validator("enable_flashinfer_autotune", mode="wrap")
    @classmethod
    def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
        """Skip validation if the value is `None` when initialization is delayed."""
        if value is None:
            return value
        return handler(value)

    def set_platform_defaults(self, vllm_config: "VllmConfig") -> None:
        """Set platform-specific defaults for the kernel config."""
        from vllm.platforms import current_platform

        platform_op_priority = current_platform.get_default_ir_op_priority(vllm_config)
        logger.debug(
            "Setting platform-specific IR op priority defaults: %s, user-defined: %s",
            platform_op_priority,
            self.ir_op_priority,
        )
        for op_name, op_priority in asdict(platform_op_priority).items():
            current_op_priority: list[str] = getattr(self.ir_op_priority, op_name)
            if current_op_priority is None:
                setattr(self.ir_op_priority, op_name, op_priority)
            else:
                # Append platform-specific priorities
                # Must be idempotent because vllm_config.set_platform_defaults() may be
                # called multiple times (due to VllmConfig.__post_init__ manual call).
                unique_op_priority = [
                    op for op in op_priority if op not in current_op_priority
                ]
                current_op_priority.extend(unique_op_priority)

        logger.info(
            "Final IR op priority after setting platform defaults: %s",
            self.ir_op_priority,
        )

enable_flashinfer_autotune = None class-attribute instance-attribute

If True, run FlashInfer autotuning during kernel warmup.

ir_op_priority = Field(default_factory=IrOpPriorityConfig) class-attribute instance-attribute

vLLM IR op priority for dispatching/lowering during the forward pass. Platform defaults appended automatically during VllmConfig.post_init.

linear_backend = 'auto' class-attribute instance-attribute

Backend for quantized linear layer GEMM kernels. Available options:

  • "auto": Automatically select the best backend based on model and hardware
  • "cutlass": Use CUTLASS-based kernels
  • "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
  • "flashinfer_trtllm": Use FlashInfer with TensorRT-LLM kernels
  • "flashinfer_cudnn": Use FlashInfer with cuDNN kernels
  • "marlin": Use Marlin kernels
  • "triton": Use Triton-based kernels
  • "deep_gemm": Use DeepGEMM kernels
  • "torch": Use PyTorch native scaled_mm kernels
  • "aiter": Use AMD AITer kernels (ROCm only)
  • "machete": Use Machete kernels (mixed-precision)
  • "fbgemm": Use FBGEMM kernels
  • "conch": Use Conch mixed-precision kernels
  • "exllama": Use Exllama mixed-precision kernels
  • "emulation": Use slow dequant-to-BF16 emulation (for testing only)

moe_backend = 'auto' class-attribute instance-attribute

Backend for MoE expert computation kernels. Available options:

  • "auto": Automatically select the best backend based on model and hardware
  • "triton": Use Triton-based fused MoE kernels
  • "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)
  • "deep_gemm_mega_moe": Use DeepGEMM mega MoE kernels
  • "cutlass": Use vLLM CUTLASS kernels
  • "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
  • "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
  • "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
  • "flashinfer_b12x": Use FlashInfer CuteDSL fused MoE for SM12x (RTX Pro 6000 / DGX Spark)
  • "marlin": Use Marlin kernels (weight-only quantization)
  • "humming": Use Humming Mixed Precision kernels
  • "triton_unfused": Use Triton unfused MoE kernels
  • "aiter": Use AMD AITer kernels (ROCm only)
  • "emulation": use BF16/FP16 GEMM, dequantizing weights and running QDQ on activations.

_skip_none_validation(value, handler) classmethod

Skip validation if the value is None when initialization is delayed.

Source code in vllm/config/kernel.py
@field_validator("enable_flashinfer_autotune", mode="wrap")
@classmethod
def _skip_none_validation(cls, value: Any, handler: Callable) -> Any:
    """Skip validation if the value is `None` when initialization is delayed."""
    if value is None:
        return value
    return handler(value)

compute_hash()

Produces a hash unique to the pass configuration. Any new fields that affect compilation should be added to the hash. Any future fields that don't affect compilation should be excluded.

Source code in vllm/config/kernel.py
def compute_hash(self) -> str:
    """
    Produces a hash unique to the pass configuration.
    Any new fields that affect compilation should be added to the hash.
    Any future fields that don't affect compilation should be excluded.
    """
    ignored_factors = {
        "enable_flashinfer_autotune",
        "ir_op_priority",  # handled separately below
    }
    factors = get_hash_factors(self, ignored_factors)
    factors["ir_op_priority"] = self.ir_op_priority.compute_hash()
    return hash_factors(factors)

set_platform_defaults(vllm_config)

Set platform-specific defaults for the kernel config.

Source code in vllm/config/kernel.py
def set_platform_defaults(self, vllm_config: "VllmConfig") -> None:
    """Set platform-specific defaults for the kernel config."""
    from vllm.platforms import current_platform

    platform_op_priority = current_platform.get_default_ir_op_priority(vllm_config)
    logger.debug(
        "Setting platform-specific IR op priority defaults: %s, user-defined: %s",
        platform_op_priority,
        self.ir_op_priority,
    )
    for op_name, op_priority in asdict(platform_op_priority).items():
        current_op_priority: list[str] = getattr(self.ir_op_priority, op_name)
        if current_op_priority is None:
            setattr(self.ir_op_priority, op_name, op_priority)
        else:
            # Append platform-specific priorities
            # Must be idempotent because vllm_config.set_platform_defaults() may be
            # called multiple times (due to VllmConfig.__post_init__ manual call).
            unique_op_priority = [
                op for op in op_priority if op not in current_op_priority
            ]
            current_op_priority.extend(unique_op_priority)

    logger.info(
        "Final IR op priority after setting platform defaults: %s",
        self.ir_op_priority,
    )