`vllm.model_executor.layers.quantization.base_config` ¶

Classes:

QuantizationConfig –

Base class for quantization configs.
QuantizeMethodBase –

Base class for different quantized methods.

Functions:

method_has_implemented_embedding –

Not all quant methods have embedding implemented, so we need to check that

`QuantizationConfig` ¶

Bases: ABC

Base class for quantization configs.

Methods:

apply_vllm_mapper –

Interface for models to update module names referenced in
from_config –

Create a config class from the model's quantization config.
get_cache_scale_mapper –

Mapping from checkpoint KV-cache scale names to vLLM scale names.
get_config_filenames –

List of filenames to search for in the model directory.
get_from_keys –

Get a value from the model's quantization config.
get_from_keys_or –

Get an optional value from the model's quantization config.
get_min_capability –

Minimum GPU capability to support the quantization method.
get_name –

Name of the quantization method.
get_quant_method –

Get the quantize method to use for the quantized layer.
get_supported_act_dtypes –

List of supported activation dtypes.
is_mxfp4_quant –

Determine if mxfp4 quantization will be used for this config.
maybe_update_config –

Interface to update values after config initialization.
override_quantization_method –

Detects if this quantization method can support a given checkpoint

Source code in vllm/model_executor/layers/quantization/base_config.py

class QuantizationConfig(ABC):
    """Base class for quantization configs."""

    def __init__(self):
        super().__init__()
        # mapping is updated by models as they initialize
        self.packed_modules_mapping: dict[str, list[str]] = dict()

    @abstractmethod
    def get_name(self) -> QuantizationMethods:
        """Name of the quantization method."""
        raise NotImplementedError

    @abstractmethod
    def get_supported_act_dtypes(self) -> list[torch.dtype]:
        """List of supported activation dtypes."""
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def get_min_capability(cls) -> int:
        """Minimum GPU capability to support the quantization method.

        E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
        This requirement is due to the custom CUDA kernels used by the
        quantization method.
        """
        raise NotImplementedError

    @staticmethod
    @abstractmethod
    def get_config_filenames() -> list[str]:
        """List of filenames to search for in the model directory."""
        raise NotImplementedError

    @classmethod
    @abstractmethod
    def from_config(cls, config: dict[str, Any]) -> "QuantizationConfig":
        """Create a config class from the model's quantization config."""
        raise NotImplementedError

    @classmethod
    def override_quantization_method(
        cls,
        hf_quant_cfg: dict[str, Any],
        user_quant: str | None,
        hf_config: Any = None,
    ) -> QuantizationMethods | None:
        """
        Detects if this quantization method can support a given checkpoint
        format by overriding the user specified quantization method --
        this method should only be overwritten by subclasses in exceptional
        circumstances.

        Args:
            hf_quant_cfg: The checkpoint's quantization config dict.
            user_quant: The user-specified quantization method string.
            hf_config: The HuggingFace model config object (e.g. for
                model_type checks). May be None if not available.
        """
        return None

    @staticmethod
    def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any:
        """Get a value from the model's quantization config."""
        for key in keys:
            if key in config:
                return config[key]
        raise ValueError(
            f"Cannot find any of {keys} in the model's quantization config."
        )

    @staticmethod
    def get_from_keys_or(config: dict[str, Any], keys: list[str], default: Any) -> Any:
        """Get an optional value from the model's quantization config."""
        try:
            return QuantizationConfig.get_from_keys(config, keys)
        except ValueError:
            return default

    @abstractmethod
    def get_quant_method(
        self, layer: torch.nn.Module, prefix: str
    ) -> QuantizeMethodBase | None:
        """Get the quantize method to use for the quantized layer.

        Args:
            layer: The layer for the quant method.
            prefix: The full name of the layer in the state dict
        Returns:
            The quantize method. None if the given layer doesn't support quant
            method.
        """
        raise NotImplementedError

    def get_cache_scale_mapper(self) -> "WeightsMapper | None":
        """Mapping from checkpoint KV-cache scale names to vLLM scale names.

        Returning a mapper here causes `AutoWeightsLoader` to apply it to the
        weight stream automatically; individual model `load_weights` methods
        do not need to know about KV-cache scales.
        """
        return None

    def apply_vllm_mapper(  # noqa: B027
        self, hf_to_vllm_mapper: "WeightsMapper"
    ):
        """
        Interface for models to update module names referenced in
        quantization configs in order to reflect the vllm model structure

        Args:
            hf_to_vllm_mapper: maps from hf model structure (the assumed
                structure of the qconfig) to vllm model structure
        """
        # TODO (@kylesayrs): add implementations for all subclasses
        pass

    def maybe_update_config(  # noqa: B027
        self,
        model_name: str,
        hf_config: PretrainedConfig | None = None,
        revision: str | None = None,
    ):
        """
        Interface to update values after config initialization.

        Args:
            model_name: The name of the model
            hf_config: The Hugging Face config of the model
            revision: The revision of the model
        Returns:
        """
        # TODO: revision is never passed currently in vllm.py,
        # but is used in subclasses, should we remove this parameter?
        pass

    def is_mxfp4_quant(self, prefix: str, layer: torch.nn.Module) -> bool:
        """
        Determine if mxfp4 quantization will be used for this config.

        This allows hidden_size rounding to happen before moe_config creation
        without needing to instantiate quant_method first.

        Args:
            prefix: The layer prefix/name in the model
            layer: The layer module

        Returns:
            True if this config uses MXFP4 quantization, False otherwise
        """
        return False

`apply_vllm_mapper(hf_to_vllm_mapper)` ¶

Interface for models to update module names referenced in quantization configs in order to reflect the vllm model structure

Parameters:

hf_to_vllm_mapper ¶
(WeightsMapper) –

maps from hf model structure (the assumed structure of the qconfig) to vllm model structure

Source code in vllm/model_executor/layers/quantization/base_config.py

def apply_vllm_mapper(  # noqa: B027
    self, hf_to_vllm_mapper: "WeightsMapper"
):
    """
    Interface for models to update module names referenced in
    quantization configs in order to reflect the vllm model structure

    Args:
        hf_to_vllm_mapper: maps from hf model structure (the assumed
            structure of the qconfig) to vllm model structure
    """
    # TODO (@kylesayrs): add implementations for all subclasses
    pass

`from_config(config)` `abstractmethod` `classmethod` ¶

Create a config class from the model's quantization config.

Source code in vllm/model_executor/layers/quantization/base_config.py

@classmethod
@abstractmethod
def from_config(cls, config: dict[str, Any]) -> "QuantizationConfig":
    """Create a config class from the model's quantization config."""
    raise NotImplementedError

`get_cache_scale_mapper()` ¶

Mapping from checkpoint KV-cache scale names to vLLM scale names.

Returning a mapper here causes AutoWeightsLoader to apply it to the weight stream automatically; individual model load_weights methods do not need to know about KV-cache scales.

Source code in vllm/model_executor/layers/quantization/base_config.py

def get_cache_scale_mapper(self) -> "WeightsMapper | None":
    """Mapping from checkpoint KV-cache scale names to vLLM scale names.

    Returning a mapper here causes `AutoWeightsLoader` to apply it to the
    weight stream automatically; individual model `load_weights` methods
    do not need to know about KV-cache scales.
    """
    return None

`get_config_filenames()` `abstractmethod` `staticmethod` ¶

List of filenames to search for in the model directory.

Source code in vllm/model_executor/layers/quantization/base_config.py

@staticmethod
@abstractmethod
def get_config_filenames() -> list[str]:
    """List of filenames to search for in the model directory."""
    raise NotImplementedError

`get_from_keys(config, keys)` `staticmethod` ¶

Get a value from the model's quantization config.

Source code in vllm/model_executor/layers/quantization/base_config.py

@staticmethod
def get_from_keys(config: dict[str, Any], keys: list[str]) -> Any:
    """Get a value from the model's quantization config."""
    for key in keys:
        if key in config:
            return config[key]
    raise ValueError(
        f"Cannot find any of {keys} in the model's quantization config."
    )

`get_from_keys_or(config, keys, default)` `staticmethod` ¶

Get an optional value from the model's quantization config.

Source code in vllm/model_executor/layers/quantization/base_config.py

@staticmethod
def get_from_keys_or(config: dict[str, Any], keys: list[str], default: Any) -> Any:
    """Get an optional value from the model's quantization config."""
    try:
        return QuantizationConfig.get_from_keys(config, keys)
    except ValueError:
        return default

`get_min_capability()` `abstractmethod` `classmethod` ¶

Minimum GPU capability to support the quantization method.

E.g., 70 for Volta, 75 for Turing, 80 for Ampere. This requirement is due to the custom CUDA kernels used by the quantization method.

Source code in vllm/model_executor/layers/quantization/base_config.py

@classmethod
@abstractmethod
def get_min_capability(cls) -> int:
    """Minimum GPU capability to support the quantization method.

    E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
    This requirement is due to the custom CUDA kernels used by the
    quantization method.
    """
    raise NotImplementedError

`get_name()` `abstractmethod` ¶

Name of the quantization method.

Source code in vllm/model_executor/layers/quantization/base_config.py

@abstractmethod
def get_name(self) -> QuantizationMethods:
    """Name of the quantization method."""
    raise NotImplementedError

`get_quant_method(layer, prefix)` `abstractmethod` ¶

Get the quantize method to use for the quantized layer.

Parameters:

layer ¶
(Module) –

The layer for the quant method.
prefix ¶
(str) –

The full name of the layer in the state dict

Returns: The quantize method. None if the given layer doesn't support quant method.

Source code in vllm/model_executor/layers/quantization/base_config.py

@abstractmethod
def get_quant_method(
    self, layer: torch.nn.Module, prefix: str
) -> QuantizeMethodBase | None:
    """Get the quantize method to use for the quantized layer.

    Args:
        layer: The layer for the quant method.
        prefix: The full name of the layer in the state dict
    Returns:
        The quantize method. None if the given layer doesn't support quant
        method.
    """
    raise NotImplementedError

`get_supported_act_dtypes()` `abstractmethod` ¶

List of supported activation dtypes.

Source code in vllm/model_executor/layers/quantization/base_config.py

@abstractmethod
def get_supported_act_dtypes(self) -> list[torch.dtype]:
    """List of supported activation dtypes."""
    raise NotImplementedError

`is_mxfp4_quant(prefix, layer)` ¶

Determine if mxfp4 quantization will be used for this config.

This allows hidden_size rounding to happen before moe_config creation without needing to instantiate quant_method first.

Parameters:

prefix ¶
(str) –

The layer prefix/name in the model
layer ¶
(Module) –

The layer module

Returns:

bool –

True if this config uses MXFP4 quantization, False otherwise

Source code in vllm/model_executor/layers/quantization/base_config.py

def is_mxfp4_quant(self, prefix: str, layer: torch.nn.Module) -> bool:
    """
    Determine if mxfp4 quantization will be used for this config.

    This allows hidden_size rounding to happen before moe_config creation
    without needing to instantiate quant_method first.

    Args:
        prefix: The layer prefix/name in the model
        layer: The layer module

    Returns:
        True if this config uses MXFP4 quantization, False otherwise
    """
    return False

`maybe_update_config(model_name, hf_config=None, revision=None)` ¶

Interface to update values after config initialization.

Parameters:

model_name ¶
(str) –

The name of the model
hf_config ¶
(PretrainedConfig | None, default: None ) –

The Hugging Face config of the model
revision ¶
(str | None, default: None ) –

The revision of the model

Returns:

Source code in vllm/model_executor/layers/quantization/base_config.py

def maybe_update_config(  # noqa: B027
    self,
    model_name: str,
    hf_config: PretrainedConfig | None = None,
    revision: str | None = None,
):
    """
    Interface to update values after config initialization.

    Args:
        model_name: The name of the model
        hf_config: The Hugging Face config of the model
        revision: The revision of the model
    Returns:
    """
    # TODO: revision is never passed currently in vllm.py,
    # but is used in subclasses, should we remove this parameter?
    pass

`override_quantization_method(hf_quant_cfg, user_quant, hf_config=None)` `classmethod` ¶

Detects if this quantization method can support a given checkpoint format by overriding the user specified quantization method -- this method should only be overwritten by subclasses in exceptional circumstances.

Parameters:

hf_quant_cfg ¶
(dict[str, Any]) –

The checkpoint's quantization config dict.
user_quant ¶
(str | None) –

The user-specified quantization method string.
hf_config ¶
(Any, default: None ) –

The HuggingFace model config object (e.g. for model_type checks). May be None if not available.

Source code in vllm/model_executor/layers/quantization/base_config.py

@classmethod
def override_quantization_method(
    cls,
    hf_quant_cfg: dict[str, Any],
    user_quant: str | None,
    hf_config: Any = None,
) -> QuantizationMethods | None:
    """
    Detects if this quantization method can support a given checkpoint
    format by overriding the user specified quantization method --
    this method should only be overwritten by subclasses in exceptional
    circumstances.

    Args:
        hf_quant_cfg: The checkpoint's quantization config dict.
        user_quant: The user-specified quantization method string.
        hf_config: The HuggingFace model config object (e.g. for
            model_type checks). May be None if not available.
    """
    return None

`QuantizeMethodBase` ¶

Bases: ABC

Base class for different quantized methods.

Methods:

apply –

Apply the weights in layer to the input tensor.
create_weights –

Create weights for a layer.
embedding –

Gather embeddings in the layer based on indices in the input tensor.
process_weights_after_loading –

Process the weight after loading.

Source code in vllm/model_executor/layers/quantization/base_config.py

class QuantizeMethodBase(ABC):
    """Base class for different quantized methods."""

    # Whether this method creates weights on meta device for online quantization.
    # When True, weights are created on meta device and quantized layer-wise
    # in process_weights_after_loading, reducing peak memory during loading.
    uses_meta_device: bool = False

    @abstractmethod
    def create_weights(
        self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs
    ):
        """Create weights for a layer.

        The weights will be set as attributes of the layer."""
        raise NotImplementedError

    @abstractmethod
    def apply(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
        """Apply the weights in layer to the input tensor.

        Expects create_weights to have been called before on the layer."""
        raise NotImplementedError

    # Not required functions
    def embedding(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
        """Gather embeddings in the layer based on indices in the input tensor.

        Expects create_weights to have been called before on the layer."""
        raise NotImplementedError

    def process_weights_after_loading(self, layer: nn.Module) -> None:
        """Process the weight after loading.

        This can be used for example, to transpose weights for computation.
        """
        return

`apply(layer, *args, **kwargs)` `abstractmethod` ¶

Apply the weights in layer to the input tensor.

Expects create_weights to have been called before on the layer.

Source code in vllm/model_executor/layers/quantization/base_config.py

@abstractmethod
def apply(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
    """Apply the weights in layer to the input tensor.

    Expects create_weights to have been called before on the layer."""
    raise NotImplementedError

`create_weights(layer, *weight_args, **extra_weight_attrs)` `abstractmethod` ¶

Create weights for a layer.

The weights will be set as attributes of the layer.

Source code in vllm/model_executor/layers/quantization/base_config.py

@abstractmethod
def create_weights(
    self, layer: torch.nn.Module, *weight_args, **extra_weight_attrs
):
    """Create weights for a layer.

    The weights will be set as attributes of the layer."""
    raise NotImplementedError

`embedding(layer, *args, **kwargs)` ¶

Gather embeddings in the layer based on indices in the input tensor.

Expects create_weights to have been called before on the layer.

Source code in vllm/model_executor/layers/quantization/base_config.py

def embedding(self, layer: torch.nn.Module, *args, **kwargs) -> torch.Tensor:
    """Gather embeddings in the layer based on indices in the input tensor.

    Expects create_weights to have been called before on the layer."""
    raise NotImplementedError

`process_weights_after_loading(layer)` ¶

Process the weight after loading.

This can be used for example, to transpose weights for computation.

Source code in vllm/model_executor/layers/quantization/base_config.py

def process_weights_after_loading(self, layer: nn.Module) -> None:
    """Process the weight after loading.

    This can be used for example, to transpose weights for computation.
    """
    return

`method_has_implemented_embedding(method_class)` ¶

Not all quant methods have embedding implemented, so we need to check that it exists for our given method. We check this by making sure the function has been changed from the base implementation.

Source code in vllm/model_executor/layers/quantization/base_config.py

def method_has_implemented_embedding(method_class: type[QuantizeMethodBase]) -> bool:
    """
    Not all quant methods have embedding implemented, so we need to check that
    it exists for our given method. We check this by making sure the function
    has been changed from the base implementation.
    """
    base_embedding = inspect.getattr_static(QuantizeMethodBase, "embedding", None)
    class_embedding = inspect.getattr_static(method_class, "embedding", None)

    return class_embedding is not None and class_embedding is not base_embedding

`vllm.model_executor.layers.quantization.base_config` ¶

`QuantizationConfig` ¶

`apply_vllm_mapper(hf_to_vllm_mapper)` ¶

`hf_to_vllm_mapper` ¶

`from_config(config)` `abstractmethod` `classmethod` ¶

`get_cache_scale_mapper()` ¶

`get_config_filenames()` `abstractmethod` `staticmethod` ¶

`get_from_keys(config, keys)` `staticmethod` ¶

`get_from_keys_or(config, keys, default)` `staticmethod` ¶

`get_min_capability()` `abstractmethod` `classmethod` ¶

`get_name()` `abstractmethod` ¶

`get_quant_method(layer, prefix)` `abstractmethod` ¶

`layer` ¶

`prefix` ¶

`get_supported_act_dtypes()` `abstractmethod` ¶

`is_mxfp4_quant(prefix, layer)` ¶

`prefix` ¶

`layer` ¶

`maybe_update_config(model_name, hf_config=None, revision=None)` ¶

`model_name` ¶

`hf_config` ¶

`revision` ¶

`override_quantization_method(hf_quant_cfg, user_quant, hf_config=None)` `classmethod` ¶

`hf_quant_cfg` ¶

`user_quant` ¶

`hf_config` ¶

`QuantizeMethodBase` ¶

`apply(layer, *args, **kwargs)` `abstractmethod` ¶

`create_weights(layer, *weight_args, **extra_weight_attrs)` `abstractmethod` ¶

`embedding(layer, *args, **kwargs)` ¶

`process_weights_after_loading(layer)` ¶

`method_has_implemented_embedding(method_class)` ¶

vllm.model_executor.layers.quantization.base_config ¶

QuantizationConfig ¶

apply_vllm_mapper(hf_to_vllm_mapper) ¶

hf_to_vllm_mapper ¶

from_config(config) abstractmethod classmethod ¶

get_cache_scale_mapper() ¶

get_config_filenames() abstractmethod staticmethod ¶

get_from_keys(config, keys) staticmethod ¶

get_from_keys_or(config, keys, default) staticmethod ¶

get_min_capability() abstractmethod classmethod ¶

get_name() abstractmethod ¶

get_quant_method(layer, prefix) abstractmethod ¶

layer ¶

prefix ¶

get_supported_act_dtypes() abstractmethod ¶

is_mxfp4_quant(prefix, layer) ¶

prefix ¶

layer ¶

maybe_update_config(model_name, hf_config=None, revision=None) ¶

model_name ¶

hf_config ¶

revision ¶

override_quantization_method(hf_quant_cfg, user_quant, hf_config=None) classmethod ¶

hf_quant_cfg ¶

user_quant ¶

hf_config ¶

QuantizeMethodBase ¶

apply(layer, *args, **kwargs) abstractmethod ¶

create_weights(layer, *weight_args, **extra_weight_attrs) abstractmethod ¶

embedding(layer, *args, **kwargs) ¶

process_weights_after_loading(layer) ¶

method_has_implemented_embedding(method_class) ¶

`vllm.model_executor.layers.quantization.base_config` ¶

`QuantizationConfig` ¶

`apply_vllm_mapper(hf_to_vllm_mapper)` ¶

`hf_to_vllm_mapper` ¶

`from_config(config)` `abstractmethod` `classmethod` ¶

`get_cache_scale_mapper()` ¶

`get_config_filenames()` `abstractmethod` `staticmethod` ¶

`get_from_keys(config, keys)` `staticmethod` ¶

`get_from_keys_or(config, keys, default)` `staticmethod` ¶

`get_min_capability()` `abstractmethod` `classmethod` ¶

`get_name()` `abstractmethod` ¶

`get_quant_method(layer, prefix)` `abstractmethod` ¶

`layer` ¶

`prefix` ¶

`get_supported_act_dtypes()` `abstractmethod` ¶

`is_mxfp4_quant(prefix, layer)` ¶

`prefix` ¶

`layer` ¶

`maybe_update_config(model_name, hf_config=None, revision=None)` ¶

`model_name` ¶

`hf_config` ¶

`revision` ¶

`override_quantization_method(hf_quant_cfg, user_quant, hf_config=None)` `classmethod` ¶

`hf_quant_cfg` ¶

`user_quant` ¶

`hf_config` ¶

`QuantizeMethodBase` ¶

`apply(layer, *args, **kwargs)` `abstractmethod` ¶

`create_weights(layer, *weight_args, **extra_weight_attrs)` `abstractmethod` ¶

`embedding(layer, *args, **kwargs)` ¶

`process_weights_after_loading(layer)` ¶

`method_has_implemented_embedding(method_class)` ¶