Skip to content

vllm.model_executor.layers.quantization.utils.humming_utils

Functions:

convert_linear_layer_to_humming_standard(layer, name_map)

Rename/reshape a linear layer's quantized params (the canonical MPLinear layout: weight_packed int32 + weight_scale) into the parameter names and layout humming's weight schema expects (weight / weight_scale).

Source code in vllm/model_executor/layers/quantization/utils/humming_utils.py
def convert_linear_layer_to_humming_standard(
    layer: LinearBase, name_map: dict[str, str]
):
    """Rename/reshape a linear layer's quantized params (the canonical MPLinear
    layout: ``weight_packed`` int32 + ``weight_scale``) into the parameter names
    and layout humming's weight schema expects (``weight`` / ``weight_scale``)."""
    for name, checkpoint_name in name_map.items():
        tensor = getattr(layer, checkpoint_name)
        delattr(layer, checkpoint_name)

        if name == "weight":
            input_dim = getattr(tensor, "input_dim", 1)
            output_dim = getattr(tensor, "output_dim", 0)

            if input_dim == 0 and output_dim == 1:
                tensor = tensor.transpose(1, 0).contiguous()
            else:
                assert output_dim == 0 and input_dim == 1

            tensor = tensor.view(tensor.size(0), -1).view(torch.int32)
        elif name in ["weight_scale", "zero_point"]:
            if getattr(tensor, "output_dim", 0) == 1:
                tensor = tensor.transpose(0, 1).contiguous()
            if tensor.ndim == 1:
                tensor = tensor.unsqueeze(1)

            tensor = tensor.view(torch.int32) if name == "zero_point" else tensor

        if isinstance(tensor, torch.nn.Parameter):
            param = tensor
        else:
            param = torch.nn.Parameter(tensor, requires_grad=False)

        setattr(layer, name, param)