`vllm.model_executor.models.glm4_1v` ¶

Inference-only GLM-4.1V & GLM-4.6V-Flash, AutoGLM-Phone-9B model compatible with HuggingFace weights.

Classes:

Glm4vForConditionalGeneration –
Glm4vImageEmbeddingInputs –

Dimensions:
Glm4vImagePixelInputs –

Dimensions:
Glm4vProcessingInfo –
Glm4vVideoEmbeddingInputs –

Dimensions:
Glm4vVideoPixelInputs –

Dimensions:
Glm4vVisionTransformer –

Functions:

all_gather_interleave –

All-gather the input tensor interleavely across model parallel group.

`Glm4vForConditionalGeneration` ¶

Bases: Module, SupportsMultiModal, SupportsEncoderCudaGraph, SupportsLoRA, SupportsPP, SupportsMRoPE

Methods:

forward –

Run forward pass for GLM-4V.
get_mm_mapping –

Get the module prefix in multimodal models

Source code in vllm/model_executor/models/glm4_1v.py

@MULTIMODAL_REGISTRY.register_processor(
    Glm4vMultiModalProcessor,
    info=Glm4vProcessingInfo,
    dummy_inputs=Glm4vDummyInputsBuilder,
)
class Glm4vForConditionalGeneration(
    nn.Module,
    SupportsMultiModal,
    SupportsEncoderCudaGraph,
    SupportsLoRA,
    SupportsPP,
    SupportsMRoPE,
):
    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": ["gate_up_proj"],
    }

    # To ensure correct weight loading and mapping.
    hf_to_vllm_mapper = WeightsMapper(
        orig_to_new_prefix={
            "lm_head.": "language_model.lm_head.",
            "model.language_model.": "language_model.model.",
            "model.visual.": "visual.",
        }
    )

    supports_encoder_tp_data = True

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
        if modality.startswith("image"):
            return "<|begin_of_image|><|image|><|end_of_image|>"
        if modality.startswith("video"):
            return "<|begin_of_video|><|video|><|end_of_video|>"

        raise ValueError("Only image or video modality is supported")

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__()
        config = vllm_config.model_config.hf_config
        quant_config = vllm_config.quant_config
        multimodal_config = vllm_config.model_config.multimodal_config

        self.config = config
        self.model_config = vllm_config.model_config
        self.multimodal_config = multimodal_config
        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
        self.is_multimodal_pruning_enabled = (
            multimodal_config.is_multimodal_pruning_enabled()
        )

        with self._mark_tower_model(vllm_config, {"image", "video"}):
            self.visual = Glm4vVisionTransformer(
                config.text_config,
                config.vision_config,
                norm_eps=getattr(config, "rms_norm_eps", 1e-5),
                quant_config=quant_config,
                prefix=maybe_prefix(prefix, "visual"),
            )

        if config.model_type in ("glm4v", "glm_ocr", "glmga"):
            architectures = ["Glm4ForCausalLM"]
        elif config.model_type == "glm4v_moe":
            architectures = ["Glm4MoeForCausalLM"]
        else:
            architectures = None

        with self._mark_language_model(vllm_config):
            self.language_model = init_vllm_registered_model(
                vllm_config=vllm_config,
                hf_config=config.text_config,
                prefix=maybe_prefix(prefix, "language_model"),
                architectures=architectures,
            )

        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors
        )

    def _parse_and_validate_image_input(
        self, **kwargs: object
    ) -> Glm4vImageInputs | None:
        pixel_values = kwargs.pop("pixel_values", None)
        image_embeds = kwargs.pop("image_embeds", None)
        image_grid_thw = kwargs.pop("image_grid_thw", None)

        if pixel_values is None and image_embeds is None:
            return None

        if pixel_values is not None:
            return Glm4vImagePixelInputs(
                type="pixel_values",
                pixel_values=pixel_values,
                image_grid_thw=image_grid_thw,
            )

        if image_embeds is not None:
            return Glm4vImageEmbeddingInputs(
                type="image_embeds",
                image_embeds=image_embeds,
                image_grid_thw=image_grid_thw,
            )

    def _parse_and_validate_video_input(
        self, **kwargs: object
    ) -> Glm4vVideoInputs | None:
        pixel_values_videos = kwargs.pop("pixel_values_videos", None)
        video_embeds = kwargs.pop("video_embeds", None)
        video_grid_thw = kwargs.pop("video_grid_thw", None)

        if pixel_values_videos is None and video_embeds is None:
            return None

        if pixel_values_videos is not None:
            return Glm4vVideoPixelInputs(
                type="pixel_values_videos",
                pixel_values_videos=pixel_values_videos,
                video_grid_thw=video_grid_thw,
            )

        if video_embeds is not None:
            return Glm4vVideoEmbeddingInputs(
                type="video_embeds",
                video_embeds=video_embeds,
                video_grid_thw=video_grid_thw,
            )

    def _process_image_input(
        self, image_input: Glm4vImageInputs
    ) -> tuple[torch.Tensor, ...]:
        grid_thw = image_input["image_grid_thw"]
        assert grid_thw.ndim == 2

        if image_input["type"] == "image_embeds":
            image_embeds = image_input["image_embeds"].type(self.visual.dtype)
        else:
            pixel_values = image_input["pixel_values"].type(self.visual.dtype)
            if self.use_data_parallel:
                return run_dp_sharded_mrope_vision_model(
                    self.visual, pixel_values, grid_thw.tolist(), rope_type="rope_3d"
                )
            else:
                image_embeds = self.visual(pixel_values, grid_thw=grid_thw)

        merge_size = self.visual.spatial_merge_size
        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
        return image_embeds.split(sizes)

    def _process_video_input(
        self, video_input: Glm4vVideoInputs
    ) -> tuple[torch.Tensor, ...]:
        grid_thw = video_input["video_grid_thw"]
        assert grid_thw.ndim == 2

        if video_input["type"] == "video_embeds":
            video_embeds = video_input["video_embeds"].type(self.visual.dtype)
        else:
            pixel_values_videos = video_input["pixel_values_videos"].type(
                self.visual.dtype
            )
            if self.use_data_parallel:
                return run_dp_sharded_mrope_vision_model(
                    self.visual,
                    pixel_values_videos,
                    grid_thw.tolist(),
                    rope_type="rope_3d",
                )
            else:
                video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw)

        # Split concatenated embeddings for each video item.
        merge_size = self.visual.spatial_merge_size
        sizes = (grid_thw.prod(-1) // merge_size // merge_size).tolist()
        return video_embeds.split(sizes)

    # -- SupportsEncoderCudaGraph protocol methods --

    def get_encoder_cudagraph_config(self):
        from vllm.v1.worker.encoder_cudagraph_defs import (
            EncoderCudaGraphConfig,
        )

        # When EVS pruning is enabled, embed_multimodal post-processes both
        # image and video embeddings (mrope positions are appended for image,
        # prune+append for video). The encoder CUDA graph path bypasses that
        # post-process, producing inconsistent embedding formats vs eager. So
        # disable CUDA graph for all modalities when pruning is on.
        modalities = [] if self.is_multimodal_pruning_enabled else ["image", "video"]

        # Compute max_frames_per_video for budget sizing.
        max_frames = self.get_max_frames_per_video() if "video" in modalities else 1

        return EncoderCudaGraphConfig(
            modalities=modalities,
            buffer_keys=[
                "pixel_values",
                "pos_embeds",
                "rotary_pos_emb_cos",
                "rotary_pos_emb_sin",
                "cu_seqlens",
                "max_seqlen",
                "sequence_lengths",
            ],
            out_hidden_size=self.visual.out_hidden_size,
            max_frames_per_video=max_frames,
        )

    def get_input_modality(
        self,
        mm_kwargs: dict[str, Any],
    ) -> str:
        if "image_grid_thw" in mm_kwargs:
            return "image"
        elif "video_grid_thw" in mm_kwargs:
            return "video"
        raise AssertionError("This line should be unreachable.")

    def get_max_frames_per_video(self) -> int:
        mm_registry = MULTIMODAL_REGISTRY
        info = mm_registry.get_processing_info(self.model_config)
        max_frames_per_video = info.get_num_frames_with_most_features(
            seq_len=self.model_config.max_model_len,
            mm_counts={"video": self.multimodal_config.get_limit_per_prompt("video")},
        )
        # Small 'max_frames_per_video' will cause 'tensor mismatch' in PR#43403
        # 16 is the default 'num_frames' of '_get_vision_info'
        return max(max_frames_per_video, 16)

    def get_encoder_cudagraph_budget_range(
        self,
        vllm_config,
    ) -> tuple[int, int]:
        # Min: estimated smallest possible encoder input.
        # 224x224 image → 16x16 patches (patch_size=14)
        #                 spatial_merge_size=2 → 8x8 = 64 tokens
        min_budget = 64
        # Max: capped by max_num_batched_tokens
        max_budget = min(
            vllm_config.scheduler_config.max_num_batched_tokens,
            vllm_config.model_config.max_model_len,
        )
        return (min_budget, max_budget)

    def _get_pixel_values_by_modality(
        self,
        mm_kwargs: dict[str, Any],
    ) -> torch.Tensor:
        if self.get_input_modality(mm_kwargs) == "image":
            pixel_values = mm_kwargs["pixel_values"]
        else:
            pixel_values = mm_kwargs["pixel_values_videos"]
        return pixel_values

    def _get_grid_thw_by_modality(
        self,
        mm_kwargs: dict[str, Any],
    ) -> list[tuple[int, int, int]]:
        grid_thw_key = f"{self.get_input_modality(mm_kwargs)}_grid_thw"
        grid_thw = mm_kwargs[grid_thw_key]
        if not isinstance(grid_thw, list):
            grid_thw = grid_thw.tolist()
        return grid_thw

    def get_encoder_cudagraph_item_specs(
        self,
        mm_kwargs: dict[str, Any],
    ):
        from vllm.v1.worker.encoder_cudagraph_defs import EncoderItemSpec

        m = self.visual.spatial_merge_size
        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
        return [
            EncoderItemSpec(
                input_size=t * h * w,
                output_tokens=t * (h // m) * (w // m),
            )
            for t, h, w in grid_thw
        ]

    def select_encoder_cudagraph_items(
        self,
        mm_kwargs: dict[str, Any],
        indices: list[int],
    ) -> dict[str, Any]:
        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
        pixel_values = self._get_pixel_values_by_modality(mm_kwargs)

        if len(indices) == 0:
            if self.get_input_modality(mm_kwargs) == "image":
                return {
                    "pixel_values": pixel_values[:0],
                    "image_grid_thw": [],
                }
            else:
                return {
                    "pixel_values_videos": pixel_values[:0],
                    "video_grid_thw": [],
                }

        # Compute cumulative patch offsets for slicing pixel_values
        patches_per_item = [t * h * w for t, h, w in grid_thw]
        cum_patches = [0]
        for p in patches_per_item:
            cum_patches.append(cum_patches[-1] + p)

        selected_pv = torch.cat(
            [pixel_values[cum_patches[i] : cum_patches[i + 1]] for i in indices]
        )
        selected_grid = [grid_thw[i] for i in indices]

        if self.get_input_modality(mm_kwargs) == "image":
            return {
                "pixel_values": selected_pv,
                "image_grid_thw": selected_grid,
            }
        else:
            return {
                "pixel_values_videos": selected_pv,
                "video_grid_thw": selected_grid,
            }

    def prepare_encoder_cudagraph_capture_inputs(
        self,
        token_budget: int,
        max_batch_size: int,
        max_frames_per_batch: int,
        device: torch.device,
        dtype: torch.dtype,
    ):
        from vllm.v1.worker.encoder_cudagraph_defs import (
            EncoderCudaGraphCaptureInputs,
        )

        spatial_merge_size = self.visual.spatial_merge_size
        per_mm_item_output = token_budget // max_batch_size

        frames_per_item = max_frames_per_batch // max_batch_size
        if frames_per_item > 1:
            # Build the capture grid using a video-format layout so that
            # cu_seqlens is sized for video replays from the start.
            # cu_seqlens has one entry per attention sequence (one per frame),
            # so using T > 1 per item makes the buffer large enough without
            # relying solely on padding.
            # Ceiling ensures frames_per_item * tokens_per_frame >= per_mm_item_output
            # so the pixel_values buffer covers any valid single-item replay.
            tokens_per_frame = (
                per_mm_item_output + frames_per_item - 1
            ) // frames_per_item
            # Video-format grid_config (T=frames_per_item).
            grid_config = [
                [
                    frames_per_item,
                    spatial_merge_size,
                    tokens_per_frame * spatial_merge_size,
                ]
                for _ in range(max_batch_size)
            ]
        else:
            # Image-format grid_config (T=1).
            grid_config = [
                [1, spatial_merge_size, per_mm_item_output * spatial_merge_size]
                for _ in range(max_batch_size)
            ]

        # Create dummy pixel_values
        patch_embed = self.visual.patch_embed
        in_channels = patch_embed.proj.in_channels
        patch_size = patch_embed.patch_size
        temporal_patch_size = patch_embed.temporal_patch_size
        total_patches = sum(t * h * w for t, h, w in grid_config)
        flattened_patch_size = (
            in_channels * temporal_patch_size * patch_size * patch_size
        )
        dummy_pixel_values = torch.randn(
            total_patches, flattened_patch_size, device=device, dtype=dtype
        )

        # Override max_seqlen with a safe upper bound for capture.
        # max_seqlen.item() gets baked into the CUDA graph (not replayed),
        # so the capture value must cover any replay scenario.
        # Worst case: 1 item consuming the full budget ->
        # seq_len = token_budget * spatial_merge_size^2.
        metadata = self.visual.prepare_encoder_metadata(
            grid_config,
            max_batch_size=max_batch_size,
            max_frames_per_batch=max_frames_per_batch,
            max_seqlen_override=token_budget * (spatial_merge_size**2),
            device=device,
        )

        # Just use image-modality dummy input_buffer for capturing, since it's also
        # compatible for video inputs (has the same shape: [num_patches, C*T*P*P]).
        values = metadata | {
            "pixel_values": dummy_pixel_values,
        }

        return EncoderCudaGraphCaptureInputs(
            values=values,
        )

    def prepare_encoder_cudagraph_replay_buffers(
        self,
        mm_kwargs: dict[str, Any],
        max_batch_size: int,
        max_frames_per_batch: int,
    ):
        modality = self.get_input_modality(mm_kwargs)
        grid_thw_list = self._get_grid_thw_by_modality(mm_kwargs)

        if modality == "image":
            metadata = self.visual.prepare_encoder_metadata(
                grid_thw_list,
                max_batch_size=max_batch_size,
            )
        elif modality == "video":
            metadata = self.visual.prepare_encoder_metadata(
                grid_thw_list,
                max_frames_per_batch=max_frames_per_batch,
            )
        else:
            raise AssertionError("This line should be unreachable.")

        values = metadata | {
            "pixel_values": self._get_pixel_values_by_modality(mm_kwargs),
        }
        return EncoderCudaGraphReplayBuffers(values=values)

    def encoder_cudagraph_forward(
        self,
        values: dict[str, torch.Tensor],
    ) -> torch.Tensor:
        pixel_values = values.pop("pixel_values")
        metadata = values
        return self.visual(pixel_values, None, encoder_metadata=metadata)

    def encoder_eager_forward(
        self,
        mm_kwargs: dict[str, Any],
    ) -> torch.Tensor:
        pixel_values = self._get_pixel_values_by_modality(mm_kwargs)
        grid_thw = self._get_grid_thw_by_modality(mm_kwargs)
        return self.visual(pixel_values, grid_thw)

    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
        mm_input_by_modality = {}

        # Preserve the order of modalities if there are multiple of them
        # from the order of kwargs.
        for input_key in kwargs:
            if (
                input_key in ("pixel_values", "image_embeds")
                and "image" not in mm_input_by_modality
            ):
                mm_input_by_modality["image"] = self._parse_and_validate_image_input(
                    **kwargs
                )
            if (
                input_key in ("pixel_values_videos", "video_embeds")
                and "video" not in mm_input_by_modality
            ):
                mm_input_by_modality["video"] = self._parse_and_validate_video_input(
                    **kwargs
                )
        return mm_input_by_modality

    def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None:
        mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs)
        if not mm_input_by_modality:
            return None

        # The result multimodal_embeddings is tuple of tensors, with each
        # tensor corresponding to a multimodal data item (image or video).
        multimodal_embeddings: tuple[torch.Tensor, ...] = ()

        # NOTE: It is important to iterate over the keys in this dictionary
        # to preserve the order of the modalities.
        for modality in mm_input_by_modality:
            multimodal_input = mm_input_by_modality[modality]
            if modality == "image":
                image_embeddings = self._process_image_input(multimodal_input)
                multimodal_embeddings += tuple(image_embeddings)
            if modality == "video":
                video_embeddings = self._process_video_input(multimodal_input)
                multimodal_embeddings += tuple(video_embeddings)
        return multimodal_embeddings

    def iter_mm_grid_thw(
        self, mm_features: list[MultiModalFeatureSpec]
    ) -> Iterator[tuple[int, int, int, int]]:
        hf_config = self.config
        spatial_merge_size = hf_config.vision_config.spatial_merge_size
        for mm_feature in sorted(mm_features, key=lambda f: f.mm_position.offset):
            embed_ranges = mm_feature.mm_position.extract_embeds_range()
            if mm_feature.modality == "image":
                t, h, w = mm_feature.data["image_grid_thw"].data.tolist()
                assert t == 1, f"Image must have 1 frame, got {t}"
                assert len(embed_ranges) == 1
                offset, end = embed_ranges[0]
                assert end - offset + 1 == h * w // spatial_merge_size**2
                yield offset, t, h // spatial_merge_size, w // spatial_merge_size
            elif mm_feature.modality == "video":
                t, h, w = mm_feature.data["video_grid_thw"].data.tolist()
                llm_grid_h = h // spatial_merge_size
                llm_grid_w = w // spatial_merge_size
                num_tokens_per_frame = llm_grid_h * llm_grid_w

                if len(embed_ranges) == t:
                    for offset, end in embed_ranges:
                        assert end - offset + 1 == num_tokens_per_frame
                        yield offset, 1, llm_grid_h, llm_grid_w
                else:
                    offset = mm_feature.mm_position.offset
                    yield offset, t, llm_grid_h, llm_grid_w
            else:
                raise ValueError(f"Unsupported modality: {mm_feature.modality}")

    def get_mrope_input_positions(
        self,
        input_tokens: list[int],
        mm_features: list[MultiModalFeatureSpec],
    ) -> tuple[torch.Tensor, int]:
        llm_pos_ids_list: list = []
        st = 0
        for (
            offset,
            llm_grid_t,
            llm_grid_h,
            llm_grid_w,
        ) in self.iter_mm_grid_thw(mm_features):
            text_len = offset - st
            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
            llm_pos_ids_list.append(
                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
            )
            grid_indices = np.indices((llm_grid_t, llm_grid_h, llm_grid_w)).reshape(
                3, -1
            )
            llm_pos_ids_list.append(grid_indices + text_len + st_idx)
            st = offset + llm_grid_t * llm_grid_h * llm_grid_w

        if st < len(input_tokens):
            text_len = len(input_tokens) - st
            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
            llm_pos_ids_list.append(
                np.broadcast_to(np.arange(text_len), (3, text_len)) + st_idx
            )

        llm_positions = np.concatenate(llm_pos_ids_list, axis=1).reshape(3, -1)
        mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item()
        return torch.from_numpy(llm_positions), mrope_position_delta

    def forward(
        self,
        input_ids: torch.Tensor | None,
        positions: torch.Tensor,
        intermediate_tensors: IntermediateTensors | None = None,
        inputs_embeds: torch.Tensor | None = None,
        **kwargs: object,
    ) -> torch.Tensor | IntermediateTensors:
        """Run forward pass for GLM-4V.

        Args:
            input_ids: Flattened (concatenated) input_ids corresponding to a
                batch.
            positions: Flattened (concatenated) position ids corresponding to a
                batch.
                **NOTE**: If mrope is enabled (default setting for GLM-4V
                opensource models), the shape will be `(3, seq_len)`,
                otherwise it will be `(seq_len,).
            intermediate_tensors: Optional intermediate tensors for pipeline
                parallelism.
            inputs_embeds: Optional pre-computed input embeddings.
            **kwargs: Additional keyword arguments.
        """
        if intermediate_tensors is not None:
            inputs_embeds = None

        hidden_states = self.language_model.model(
            input_ids=input_ids,
            positions=positions,
            intermediate_tensors=intermediate_tensors,
            inputs_embeds=inputs_embeds,
        )
        return hidden_states

    def compute_logits(
        self,
        hidden_states: torch.Tensor,
    ) -> torch.Tensor | None:
        return self.language_model.compute_logits(hidden_states)

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(self)
        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)

    def get_mm_mapping(self) -> MultiModelKeys:
        """
        Get the module prefix in multimodal models
        """
        return MultiModelKeys.from_string_field(
            language_model="language_model.model",
            connector="visual.merger.",
            tower_model="visual.",
        )

    def get_num_mm_encoder_tokens(
        self,
        num_image_tokens: int,
    ) -> int:
        merge_size = self.config.vision_config.spatial_merge_size
        return num_image_tokens * (merge_size**2)

    def get_num_mm_connector_tokens(
        self,
        num_vision_tokens: int,
    ) -> int:
        merge_size = self.config.vision_config.spatial_merge_size
        return num_vision_tokens // (merge_size**2)

`forward(input_ids, positions, intermediate_tensors=None, inputs_embeds=None, **kwargs)` ¶

Run forward pass for GLM-4V.

Parameters:

input_ids ¶
(Tensor | None) –

Flattened (concatenated) input_ids corresponding to a batch.
positions ¶
(Tensor) –

Flattened (concatenated) position ids corresponding to a batch. NOTE: If mrope is enabled (default setting for GLM-4V opensource models), the shape will be (3, seq_len), otherwise it will be `(seq_len,).
intermediate_tensors ¶
(IntermediateTensors | None, default: None ) –

Optional intermediate tensors for pipeline parallelism.
inputs_embeds ¶
(Tensor | None, default: None ) –

Optional pre-computed input embeddings.
**kwargs ¶
(object, default: {} ) –

Additional keyword arguments.

Source code in vllm/model_executor/models/glm4_1v.py

def forward(
    self,
    input_ids: torch.Tensor | None,
    positions: torch.Tensor,
    intermediate_tensors: IntermediateTensors | None = None,
    inputs_embeds: torch.Tensor | None = None,
    **kwargs: object,
) -> torch.Tensor | IntermediateTensors:
    """Run forward pass for GLM-4V.

    Args:
        input_ids: Flattened (concatenated) input_ids corresponding to a
            batch.
        positions: Flattened (concatenated) position ids corresponding to a
            batch.
            **NOTE**: If mrope is enabled (default setting for GLM-4V
            opensource models), the shape will be `(3, seq_len)`,
            otherwise it will be `(seq_len,).
        intermediate_tensors: Optional intermediate tensors for pipeline
            parallelism.
        inputs_embeds: Optional pre-computed input embeddings.
        **kwargs: Additional keyword arguments.
    """
    if intermediate_tensors is not None:
        inputs_embeds = None

    hidden_states = self.language_model.model(
        input_ids=input_ids,
        positions=positions,
        intermediate_tensors=intermediate_tensors,
        inputs_embeds=inputs_embeds,
    )
    return hidden_states

`get_mm_mapping()` ¶

Get the module prefix in multimodal models

Source code in vllm/model_executor/models/glm4_1v.py

def get_mm_mapping(self) -> MultiModelKeys:
    """
    Get the module prefix in multimodal models
    """
    return MultiModelKeys.from_string_field(
        language_model="language_model.model",
        connector="visual.merger.",
        tower_model="visual.",
    )

`Glm4vImageEmbeddingInputs` ¶

Bases: TensorSchema

Dimensions

f: Number of image features (varies based on image resolution)
h: Hidden size (must match language model backbone)
n: Number of images
g: Grid dimensions (3 for grid_t, grid_h, grid_w)

Source code in vllm/model_executor/models/glm4_1v.py

class Glm4vImageEmbeddingInputs(TensorSchema):
    """
    Dimensions:
        - f: Number of image features (varies based on image resolution)
        - h: Hidden size (must match language model backbone)
        - n: Number of images
        - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
    """

    type: Literal["image_embeds"] = "image_embeds"

    image_embeds: Annotated[torch.Tensor, TensorShape("f", "h")]
    image_grid_thw: Annotated[torch.Tensor, TensorShape("n", 3)]

`Glm4vImagePixelInputs` ¶

Bases: TensorSchema

Dimensions

np: Number of patches
cpp: Number of channels * patch_size * patch_size
ni: Number of images
g: Grid dimensions (3 for grid_t, grid_h, grid_w)

Source code in vllm/model_executor/models/glm4_1v.py

class Glm4vImagePixelInputs(TensorSchema):
    """
    Dimensions:
        - np: Number of patches
        - cpp: Number of channels * patch_size * patch_size
        - ni: Number of images
        - g: Grid dimensions (3 for grid_t, grid_h, grid_w)
    """

    type: Literal["pixel_values"] = "pixel_values"

    pixel_values: Annotated[torch.Tensor, TensorShape("np", "cpp")]
    image_grid_thw: Annotated[torch.Tensor, TensorShape("ni", 3)]

`Glm4vProcessingInfo` ¶

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/glm4_1v.py

class Glm4vProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"image": None, "video": 1}

    def get_image_processor(self, **kwargs: object) -> Glm4vImageProcessor:
        return self.get_hf_processor(**kwargs).image_processor

    def get_video_processor(self, **kwargs: object) -> Glm4vVideoProcessor:
        return self.get_hf_processor(**kwargs).video_processor

    def get_mm_max_tokens_per_item(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> Mapping[str, int] | None:
        processor = self.get_hf_processor()
        if isinstance(processor, Glm4vProcessor):
            return None

        result: dict[str, int] = {}

        if mm_counts.get("image", 0) > 0:
            result["image"] = self.get_max_image_tokens()

        if mm_counts.get("video", 0) > 0:
            video_processor = self.get_video_processor()
            max_pixels = video_processor.size["longest_edge"]

            vision_config = self.get_hf_config().vision_config
            temporal_patch_size = vision_config.temporal_patch_size
            patch_size = vision_config.patch_size
            merge_size = vision_config.spatial_merge_size

            max_vision_tokens = max_pixels // (
                temporal_patch_size * patch_size**2 * merge_size**2
            )

            # GLMGA supports up to 640 frames (max_frames).
            max_grid_t = 640 // temporal_patch_size

            tokenizer = self.get_tokenizer()
            max_ts_tokens = max(
                len(tokenizer.encode(f"{t:.1f} seconds", add_special_tokens=False))
                for t in range(min(max_grid_t, 300))
            )

            result["video"] = max_vision_tokens + max_grid_t * (2 + max_ts_tokens) + 2

        return result

    def get_data_parser(self):
        return MultiModalDataParser(
            video_needs_metadata=True,
            expected_hidden_size=self._get_expected_hidden_size(),
        )

    def _get_vision_info(
        self,
        *,
        image_width: int,
        image_height: int,
        num_frames: int = 16,
        do_resize: bool = True,
        max_image_pixels: int = 28 * 28 * 2 * 30000,
    ) -> tuple[ImageSize, int]:
        hf_config = self.get_hf_config()
        vision_config = hf_config.vision_config
        patch_size = vision_config.patch_size
        merge_size = vision_config.spatial_merge_size
        temporal_patch_size = vision_config.temporal_patch_size
        if do_resize:
            resized_height, resized_width = smart_resize(
                num_frames=num_frames
                if num_frames > temporal_patch_size
                else temporal_patch_size,
                height=image_height,
                width=image_width,
                factor=patch_size * merge_size,
                max_pixels=max_image_pixels,
            )
            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
        else:
            preprocessed_size = ImageSize(width=image_width, height=image_height)

        # NOTE: Frames are padded to be divisible by `temporal_patch_size`
        # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L294
        padded_num_frames = num_frames + num_frames % temporal_patch_size

        grid_t = max(padded_num_frames // temporal_patch_size, 1)
        grid_h = preprocessed_size.height // patch_size
        grid_w = preprocessed_size.width // patch_size

        num_patches = grid_t * grid_h * grid_w
        num_vision_tokens = num_patches // (merge_size**2)

        return preprocessed_size, num_vision_tokens

    def _get_image_max_pixels(self) -> int:
        """Read max_pixels from the HF image processor config.

        Despite the name, ``longest_edge`` is a pixel **area** (total pixel
        count), not an edge length.  The HF processor passes it directly to
        ``smart_resize`` as the ``max_pixels`` argument, which constrains
        ``t_bar * h_bar * w_bar <= max_pixels``.
        """
        return self.get_image_processor().size["longest_edge"]

    def get_image_size_with_most_features(self) -> ImageSize:
        # Use num_frames=1 for single-image budget estimation.
        # _get_vision_info defaults to num_frames=16 (video), which
        # makes smart_resize constrain 16*H*W <= max_pixels, vastly
        # underestimating the spatial budget for a single image and
        # causing encoder cache overflow for large images
        # (see https://github.com/vllm-project/vllm/issues/34040).
        max_image_size, _ = self._get_vision_info(
            image_width=9999999,
            image_height=9999999,
            num_frames=1,
            max_image_pixels=self._get_image_max_pixels(),
        )
        return max_image_size

    def get_num_image_tokens(
        self,
        *,
        image_width: int,
        image_height: int,
    ) -> int:
        _, num_image_tokens = self._get_vision_info(
            image_width=image_width,
            image_height=image_height,
            num_frames=1,
            max_image_pixels=self._get_image_max_pixels(),
        )
        return num_image_tokens

    def get_max_image_tokens(self) -> int:
        target_width, target_height = self.get_image_size_with_most_features()

        return self.get_num_image_tokens(
            image_width=target_width,
            image_height=target_height,
        )

    def get_num_video_tokens(
        self,
        *,
        image_width: int,
        image_height: int,
        num_frames: int,
    ) -> int:
        _, num_video_tokens = self._get_vision_info(
            image_width=image_width,
            image_height=image_height,
            num_frames=num_frames,
            max_image_pixels=28 * 28 * 2 * 30000,
        )
        return num_video_tokens

    def _get_max_video_frames(self, max_tokens: int) -> int:
        target_width, target_height = self.get_image_size_with_most_features()

        num_frames = 0

        while True:
            next_num_frames = num_frames + 1
            next_max_tokens = self.get_num_video_tokens(
                image_width=target_width,
                image_height=target_height,
                num_frames=next_num_frames,
            )
            if next_max_tokens > max_tokens or next_max_tokens == 0:
                break

            num_frames = next_num_frames

        return num_frames

    def get_num_frames_with_most_features(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> int:
        max_images = mm_counts.get("image", 0)
        max_videos = mm_counts.get("video", 0)

        max_image_tokens = self.get_max_image_tokens() * max_images
        max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens)
        max_frames_per_video = min(
            max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO
        )

        return max(max_frames_per_video, 1)

    def _get_video_second_idx_glm4v(
        self, metadata: dict[str, Any], total_frames: int
    ) -> list[int]:
        video_processor = self.get_video_processor()

        video_fps = metadata.get("fps", video_processor.fps)
        meta_frames = metadata.get("total_num_frames", total_frames)
        max_frame_idx = meta_frames - 1
        duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1)
        do_sample_frames = metadata["do_sample_frames"]
        if not do_sample_frames:
            frame_indices = metadata["frames_indices"]
        else:
            if duration <= video_processor.max_duration:
                n = int(math.floor(duration * video_processor.fps))
                frame_indices = [
                    min(
                        max_frame_idx,
                        int(math.ceil(i * video_fps / video_processor.fps)),
                    )
                    for i in range(n)
                ]
            else:
                num_samples = int(video_processor.max_duration * video_processor.fps)
                if num_samples >= meta_frames:
                    frame_indices = list(range(meta_frames))
                else:
                    target_seconds = np.linspace(
                        0, duration, num_samples, endpoint=True
                    )
                    frame_indices = [
                        min(max_frame_idx, int(math.ceil(t * video_fps)))
                        for t in target_seconds
                    ]

        seen, uniq = set(), []
        for idx in frame_indices:
            if idx not in seen:
                seen.add(idx)
                uniq.append(idx)
        if len(uniq) & 1:
            uniq.append(uniq[-1])
        frame_indices = uniq

        full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
        timestamps_list = full_second_idxs[::2]
        selected_timestamps = []
        for idx in range(0, len(timestamps_list)):
            selected_timestamps.append(timestamps_list[idx])
        return selected_timestamps

    def _get_video_second_idx_glm46v(
        self, metadata: dict[str, Any], total_frames: int
    ) -> list[int]:
        video_processor = self.get_video_processor()

        video_fps = metadata["fps"]
        meta_frames = metadata.get("total_num_frames", total_frames)
        max_frame_idx = meta_frames - 1
        duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1)

        do_sample_frames = metadata.get("do_sample_frames", True)
        if not do_sample_frames:
            frame_indices = metadata["frames_indices"]
        else:
            DYNAMIC_FPS_THRES = {30: 3, 300: 1, 2400: 0.5}
            MAX_FRAME_COUNT_DYNAMIC = 640
            MAX_DURATION = 2400

            effective_duration = min(duration, MAX_DURATION)
            if effective_duration <= 30:
                target_fps = DYNAMIC_FPS_THRES[30]
            elif effective_duration <= 300:
                target_fps = DYNAMIC_FPS_THRES[300]
            else:
                target_fps = DYNAMIC_FPS_THRES[2400]

            temporal_patch_size = getattr(video_processor, "temporal_patch_size", 1)
            extract_t = int(effective_duration * target_fps * temporal_patch_size)
            extract_t = min(extract_t, MAX_FRAME_COUNT_DYNAMIC)

            duration_per_frame = 1 / video_fps
            timestamps = [i * duration_per_frame for i in range(meta_frames)]
            max_second = int(duration)

            if meta_frames < extract_t:
                frame_indices = np.linspace(
                    0, meta_frames - 1, extract_t, dtype=int
                ).tolist()
            else:
                frame_indices = []
                current_second = 0.0
                inv_fps = 1 / (temporal_patch_size * target_fps)
                for frame_index in range(meta_frames):
                    if timestamps[frame_index] >= current_second:
                        current_second += inv_fps
                        frame_indices.append(frame_index)
                        if current_second >= max_second:
                            break

            if len(frame_indices) < extract_t:
                if len(frame_indices) == 0:
                    start, end = 0, max(meta_frames - 1, 0)
                else:
                    start, end = frame_indices[0], frame_indices[-1]
                frame_indices = np.linspace(start, end, extract_t, dtype=int).tolist()
            elif len(frame_indices) > extract_t:
                frame_indices = np.linspace(
                    0, meta_frames - 1, extract_t, dtype=int
                ).tolist()

        seen, uniq = set(), []
        for idx in frame_indices:
            if idx not in seen:
                seen.add(idx)
                uniq.append(idx)

        if len(uniq) & 1:
            uniq.append(uniq[-1])

        frame_indices = uniq
        full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
        timestamps_list = full_second_idxs[::2]
        selected_timestamps = []
        for idx in range(len(timestamps_list)):
            selected_timestamps.append(timestamps_list[idx])
        return selected_timestamps

    def _is_glmga_model(self, processor: object) -> bool:
        """Detect GLMGA variant via its Glmga sub-processors."""
        for attr in ("image_processor", "video_processor"):
            sub = getattr(processor, attr, None)
            if sub and "Glmga" in type(sub).__name__:
                return True
        return False

    def _get_video_second_idx_glmga(
        self, metadata: dict[str, Any], total_frames: int
    ) -> list[int]:
        """Fixed fps=2 frame selection matching GlmgaVideoProcessor.sample_frames."""
        video_processor = self.get_video_processor()

        video_fps = metadata["fps"]
        meta_frames = metadata.get("total_num_frames", total_frames)
        max_frame_idx = meta_frames - 1
        duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1)

        do_sample_frames = metadata.get("do_sample_frames", True)
        if not do_sample_frames:
            frame_indices = metadata["frames_indices"]
        else:
            target_fps = 2
            max_frames = getattr(video_processor, "max_frames", 640)
            extract_t = int(duration * target_fps)
            extract_t = min(extract_t, max_frames)

            duration_per_frame = 1 / video_fps
            timestamps = [i * duration_per_frame for i in range(meta_frames)]

            if meta_frames < extract_t:
                frame_indices = [
                    math.floor(i * meta_frames / extract_t) for i in range(extract_t)
                ]
            else:
                frame_indices = []
                current_second = 0.0
                inv_fps = 1 / target_fps
                for frame_index in range(meta_frames):
                    if timestamps[frame_index] >= current_second:
                        current_second += inv_fps
                        frame_indices.append(frame_index)
                        if current_second >= duration - inv_fps:
                            break

            if len(frame_indices) < extract_t:
                if len(frame_indices) == 0:
                    start, end = 0, max(meta_frames - 1, 0)
                else:
                    start, end = frame_indices[0], frame_indices[-1]
                frame_indices = np.linspace(start, end, extract_t, dtype=int).tolist()
            elif len(frame_indices) > extract_t:
                frame_indices = np.linspace(
                    0, meta_frames - 1, extract_t, dtype=int
                ).tolist()

        seen, uniq = set(), []
        for idx in frame_indices:
            if idx not in seen:
                seen.add(idx)
                uniq.append(idx)

        if len(uniq) & 1:
            uniq.append(uniq[-1])

        frame_indices = uniq
        full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
        timestamps_list = full_second_idxs[::2]
        return list(timestamps_list)

    def _construct_video_placeholder(
        self,
        video_array: np.ndarray,
        metadata: dict[str, Any],
        grid_thw: torch.Tensor,
    ) -> list[int]:
        hf_processor = self.get_hf_processor()
        tokenizer = self.get_tokenizer()
        image_processor = hf_processor.image_processor

        hf_config = self.get_hf_config()
        boi_token_id = hf_config.image_start_token_id
        eoi_token_id = hf_config.image_end_token_id
        bov_token_id = hf_config.video_start_token_id
        eov_token_id = hf_config.video_end_token_id
        merge_length = image_processor.merge_size**2

        assert isinstance(grid_thw, torch.Tensor)

        if isinstance(hf_processor, Glm4vProcessor):
            timestamps = self._get_video_second_idx_glm4v(metadata, len(video_array))
        elif self._is_glmga_model(hf_processor):
            timestamps = self._get_video_second_idx_glmga(metadata, len(video_array))
        else:
            timestamps = self._get_video_second_idx_glm46v(metadata, len(video_array))

        timestamp_format = (
            "{}" if isinstance(hf_processor, Glm4vProcessor) else "{:.1f} seconds"
        )
        frames_idx_token = [
            tokenizer.encode(timestamp_format.format(i), add_special_tokens=False)
            for i in timestamps
        ]
        T, H, W = grid_thw
        num_tokens_per_frame = int(H * W) // merge_length
        placeholder = []
        placeholder.append(bov_token_id)
        # Glm46VProcessor uses image_token_id for video frame embeddings;
        # Glm4vProcessor uses video_token_id.
        frame_embed_token_id = (
            hf_processor.video_token_id
            if isinstance(hf_processor, Glm4vProcessor) or not TRANSFORMERS_WITH_GA
            else hf_processor.image_token_id
        )
        for frame_idx in frames_idx_token:
            placeholder.append(boi_token_id)
            placeholder.extend([frame_embed_token_id] * num_tokens_per_frame)
            placeholder.append(eoi_token_id)
            placeholder.extend(frame_idx)
        placeholder.append(eov_token_id)

        return placeholder

`_get_image_max_pixels()` ¶

Read max_pixels from the HF image processor config.

Despite the name, longest_edge is a pixel area (total pixel count), not an edge length. The HF processor passes it directly to smart_resize as the max_pixels argument, which constrains t_bar * h_bar * w_bar <= max_pixels.

Source code in vllm/model_executor/models/glm4_1v.py

def _get_image_max_pixels(self) -> int:
    """Read max_pixels from the HF image processor config.

    Despite the name, ``longest_edge`` is a pixel **area** (total pixel
    count), not an edge length.  The HF processor passes it directly to
    ``smart_resize`` as the ``max_pixels`` argument, which constrains
    ``t_bar * h_bar * w_bar <= max_pixels``.
    """
    return self.get_image_processor().size["longest_edge"]

`_get_video_second_idx_glmga(metadata, total_frames)` ¶

Fixed fps=2 frame selection matching GlmgaVideoProcessor.sample_frames.

Source code in vllm/model_executor/models/glm4_1v.py

def _get_video_second_idx_glmga(
    self, metadata: dict[str, Any], total_frames: int
) -> list[int]:
    """Fixed fps=2 frame selection matching GlmgaVideoProcessor.sample_frames."""
    video_processor = self.get_video_processor()

    video_fps = metadata["fps"]
    meta_frames = metadata.get("total_num_frames", total_frames)
    max_frame_idx = meta_frames - 1
    duration = metadata.get("duration", round(max_frame_idx / video_fps) + 1)

    do_sample_frames = metadata.get("do_sample_frames", True)
    if not do_sample_frames:
        frame_indices = metadata["frames_indices"]
    else:
        target_fps = 2
        max_frames = getattr(video_processor, "max_frames", 640)
        extract_t = int(duration * target_fps)
        extract_t = min(extract_t, max_frames)

        duration_per_frame = 1 / video_fps
        timestamps = [i * duration_per_frame for i in range(meta_frames)]

        if meta_frames < extract_t:
            frame_indices = [
                math.floor(i * meta_frames / extract_t) for i in range(extract_t)
            ]
        else:
            frame_indices = []
            current_second = 0.0
            inv_fps = 1 / target_fps
            for frame_index in range(meta_frames):
                if timestamps[frame_index] >= current_second:
                    current_second += inv_fps
                    frame_indices.append(frame_index)
                    if current_second >= duration - inv_fps:
                        break

        if len(frame_indices) < extract_t:
            if len(frame_indices) == 0:
                start, end = 0, max(meta_frames - 1, 0)
            else:
                start, end = frame_indices[0], frame_indices[-1]
            frame_indices = np.linspace(start, end, extract_t, dtype=int).tolist()
        elif len(frame_indices) > extract_t:
            frame_indices = np.linspace(
                0, meta_frames - 1, extract_t, dtype=int
            ).tolist()

    seen, uniq = set(), []
    for idx in frame_indices:
        if idx not in seen:
            seen.add(idx)
            uniq.append(idx)

    if len(uniq) & 1:
        uniq.append(uniq[-1])

    frame_indices = uniq
    full_second_idxs = [int(idx / video_fps) for idx in frame_indices]
    timestamps_list = full_second_idxs[::2]
    return list(timestamps_list)

`_is_glmga_model(processor)` ¶

Detect GLMGA variant via its Glmga sub-processors.

Source code in vllm/model_executor/models/glm4_1v.py

def _is_glmga_model(self, processor: object) -> bool:
    """Detect GLMGA variant via its Glmga sub-processors."""
    for attr in ("image_processor", "video_processor"):
        sub = getattr(processor, attr, None)
        if sub and "Glmga" in type(sub).__name__:
            return True
    return False

`Glm4vVideoEmbeddingInputs` ¶

Bases: TensorSchema

Dimensions

p: Number of video patches across all frames
h: Hidden size (must match language model backbone)
f: Number of frames
g: Grid dimensions (3 for grid_t which is usually 1 for processed video, grid_h, grid_w)

Source code in vllm/model_executor/models/glm4_1v.py

class Glm4vVideoEmbeddingInputs(TensorSchema):
    """
    Dimensions:
        - p: Number of video patches across all frames
        - h: Hidden size (must match language model backbone)
        - f: Number of frames
        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
          video, grid_h, grid_w)
    """

    type: Literal["video_embeds"] = "video_embeds"

    video_embeds: Annotated[torch.Tensor, TensorShape("p", "h")]
    video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)]

`Glm4vVideoPixelInputs` ¶

Bases: TensorSchema

Dimensions

np: Number of patches
ctpp: Number of channels * temporal_patch_size * patch_size * patch_size
f: Number of frames
g: Grid dimensions (3 for grid_t which is usually 1 for processed video, grid_h, grid_w)

Source code in vllm/model_executor/models/glm4_1v.py

class Glm4vVideoPixelInputs(TensorSchema):
    """
    Dimensions:
        - np: Number of patches
        - ctpp: Number of channels * temporal_patch_size *
            patch_size * patch_size
        - f: Number of frames
        - g: Grid dimensions (3 for grid_t which is usually 1 for processed
          video, grid_h, grid_w)
    """

    type: Literal["pixel_values_videos"] = "pixel_values_videos"

    pixel_values_videos: Annotated[torch.Tensor, TensorShape("np", "ctpp")]
    video_grid_thw: Annotated[torch.Tensor, TensorShape("f", 3)]

`Glm4vVisionTransformer` ¶

Bases: Module

Methods:

pos_embeds_interpolate –

Pre-compute absolute position embeddings for all input samples.
prepare_encoder_metadata –

Compute encoder metadata from grid_thw_list.

Source code in vllm/model_executor/models/glm4_1v.py

class Glm4vVisionTransformer(nn.Module):
    def __init__(
        self,
        text_config: Glm4vTextConfig,
        vision_config: Glm4vVisionConfig,
        norm_eps: float = 1e-6,
        quant_config: QuantizationConfig | None = None,
        prefix: str = "",
    ) -> None:
        super().__init__()

        use_data_parallel = is_vit_use_data_parallel()
        self.tp_size = (
            1 if use_data_parallel else get_tensor_model_parallel_world_size()
        )

        patch_size = vision_config.patch_size
        temporal_patch_size = vision_config.temporal_patch_size
        in_channels = vision_config.in_channels
        depth = vision_config.depth
        self.hidden_size = vision_config.hidden_size
        self.num_heads = vision_config.num_heads

        self.patch_size = vision_config.patch_size
        self.spatial_merge_size = vision_config.spatial_merge_size
        self.out_hidden_size = vision_config.out_hidden_size

        self.patch_embed = Glm4vVisionPatchEmbed(
            patch_size=patch_size,
            temporal_patch_size=temporal_patch_size,
            in_channels=in_channels,
            hidden_size=self.hidden_size,
        )

        norm_layer = partial(RMSNorm, eps=norm_eps)
        head_dim = self.hidden_size // self.num_heads
        self.rotary_pos_emb = get_rope(
            head_size=head_dim,
            max_position=8192,
            is_neox_style=True,
            rope_parameters={"partial_rotary_factor": 0.5},
        )
        self.blocks = nn.ModuleList(
            [
                Glm4vVisionBlock(
                    dim=self.hidden_size,
                    num_heads=self.num_heads,
                    mlp_hidden_dim=vision_config.out_hidden_size,
                    norm_layer=norm_layer,
                    quant_config=quant_config,
                    prefix=f"{prefix}.blocks.{layer_idx}",
                )
                for layer_idx in range(depth)
            ]
        )
        self.merger = Glm4vPatchMerger(
            d_model=vision_config.out_hidden_size,
            context_dim=vision_config.intermediate_size,
            quant_config=quant_config,
            bias=False,
            prefix=f"{prefix}.merger",
        )
        self.embeddings = Glm4vVisionEmbeddings(vision_config)

        self.post_conv_layernorm = RMSNorm(
            vision_config.hidden_size, eps=vision_config.rms_norm_eps
        )
        self.downsample = Conv2dLayer(
            in_channels=vision_config.hidden_size,
            out_channels=vision_config.out_hidden_size,
            kernel_size=vision_config.spatial_merge_size,
            stride=vision_config.spatial_merge_size,
        )
        self.post_layernorm = RMSNorm(
            vision_config.hidden_size, eps=vision_config.rms_norm_eps
        )

        self.attn_backend = get_vit_attn_backend(
            head_size=head_dim,
            dtype=torch.get_default_dtype(),
        )

    @property
    def dtype(self) -> torch.dtype:
        return self.patch_embed.proj.weight.dtype

    @property
    def device(self) -> torch.device:
        return self.patch_embed.proj.weight.device

    def rot_pos_emb(
        self, grid_thw: list[list[int]]
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        pos_ids = []
        for t, h, w in grid_thw:
            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
            hpos_ids = (
                hpos_ids.reshape(
                    h // self.spatial_merge_size,
                    self.spatial_merge_size,
                    w // self.spatial_merge_size,
                    self.spatial_merge_size,
                )
                .permute(0, 2, 1, 3)
                .flatten()
            )
            wpos_ids = (
                wpos_ids.reshape(
                    h // self.spatial_merge_size,
                    self.spatial_merge_size,
                    w // self.spatial_merge_size,
                    self.spatial_merge_size,
                )
                .permute(0, 2, 1, 3)
                .flatten()
            )
            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
        pos_ids = torch.cat(pos_ids, dim=0)
        max_grid_size = max(max(h, w) for _, h, w in grid_thw)

        # Use pre-computed cos_sin_cache from RotaryEmbedding
        cos, sin = self.rotary_pos_emb.get_cos_sin(max_grid_size)

        pos_ids = pos_ids.to(cos.device, non_blocking=True)
        cos_combined = cos[pos_ids].flatten(1)
        sin_combined = sin[pos_ids].flatten(1)
        return cos_combined, sin_combined, pos_ids

    def compute_attn_mask_seqlen(
        self,
        cu_seqlens: torch.Tensor,
    ) -> torch.Tensor | None:
        max_seqlen = None
        if self.attn_backend in {
            AttentionBackendEnum.FLASH_ATTN,
            AttentionBackendEnum.ROCM_AITER_FA,
            AttentionBackendEnum.TRITON_ATTN,
        }:
            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
        return max_seqlen

    def pos_embeds_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
        """Pre-compute absolute position embeddings for all input samples.
        The original `self.embeddings` fused token embeddings and position embeddings
        in one call, which prevented preparing position embeddings as static metadata
        required by CUDA graph capture / replay. This method decouples the two by
        feeding an all-zero token tensor to `self.embeddings`. The module therefore only
        performs bicubic interpolation based on the coordinates and returns pure
        position embeddings. These are cached in `prepare_encoder_metadata` and later
        added to the patch tokens in `forward` via `x = x + pos_embeds`, keeping the
        forward graph compatible with CUDA graph replay. Coordinate generation matches
        `rot_pos_emb` exactly to guarantee spatial alignment.
        """

        device = self.embeddings.position_embedding.weight.device
        dtype = self.dtype
        all_embeds = []

        for t, h, w in grid_thw:
            # Use the same coordinate generation logic as rot_pos_emb
            # to ensure consistent positional embedding interpolation
            h_coords = torch.arange(h).unsqueeze(1).expand(-1, w)
            w_coords = torch.arange(w).unsqueeze(0).expand(h, -1)
            h_coords = (
                h_coords.reshape(
                    h // self.spatial_merge_size,
                    self.spatial_merge_size,
                    w // self.spatial_merge_size,
                    self.spatial_merge_size,
                )
                .permute(0, 2, 1, 3)
                .flatten()
            )
            w_coords = (
                w_coords.reshape(
                    h // self.spatial_merge_size,
                    self.spatial_merge_size,
                    w // self.spatial_merge_size,
                    self.spatial_merge_size,
                )
                .permute(0, 2, 1, 3)
                .flatten()
            )

            lengths = [h * w] * t
            image_shapes = torch.tensor([[t, h, w]], device=device)

            h_coords_repeated = h_coords.repeat(t)
            w_coords_repeated = w_coords.repeat(t)

            embeds = self.embeddings(
                embeddings=torch.zeros(
                    h * w * t, self.hidden_size, device=device, dtype=dtype
                ),
                lengths=lengths,
                image_shapes=image_shapes,
                h_coords=h_coords_repeated,
                w_coords=w_coords_repeated,
            )
            all_embeds.append(embeds)

        return torch.cat(all_embeds, dim=0).to(dtype)

    def prepare_encoder_metadata(
        self,
        grid_thw_list: list[list[int]],
        *,
        max_batch_size: int | None = None,
        max_frames_per_batch: int | None = None,
        max_seqlen_override: int | None = None,
        device: torch.device | None = None,
    ) -> dict[str, torch.Tensor | None]:
        """Compute encoder metadata from grid_thw_list.

        Shared by the eager forward path, CUDA graph capture, and
        CUDA graph replay to avoid duplicated implementation.

        Args:
            grid_thw_list: Grid configurations as list of [t, h, w].
            max_batch_size: If set, pad cu_seqlens to this size
                (needed for CUDA graph capture/replay).
            max_frames_per_batch: If set, overrides max_batch_size for
                cu_seqlens padding. For video inputs each item contributes
                T attention sequences (frames); this sizes the buffer to
                the total frame budget so video replays never overflow.
            max_seqlen_override: If set, use this value for max_seqlen
                instead of computing from cu_seqlens (needed for CUDA
                graph capture to cover worst-case replay scenarios).
            device: Device to place tensors on. Defaults to self.device.
        """
        if device is None:
            device = self.device

        metadata: dict[str, torch.Tensor | None] = {}

        # Positional embeddings
        metadata["pos_embeds"] = self.pos_embeds_interpolate(grid_thw_list)
        rotary_cos, rotary_sin, _ = self.rot_pos_emb(grid_thw_list)
        metadata["rotary_pos_emb_cos"] = rotary_cos
        metadata["rotary_pos_emb_sin"] = rotary_sin

        # cu_seqlens from grid_thw
        grid_thw_np = np.array(grid_thw_list, dtype=np.int32)
        patches_per_frame = grid_thw_np[:, 1] * grid_thw_np[:, 2]
        cu_seqlens = np.repeat(patches_per_frame, grid_thw_np[:, 0]).cumsum(
            dtype=np.int32
        )
        cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])

        # Pad cu_seqlens to the required number of sequences.
        # For videos each item contributes T frames = T attention sequences,
        # so the total can exceed max_batch_size. max_frames_per_batch
        # overrides the pad target when set.
        pad_to = (
            max_frames_per_batch if max_frames_per_batch is not None else max_batch_size
        )
        if pad_to is not None:
            num_seqs = len(cu_seqlens) - 1
            if num_seqs < pad_to:
                cu_seqlens = np.concatenate(
                    [
                        cu_seqlens,
                        np.full(
                            pad_to - num_seqs,
                            cu_seqlens[-1],
                            dtype=np.int32,
                        ),
                    ]
                )

        # sequence_lengths (backend-specific)
        metadata["sequence_lengths"] = MMEncoderAttention.maybe_compute_seq_lens(
            self.attn_backend, cu_seqlens, device
        )

        # max_seqlen
        if max_seqlen_override is not None:
            max_seqlen_val = max_seqlen_override
        else:
            max_seqlen_val = MMEncoderAttention.compute_max_seqlen(
                self.attn_backend, cu_seqlens
            )
        # Keep max_seqlen on CPU: attention wrappers call .item() on it,
        # and having it on GPU would capture a wasteful D2H copy in CUDA
        # graphs without changing behavior (the scalar is baked at capture).
        metadata["max_seqlen"] = torch.tensor(max_seqlen_val, dtype=torch.int32)

        # Recompute cu_seqlens (backend-specific transformation)
        metadata["cu_seqlens"] = MMEncoderAttention.maybe_recompute_cu_seqlens(
            self.attn_backend,
            cu_seqlens,
            self.hidden_size,
            self.tp_size,
            device,
        )

        return metadata

    def forward(
        self,
        x: torch.Tensor,
        grid_thw: torch.Tensor | list[list[int]],
        *,
        encoder_metadata: dict[str, torch.Tensor] | None = None,
    ) -> torch.Tensor:
        if encoder_metadata is None:
            if not isinstance(grid_thw, list):
                grid_thw = grid_thw.tolist()
            encoder_metadata = self.prepare_encoder_metadata(grid_thw)

        # patchify
        x = x.to(device=self.device, dtype=self.dtype)
        x = self.patch_embed(x)
        x = self.post_conv_layernorm(x)

        pos_embeds = encoder_metadata["pos_embeds"]
        x = x + pos_embeds

        # transformers
        x = x.unsqueeze(1)
        for blk in self.blocks:
            x = blk(
                x,
                cu_seqlens=encoder_metadata["cu_seqlens"],
                rotary_pos_emb_cos=encoder_metadata["rotary_pos_emb_cos"],
                rotary_pos_emb_sin=encoder_metadata["rotary_pos_emb_sin"],
                max_seqlen=encoder_metadata["max_seqlen"],
            )

        # adapter
        x = self.post_layernorm(x)

        x = x.view(-1, self.spatial_merge_size, self.spatial_merge_size, x.shape[-1])
        x = x.permute(0, 3, 1, 2)
        x = self.downsample(x).view(-1, self.out_hidden_size)
        x = self.merger(x)

        return x

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("attn.qkv.", "attn.q.", "q"),
            ("attn.qkv.", "attn.k.", "k"),
            ("attn.qkv.", "attn.v.", "v"),
            ("gate_up_proj", "gate_proj", 0),
            ("gate_up_proj", "up_proj", 1),
        ]
        params_dict = dict(self.named_parameters(remove_duplicate=False))
        loaded_params: set[str] = set()

        for name, loaded_weight in weights:
            for param_name, weight_name, shard_id in stacked_params_mapping:
                if weight_name not in name:
                    continue
                name = name.replace(weight_name, param_name)

                param = params_dict[name]
                weight_loader = param.weight_loader
                weight_loader(param, loaded_weight, shard_id)
                break
            else:
                param = params_dict[name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader)
                weight_loader(param, loaded_weight)
            loaded_params.add(name)
        return loaded_params

`pos_embeds_interpolate(grid_thw)` ¶

Pre-compute absolute position embeddings for all input samples. The original self.embeddings fused token embeddings and position embeddings in one call, which prevented preparing position embeddings as static metadata required by CUDA graph capture / replay. This method decouples the two by feeding an all-zero token tensor to self.embeddings. The module therefore only performs bicubic interpolation based on the coordinates and returns pure position embeddings. These are cached in prepare_encoder_metadata and later added to the patch tokens in forward via x = x + pos_embeds, keeping the forward graph compatible with CUDA graph replay. Coordinate generation matches rot_pos_emb exactly to guarantee spatial alignment.

Source code in vllm/model_executor/models/glm4_1v.py

def pos_embeds_interpolate(self, grid_thw: list[list[int]]) -> torch.Tensor:
    """Pre-compute absolute position embeddings for all input samples.
    The original `self.embeddings` fused token embeddings and position embeddings
    in one call, which prevented preparing position embeddings as static metadata
    required by CUDA graph capture / replay. This method decouples the two by
    feeding an all-zero token tensor to `self.embeddings`. The module therefore only
    performs bicubic interpolation based on the coordinates and returns pure
    position embeddings. These are cached in `prepare_encoder_metadata` and later
    added to the patch tokens in `forward` via `x = x + pos_embeds`, keeping the
    forward graph compatible with CUDA graph replay. Coordinate generation matches
    `rot_pos_emb` exactly to guarantee spatial alignment.
    """

    device = self.embeddings.position_embedding.weight.device
    dtype = self.dtype
    all_embeds = []

    for t, h, w in grid_thw:
        # Use the same coordinate generation logic as rot_pos_emb
        # to ensure consistent positional embedding interpolation
        h_coords = torch.arange(h).unsqueeze(1).expand(-1, w)
        w_coords = torch.arange(w).unsqueeze(0).expand(h, -1)
        h_coords = (
            h_coords.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            .permute(0, 2, 1, 3)
            .flatten()
        )
        w_coords = (
            w_coords.reshape(
                h // self.spatial_merge_size,
                self.spatial_merge_size,
                w // self.spatial_merge_size,
                self.spatial_merge_size,
            )
            .permute(0, 2, 1, 3)
            .flatten()
        )

        lengths = [h * w] * t
        image_shapes = torch.tensor([[t, h, w]], device=device)

        h_coords_repeated = h_coords.repeat(t)
        w_coords_repeated = w_coords.repeat(t)

        embeds = self.embeddings(
            embeddings=torch.zeros(
                h * w * t, self.hidden_size, device=device, dtype=dtype
            ),
            lengths=lengths,
            image_shapes=image_shapes,
            h_coords=h_coords_repeated,
            w_coords=w_coords_repeated,
        )
        all_embeds.append(embeds)

    return torch.cat(all_embeds, dim=0).to(dtype)

`prepare_encoder_metadata(grid_thw_list, *, max_batch_size=None, max_frames_per_batch=None, max_seqlen_override=None, device=None)` ¶

Compute encoder metadata from grid_thw_list.

Shared by the eager forward path, CUDA graph capture, and CUDA graph replay to avoid duplicated implementation.

Parameters:

grid_thw_list ¶
(list[list[int]]) –

Grid configurations as list of [t, h, w].
max_batch_size ¶
(int | None, default: None ) –

If set, pad cu_seqlens to this size (needed for CUDA graph capture/replay).
max_frames_per_batch ¶
(int | None, default: None ) –

If set, overrides max_batch_size for cu_seqlens padding. For video inputs each item contributes T attention sequences (frames); this sizes the buffer to the total frame budget so video replays never overflow.
max_seqlen_override ¶
(int | None, default: None ) –

If set, use this value for max_seqlen instead of computing from cu_seqlens (needed for CUDA graph capture to cover worst-case replay scenarios).
device ¶
(device | None, default: None ) –

Device to place tensors on. Defaults to self.device.

Source code in vllm/model_executor/models/glm4_1v.py

def prepare_encoder_metadata(
    self,
    grid_thw_list: list[list[int]],
    *,
    max_batch_size: int | None = None,
    max_frames_per_batch: int | None = None,
    max_seqlen_override: int | None = None,
    device: torch.device | None = None,
) -> dict[str, torch.Tensor | None]:
    """Compute encoder metadata from grid_thw_list.

    Shared by the eager forward path, CUDA graph capture, and
    CUDA graph replay to avoid duplicated implementation.

    Args:
        grid_thw_list: Grid configurations as list of [t, h, w].
        max_batch_size: If set, pad cu_seqlens to this size
            (needed for CUDA graph capture/replay).
        max_frames_per_batch: If set, overrides max_batch_size for
            cu_seqlens padding. For video inputs each item contributes
            T attention sequences (frames); this sizes the buffer to
            the total frame budget so video replays never overflow.
        max_seqlen_override: If set, use this value for max_seqlen
            instead of computing from cu_seqlens (needed for CUDA
            graph capture to cover worst-case replay scenarios).
        device: Device to place tensors on. Defaults to self.device.
    """
    if device is None:
        device = self.device

    metadata: dict[str, torch.Tensor | None] = {}

    # Positional embeddings
    metadata["pos_embeds"] = self.pos_embeds_interpolate(grid_thw_list)
    rotary_cos, rotary_sin, _ = self.rot_pos_emb(grid_thw_list)
    metadata["rotary_pos_emb_cos"] = rotary_cos
    metadata["rotary_pos_emb_sin"] = rotary_sin

    # cu_seqlens from grid_thw
    grid_thw_np = np.array(grid_thw_list, dtype=np.int32)
    patches_per_frame = grid_thw_np[:, 1] * grid_thw_np[:, 2]
    cu_seqlens = np.repeat(patches_per_frame, grid_thw_np[:, 0]).cumsum(
        dtype=np.int32
    )
    cu_seqlens = np.concatenate([np.zeros(1, dtype=np.int32), cu_seqlens])

    # Pad cu_seqlens to the required number of sequences.
    # For videos each item contributes T frames = T attention sequences,
    # so the total can exceed max_batch_size. max_frames_per_batch
    # overrides the pad target when set.
    pad_to = (
        max_frames_per_batch if max_frames_per_batch is not None else max_batch_size
    )
    if pad_to is not None:
        num_seqs = len(cu_seqlens) - 1
        if num_seqs < pad_to:
            cu_seqlens = np.concatenate(
                [
                    cu_seqlens,
                    np.full(
                        pad_to - num_seqs,
                        cu_seqlens[-1],
                        dtype=np.int32,
                    ),
                ]
            )

    # sequence_lengths (backend-specific)
    metadata["sequence_lengths"] = MMEncoderAttention.maybe_compute_seq_lens(
        self.attn_backend, cu_seqlens, device
    )

    # max_seqlen
    if max_seqlen_override is not None:
        max_seqlen_val = max_seqlen_override
    else:
        max_seqlen_val = MMEncoderAttention.compute_max_seqlen(
            self.attn_backend, cu_seqlens
        )
    # Keep max_seqlen on CPU: attention wrappers call .item() on it,
    # and having it on GPU would capture a wasteful D2H copy in CUDA
    # graphs without changing behavior (the scalar is baked at capture).
    metadata["max_seqlen"] = torch.tensor(max_seqlen_val, dtype=torch.int32)

    # Recompute cu_seqlens (backend-specific transformation)
    metadata["cu_seqlens"] = MMEncoderAttention.maybe_recompute_cu_seqlens(
        self.attn_backend,
        cu_seqlens,
        self.hidden_size,
        self.tp_size,
        device,
    )

    return metadata

`all_gather_interleave(local_tensor, hidden_size, tp_size)` ¶

All-gather the input tensor interleavely across model parallel group.

Source code in vllm/model_executor/models/glm4_1v.py

def all_gather_interleave(local_tensor, hidden_size: int, tp_size: int):
    """All-gather the input tensor interleavely across model parallel group."""
    import torch.distributed as dist

    gathered_tensors = [torch.zeros_like(local_tensor) for _ in range(tp_size)]
    dist.all_gather(
        gathered_tensors,
        local_tensor,
        group=parallel_state.get_tp_group().device_group,
    )

    gathered_tensors_split = [
        torch.split(tensor, hidden_size // tp_size, -1) for tensor in gathered_tensors
    ]
    ordered_tensors = [
        tensor for pair in zip(*gathered_tensors_split) for tensor in pair
    ]
    result_tensor = torch.cat(ordered_tensors, dim=-1)
    return result_tensor

`vllm.model_executor.models.glm4_1v` ¶

`Glm4vForConditionalGeneration` ¶

`forward(input_ids, positions, intermediate_tensors=None, inputs_embeds=None, **kwargs)` ¶

`input_ids` ¶

`positions` ¶

`intermediate_tensors` ¶

`inputs_embeds` ¶

`kwargs`** ¶

`get_mm_mapping()` ¶

`Glm4vImageEmbeddingInputs` ¶

`Glm4vImagePixelInputs` ¶

`Glm4vProcessingInfo` ¶

`_get_image_max_pixels()` ¶

`_get_video_second_idx_glmga(metadata, total_frames)` ¶

`_is_glmga_model(processor)` ¶

`Glm4vVideoEmbeddingInputs` ¶

`Glm4vVideoPixelInputs` ¶

`Glm4vVisionTransformer` ¶

`pos_embeds_interpolate(grid_thw)` ¶

`prepare_encoder_metadata(grid_thw_list, *, max_batch_size=None, max_frames_per_batch=None, max_seqlen_override=None, device=None)` ¶

`grid_thw_list` ¶

`max_batch_size` ¶

`max_frames_per_batch` ¶

`max_seqlen_override` ¶

`device` ¶

`all_gather_interleave(local_tensor, hidden_size, tp_size)` ¶

vllm.model_executor.models.glm4_1v ¶

Glm4vForConditionalGeneration ¶

forward(input_ids, positions, intermediate_tensors=None, inputs_embeds=None, **kwargs) ¶

input_ids ¶

positions ¶

intermediate_tensors ¶

inputs_embeds ¶

**kwargs ¶

get_mm_mapping() ¶

Glm4vImageEmbeddingInputs ¶

Glm4vImagePixelInputs ¶

Glm4vProcessingInfo ¶

_get_image_max_pixels() ¶

_get_video_second_idx_glmga(metadata, total_frames) ¶

_is_glmga_model(processor) ¶

Glm4vVideoEmbeddingInputs ¶

Glm4vVideoPixelInputs ¶

Glm4vVisionTransformer ¶

pos_embeds_interpolate(grid_thw) ¶

prepare_encoder_metadata(grid_thw_list, *, max_batch_size=None, max_frames_per_batch=None, max_seqlen_override=None, device=None) ¶

grid_thw_list ¶

max_batch_size ¶

max_frames_per_batch ¶

max_seqlen_override ¶

device ¶

all_gather_interleave(local_tensor, hidden_size, tp_size) ¶

`vllm.model_executor.models.glm4_1v` ¶

`Glm4vForConditionalGeneration` ¶

`forward(input_ids, positions, intermediate_tensors=None, inputs_embeds=None, **kwargs)` ¶

`input_ids` ¶

`positions` ¶

`intermediate_tensors` ¶

`inputs_embeds` ¶

`kwargs`** ¶

`get_mm_mapping()` ¶

`Glm4vImageEmbeddingInputs` ¶

`Glm4vImagePixelInputs` ¶

`Glm4vProcessingInfo` ¶

`_get_image_max_pixels()` ¶

`_get_video_second_idx_glmga(metadata, total_frames)` ¶

`_is_glmga_model(processor)` ¶

`Glm4vVideoEmbeddingInputs` ¶

`Glm4vVideoPixelInputs` ¶

`Glm4vVisionTransformer` ¶

`pos_embeds_interpolate(grid_thw)` ¶

`prepare_encoder_metadata(grid_thw_list, *, max_batch_size=None, max_frames_per_batch=None, max_seqlen_override=None, device=None)` ¶

`grid_thw_list` ¶

`max_batch_size` ¶

`max_frames_per_batch` ¶

`max_seqlen_override` ¶

`device` ¶

`all_gather_interleave(local_tensor, hidden_size, tp_size)` ¶