Skip to content

vllm.multimodal.utils

Functions:

argsort_mm_positions(mm_positions)

Given a MultiModalPlaceholders, output a sequence of keys to sort the dictionary by offset (starting index in the input sequence) in ascending order.

Returns:

Source code in vllm/multimodal/utils.py
def argsort_mm_positions(
    mm_positions: MultiModalPlaceholders,
) -> list[tuple[str, int]]:
    """
    Given a `MultiModalPlaceholders`, output a sequence of keys to
    sort the dictionary by `offset` (starting index in the input sequence)
    in ascending order.

    Returns:
        A list of `(modality, idx)`, which can be used to access an item
        by `mm_positions[modality][idx]`.
    """
    flat_items = (
        (modality, idx, item)
        for modality, items in mm_positions.items()
        for idx, item in enumerate(items)
    )

    sorted_flat_items = sorted(flat_items, key=lambda x: x[2].offset)

    return [(modality, idx) for modality, idx, _ in sorted_flat_items]

encode_audio_base64(audio, sampling_rate, *, format='WAV')

Encode audio as base64.

Source code in vllm/multimodal/utils.py
def encode_audio_base64(
    audio: np.ndarray,
    sampling_rate: int,
    *,
    format: str = "WAV",
) -> str:
    """Encode audio as base64."""
    audio_io = AudioMediaIO()
    return audio_io.encode_base64((audio, sampling_rate), audio_format=format)

encode_audio_url(audio, sampling_rate, *, format='WAV')

Encode audio as a data URL.

Source code in vllm/multimodal/utils.py
def encode_audio_url(
    audio: np.ndarray,
    sampling_rate: int,
    *,
    format: str = "WAV",
) -> str:
    """Encode audio as a data URL."""
    audio_b64 = encode_audio_base64(audio, sampling_rate, format=format)
    mimetype = mimetypes.types_map.get("." + format.lower(), "audio")
    return f"data:{mimetype};base64,{audio_b64}"

encode_image_base64(image, *, image_mode='RGB', format='PNG')

Encode a pillow image to base64 format.

By default, the image is converted into RGB format before being encoded.

Source code in vllm/multimodal/utils.py
def encode_image_base64(
    image: Image.Image,
    *,
    image_mode: str = "RGB",
    format: str = "PNG",
) -> str:
    """
    Encode a pillow image to base64 format.

    By default, the image is converted into RGB format before being encoded.
    """
    image_io = ImageMediaIO(image_mode=image_mode)
    return image_io.encode_base64(image, image_format=format)

encode_image_url(image, *, image_mode='RGB', format='PNG')

Encode a pillow image as a data URL.

By default, the image is converted into RGB format before being encoded.

Source code in vllm/multimodal/utils.py
def encode_image_url(
    image: Image.Image,
    *,
    image_mode: str = "RGB",
    format: str = "PNG",
) -> str:
    """
    Encode a pillow image as a data URL.

    By default, the image is converted into RGB format before being encoded.
    """
    image_b64 = encode_image_base64(image, image_mode=image_mode, format=format)
    mimetype = mimetypes.types_map.get("." + format.lower(), "image")
    return f"data:{mimetype};base64,{image_b64}"

fetch_audio(audio_url, audio_io_kwargs=None)

Parameters:

  • audio_url

    (str) –

    URL of the audio file to fetch.

  • audio_io_kwargs

    (dict[str, Any] | None, default: None ) –

    Additional kwargs passed to handle audio IO.

Warning

This method has direct access to local files and is only intended to be called by user code. Never call this from the online server!

Source code in vllm/multimodal/utils.py
def fetch_audio(
    audio_url: str,
    audio_io_kwargs: dict[str, Any] | None = None,
) -> tuple[np.ndarray, int | float]:
    """
    Args:
        audio_url: URL of the audio file to fetch.
        audio_io_kwargs: Additional kwargs passed to handle audio IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    """
    media_io_kwargs = None if not audio_io_kwargs else {"audio": audio_io_kwargs}
    media_connector = MediaConnector(
        media_io_kwargs=media_io_kwargs,
        allowed_local_media_path="/",
    )
    return media_connector.fetch_audio(audio_url)

fetch_image(image_url, image_io_kwargs=None)

Parameters:

  • image_url

    (str) –

    URL of the image file to fetch.

  • image_io_kwargs

    (dict[str, Any] | None, default: None ) –

    Additional kwargs passed to handle image IO.

Warning

This method has direct access to local files and is only intended to be called by user code. Never call this from the online server!

Source code in vllm/multimodal/utils.py
def fetch_image(
    image_url: str,
    image_io_kwargs: dict[str, Any] | None = None,
) -> Image.Image:
    """
    Args:
        image_url: URL of the image file to fetch.
        image_io_kwargs: Additional kwargs passed to handle image IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    """
    media_io_kwargs = None if not image_io_kwargs else {"image": image_io_kwargs}
    media_connector = MediaConnector(
        media_io_kwargs=media_io_kwargs,
        allowed_local_media_path="/",
    )
    return media_connector.fetch_image(image_url)

fetch_video(video_url, video_io_kwargs=None)

Parameters:

  • video_url

    (str) –

    URL of the video file to fetch.

  • video_io_kwargs

    (dict[str, Any] | None, default: None ) –

    Additional kwargs passed to handle video IO.

Warning

This method has direct access to local files and is only intended to be called by user code. Never call this from the online server!

Source code in vllm/multimodal/utils.py
def fetch_video(
    video_url: str,
    video_io_kwargs: dict[str, Any] | None = None,
) -> tuple[npt.NDArray, dict[str, Any]]:
    """
    Args:
        video_url: URL of the video file to fetch.
        video_io_kwargs: Additional kwargs passed to handle video IO.

    Warning:
        This method has direct access to local files and is only intended
        to be called by user code. Never call this from the online server!
    """
    media_io_kwargs = None if not video_io_kwargs else {"video": video_io_kwargs}
    media_connector = MediaConnector(
        media_io_kwargs=media_io_kwargs,
        allowed_local_media_path="/",
    )
    return media_connector.fetch_video(video_url)

get_mm_features_in_window(mm_features, start, end)

Return (lo, hi) indices for features overlapping [start, end).

Assumes mm_features are sorted by offset and non-overlapping, so offset + length is also sorted.

Source code in vllm/multimodal/utils.py
def get_mm_features_in_window(
    mm_features: list[MultiModalFeatureSpec],
    start: int,
    end: int,
) -> tuple[int, int]:
    """Return (lo, hi) indices for features overlapping [start, end).

    Assumes mm_features are sorted by offset and non-overlapping, so
    offset + length is also sorted.
    """
    lo = bisect.bisect_left(
        mm_features,
        start + 1,
        key=lambda f: f.mm_position.offset + f.mm_position.length,
    )
    hi = bisect.bisect_left(
        mm_features,
        end,
        key=lambda f: f.mm_position.offset,
    )
    return lo, hi

group_and_batch_mm_items(items, *, device=None, pin_memory=False)

Group consecutive items (possibly from different requests) into batches.

Items must be split across groups if any of the following occurs, as the batch would otherwise be invalid: - They have different fields (e.g. mixed image and embedding inputs). - They have different values in MultiModalSharedField.

Parameters:

  • items

    (Sequence[MultiModalKwargsItem]) –

    List of MultiModalKwargsItem.

  • device

    (Device, default: None ) –

    The device to place the grouped tensors on.

  • pin_memory

    (bool, default: False ) –

    Whether to pin memory for faster host-to-device transfer.

Yields:

Source code in vllm/multimodal/utils.py
def group_and_batch_mm_items(
    items: Sequence[MultiModalKwargsItem],
    *,
    device: torch.types.Device = None,
    pin_memory: bool = False,
) -> Generator[tuple[int, BatchedTensorInputs]]:
    """
    Group consecutive items (possibly from different requests) into batches.

    Items must be split across groups if any of the following occurs,
    as the batch would otherwise be invalid:
    - They have different fields (e.g. mixed image and embedding inputs).
    - They have different values in `MultiModalSharedField`.

    Args:
        items: List of `MultiModalKwargsItem`.
        device: The device to place the grouped tensors on.
        pin_memory: Whether to pin memory for faster host-to-device transfer.

    Yields:
        A tuple `(num_items, grouped_kwargs)`, where:
        - `kwargs` is a dictionary of keyword arguments to pass to the model;
        - `num_items` is the corresponding number of items.
    """
    group_ids = [
        tuple(
            (key, _get_group_hash(elem))
            for key, elem in sorted(item.items(), key=lambda kv: kv[0])
        )
        for item in items
    ]
    group_sizes = [sum(1 for _ in group) for _, group in groupby(group_ids)]

    start_idx = 0
    for group_size in group_sizes:
        group_data = _batch_mm_items(
            items[start_idx : start_idx + group_size],
            device=device,
            pin_memory=pin_memory,
        )

        yield group_size, group_data

        start_idx += group_size

    assert start_idx == len(items)

group_and_batch_mm_kwargs(mm_kwargs, *, device=None, pin_memory=False)

Group consecutive items (possibly from different requests) into batches.

Items must be split across groups if any of the following occurs, as the batch would otherwise be invalid: - They have different fields (e.g. mixed image and embedding inputs). - They have different values in MultiModalSharedField.

To simplify the implementation of embed_multimodal, we add another restriction that the items in a batch must belong to the same modality.

Parameters:

  • mm_kwargs

    (list[tuple[str, MultiModalKwargsItem]]) –

    List of (modality, item).

  • device

    (Device, default: None ) –

    The device to place the grouped tensors on.

  • pin_memory

    (bool, default: False ) –

    Whether to pin memory for faster host-to-device transfer.

Yields:

  • str

    A tuple (modality, num_items, grouped_kwargs), where:

  • int
    • modality is the modality of the batch;
  • BatchedTensorInputs
    • kwargs is a dictionary of keyword arguments to pass to the model;
  • tuple[str, int, BatchedTensorInputs]
    • num_items is the corresponding number of items.
Source code in vllm/multimodal/utils.py
def group_and_batch_mm_kwargs(
    mm_kwargs: list[tuple[str, MultiModalKwargsItem]],
    *,
    device: torch.types.Device = None,
    pin_memory: bool = False,
) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
    """
    Group consecutive items (possibly from different requests) into batches.

    Items must be split across groups if any of the following occurs,
    as the batch would otherwise be invalid:
    - They have different fields (e.g. mixed image and embedding inputs).
    - They have different values in `MultiModalSharedField`.

    To simplify the implementation of `embed_multimodal`, we add another
    restriction that the items in a batch must belong to the same modality.

    Args:
        mm_kwargs: List of `(modality, item)`.
        device: The device to place the grouped tensors on.
        pin_memory: Whether to pin memory for faster host-to-device transfer.

    Yields:
        A tuple `(modality, num_items, grouped_kwargs)`, where:
        - `modality` is the modality of the batch;
        - `kwargs` is a dictionary of keyword arguments to pass to the model;
        - `num_items` is the corresponding number of items.
    """
    for modality, group in groupby(mm_kwargs, key=lambda x: x[0]):
        items_lst = [item for _, item in group]

        for num_items, mm_kwargs_batch in group_and_batch_mm_items(
            items_lst,
            device=device,
            pin_memory=pin_memory,
        ):
            yield modality, num_items, mm_kwargs_batch