`vllm.model_executor.layers.pooler` ¶

Modules:

abstract –
common –
seqwise –

Poolers that produce an output aggregating all tokens in the sequence.
special –
tokwise –

Poolers that produce an output for each token in the sequence.

Classes:

BOSEOSFilter –

Filters the BOS and EOS token results from outputs.
DispatchPooler –

Dispatches calls to a sub-pooler based on the pooling task.
Pooler –

The interface required for all poolers used in pooling models in vLLM.
PoolingParamsUpdate –

`BOSEOSFilter` ¶

Bases: Pooler

Filters the BOS and EOS token results from outputs.

Source code in vllm/model_executor/layers/pooler/special.py

class BOSEOSFilter(Pooler):
    """Filters the BOS and EOS token results from outputs."""

    def __init__(
        self,
        pooler: Pooler,
        bos_token_id: int = -1,  # -1 disables the filtering
        eos_token_id: int = -1,
    ) -> None:
        super().__init__()

        self.pooler = pooler
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

    def extra_repr(self) -> str:
        return f"bos_token_id={self.bos_token_id}, eos_token_id={self.eos_token_id}"

    def get_supported_tasks(self) -> Set[PoolingTask]:
        return self.pooler.get_supported_tasks()

    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
        return PoolingParamsUpdate(requires_token_ids=True)

    def forward(
        self,
        hidden_states: torch.Tensor | list[torch.Tensor],
        pooling_metadata: PoolingMetadata,
    ) -> PoolerOutput:
        pooled_outputs = self.pooler(hidden_states, pooling_metadata)
        assert isinstance(pooled_outputs, list)
        prompt_token_ids = pooling_metadata.get_prompt_token_ids_cpu()

        for i, (prompt_len, token_ids) in enumerate(
            zip(pooling_metadata.prompt_lens, prompt_token_ids)
        ):
            pooled_data = pooled_outputs[i]
            assert (
                isinstance(pooled_data, torch.Tensor)
                and pooled_data.shape[0] == prompt_len
            )
            if int(token_ids[0]) == self.bos_token_id:
                pooled_data = pooled_data[1:]
            if int(token_ids[-1]) == self.eos_token_id:
                pooled_data = pooled_data[:-1]
            pooled_outputs[i] = pooled_data.squeeze(-1)

        return pooled_outputs

`DispatchPooler` ¶

Bases: Pooler

Dispatches calls to a sub-pooler based on the pooling task.

Source code in vllm/model_executor/layers/pooler/special.py

class DispatchPooler(Pooler):
    """Dispatches calls to a sub-pooler based on the pooling task."""

    @classmethod
    def for_embedding(cls, pooler_config: PoolerConfig):
        return cls(
            {
                "token_embed": pooler_for_token_embed(pooler_config),
                "embed": pooler_for_embed(pooler_config),
            },
        )

    @classmethod
    def for_seq_cls(
        cls,
        pooler_config: PoolerConfig,
        *,
        pooling: SequencePoolingMethod | SequencePoolingFn | None = None,
        classifier: ClassifierFn | None = None,
    ):
        return cls(
            {
                "token_classify": pooler_for_token_classify(
                    pooler_config,
                    pooling=AllPool(),
                    classifier=classifier,
                ),
                "classify": pooler_for_classify(
                    pooler_config,
                    pooling=pooling,
                    classifier=classifier,
                ),
            }
        )

    def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None:
        super().__init__()

        for task, pooler in poolers_by_task.items():
            if task not in pooler.get_supported_tasks():
                raise ValueError(
                    f"{pooler=} does not support {task=}. "
                    f"Supported tasks: {pooler.get_supported_tasks()}"
                )

        self.poolers_by_task = poolers_by_task

    def get_supported_tasks(self) -> Set[PoolingTask]:
        return set(self.poolers_by_task)

    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
        return self.poolers_by_task[task].get_pooling_updates(task)

    def forward(
        self,
        hidden_states: torch.Tensor,
        pooling_metadata: PoolingMetadata,
    ) -> PoolerOutput:
        poolers_by_task = self.poolers_by_task
        cursor = pooling_metadata.pooling_cursor

        outputs = list[torch.Tensor | None]()
        offset = 0
        token_offset = 0
        for task, group in groupby(pooling_metadata.tasks):
            if not (pooler := poolers_by_task.get(task)):
                raise ValueError(
                    f"Unsupported task: {task!r} "
                    f"Supported tasks: {self.get_supported_tasks()}"
                )

            num_items = len(list(group))
            group_metadata = pooling_metadata[offset : offset + num_items]
            if cursor is None:
                group_hidden_states = hidden_states
            else:
                # Slice out this group's tokens so sub-poolers see only their
                # portion of the batch. Token offset is computed from the CPU
                # `num_scheduled_tokens_cpu` to avoid a GPU->CPU sync.
                group_cursor = group_metadata.pooling_cursor
                assert group_cursor is not None
                num_group_tokens = int(group_cursor.num_scheduled_tokens_cpu.sum())
                group_hidden_states = hidden_states[
                    token_offset : token_offset + num_group_tokens
                ]
                if token_offset:
                    # Shift first/last indices to be relative to the slice
                    # so seqwise poolers (which index `hidden_states` directly)
                    # remain correct.
                    pooling_cursor = dataclasses.replace(
                        group_cursor,
                        first_token_indices_gpu=(
                            group_cursor.first_token_indices_gpu - token_offset
                        ),
                        last_token_indices_gpu=(
                            group_cursor.last_token_indices_gpu - token_offset
                        ),
                    )
                    group_metadata = dataclasses.replace(
                        group_metadata, pooling_cursor=pooling_cursor
                    )
                token_offset += num_group_tokens

            group_output: PoolerOutput = pooler(group_hidden_states, group_metadata)

            outputs.extend(group_output)
            offset += num_items

        return outputs

    def extra_repr(self) -> str:
        s = f"supported_task={self.get_supported_tasks()}"
        return s

`Pooler` ¶

Bases: Module, ABC

The interface required for all poolers used in pooling models in vLLM.

Methods:

get_pooling_updates –

Construct the updated pooling parameters to use for a supported task.
get_supported_tasks –

Determine which pooling tasks are supported.

Source code in vllm/model_executor/layers/pooler/abstract.py

class Pooler(nn.Module, ABC):
    """The interface required for all poolers used in pooling models in vLLM."""

    @abstractmethod
    def get_supported_tasks(self) -> Set[PoolingTask]:
        """Determine which pooling tasks are supported."""
        raise NotImplementedError

    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
        """
        Construct the updated pooling parameters to use for a supported task.
        """
        return PoolingParamsUpdate()

    @abstractmethod
    def forward(
        self,
        hidden_states: torch.Tensor,
        pooling_metadata: PoolingMetadata,
    ) -> PoolerOutput:
        raise NotImplementedError

`get_pooling_updates(task)` ¶

Construct the updated pooling parameters to use for a supported task.

Source code in vllm/model_executor/layers/pooler/abstract.py

def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
    """
    Construct the updated pooling parameters to use for a supported task.
    """
    return PoolingParamsUpdate()

`get_supported_tasks()` `abstractmethod` ¶

Determine which pooling tasks are supported.

Source code in vllm/model_executor/layers/pooler/abstract.py

@abstractmethod
def get_supported_tasks(self) -> Set[PoolingTask]:
    """Determine which pooling tasks are supported."""
    raise NotImplementedError

`PoolingParamsUpdate` `dataclass` ¶

Attributes:

requires_token_ids (bool) –

Set this flag to enable prompt token IDs for your pooler.

Source code in vllm/model_executor/layers/pooler/common.py

@dataclass(frozen=True)
class PoolingParamsUpdate:
    requires_token_ids: bool = False
    """Set this flag to enable prompt token IDs for your pooler."""

    def __or__(self, other: "PoolingParamsUpdate") -> "PoolingParamsUpdate":
        return PoolingParamsUpdate(
            requires_token_ids=self.requires_token_ids or other.requires_token_ids,
        )

    def apply(self, params: PoolingParams) -> None:
        params.requires_token_ids = self.requires_token_ids

`requires_token_ids = False` `class-attribute` `instance-attribute` ¶

Set this flag to enable prompt token IDs for your pooler.

vllm.model_executor.layers.pooler ¶

BOSEOSFilter ¶

DispatchPooler ¶

Pooler ¶

get_pooling_updates(task) ¶

get_supported_tasks() abstractmethod ¶

PoolingParamsUpdate dataclass ¶

requires_token_ids = False class-attribute instance-attribute ¶

`vllm.model_executor.layers.pooler` ¶

`BOSEOSFilter` ¶

`DispatchPooler` ¶

`Pooler` ¶

`get_pooling_updates(task)` ¶

`get_supported_tasks()` `abstractmethod` ¶

`PoolingParamsUpdate` `dataclass` ¶

`requires_token_ids = False` `class-attribute` `instance-attribute` ¶