`vllm.entrypoints.cli.launch` ¶

Classes:

LaunchSubcommand –

The launch subcommand for the vLLM CLI.
LaunchSubcommandBase –

The base class of subcommands for vllm launch.
RenderSubcommand –

The render subcommand for vllm launch.

Functions:

run_launch_fastapi –

Run the online serving layer with FastAPI (no GPU inference).

`LaunchSubcommand` ¶

Bases: CLISubcommand

The launch subcommand for the vLLM CLI.

Uses nested sub-subcommands so each component can define its own arguments independently (e.g. vllm launch render).

Source code in vllm/entrypoints/cli/launch.py

class LaunchSubcommand(CLISubcommand):
    """The `launch` subcommand for the vLLM CLI.

    Uses nested sub-subcommands so each component can define its own
    arguments independently (e.g. ``vllm launch render``).
    """

    name = "launch"

    @staticmethod
    def cmd(args: argparse.Namespace) -> None:
        if hasattr(args, "model_tag") and args.model_tag is not None:
            args.model = args.model_tag

        args.launch_command(args)

    def validate(self, args: argparse.Namespace) -> None:
        validate_parsed_serve_args(args)

    def subparser_init(
        self, subparsers: argparse._SubParsersAction
    ) -> FlexibleArgumentParser:
        launch_parser = subparsers.add_parser(
            self.name,
            help=DESCRIPTION,
            description=DESCRIPTION,
            usage=f"vllm {self.name} <component> [options]",
        )
        launch_subparsers = launch_parser.add_subparsers(
            required=True, dest="launch_component"
        )

        for cmd_cls in LaunchSubcommandBase.__subclasses__():
            cmd_subparser = launch_subparsers.add_parser(
                cmd_cls.name,
                help=cmd_cls.help,
                description=cmd_cls.help,
                usage=f"vllm {self.name} {cmd_cls.name} [options]",
            )
            cmd_subparser.set_defaults(launch_command=cmd_cls.cmd)
            cmd_cls.add_cli_args(cmd_subparser)
            cmd_subparser.epilog = VLLM_SUBCMD_PARSER_EPILOG.format(
                subcmd=f"{self.name} {cmd_cls.name}"
            )

        return launch_parser

`LaunchSubcommandBase` ¶

Bases: CLISubcommand

The base class of subcommands for vllm launch.

Methods:

add_cli_args –

Add the CLI arguments to the parser.

Source code in vllm/entrypoints/cli/launch.py

class LaunchSubcommandBase(CLISubcommand):
    """The base class of subcommands for `vllm launch`."""

    help: str

    @classmethod
    def add_cli_args(cls, parser: FlexibleArgumentParser) -> None:
        """Add the CLI arguments to the parser.

        By default, adds the standard vLLM serving arguments.
        Subclasses can override to add component-specific arguments.
        """
        make_arg_parser(parser)

    @staticmethod
    def cmd(args: argparse.Namespace) -> None:
        raise NotImplementedError

`add_cli_args(parser)` `classmethod` ¶

Add the CLI arguments to the parser.

By default, adds the standard vLLM serving arguments. Subclasses can override to add component-specific arguments.

Source code in vllm/entrypoints/cli/launch.py

@classmethod
def add_cli_args(cls, parser: FlexibleArgumentParser) -> None:
    """Add the CLI arguments to the parser.

    By default, adds the standard vLLM serving arguments.
    Subclasses can override to add component-specific arguments.
    """
    make_arg_parser(parser)

`RenderSubcommand` ¶

Bases: LaunchSubcommandBase

The render subcommand for vllm launch.

Source code in vllm/entrypoints/cli/launch.py

class RenderSubcommand(LaunchSubcommandBase):
    """The `render` subcommand for `vllm launch`."""

    name = "render"
    help = "Launch a GPU-less rendering server (preprocessing and postprocessing only)."

    @staticmethod
    def cmd(args: argparse.Namespace) -> None:
        uvloop.run(run_launch_fastapi(args))

`run_launch_fastapi(args)` `async` ¶

Run the online serving layer with FastAPI (no GPU inference).

Source code in vllm/entrypoints/cli/launch.py

async def run_launch_fastapi(args: argparse.Namespace) -> None:
    """Run the online serving layer with FastAPI (no GPU inference)."""

    # Interrupt initialization if SIGTERM arrives before uvicorn installs
    # its own signal handlers. Once uvicorn is running it replaces this.
    def _interrupt_init(*_) -> None:
        raise KeyboardInterrupt("terminated")

    signal.signal(signal.SIGTERM, _interrupt_init)

    # 1. Socket binding
    listen_address, sock = setup_server(args)

    # 2. Build and serve the API server
    engine_args = AsyncEngineArgs.from_cli_args(args)
    model_config = engine_args.create_model_config()

    # Render servers preprocess data only — no inference, no quantized kernels.
    # Clear quantization so VllmConfig skips quant dtype/capability validation.
    model_config.quantization = None

    # Render servers never allocate KV cache; suppress the spurious CPU KV
    # cache space warning from CpuPlatform.check_and_update_config.
    envs.VLLM_CPU_KVCACHE_SPACE = 0

    vllm_config = VllmConfig(model_config=model_config)
    shutdown_task = await build_and_serve_renderer(
        vllm_config, listen_address, sock, args
    )
    try:
        await shutdown_task
    finally:
        sock.close()

vllm.entrypoints.cli.launch ¶

LaunchSubcommand ¶

LaunchSubcommandBase ¶

add_cli_args(parser) classmethod ¶

RenderSubcommand ¶

run_launch_fastapi(args) async ¶

`vllm.entrypoints.cli.launch` ¶

`LaunchSubcommand` ¶

`LaunchSubcommandBase` ¶

`add_cli_args(parser)` `classmethod` ¶

`RenderSubcommand` ¶

`run_launch_fastapi(args)` `async` ¶