vllm.entrypoints.openai.responses.protocol ¶

ResponseRawMessageAndToken ¶

Bases: OpenAIBaseModel

Class to show the raw message. If message / tokens diverge, tokens is the source of truth

Source code in vllm/entrypoints/openai/responses/protocol.py

class ResponseRawMessageAndToken(OpenAIBaseModel):
    """Class to show the raw message.
    If message / tokens diverge, tokens is the source of truth"""

    message: str
    tokens: list[int]
    type: Literal["raw_message_tokens"] = "raw_message_tokens"

ResponseReasoningPartAddedEvent ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/responses/protocol.py

class ResponseReasoningPartAddedEvent(OpenAIBaseModel):
    content_index: int
    """The index of the content part that is done."""

    item_id: str
    """The ID of the output item that the content part was added to."""

    output_index: int
    """The index of the output item that the content part was added to."""

    part: ResponseReasoningTextContent
    """The content part that is done."""

    sequence_number: int
    """The sequence number of this event."""

    type: Literal["response.reasoning_part.added"]
    """The type of the event. Always `response.reasoning_part.added`."""

content_index `instance-attribute` ¶

content_index: int

The index of the content part that is done.

item_id `instance-attribute` ¶

item_id: str

The ID of the output item that the content part was added to.

output_index `instance-attribute` ¶

output_index: int

The index of the output item that the content part was added to.

part `instance-attribute` ¶

part: Content

The content part that is done.

sequence_number `instance-attribute` ¶

sequence_number: int

The sequence number of this event.

type `instance-attribute` ¶

type: Literal['response.reasoning_part.added']

The type of the event. Always response.reasoning_part.added.

ResponseReasoningPartDoneEvent ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/responses/protocol.py

class ResponseReasoningPartDoneEvent(OpenAIBaseModel):
    content_index: int
    """The index of the content part that is done."""

    item_id: str
    """The ID of the output item that the content part was added to."""

    output_index: int
    """The index of the output item that the content part was added to."""

    part: ResponseReasoningTextContent
    """The content part that is done."""

    sequence_number: int
    """The sequence number of this event."""

    type: Literal["response.reasoning_part.done"]
    """The type of the event. Always `response.reasoning_part.done`."""

content_index `instance-attribute` ¶

content_index: int

The index of the content part that is done.

item_id `instance-attribute` ¶

item_id: str

The ID of the output item that the content part was added to.

output_index `instance-attribute` ¶

output_index: int

The index of the output item that the content part was added to.

part `instance-attribute` ¶

part: Content

The content part that is done.

sequence_number `instance-attribute` ¶

sequence_number: int

The sequence number of this event.

type `instance-attribute` ¶

type: Literal['response.reasoning_part.done']

The type of the event. Always response.reasoning_part.done.

ResponsesRequest ¶

Bases: OpenAIBaseModel

Source code in vllm/entrypoints/openai/responses/protocol.py

class ResponsesRequest(OpenAIBaseModel):
    # Ordered by official OpenAI API documentation
    # https://platform.openai.com/docs/api-reference/responses/create
    background: bool | None = False
    include: (
        list[
            Literal[
                "code_interpreter_call.outputs",
                "computer_call_output.output.image_url",
                "file_search_call.results",
                "message.input_image.image_url",
                "message.output_text.logprobs",
                "reasoning.encrypted_content",
            ],
        ]
        | None
    ) = None
    input: str | list[ResponseInputOutputItem]
    instructions: str | None = None
    max_output_tokens: int | None = None
    max_tool_calls: int | None = None
    metadata: Metadata | None = None
    model: str | None = None
    logit_bias: dict[str, float] | None = None
    parallel_tool_calls: bool | None = True
    previous_response_id: str | None = None
    prompt: ResponsePrompt | None = None
    reasoning: Reasoning | None = None
    service_tier: Literal["auto", "default", "flex", "scale", "priority"] = "auto"
    store: bool | None = True
    stream: bool | None = False
    temperature: float | None = None
    text: ResponseTextConfig | None = None
    tool_choice: ToolChoice = "auto"
    tools: list[Tool] = Field(default_factory=list)
    top_logprobs: int | None = 0
    top_p: float | None = None
    top_k: int | None = None
    truncation: Literal["auto", "disabled"] | None = "disabled"
    user: str | None = None
    skip_special_tokens: bool = True
    include_stop_str_in_output: bool = False
    presence_penalty: float | None = Field(
        default=None,
        ge=-2.0,
        le=2.0,
        description=(
            "The presence penalty that was used to penalize new tokens based on "
            "whether they appear in the text so far."
        ),
    )
    frequency_penalty: float | None = Field(
        default=None,
        ge=-2.0,
        le=2.0,
        description=(
            "The frequency penalty that was used to penalize new tokens based on "
            "their frequency in the text so far."
        ),
    )
    prompt_cache_key: str | None = Field(
        default=None,
        description=(
            "A key that was used to read from or write to the prompt cache."
            "Note: This field has not been implemented yet "
            "and vLLM will ignore it."
        ),
    )

    # --8<-- [start:responses-extra-params]
    request_id: str = Field(
        default_factory=lambda: f"resp_{random_uuid()}",
        description=(
            "The request_id related to this request. If the caller does "
            "not set it, a random_uuid will be generated. This id is used "
            "through out the inference process and return in response."
        ),
    )
    media_io_kwargs: dict[str, dict[str, Any]] | None = Field(
        default=None,
        description=(
            "Additional kwargs to pass to the media IO connectors, "
            "keyed by modality. Merged with engine-level media_io_kwargs."
        ),
    )
    mm_processor_kwargs: dict[str, Any] | None = Field(
        default=None,
        description=("Additional kwargs to pass to the HF processor."),
    )
    priority: int = Field(
        default=0,
        ge=_INT64_MIN,
        le=_INT64_MAX,
        description=(
            "The priority of the request (lower means earlier handling; "
            "default: 0). Any priority other than 0 will raise an error "
            "if the served model does not use priority scheduling."
        ),
    )
    cache_salt: str | None = Field(
        default=None,
        description=(
            "If specified, the prefix cache will be salted with the provided "
            "string to prevent an attacker to guess prompts in multi-user "
            "environments. The salt should be random, protected from "
            "access by 3rd parties, and long enough to be "
            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
            "to 256 bit)."
        ),
    )

    enable_response_messages: bool = Field(
        default=False,
        description=(
            "Dictates whether or not to return messages as part of the "
            "response object. Currently only supported for non-background."
        ),
    )
    # similar to input_messages / output_messages in ResponsesResponse
    # we take in previous_input_messages (ie in harmony format)
    # this cannot be used in conjunction with previous_response_id
    # TODO: consider supporting non harmony messages as well
    previous_input_messages: list[OpenAIHarmonyMessage | dict] | None = None
    structured_outputs: StructuredOutputsParams | None = Field(
        default=None,
        description="Additional kwargs for structured outputs",
    )

    repetition_penalty: float | None = None
    seed: int | None = Field(None, ge=_INT64_MIN, le=_INT64_MAX)
    stop: str | list[str] | None = []
    ignore_eos: bool = False
    vllm_xargs: dict[str, str | int | float | list[str | int | float]] | None = Field(
        default=None,
        description=(
            "Additional request parameters with (list of) string or "
            "numeric values, used by custom extensions."
        ),
    )
    kv_transfer_params: dict[str, Any] | None = Field(
        default=None,
        description="KVTransfer parameters used for disaggregated serving.",
    )
    # --8<-- [end:responses-extra-params]

    def build_chat_params(
        self,
        default_template: str | None,
        default_template_content_format: ChatTemplateContentFormatOption,
    ) -> ChatParams:
        from .utils import should_continue_final_message

        # Check if we should continue the final message (partial completion)
        # This enables Anthropic-style partial message completion where the
        # user provides an incomplete assistant message to continue from.
        continue_final = should_continue_final_message(self.input)

        reasoning = self.reasoning

        return ChatParams(
            chat_template=default_template,
            chat_template_content_format=default_template_content_format,
            chat_template_kwargs=merge_kwargs(  # To remove unset values
                {},
                dict(
                    add_generation_prompt=not continue_final,
                    continue_final_message=continue_final,
                    reasoning_effort=None if reasoning is None else reasoning.effort,
                ),
            ),
            media_io_kwargs=self.media_io_kwargs,
        )

    def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
        return TokenizeParams(
            max_total_tokens=model_config.max_model_len,
            max_output_tokens=self.max_output_tokens or 0,
            truncate_prompt_tokens=-1 if self.truncation != "disabled" else None,
            max_total_tokens_param="max_model_len",
            max_output_tokens_param="max_output_tokens",
        )

    _DEFAULT_SAMPLING_PARAMS = {
        "temperature": 1.0,
        "top_p": 1.0,
        "top_k": 0,
    }

    def to_sampling_params(
        self,
        default_max_tokens: int,
        default_sampling_params: dict | None = None,
    ) -> SamplingParams:
        if self.max_output_tokens is None:
            max_tokens = default_max_tokens
        else:
            max_tokens = min(self.max_output_tokens, default_max_tokens)

        default_sampling_params = default_sampling_params or {}
        if (temperature := self.temperature) is None:
            temperature = default_sampling_params.get(
                "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]
            )
        if (top_p := self.top_p) is None:
            top_p = default_sampling_params.get(
                "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]
            )
        if (top_k := self.top_k) is None:
            top_k = default_sampling_params.get(
                "top_k", self._DEFAULT_SAMPLING_PARAMS["top_k"]
            )

        if (repetition_penalty := self.repetition_penalty) is None:
            repetition_penalty = default_sampling_params.get("repetition_penalty", 1.0)

        if (presence_penalty := self.presence_penalty) is None:
            presence_penalty = default_sampling_params.get("presence_penalty", 0.0)

        if (frequency_penalty := self.frequency_penalty) is None:
            frequency_penalty = default_sampling_params.get("frequency_penalty", 0.0)

        stop_token_ids = default_sampling_params.get("stop_token_ids")

        # Structured output
        structured_outputs = self.structured_outputs

        # Also check text.format for OpenAI-style json_schema
        if self.text is not None and self.text.format is not None:
            if structured_outputs is not None:
                raise VLLMValidationError(
                    "Cannot specify both structured_outputs and text.format",
                    parameter="structured_outputs",
                )
            response_format = self.text.format
            if (
                response_format.type == "json_schema"
                and response_format.schema_ is not None
            ):
                structured_outputs = StructuredOutputsParams(
                    json=response_format.schema_  # type: ignore[call-arg]
                    # --follow-imports skip hides the class definition but also hides
                    # multiple third party conflicts, so best of both evils
                )

        stop = self.stop if self.stop else []
        if isinstance(stop, str):
            stop = [stop]

        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
        if self.kv_transfer_params:
            extra_args["kv_transfer_params"] = self.kv_transfer_params

        return SamplingParams.from_optional(
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            max_tokens=max_tokens,
            logprobs=self.top_logprobs if self.is_include_output_logprobs() else None,
            stop_token_ids=stop_token_ids,
            stop=stop,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            repetition_penalty=repetition_penalty,
            seed=self.seed,
            ignore_eos=self.ignore_eos,
            output_kind=(
                RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY
            ),
            structured_outputs=structured_outputs,
            logit_bias=self.logit_bias,
            extra_args=extra_args,
            skip_clone=True,  # Created fresh per request, safe to skip clone
            skip_special_tokens=self.skip_special_tokens,
            include_stop_str_in_output=self.include_stop_str_in_output,
        )

    def is_include_output_logprobs(self) -> bool:
        """Check if the request includes output logprobs."""
        if self.include is None:
            return False
        return (
            isinstance(self.include, list)
            and "message.output_text.logprobs" in self.include
        )

    @model_validator(mode="before")
    @classmethod
    def validate_background(cls, data):
        if not data.get("background"):
            return data
        if not data.get("store", True):
            raise VLLMValidationError(
                "background can only be used when `store` is true",
                parameter="background",
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def validate_prompt(cls, data):
        if data.get("prompt") is not None:
            raise VLLMValidationError(
                "prompt template is not supported", parameter="prompt"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_cache_salt_support(cls, data):
        if data.get("cache_salt") is not None and (
            not isinstance(data["cache_salt"], str) or not data["cache_salt"]
        ):
            raise VLLMValidationError(
                "Parameter 'cache_salt' must be a non-empty string if provided.",
                parameter="cache_salt",
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def input_item_parsing(cls, data):
        """Parse input items that are missing required fields or that Pydantic
        cannot disambiguate in a Union of TypedDict / BaseModel types.

        Specifically handles:
        - function_call -> ResponseFunctionToolCall
        - reasoning     -> ResponseReasoningItem (auto-generates id)
        - message(role=assistant) -> ResponseOutputMessage (auto-generates
          id/status and annotations)

        Invalid structures are left for Pydantic to reject.
        """
        input_data = data.get("input")

        # Early return for None, strings, or bytes
        if input_data is None or isinstance(input_data, (str, bytes)):
            return data

        # Convert iterators (like ValidatorIterator) to list
        if not isinstance(input_data, list):
            try:
                input_data = list(input_data)
            except TypeError:
                # Not iterable, leave as-is for Pydantic to handle
                return data

        processed_input = []
        for item in input_data:
            if not isinstance(item, dict):
                processed_input.append(item)
                continue

            item_type = item.get("type")

            if item_type == "function_call":
                try:
                    processed_input.append(ResponseFunctionToolCall(**item))
                except ValidationError:
                    logger.debug(
                        "Failed to parse function_call to ResponseFunctionToolCall, "
                        "leaving for Pydantic validation"
                    )
                    processed_input.append(item)

            elif item_type == "reasoning":
                if "id" not in item:
                    item = {**item, "id": f"rs_{random_uuid()}"}
                try:
                    processed_input.append(ResponseReasoningItem(**item))
                except ValidationError:
                    logger.debug(
                        "Failed to parse reasoning to ResponseReasoningItem, "
                        "leaving for Pydantic validation"
                    )
                    processed_input.append(item)

            elif item_type == "message" and item.get("role") == "assistant":
                item = dict(item)
                if "id" not in item:
                    item["id"] = f"msg_{random_uuid()}"
                if "status" not in item:
                    item["status"] = "completed"
                # ResponseOutputText requires annotations
                if isinstance(item.get("content"), list):
                    new_content = []
                    for c in item["content"]:
                        if (
                            isinstance(c, dict)
                            and c.get("type") == "output_text"
                            and "annotations" not in c
                        ):
                            c = {**c, "annotations": []}
                        new_content.append(c)
                    item["content"] = new_content
                try:
                    processed_input.append(ResponseOutputMessage(**item))
                except ValidationError:
                    logger.debug(
                        "Failed to parse assistant message to ResponseOutputMessage, "
                        "leaving for Pydantic validation"
                    )
                    processed_input.append(item)

            else:
                processed_input.append(item)

        data["input"] = processed_input
        return data

    @model_validator(mode="before")
    @classmethod
    def check_tool_usage(cls, data):
        if not isinstance(data, dict):
            return data

        tools = data.get("tools")
        tool_choice = data.get("tool_choice", "auto")
        has_tools = tools is not None and len(tools) > 0
        is_named_tool_choice = (
            isinstance(tool_choice, dict) and tool_choice.get("type") == "function"
        )

        if not has_tools:
            if tool_choice in ("auto", "none"):
                data["tool_choice"] = "none"
            elif tool_choice == "required":
                raise VLLMValidationError(
                    "Tool choice 'required' must be specified with 'tools' parameter.",
                    parameter="tool_choice",
                )
            elif is_named_tool_choice:
                raise VLLMValidationError(
                    "Tool choice 'function' not found in 'tools' parameter.",
                    parameter="tool_choice",
                )
        elif is_named_tool_choice and tools is not None:
            tool_name = tool_choice.get("name")
            tool_names = {
                t.get("name") if isinstance(t, dict) else getattr(t, "name", None)
                for t in tools
            }
            if not tool_name or tool_name not in tool_names:
                raise VLLMValidationError(
                    "Tool choice 'function' not found in 'tools' parameter.",
                    parameter="tool_choice",
                )

        return data

input_item_parsing `classmethod` ¶

input_item_parsing(data)

Parse input items that are missing required fields or that Pydantic cannot disambiguate in a Union of TypedDict / BaseModel types.

Specifically handles: - function_call -> ResponseFunctionToolCall - reasoning -> ResponseReasoningItem (auto-generates id) - message(role=assistant) -> ResponseOutputMessage (auto-generates id/status and annotations)

Invalid structures are left for Pydantic to reject.

Source code in vllm/entrypoints/openai/responses/protocol.py

@model_validator(mode="before")
@classmethod
def input_item_parsing(cls, data):
    """Parse input items that are missing required fields or that Pydantic
    cannot disambiguate in a Union of TypedDict / BaseModel types.

    Specifically handles:
    - function_call -> ResponseFunctionToolCall
    - reasoning     -> ResponseReasoningItem (auto-generates id)
    - message(role=assistant) -> ResponseOutputMessage (auto-generates
      id/status and annotations)

    Invalid structures are left for Pydantic to reject.
    """
    input_data = data.get("input")

    # Early return for None, strings, or bytes
    if input_data is None or isinstance(input_data, (str, bytes)):
        return data

    # Convert iterators (like ValidatorIterator) to list
    if not isinstance(input_data, list):
        try:
            input_data = list(input_data)
        except TypeError:
            # Not iterable, leave as-is for Pydantic to handle
            return data

    processed_input = []
    for item in input_data:
        if not isinstance(item, dict):
            processed_input.append(item)
            continue

        item_type = item.get("type")

        if item_type == "function_call":
            try:
                processed_input.append(ResponseFunctionToolCall(**item))
            except ValidationError:
                logger.debug(
                    "Failed to parse function_call to ResponseFunctionToolCall, "
                    "leaving for Pydantic validation"
                )
                processed_input.append(item)

        elif item_type == "reasoning":
            if "id" not in item:
                item = {**item, "id": f"rs_{random_uuid()}"}
            try:
                processed_input.append(ResponseReasoningItem(**item))
            except ValidationError:
                logger.debug(
                    "Failed to parse reasoning to ResponseReasoningItem, "
                    "leaving for Pydantic validation"
                )
                processed_input.append(item)

        elif item_type == "message" and item.get("role") == "assistant":
            item = dict(item)
            if "id" not in item:
                item["id"] = f"msg_{random_uuid()}"
            if "status" not in item:
                item["status"] = "completed"
            # ResponseOutputText requires annotations
            if isinstance(item.get("content"), list):
                new_content = []
                for c in item["content"]:
                    if (
                        isinstance(c, dict)
                        and c.get("type") == "output_text"
                        and "annotations" not in c
                    ):
                        c = {**c, "annotations": []}
                    new_content.append(c)
                item["content"] = new_content
            try:
                processed_input.append(ResponseOutputMessage(**item))
            except ValidationError:
                logger.debug(
                    "Failed to parse assistant message to ResponseOutputMessage, "
                    "leaving for Pydantic validation"
                )
                processed_input.append(item)

        else:
            processed_input.append(item)

    data["input"] = processed_input
    return data

is_include_output_logprobs ¶

is_include_output_logprobs() -> bool

Check if the request includes output logprobs.

Source code in vllm/entrypoints/openai/responses/protocol.py

def is_include_output_logprobs(self) -> bool:
    """Check if the request includes output logprobs."""
    if self.include is None:
        return False
    return (
        isinstance(self.include, list)
        and "message.output_text.logprobs" in self.include
    )

serialize_message ¶

serialize_message(msg)

Serializes a single message

Source code in vllm/entrypoints/openai/responses/protocol.py

def serialize_message(msg):
    """
    Serializes a single message
    """
    if isinstance(msg, dict):
        return msg
    elif hasattr(msg, "to_dict"):
        return msg.to_dict()
    else:
        # fallback to pydantic dump
        return msg.model_dump(mode="json", by_alias=True)

serialize_messages ¶

serialize_messages(msgs)

Serializes multiple messages

Source code in vllm/entrypoints/openai/responses/protocol.py

def serialize_messages(msgs):
    """
    Serializes multiple messages
    """
    return [serialize_message(msg) for msg in msgs] if msgs else None

vllm.entrypoints.openai.responses.protocol ¶

ResponseRawMessageAndToken ¶

ResponseReasoningPartAddedEvent ¶

content_index instance-attribute ¶

item_id instance-attribute ¶

output_index instance-attribute ¶

part instance-attribute ¶

sequence_number instance-attribute ¶

type instance-attribute ¶

ResponseReasoningPartDoneEvent ¶

content_index instance-attribute ¶

item_id instance-attribute ¶

output_index instance-attribute ¶

part instance-attribute ¶

sequence_number instance-attribute ¶

type instance-attribute ¶

ResponsesRequest ¶

input_item_parsing classmethod ¶

is_include_output_logprobs ¶

serialize_message ¶

serialize_messages ¶

content_index `instance-attribute` ¶

item_id `instance-attribute` ¶

output_index `instance-attribute` ¶

part `instance-attribute` ¶

sequence_number `instance-attribute` ¶

type `instance-attribute` ¶

content_index `instance-attribute` ¶

item_id `instance-attribute` ¶

output_index `instance-attribute` ¶

part `instance-attribute` ¶

sequence_number `instance-attribute` ¶

type `instance-attribute` ¶

input_item_parsing `classmethod` ¶