Skip to content

vllm.outputs

_O module-attribute

_O = TypeVar('_O', default=PoolingOutput)

logger module-attribute

logger = init_logger(__name__)

ClassificationOutput dataclass

The output data of one classification output of a request.

Parameters:

Name Type Description Default
probs list[float]

The probability vector, which is a list of floats. Its length depends on the number of classes.

required
Source code in vllm/outputs.py
@dataclass
class ClassificationOutput:
    """The output data of one classification output of a request.

    Args:
        probs: The probability vector, which is a list of floats.
            Its length depends on the number of classes.
    """

    probs: list[float]

    @staticmethod
    def from_base(pooling_output: PoolingOutput):
        # pooling_output shape: (num_classes)
        pooled_data = pooling_output.data
        if pooled_data.ndim != 1:
            raise ValueError("pooled_data should be a 1-D probability vector")

        return ClassificationOutput(pooled_data.tolist())

    @property
    def num_classes(self) -> int:
        return len(self.probs)

    def __repr__(self) -> str:
        return f"ClassificationOutput(num_classes={self.num_classes})"

num_classes property

num_classes: int

probs instance-attribute

probs: list[float]

__init__

__init__(probs: list[float]) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"ClassificationOutput(num_classes={self.num_classes})"

from_base staticmethod

from_base(pooling_output: PoolingOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(pooling_output: PoolingOutput):
    # pooling_output shape: (num_classes)
    pooled_data = pooling_output.data
    if pooled_data.ndim != 1:
        raise ValueError("pooled_data should be a 1-D probability vector")

    return ClassificationOutput(pooled_data.tolist())

ClassificationRequestOutput

Bases: PoolingRequestOutput[ClassificationOutput]

Source code in vllm/outputs.py
class ClassificationRequestOutput(PoolingRequestOutput[ClassificationOutput]):
    @staticmethod
    def from_base(request_output: PoolingRequestOutput):
        return ClassificationRequestOutput(
            request_id=request_output.request_id,
            outputs=ClassificationOutput.from_base(request_output.outputs),
            prompt_token_ids=request_output.prompt_token_ids,
            num_cached_tokens=request_output.num_cached_tokens,
            finished=request_output.finished,
        )

from_base staticmethod

from_base(request_output: PoolingRequestOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(request_output: PoolingRequestOutput):
    return ClassificationRequestOutput(
        request_id=request_output.request_id,
        outputs=ClassificationOutput.from_base(request_output.outputs),
        prompt_token_ids=request_output.prompt_token_ids,
        num_cached_tokens=request_output.num_cached_tokens,
        finished=request_output.finished,
    )

CompletionOutput dataclass

The output data of one completion output of a request.

Parameters:

Name Type Description Default
index int

The index of the output in the request.

required
text str

The generated output text.

required
token_ids Sequence[int]

The token IDs of the generated output text.

required
cumulative_logprob float | None

The cumulative log probability of the generated output text.

required
logprobs SampleLogprobs | None

The log probabilities of the top probability words at each position if the logprobs are requested.

required
finish_reason str | None

The reason why the sequence is finished.

None
stop_reason int | str | None

The stop string or token id that caused the completion to stop, None if the completion finished for some other reason including encountering the EOS token.

None
lora_request LoRARequest | None

The LoRA request that was used to generate the output.

None
Source code in vllm/outputs.py
@dataclass
class CompletionOutput:
    """The output data of one completion output of a request.

    Args:
        index: The index of the output in the request.
        text: The generated output text.
        token_ids: The token IDs of the generated output text.
        cumulative_logprob: The cumulative log probability of the generated
            output text.
        logprobs: The log probabilities of the top probability words at each
            position if the logprobs are requested.
        finish_reason: The reason why the sequence is finished.
        stop_reason: The stop string or token id that caused the completion
            to stop, None if the completion finished for some other reason
            including encountering the EOS token.
        lora_request: The LoRA request that was used to generate the output.
    """

    index: int
    text: str
    token_ids: GenericSequence[int]
    cumulative_logprob: float | None
    logprobs: SampleLogprobs | None
    finish_reason: str | None = None
    stop_reason: int | str | None = None
    lora_request: LoRARequest | None = None

    def finished(self) -> bool:
        return self.finish_reason is not None

    def __repr__(self) -> str:
        return (
            f"CompletionOutput(index={self.index}, "
            f"text={self.text!r}, "
            f"token_ids={self.token_ids}, "
            f"cumulative_logprob={self.cumulative_logprob}, "
            f"logprobs={self.logprobs}, "
            f"finish_reason={self.finish_reason}, "
            f"stop_reason={self.stop_reason})"
        )

cumulative_logprob instance-attribute

cumulative_logprob: float | None

finish_reason class-attribute instance-attribute

finish_reason: str | None = None

index instance-attribute

index: int

logprobs instance-attribute

logprobs: SampleLogprobs | None

lora_request class-attribute instance-attribute

lora_request: LoRARequest | None = None

stop_reason class-attribute instance-attribute

stop_reason: int | str | None = None

text instance-attribute

text: str

token_ids instance-attribute

token_ids: Sequence[int]

__init__

__init__(
    index: int,
    text: str,
    token_ids: Sequence[int],
    cumulative_logprob: float | None,
    logprobs: SampleLogprobs | None,
    finish_reason: str | None = None,
    stop_reason: int | str | None = None,
    lora_request: LoRARequest | None = None,
) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return (
        f"CompletionOutput(index={self.index}, "
        f"text={self.text!r}, "
        f"token_ids={self.token_ids}, "
        f"cumulative_logprob={self.cumulative_logprob}, "
        f"logprobs={self.logprobs}, "
        f"finish_reason={self.finish_reason}, "
        f"stop_reason={self.stop_reason})"
    )

finished

finished() -> bool
Source code in vllm/outputs.py
def finished(self) -> bool:
    return self.finish_reason is not None

EmbeddingOutput dataclass

The output data of one embedding output of a request.

Parameters:

Name Type Description Default
embedding list[float]

The embedding vector, which is a list of floats. Its length depends on the hidden dimension of the model.

required
Source code in vllm/outputs.py
@dataclass
class EmbeddingOutput:
    """The output data of one embedding output of a request.

    Args:
        embedding: The embedding vector, which is a list of floats.
            Its length depends on the hidden dimension of the model.
    """

    embedding: list[float]

    @staticmethod
    def from_base(pooling_output: PoolingOutput):
        pooled_data = pooling_output.data
        if pooled_data.ndim != 1:
            raise ValueError("pooled_data should be a 1-D embedding vector")

        return EmbeddingOutput(pooled_data.tolist())

    @property
    def hidden_size(self) -> int:
        return len(self.embedding)

    def __repr__(self) -> str:
        return f"EmbeddingOutput(hidden_size={self.hidden_size})"

embedding instance-attribute

embedding: list[float]

hidden_size property

hidden_size: int

__init__

__init__(embedding: list[float]) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"EmbeddingOutput(hidden_size={self.hidden_size})"

from_base staticmethod

from_base(pooling_output: PoolingOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(pooling_output: PoolingOutput):
    pooled_data = pooling_output.data
    if pooled_data.ndim != 1:
        raise ValueError("pooled_data should be a 1-D embedding vector")

    return EmbeddingOutput(pooled_data.tolist())

EmbeddingRequestOutput

Bases: PoolingRequestOutput[EmbeddingOutput]

Source code in vllm/outputs.py
class EmbeddingRequestOutput(PoolingRequestOutput[EmbeddingOutput]):
    @staticmethod
    def from_base(request_output: PoolingRequestOutput):
        return EmbeddingRequestOutput(
            request_id=request_output.request_id,
            outputs=EmbeddingOutput.from_base(request_output.outputs),
            prompt_token_ids=request_output.prompt_token_ids,
            num_cached_tokens=request_output.num_cached_tokens,
            finished=request_output.finished,
        )

from_base staticmethod

from_base(request_output: PoolingRequestOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(request_output: PoolingRequestOutput):
    return EmbeddingRequestOutput(
        request_id=request_output.request_id,
        outputs=EmbeddingOutput.from_base(request_output.outputs),
        prompt_token_ids=request_output.prompt_token_ids,
        num_cached_tokens=request_output.num_cached_tokens,
        finished=request_output.finished,
    )

PoolingOutput dataclass

The output data of one pooling output of a request.

Parameters:

Name Type Description Default
data Tensor

The extracted hidden states.

required
Source code in vllm/outputs.py
@dataclass
class PoolingOutput:
    """The output data of one pooling output of a request.

    Args:
        data: The extracted hidden states.
    """

    data: torch.Tensor

    def __repr__(self) -> str:
        return f"PoolingOutput(data={self.data})"

    def __eq__(self, other: object) -> bool:
        return isinstance(other, self.__class__) and bool(
            (self.data == other.data).all()
        )

data instance-attribute

data: Tensor

__eq__

__eq__(other: object) -> bool
Source code in vllm/outputs.py
def __eq__(self, other: object) -> bool:
    return isinstance(other, self.__class__) and bool(
        (self.data == other.data).all()
    )

__init__

__init__(data: Tensor) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"PoolingOutput(data={self.data})"

PoolingRequestOutput

Bases: Generic[_O]

The output data of a pooling request to the LLM.

Parameters:

Name Type Description Default
request_id str

A unique identifier for the pooling request.

required
outputs PoolingOutput

The pooling results for the given input.

required
prompt_token_ids list[int]

A list of token IDs used in the prompt.

required
num_cached_tokens int

The number of tokens with prefix cache hit.

required
finished bool

A flag indicating whether the pooling is completed.

required
Source code in vllm/outputs.py
class PoolingRequestOutput(Generic[_O]):
    """
    The output data of a pooling request to the LLM.

    Args:
        request_id (str): A unique identifier for the pooling request.
        outputs (PoolingOutput): The pooling results for the given input.
        prompt_token_ids (list[int]): A list of token IDs used in the prompt.
        num_cached_tokens: The number of tokens with prefix cache hit.
        finished (bool): A flag indicating whether the pooling is completed.
    """

    def __init__(
        self,
        request_id: str,
        outputs: _O,
        prompt_token_ids: list[int],
        num_cached_tokens: int,
        finished: bool,
    ):
        self.request_id = request_id
        self.prompt_token_ids = prompt_token_ids
        self.num_cached_tokens = num_cached_tokens
        self.finished = finished
        self.outputs = outputs

    def __repr__(self):
        return (
            f"{type(self).__name__}(request_id={self.request_id!r}, "
            f"outputs={self.outputs!r}, "
            f"prompt_token_ids={self.prompt_token_ids}, "
            f"num_cached_tokens={self.num_cached_tokens}, "
            f"finished={self.finished})"
        )

finished instance-attribute

finished = finished

num_cached_tokens instance-attribute

num_cached_tokens = num_cached_tokens

outputs instance-attribute

outputs = outputs

prompt_token_ids instance-attribute

prompt_token_ids = prompt_token_ids

request_id instance-attribute

request_id = request_id

__init__

__init__(
    request_id: str,
    outputs: _O,
    prompt_token_ids: list[int],
    num_cached_tokens: int,
    finished: bool,
)
Source code in vllm/outputs.py
def __init__(
    self,
    request_id: str,
    outputs: _O,
    prompt_token_ids: list[int],
    num_cached_tokens: int,
    finished: bool,
):
    self.request_id = request_id
    self.prompt_token_ids = prompt_token_ids
    self.num_cached_tokens = num_cached_tokens
    self.finished = finished
    self.outputs = outputs

__repr__

__repr__()
Source code in vllm/outputs.py
def __repr__(self):
    return (
        f"{type(self).__name__}(request_id={self.request_id!r}, "
        f"outputs={self.outputs!r}, "
        f"prompt_token_ids={self.prompt_token_ids}, "
        f"num_cached_tokens={self.num_cached_tokens}, "
        f"finished={self.finished})"
    )

RequestOutput

The output data of a completion request to the LLM.

Parameters:

Name Type Description Default
request_id str

The unique ID of the request.

required
prompt str | None

The prompt string of the request. For encoder/decoder models, this is the decoder input prompt.

required
prompt_token_ids list[int] | None

The token IDs of the prompt. For encoder/decoder models, this is the decoder input prompt token ids.

required
prompt_logprobs PromptLogprobs | None

The log probabilities to return per prompt token.

required
outputs list[CompletionOutput]

The output sequences of the request.

required
finished bool

Whether the whole request is finished.

required
metrics RequestMetrics | RequestStateStats | None

Metrics associated with the request.

None
lora_request LoRARequest | None

The LoRA request that was used to generate the output.

None
encoder_prompt str | None

The encoder prompt string of the request. None if decoder-only.

None
encoder_prompt_token_ids list[int] | None

The token IDs of the encoder prompt. None if decoder-only.

None
num_cached_tokens int | None

The number of tokens with prefix cache hit.

None
kv_transfer_params dict[str, Any] | None

The params for remote K/V transfer.

None
Source code in vllm/outputs.py
class RequestOutput:
    """The output data of a completion request to the LLM.

    Args:
        request_id: The unique ID of the request.
        prompt: The prompt string of the request.
                For encoder/decoder models, this is the
                decoder input prompt.
        prompt_token_ids: The token IDs of the prompt.
                          For encoder/decoder models, this is the
                          decoder input prompt token ids.
        prompt_logprobs: The log probabilities to return per prompt token.
        outputs: The output sequences of the request.
        finished: Whether the whole request is finished.
        metrics: Metrics associated with the request.
        lora_request: The LoRA request that was used to generate the output.
        encoder_prompt: The encoder prompt string of the request.
                        None if decoder-only.
        encoder_prompt_token_ids: The token IDs of the encoder prompt.
                                  None if decoder-only.
        num_cached_tokens: The number of tokens with prefix cache hit.
        kv_transfer_params: The params for remote K/V transfer.
    """

    def __init__(
        self,
        request_id: str,
        prompt: str | None,
        prompt_token_ids: list[int] | None,
        prompt_logprobs: PromptLogprobs | None,
        outputs: list[CompletionOutput],
        finished: bool,
        metrics: RequestMetrics | RequestStateStats | None = None,
        lora_request: LoRARequest | None = None,
        encoder_prompt: str | None = None,
        encoder_prompt_token_ids: list[int] | None = None,
        num_cached_tokens: int | None = None,
        *,
        multi_modal_placeholders: MultiModalPlaceholderDict | None = None,
        kv_transfer_params: dict[str, Any] | None = None,
        # Forward compatibility, code that uses args added in new release can
        # still run with older versions of vLLM without breaking.
        **kwargs: Any,
    ) -> None:
        if kwargs:
            logger.warning_once(
                "RequestOutput: Ignoring extra arguments: %s", str(kwargs)
            )
        self.request_id = request_id
        self.prompt = prompt
        self.prompt_token_ids = prompt_token_ids
        self.multi_modal_placeholders = multi_modal_placeholders or {}
        self.prompt_logprobs = prompt_logprobs
        self.outputs = outputs
        self.finished = finished
        self.metrics = metrics
        self.lora_request = lora_request
        self.encoder_prompt = encoder_prompt
        self.encoder_prompt_token_ids = encoder_prompt_token_ids
        self.num_cached_tokens = num_cached_tokens
        self.kv_transfer_params = kv_transfer_params

    def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
        """Merge subsequent RequestOutput into this one"""

        self.finished |= next_output.finished
        self.kv_transfer_params = next_output.kv_transfer_params

        for next_completion in next_output.outputs:
            for i, completion in enumerate(self.outputs):
                if completion.index == next_completion.index:
                    if aggregate:
                        # Merge outputs with same index
                        completion.text += next_completion.text
                        if not isinstance(completion.token_ids, MutableSequence):
                            completion.token_ids = list(completion.token_ids)
                        completion.token_ids.extend(next_completion.token_ids)
                        if next_completion.logprobs:
                            assert completion.logprobs is not None
                            completion.logprobs.extend(next_completion.logprobs)
                        completion.cumulative_logprob = (
                            next_completion.cumulative_logprob
                        )
                        completion.finish_reason = next_completion.finish_reason
                        completion.stop_reason = next_completion.stop_reason
                    else:
                        # Replace the output with the new one
                        self.outputs[i] = next_completion
                    break
            else:
                self.outputs.append(next_completion)

    def __repr__(self) -> str:
        return (
            f"RequestOutput(request_id={self.request_id}, "
            f"prompt={self.prompt!r}, "
            f"prompt_token_ids={self.prompt_token_ids}, "
            f"encoder_prompt={self.encoder_prompt!r}, "
            f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
            f"prompt_logprobs={self.prompt_logprobs}, "
            f"outputs={self.outputs}, "
            f"finished={self.finished}, "
            f"metrics={self.metrics}, "
            f"lora_request={self.lora_request}, "
            f"num_cached_tokens={self.num_cached_tokens}, "
            f"multi_modal_placeholders={self.multi_modal_placeholders})"
        )

encoder_prompt instance-attribute

encoder_prompt = encoder_prompt

encoder_prompt_token_ids instance-attribute

encoder_prompt_token_ids = encoder_prompt_token_ids

finished instance-attribute

finished = finished

kv_transfer_params instance-attribute

kv_transfer_params = kv_transfer_params

lora_request instance-attribute

lora_request = lora_request

metrics instance-attribute

metrics = metrics

multi_modal_placeholders instance-attribute

multi_modal_placeholders = multi_modal_placeholders or {}

num_cached_tokens instance-attribute

num_cached_tokens = num_cached_tokens

outputs instance-attribute

outputs = outputs

prompt instance-attribute

prompt = prompt

prompt_logprobs instance-attribute

prompt_logprobs = prompt_logprobs

prompt_token_ids instance-attribute

prompt_token_ids = prompt_token_ids

request_id instance-attribute

request_id = request_id

__init__

__init__(
    request_id: str,
    prompt: str | None,
    prompt_token_ids: list[int] | None,
    prompt_logprobs: PromptLogprobs | None,
    outputs: list[CompletionOutput],
    finished: bool,
    metrics: RequestMetrics
    | RequestStateStats
    | None = None,
    lora_request: LoRARequest | None = None,
    encoder_prompt: str | None = None,
    encoder_prompt_token_ids: list[int] | None = None,
    num_cached_tokens: int | None = None,
    *,
    multi_modal_placeholders: MultiModalPlaceholderDict
    | None = None,
    kv_transfer_params: dict[str, Any] | None = None,
    **kwargs: Any,
) -> None
Source code in vllm/outputs.py
def __init__(
    self,
    request_id: str,
    prompt: str | None,
    prompt_token_ids: list[int] | None,
    prompt_logprobs: PromptLogprobs | None,
    outputs: list[CompletionOutput],
    finished: bool,
    metrics: RequestMetrics | RequestStateStats | None = None,
    lora_request: LoRARequest | None = None,
    encoder_prompt: str | None = None,
    encoder_prompt_token_ids: list[int] | None = None,
    num_cached_tokens: int | None = None,
    *,
    multi_modal_placeholders: MultiModalPlaceholderDict | None = None,
    kv_transfer_params: dict[str, Any] | None = None,
    # Forward compatibility, code that uses args added in new release can
    # still run with older versions of vLLM without breaking.
    **kwargs: Any,
) -> None:
    if kwargs:
        logger.warning_once(
            "RequestOutput: Ignoring extra arguments: %s", str(kwargs)
        )
    self.request_id = request_id
    self.prompt = prompt
    self.prompt_token_ids = prompt_token_ids
    self.multi_modal_placeholders = multi_modal_placeholders or {}
    self.prompt_logprobs = prompt_logprobs
    self.outputs = outputs
    self.finished = finished
    self.metrics = metrics
    self.lora_request = lora_request
    self.encoder_prompt = encoder_prompt
    self.encoder_prompt_token_ids = encoder_prompt_token_ids
    self.num_cached_tokens = num_cached_tokens
    self.kv_transfer_params = kv_transfer_params

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return (
        f"RequestOutput(request_id={self.request_id}, "
        f"prompt={self.prompt!r}, "
        f"prompt_token_ids={self.prompt_token_ids}, "
        f"encoder_prompt={self.encoder_prompt!r}, "
        f"encoder_prompt_token_ids={self.encoder_prompt_token_ids}, "
        f"prompt_logprobs={self.prompt_logprobs}, "
        f"outputs={self.outputs}, "
        f"finished={self.finished}, "
        f"metrics={self.metrics}, "
        f"lora_request={self.lora_request}, "
        f"num_cached_tokens={self.num_cached_tokens}, "
        f"multi_modal_placeholders={self.multi_modal_placeholders})"
    )

add

add(next_output: RequestOutput, aggregate: bool) -> None

Merge subsequent RequestOutput into this one

Source code in vllm/outputs.py
def add(self, next_output: "RequestOutput", aggregate: bool) -> None:
    """Merge subsequent RequestOutput into this one"""

    self.finished |= next_output.finished
    self.kv_transfer_params = next_output.kv_transfer_params

    for next_completion in next_output.outputs:
        for i, completion in enumerate(self.outputs):
            if completion.index == next_completion.index:
                if aggregate:
                    # Merge outputs with same index
                    completion.text += next_completion.text
                    if not isinstance(completion.token_ids, MutableSequence):
                        completion.token_ids = list(completion.token_ids)
                    completion.token_ids.extend(next_completion.token_ids)
                    if next_completion.logprobs:
                        assert completion.logprobs is not None
                        completion.logprobs.extend(next_completion.logprobs)
                    completion.cumulative_logprob = (
                        next_completion.cumulative_logprob
                    )
                    completion.finish_reason = next_completion.finish_reason
                    completion.stop_reason = next_completion.stop_reason
                else:
                    # Replace the output with the new one
                    self.outputs[i] = next_completion
                break
        else:
            self.outputs.append(next_completion)

ScoringOutput dataclass

The output data of one scoring output of a request.

Parameters:

Name Type Description Default
score float

The similarity score, which is a scalar value.

required
Source code in vllm/outputs.py
@dataclass
class ScoringOutput:
    """The output data of one scoring output of a request.

    Args:
        score: The similarity score, which is a scalar value.
    """

    score: float

    @staticmethod
    def from_base(pooling_output: PoolingOutput):
        # pooling_output shape:
        #   classify task: (num_classes) num_classes == 1
        #   embed task: a scalar value
        pooled_data = pooling_output.data.squeeze()
        if pooled_data.ndim != 0:
            raise ValueError("pooled_data should be a scalar score")

        return ScoringOutput(pooled_data.item())

    def __repr__(self) -> str:
        return f"ScoringOutput(score={self.score})"

score instance-attribute

score: float

__init__

__init__(score: float) -> None

__repr__

__repr__() -> str
Source code in vllm/outputs.py
def __repr__(self) -> str:
    return f"ScoringOutput(score={self.score})"

from_base staticmethod

from_base(pooling_output: PoolingOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(pooling_output: PoolingOutput):
    # pooling_output shape:
    #   classify task: (num_classes) num_classes == 1
    #   embed task: a scalar value
    pooled_data = pooling_output.data.squeeze()
    if pooled_data.ndim != 0:
        raise ValueError("pooled_data should be a scalar score")

    return ScoringOutput(pooled_data.item())

ScoringRequestOutput

Bases: PoolingRequestOutput[ScoringOutput]

Source code in vllm/outputs.py
class ScoringRequestOutput(PoolingRequestOutput[ScoringOutput]):
    @staticmethod
    def from_base(request_output: PoolingRequestOutput):
        return ScoringRequestOutput(
            request_id=request_output.request_id,
            outputs=ScoringOutput.from_base(request_output.outputs),
            prompt_token_ids=request_output.prompt_token_ids,
            num_cached_tokens=request_output.num_cached_tokens,
            finished=request_output.finished,
        )

from_base staticmethod

from_base(request_output: PoolingRequestOutput)
Source code in vllm/outputs.py
@staticmethod
def from_base(request_output: PoolingRequestOutput):
    return ScoringRequestOutput(
        request_id=request_output.request_id,
        outputs=ScoringOutput.from_base(request_output.outputs),
        prompt_token_ids=request_output.prompt_token_ids,
        num_cached_tokens=request_output.num_cached_tokens,
        finished=request_output.finished,
    )