| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424 |
- # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
- from __future__ import annotations
- from typing import List, Union, Iterable
- from typing_extensions import Literal
- import httpx
- from .... import _legacy_response
- from ...._types import NOT_GIVEN, Body, Query, Headers, NotGiven
- from ...._utils import maybe_transform, async_maybe_transform
- from ...._compat import cached_property
- from ...._resource import SyncAPIResource, AsyncAPIResource
- from ...._response import to_streamed_response_wrapper, async_to_streamed_response_wrapper
- from ...._base_client import make_request_options
- from ....types.beta.realtime import session_create_params
- from ....types.beta.realtime.session_create_response import SessionCreateResponse
- __all__ = ["Sessions", "AsyncSessions"]
- class Sessions(SyncAPIResource):
- @cached_property
- def with_raw_response(self) -> SessionsWithRawResponse:
- """
- This property can be used as a prefix for any HTTP method call to return
- the raw response object instead of the parsed content.
- For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
- """
- return SessionsWithRawResponse(self)
- @cached_property
- def with_streaming_response(self) -> SessionsWithStreamingResponse:
- """
- An alternative to `.with_raw_response` that doesn't eagerly read the response body.
- For more information, see https://www.github.com/openai/openai-python#with_streaming_response
- """
- return SessionsWithStreamingResponse(self)
- def create(
- self,
- *,
- client_secret: session_create_params.ClientSecret | NotGiven = NOT_GIVEN,
- input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
- input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
- input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
- instructions: str | NotGiven = NOT_GIVEN,
- max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
- modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
- model: Literal[
- "gpt-realtime",
- "gpt-realtime-2025-08-28",
- "gpt-4o-realtime-preview",
- "gpt-4o-realtime-preview-2024-10-01",
- "gpt-4o-realtime-preview-2024-12-17",
- "gpt-4o-realtime-preview-2025-06-03",
- "gpt-4o-mini-realtime-preview",
- "gpt-4o-mini-realtime-preview-2024-12-17",
- ]
- | NotGiven = NOT_GIVEN,
- output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
- speed: float | NotGiven = NOT_GIVEN,
- temperature: float | NotGiven = NOT_GIVEN,
- tool_choice: str | NotGiven = NOT_GIVEN,
- tools: Iterable[session_create_params.Tool] | NotGiven = NOT_GIVEN,
- tracing: session_create_params.Tracing | NotGiven = NOT_GIVEN,
- turn_detection: session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
- voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]]
- | NotGiven = NOT_GIVEN,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
- ) -> SessionCreateResponse:
- """
- Create an ephemeral API token for use in client-side applications with the
- Realtime API. Can be configured with the same session parameters as the
- `session.update` client event.
- It responds with a session object, plus a `client_secret` key which contains a
- usable ephemeral API token that can be used to authenticate browser clients for
- the Realtime API.
- Args:
- client_secret: Configuration options for the generated client secret.
- input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
- `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
- (mono), and little-endian byte order.
- input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
- off. Noise reduction filters audio added to the input audio buffer before it is
- sent to VAD and the model. Filtering the audio can improve VAD and turn
- detection accuracy (reducing false positives) and model performance by improving
- perception of the input audio.
- input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
- `null` to turn off once on. Input audio transcription is not native to the
- model, since the model consumes audio directly. Transcription runs
- asynchronously through
- [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as guidance of input audio content rather than precisely
- what the model heard. The client can optionally set the language and prompt for
- transcription, these offer additional guidance to the transcription service.
- instructions: The default system instructions (i.e. system message) prepended to model calls.
- This field allows the client to guide the model on desired responses. The model
- can be instructed on response content and format, (e.g. "be extremely succinct",
- "act friendly", "here are examples of good responses") and on audio behavior
- (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
- instructions are not guaranteed to be followed by the model, but they provide
- guidance to the model on the desired behavior.
- Note that the server sets default instructions which will be used if this field
- is not set and are visible in the `session.created` event at the start of the
- session.
- max_response_output_tokens: Maximum number of output tokens for a single assistant response, inclusive of
- tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
- `inf` for the maximum available tokens for a given model. Defaults to `inf`.
- modalities: The set of modalities the model can respond with. To disable audio, set this to
- ["text"].
- model: The Realtime model used for this session.
- output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
- For `pcm16`, output audio is sampled at a rate of 24kHz.
- speed: The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
- minimum speed. 1.5 is the maximum speed. This value can only be changed in
- between model turns, not while a response is in progress.
- temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
- temperature of 0.8 is highly recommended for best performance.
- tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
- a function.
- tools: Tools (functions) available to the model.
- tracing: Configuration options for tracing. Set to null to disable tracing. Once tracing
- is enabled for a session, the configuration cannot be modified.
- `auto` will create a trace for the session with default values for the workflow
- name, group id, and metadata.
- turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
- set to `null` to turn off, in which case the client must manually trigger model
- response. Server VAD means that the model will detect the start and end of
- speech based on audio volume and respond at the end of user speech. Semantic VAD
- is more advanced and uses a turn detection model (in conjunction with VAD) to
- semantically estimate whether the user has finished speaking, then dynamically
- sets a timeout based on this probability. For example, if user audio trails off
- with "uhhm", the model will score a low probability of turn end and wait longer
- for the user to continue speaking. This can be useful for more natural
- conversations, but may have a higher latency.
- voice: The voice the model uses to respond. Voice cannot be changed during the session
- once the model has responded with audio at least once. Current voice options are
- `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
- extra_headers: Send extra headers
- extra_query: Add additional query parameters to the request
- extra_body: Add additional JSON properties to the request
- timeout: Override the client-level default timeout for this request, in seconds
- """
- extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
- return self._post(
- "/realtime/sessions",
- body=maybe_transform(
- {
- "client_secret": client_secret,
- "input_audio_format": input_audio_format,
- "input_audio_noise_reduction": input_audio_noise_reduction,
- "input_audio_transcription": input_audio_transcription,
- "instructions": instructions,
- "max_response_output_tokens": max_response_output_tokens,
- "modalities": modalities,
- "model": model,
- "output_audio_format": output_audio_format,
- "speed": speed,
- "temperature": temperature,
- "tool_choice": tool_choice,
- "tools": tools,
- "tracing": tracing,
- "turn_detection": turn_detection,
- "voice": voice,
- },
- session_create_params.SessionCreateParams,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=SessionCreateResponse,
- )
- class AsyncSessions(AsyncAPIResource):
- @cached_property
- def with_raw_response(self) -> AsyncSessionsWithRawResponse:
- """
- This property can be used as a prefix for any HTTP method call to return
- the raw response object instead of the parsed content.
- For more information, see https://www.github.com/openai/openai-python#accessing-raw-response-data-eg-headers
- """
- return AsyncSessionsWithRawResponse(self)
- @cached_property
- def with_streaming_response(self) -> AsyncSessionsWithStreamingResponse:
- """
- An alternative to `.with_raw_response` that doesn't eagerly read the response body.
- For more information, see https://www.github.com/openai/openai-python#with_streaming_response
- """
- return AsyncSessionsWithStreamingResponse(self)
- async def create(
- self,
- *,
- client_secret: session_create_params.ClientSecret | NotGiven = NOT_GIVEN,
- input_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
- input_audio_noise_reduction: session_create_params.InputAudioNoiseReduction | NotGiven = NOT_GIVEN,
- input_audio_transcription: session_create_params.InputAudioTranscription | NotGiven = NOT_GIVEN,
- instructions: str | NotGiven = NOT_GIVEN,
- max_response_output_tokens: Union[int, Literal["inf"]] | NotGiven = NOT_GIVEN,
- modalities: List[Literal["text", "audio"]] | NotGiven = NOT_GIVEN,
- model: Literal[
- "gpt-realtime",
- "gpt-realtime-2025-08-28",
- "gpt-4o-realtime-preview",
- "gpt-4o-realtime-preview-2024-10-01",
- "gpt-4o-realtime-preview-2024-12-17",
- "gpt-4o-realtime-preview-2025-06-03",
- "gpt-4o-mini-realtime-preview",
- "gpt-4o-mini-realtime-preview-2024-12-17",
- ]
- | NotGiven = NOT_GIVEN,
- output_audio_format: Literal["pcm16", "g711_ulaw", "g711_alaw"] | NotGiven = NOT_GIVEN,
- speed: float | NotGiven = NOT_GIVEN,
- temperature: float | NotGiven = NOT_GIVEN,
- tool_choice: str | NotGiven = NOT_GIVEN,
- tools: Iterable[session_create_params.Tool] | NotGiven = NOT_GIVEN,
- tracing: session_create_params.Tracing | NotGiven = NOT_GIVEN,
- turn_detection: session_create_params.TurnDetection | NotGiven = NOT_GIVEN,
- voice: Union[str, Literal["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"]]
- | NotGiven = NOT_GIVEN,
- # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
- # The extra values given here take precedence over values defined on the client or passed to this method.
- extra_headers: Headers | None = None,
- extra_query: Query | None = None,
- extra_body: Body | None = None,
- timeout: float | httpx.Timeout | None | NotGiven = NOT_GIVEN,
- ) -> SessionCreateResponse:
- """
- Create an ephemeral API token for use in client-side applications with the
- Realtime API. Can be configured with the same session parameters as the
- `session.update` client event.
- It responds with a session object, plus a `client_secret` key which contains a
- usable ephemeral API token that can be used to authenticate browser clients for
- the Realtime API.
- Args:
- client_secret: Configuration options for the generated client secret.
- input_audio_format: The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
- `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
- (mono), and little-endian byte order.
- input_audio_noise_reduction: Configuration for input audio noise reduction. This can be set to `null` to turn
- off. Noise reduction filters audio added to the input audio buffer before it is
- sent to VAD and the model. Filtering the audio can improve VAD and turn
- detection accuracy (reducing false positives) and model performance by improving
- perception of the input audio.
- input_audio_transcription: Configuration for input audio transcription, defaults to off and can be set to
- `null` to turn off once on. Input audio transcription is not native to the
- model, since the model consumes audio directly. Transcription runs
- asynchronously through
- [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
- and should be treated as guidance of input audio content rather than precisely
- what the model heard. The client can optionally set the language and prompt for
- transcription, these offer additional guidance to the transcription service.
- instructions: The default system instructions (i.e. system message) prepended to model calls.
- This field allows the client to guide the model on desired responses. The model
- can be instructed on response content and format, (e.g. "be extremely succinct",
- "act friendly", "here are examples of good responses") and on audio behavior
- (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
- instructions are not guaranteed to be followed by the model, but they provide
- guidance to the model on the desired behavior.
- Note that the server sets default instructions which will be used if this field
- is not set and are visible in the `session.created` event at the start of the
- session.
- max_response_output_tokens: Maximum number of output tokens for a single assistant response, inclusive of
- tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
- `inf` for the maximum available tokens for a given model. Defaults to `inf`.
- modalities: The set of modalities the model can respond with. To disable audio, set this to
- ["text"].
- model: The Realtime model used for this session.
- output_audio_format: The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
- For `pcm16`, output audio is sampled at a rate of 24kHz.
- speed: The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
- minimum speed. 1.5 is the maximum speed. This value can only be changed in
- between model turns, not while a response is in progress.
- temperature: Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
- temperature of 0.8 is highly recommended for best performance.
- tool_choice: How the model chooses tools. Options are `auto`, `none`, `required`, or specify
- a function.
- tools: Tools (functions) available to the model.
- tracing: Configuration options for tracing. Set to null to disable tracing. Once tracing
- is enabled for a session, the configuration cannot be modified.
- `auto` will create a trace for the session with default values for the workflow
- name, group id, and metadata.
- turn_detection: Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
- set to `null` to turn off, in which case the client must manually trigger model
- response. Server VAD means that the model will detect the start and end of
- speech based on audio volume and respond at the end of user speech. Semantic VAD
- is more advanced and uses a turn detection model (in conjunction with VAD) to
- semantically estimate whether the user has finished speaking, then dynamically
- sets a timeout based on this probability. For example, if user audio trails off
- with "uhhm", the model will score a low probability of turn end and wait longer
- for the user to continue speaking. This can be useful for more natural
- conversations, but may have a higher latency.
- voice: The voice the model uses to respond. Voice cannot be changed during the session
- once the model has responded with audio at least once. Current voice options are
- `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
- extra_headers: Send extra headers
- extra_query: Add additional query parameters to the request
- extra_body: Add additional JSON properties to the request
- timeout: Override the client-level default timeout for this request, in seconds
- """
- extra_headers = {"OpenAI-Beta": "assistants=v2", **(extra_headers or {})}
- return await self._post(
- "/realtime/sessions",
- body=await async_maybe_transform(
- {
- "client_secret": client_secret,
- "input_audio_format": input_audio_format,
- "input_audio_noise_reduction": input_audio_noise_reduction,
- "input_audio_transcription": input_audio_transcription,
- "instructions": instructions,
- "max_response_output_tokens": max_response_output_tokens,
- "modalities": modalities,
- "model": model,
- "output_audio_format": output_audio_format,
- "speed": speed,
- "temperature": temperature,
- "tool_choice": tool_choice,
- "tools": tools,
- "tracing": tracing,
- "turn_detection": turn_detection,
- "voice": voice,
- },
- session_create_params.SessionCreateParams,
- ),
- options=make_request_options(
- extra_headers=extra_headers, extra_query=extra_query, extra_body=extra_body, timeout=timeout
- ),
- cast_to=SessionCreateResponse,
- )
- class SessionsWithRawResponse:
- def __init__(self, sessions: Sessions) -> None:
- self._sessions = sessions
- self.create = _legacy_response.to_raw_response_wrapper(
- sessions.create,
- )
- class AsyncSessionsWithRawResponse:
- def __init__(self, sessions: AsyncSessions) -> None:
- self._sessions = sessions
- self.create = _legacy_response.async_to_raw_response_wrapper(
- sessions.create,
- )
- class SessionsWithStreamingResponse:
- def __init__(self, sessions: Sessions) -> None:
- self._sessions = sessions
- self.create = to_streamed_response_wrapper(
- sessions.create,
- )
- class AsyncSessionsWithStreamingResponse:
- def __init__(self, sessions: AsyncSessions) -> None:
- self._sessions = sessions
- self.create = async_to_streamed_response_wrapper(
- sessions.create,
- )
|