sessions.d.mts 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744
  1. import { APIResource } from "../../../core/resource.mjs";
  2. import { APIPromise } from "../../../core/api-promise.mjs";
  3. import { RequestOptions } from "../../../internal/request-options.mjs";
  4. export declare class Sessions extends APIResource {
  5. /**
  6. * Create an ephemeral API token for use in client-side applications with the
  7. * Realtime API. Can be configured with the same session parameters as the
  8. * `session.update` client event.
  9. *
  10. * It responds with a session object, plus a `client_secret` key which contains a
  11. * usable ephemeral API token that can be used to authenticate browser clients for
  12. * the Realtime API.
  13. *
  14. * @example
  15. * ```ts
  16. * const session =
  17. * await client.beta.realtime.sessions.create();
  18. * ```
  19. */
  20. create(body: SessionCreateParams, options?: RequestOptions): APIPromise<SessionCreateResponse>;
  21. }
  22. /**
  23. * Realtime session object configuration.
  24. */
  25. export interface Session {
  26. /**
  27. * Unique identifier for the session that looks like `sess_1234567890abcdef`.
  28. */
  29. id?: string;
  30. /**
  31. * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
  32. * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
  33. * (mono), and little-endian byte order.
  34. */
  35. input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
  36. /**
  37. * Configuration for input audio noise reduction. This can be set to `null` to turn
  38. * off. Noise reduction filters audio added to the input audio buffer before it is
  39. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  40. * detection accuracy (reducing false positives) and model performance by improving
  41. * perception of the input audio.
  42. */
  43. input_audio_noise_reduction?: Session.InputAudioNoiseReduction;
  44. /**
  45. * Configuration for input audio transcription, defaults to off and can be set to
  46. * `null` to turn off once on. Input audio transcription is not native to the
  47. * model, since the model consumes audio directly. Transcription runs
  48. * asynchronously through
  49. * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
  50. * and should be treated as guidance of input audio content rather than precisely
  51. * what the model heard. The client can optionally set the language and prompt for
  52. * transcription, these offer additional guidance to the transcription service.
  53. */
  54. input_audio_transcription?: Session.InputAudioTranscription;
  55. /**
  56. * The default system instructions (i.e. system message) prepended to model calls.
  57. * This field allows the client to guide the model on desired responses. The model
  58. * can be instructed on response content and format, (e.g. "be extremely succinct",
  59. * "act friendly", "here are examples of good responses") and on audio behavior
  60. * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
  61. * instructions are not guaranteed to be followed by the model, but they provide
  62. * guidance to the model on the desired behavior.
  63. *
  64. * Note that the server sets default instructions which will be used if this field
  65. * is not set and are visible in the `session.created` event at the start of the
  66. * session.
  67. */
  68. instructions?: string;
  69. /**
  70. * Maximum number of output tokens for a single assistant response, inclusive of
  71. * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
  72. * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
  73. */
  74. max_response_output_tokens?: number | 'inf';
  75. /**
  76. * The set of modalities the model can respond with. To disable audio, set this to
  77. * ["text"].
  78. */
  79. modalities?: Array<'text' | 'audio'>;
  80. /**
  81. * The Realtime model used for this session.
  82. */
  83. model?: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-realtime-preview-2025-06-03' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17';
  84. /**
  85. * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
  86. * For `pcm16`, output audio is sampled at a rate of 24kHz.
  87. */
  88. output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
  89. /**
  90. * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
  91. * minimum speed. 1.5 is the maximum speed. This value can only be changed in
  92. * between model turns, not while a response is in progress.
  93. */
  94. speed?: number;
  95. /**
  96. * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
  97. * temperature of 0.8 is highly recommended for best performance.
  98. */
  99. temperature?: number;
  100. /**
  101. * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
  102. * a function.
  103. */
  104. tool_choice?: string;
  105. /**
  106. * Tools (functions) available to the model.
  107. */
  108. tools?: Array<Session.Tool>;
  109. /**
  110. * Configuration options for tracing. Set to null to disable tracing. Once tracing
  111. * is enabled for a session, the configuration cannot be modified.
  112. *
  113. * `auto` will create a trace for the session with default values for the workflow
  114. * name, group id, and metadata.
  115. */
  116. tracing?: 'auto' | Session.TracingConfiguration;
  117. /**
  118. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  119. * set to `null` to turn off, in which case the client must manually trigger model
  120. * response. Server VAD means that the model will detect the start and end of
  121. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  122. * is more advanced and uses a turn detection model (in conjunction with VAD) to
  123. * semantically estimate whether the user has finished speaking, then dynamically
  124. * sets a timeout based on this probability. For example, if user audio trails off
  125. * with "uhhm", the model will score a low probability of turn end and wait longer
  126. * for the user to continue speaking. This can be useful for more natural
  127. * conversations, but may have a higher latency.
  128. */
  129. turn_detection?: Session.TurnDetection;
  130. /**
  131. * The voice the model uses to respond. Voice cannot be changed during the session
  132. * once the model has responded with audio at least once. Current voice options are
  133. * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
  134. */
  135. voice?: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
  136. }
  137. export declare namespace Session {
  138. /**
  139. * Configuration for input audio noise reduction. This can be set to `null` to turn
  140. * off. Noise reduction filters audio added to the input audio buffer before it is
  141. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  142. * detection accuracy (reducing false positives) and model performance by improving
  143. * perception of the input audio.
  144. */
  145. interface InputAudioNoiseReduction {
  146. /**
  147. * Type of noise reduction. `near_field` is for close-talking microphones such as
  148. * headphones, `far_field` is for far-field microphones such as laptop or
  149. * conference room microphones.
  150. */
  151. type?: 'near_field' | 'far_field';
  152. }
  153. /**
  154. * Configuration for input audio transcription, defaults to off and can be set to
  155. * `null` to turn off once on. Input audio transcription is not native to the
  156. * model, since the model consumes audio directly. Transcription runs
  157. * asynchronously through
  158. * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
  159. * and should be treated as guidance of input audio content rather than precisely
  160. * what the model heard. The client can optionally set the language and prompt for
  161. * transcription, these offer additional guidance to the transcription service.
  162. */
  163. interface InputAudioTranscription {
  164. /**
  165. * The language of the input audio. Supplying the input language in
  166. * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  167. * format will improve accuracy and latency.
  168. */
  169. language?: string;
  170. /**
  171. * The model to use for transcription, current options are `gpt-4o-transcribe`,
  172. * `gpt-4o-mini-transcribe`, and `whisper-1`.
  173. */
  174. model?: string;
  175. /**
  176. * An optional text to guide the model's style or continue a previous audio
  177. * segment. For `whisper-1`, the
  178. * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
  179. * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
  180. * "expect words related to technology".
  181. */
  182. prompt?: string;
  183. }
  184. interface Tool {
  185. /**
  186. * The description of the function, including guidance on when and how to call it,
  187. * and guidance about what to tell the user when calling (if anything).
  188. */
  189. description?: string;
  190. /**
  191. * The name of the function.
  192. */
  193. name?: string;
  194. /**
  195. * Parameters of the function in JSON Schema.
  196. */
  197. parameters?: unknown;
  198. /**
  199. * The type of the tool, i.e. `function`.
  200. */
  201. type?: 'function';
  202. }
  203. /**
  204. * Granular configuration for tracing.
  205. */
  206. interface TracingConfiguration {
  207. /**
  208. * The group id to attach to this trace to enable filtering and grouping in the
  209. * traces dashboard.
  210. */
  211. group_id?: string;
  212. /**
  213. * The arbitrary metadata to attach to this trace to enable filtering in the traces
  214. * dashboard.
  215. */
  216. metadata?: unknown;
  217. /**
  218. * The name of the workflow to attach to this trace. This is used to name the trace
  219. * in the traces dashboard.
  220. */
  221. workflow_name?: string;
  222. }
  223. /**
  224. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  225. * set to `null` to turn off, in which case the client must manually trigger model
  226. * response. Server VAD means that the model will detect the start and end of
  227. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  228. * is more advanced and uses a turn detection model (in conjunction with VAD) to
  229. * semantically estimate whether the user has finished speaking, then dynamically
  230. * sets a timeout based on this probability. For example, if user audio trails off
  231. * with "uhhm", the model will score a low probability of turn end and wait longer
  232. * for the user to continue speaking. This can be useful for more natural
  233. * conversations, but may have a higher latency.
  234. */
  235. interface TurnDetection {
  236. /**
  237. * Whether or not to automatically generate a response when a VAD stop event
  238. * occurs.
  239. */
  240. create_response?: boolean;
  241. /**
  242. * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
  243. * will wait longer for the user to continue speaking, `high` will respond more
  244. * quickly. `auto` is the default and is equivalent to `medium`.
  245. */
  246. eagerness?: 'low' | 'medium' | 'high' | 'auto';
  247. /**
  248. * Whether or not to automatically interrupt any ongoing response with output to
  249. * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
  250. * occurs.
  251. */
  252. interrupt_response?: boolean;
  253. /**
  254. * Used only for `server_vad` mode. Amount of audio to include before the VAD
  255. * detected speech (in milliseconds). Defaults to 300ms.
  256. */
  257. prefix_padding_ms?: number;
  258. /**
  259. * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
  260. * milliseconds). Defaults to 500ms. With shorter values the model will respond
  261. * more quickly, but may jump in on short pauses from the user.
  262. */
  263. silence_duration_ms?: number;
  264. /**
  265. * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
  266. * defaults to 0.5. A higher threshold will require louder audio to activate the
  267. * model, and thus might perform better in noisy environments.
  268. */
  269. threshold?: number;
  270. /**
  271. * Type of turn detection.
  272. */
  273. type?: 'server_vad' | 'semantic_vad';
  274. }
  275. }
  276. /**
  277. * A new Realtime session configuration, with an ephemeral key. Default TTL for
  278. * keys is one minute.
  279. */
  280. export interface SessionCreateResponse {
  281. /**
  282. * Ephemeral key returned by the API.
  283. */
  284. client_secret: SessionCreateResponse.ClientSecret;
  285. /**
  286. * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
  287. */
  288. input_audio_format?: string;
  289. /**
  290. * Configuration for input audio transcription, defaults to off and can be set to
  291. * `null` to turn off once on. Input audio transcription is not native to the
  292. * model, since the model consumes audio directly. Transcription runs
  293. * asynchronously and should be treated as rough guidance rather than the
  294. * representation understood by the model.
  295. */
  296. input_audio_transcription?: SessionCreateResponse.InputAudioTranscription;
  297. /**
  298. * The default system instructions (i.e. system message) prepended to model calls.
  299. * This field allows the client to guide the model on desired responses. The model
  300. * can be instructed on response content and format, (e.g. "be extremely succinct",
  301. * "act friendly", "here are examples of good responses") and on audio behavior
  302. * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
  303. * instructions are not guaranteed to be followed by the model, but they provide
  304. * guidance to the model on the desired behavior.
  305. *
  306. * Note that the server sets default instructions which will be used if this field
  307. * is not set and are visible in the `session.created` event at the start of the
  308. * session.
  309. */
  310. instructions?: string;
  311. /**
  312. * Maximum number of output tokens for a single assistant response, inclusive of
  313. * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
  314. * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
  315. */
  316. max_response_output_tokens?: number | 'inf';
  317. /**
  318. * The set of modalities the model can respond with. To disable audio, set this to
  319. * ["text"].
  320. */
  321. modalities?: Array<'text' | 'audio'>;
  322. /**
  323. * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
  324. */
  325. output_audio_format?: string;
  326. /**
  327. * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
  328. * minimum speed. 1.5 is the maximum speed. This value can only be changed in
  329. * between model turns, not while a response is in progress.
  330. */
  331. speed?: number;
  332. /**
  333. * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
  334. */
  335. temperature?: number;
  336. /**
  337. * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
  338. * a function.
  339. */
  340. tool_choice?: string;
  341. /**
  342. * Tools (functions) available to the model.
  343. */
  344. tools?: Array<SessionCreateResponse.Tool>;
  345. /**
  346. * Configuration options for tracing. Set to null to disable tracing. Once tracing
  347. * is enabled for a session, the configuration cannot be modified.
  348. *
  349. * `auto` will create a trace for the session with default values for the workflow
  350. * name, group id, and metadata.
  351. */
  352. tracing?: 'auto' | SessionCreateResponse.TracingConfiguration;
  353. /**
  354. * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
  355. * means that the model will detect the start and end of speech based on audio
  356. * volume and respond at the end of user speech.
  357. */
  358. turn_detection?: SessionCreateResponse.TurnDetection;
  359. /**
  360. * The voice the model uses to respond. Voice cannot be changed during the session
  361. * once the model has responded with audio at least once. Current voice options are
  362. * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
  363. */
  364. voice?: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
  365. }
  366. export declare namespace SessionCreateResponse {
  367. /**
  368. * Ephemeral key returned by the API.
  369. */
  370. interface ClientSecret {
  371. /**
  372. * Timestamp for when the token expires. Currently, all tokens expire after one
  373. * minute.
  374. */
  375. expires_at: number;
  376. /**
  377. * Ephemeral key usable in client environments to authenticate connections to the
  378. * Realtime API. Use this in client-side environments rather than a standard API
  379. * token, which should only be used server-side.
  380. */
  381. value: string;
  382. }
  383. /**
  384. * Configuration for input audio transcription, defaults to off and can be set to
  385. * `null` to turn off once on. Input audio transcription is not native to the
  386. * model, since the model consumes audio directly. Transcription runs
  387. * asynchronously and should be treated as rough guidance rather than the
  388. * representation understood by the model.
  389. */
  390. interface InputAudioTranscription {
  391. /**
  392. * The model to use for transcription.
  393. */
  394. model?: string;
  395. }
  396. interface Tool {
  397. /**
  398. * The description of the function, including guidance on when and how to call it,
  399. * and guidance about what to tell the user when calling (if anything).
  400. */
  401. description?: string;
  402. /**
  403. * The name of the function.
  404. */
  405. name?: string;
  406. /**
  407. * Parameters of the function in JSON Schema.
  408. */
  409. parameters?: unknown;
  410. /**
  411. * The type of the tool, i.e. `function`.
  412. */
  413. type?: 'function';
  414. }
  415. /**
  416. * Granular configuration for tracing.
  417. */
  418. interface TracingConfiguration {
  419. /**
  420. * The group id to attach to this trace to enable filtering and grouping in the
  421. * traces dashboard.
  422. */
  423. group_id?: string;
  424. /**
  425. * The arbitrary metadata to attach to this trace to enable filtering in the traces
  426. * dashboard.
  427. */
  428. metadata?: unknown;
  429. /**
  430. * The name of the workflow to attach to this trace. This is used to name the trace
  431. * in the traces dashboard.
  432. */
  433. workflow_name?: string;
  434. }
  435. /**
  436. * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
  437. * means that the model will detect the start and end of speech based on audio
  438. * volume and respond at the end of user speech.
  439. */
  440. interface TurnDetection {
  441. /**
  442. * Amount of audio to include before the VAD detected speech (in milliseconds).
  443. * Defaults to 300ms.
  444. */
  445. prefix_padding_ms?: number;
  446. /**
  447. * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
  448. * With shorter values the model will respond more quickly, but may jump in on
  449. * short pauses from the user.
  450. */
  451. silence_duration_ms?: number;
  452. /**
  453. * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
  454. * threshold will require louder audio to activate the model, and thus might
  455. * perform better in noisy environments.
  456. */
  457. threshold?: number;
  458. /**
  459. * Type of turn detection, only `server_vad` is currently supported.
  460. */
  461. type?: string;
  462. }
  463. }
  464. export interface SessionCreateParams {
  465. /**
  466. * Configuration options for the generated client secret.
  467. */
  468. client_secret?: SessionCreateParams.ClientSecret;
  469. /**
  470. * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. For
  471. * `pcm16`, input audio must be 16-bit PCM at a 24kHz sample rate, single channel
  472. * (mono), and little-endian byte order.
  473. */
  474. input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
  475. /**
  476. * Configuration for input audio noise reduction. This can be set to `null` to turn
  477. * off. Noise reduction filters audio added to the input audio buffer before it is
  478. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  479. * detection accuracy (reducing false positives) and model performance by improving
  480. * perception of the input audio.
  481. */
  482. input_audio_noise_reduction?: SessionCreateParams.InputAudioNoiseReduction;
  483. /**
  484. * Configuration for input audio transcription, defaults to off and can be set to
  485. * `null` to turn off once on. Input audio transcription is not native to the
  486. * model, since the model consumes audio directly. Transcription runs
  487. * asynchronously through
  488. * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
  489. * and should be treated as guidance of input audio content rather than precisely
  490. * what the model heard. The client can optionally set the language and prompt for
  491. * transcription, these offer additional guidance to the transcription service.
  492. */
  493. input_audio_transcription?: SessionCreateParams.InputAudioTranscription;
  494. /**
  495. * The default system instructions (i.e. system message) prepended to model calls.
  496. * This field allows the client to guide the model on desired responses. The model
  497. * can be instructed on response content and format, (e.g. "be extremely succinct",
  498. * "act friendly", "here are examples of good responses") and on audio behavior
  499. * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
  500. * instructions are not guaranteed to be followed by the model, but they provide
  501. * guidance to the model on the desired behavior.
  502. *
  503. * Note that the server sets default instructions which will be used if this field
  504. * is not set and are visible in the `session.created` event at the start of the
  505. * session.
  506. */
  507. instructions?: string;
  508. /**
  509. * Maximum number of output tokens for a single assistant response, inclusive of
  510. * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
  511. * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
  512. */
  513. max_response_output_tokens?: number | 'inf';
  514. /**
  515. * The set of modalities the model can respond with. To disable audio, set this to
  516. * ["text"].
  517. */
  518. modalities?: Array<'text' | 'audio'>;
  519. /**
  520. * The Realtime model used for this session.
  521. */
  522. model?: 'gpt-4o-realtime-preview' | 'gpt-4o-realtime-preview-2024-10-01' | 'gpt-4o-realtime-preview-2024-12-17' | 'gpt-4o-realtime-preview-2025-06-03' | 'gpt-4o-mini-realtime-preview' | 'gpt-4o-mini-realtime-preview-2024-12-17';
  523. /**
  524. * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
  525. * For `pcm16`, output audio is sampled at a rate of 24kHz.
  526. */
  527. output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
  528. /**
  529. * The speed of the model's spoken response. 1.0 is the default speed. 0.25 is the
  530. * minimum speed. 1.5 is the maximum speed. This value can only be changed in
  531. * between model turns, not while a response is in progress.
  532. */
  533. speed?: number;
  534. /**
  535. * Sampling temperature for the model, limited to [0.6, 1.2]. For audio models a
  536. * temperature of 0.8 is highly recommended for best performance.
  537. */
  538. temperature?: number;
  539. /**
  540. * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
  541. * a function.
  542. */
  543. tool_choice?: string;
  544. /**
  545. * Tools (functions) available to the model.
  546. */
  547. tools?: Array<SessionCreateParams.Tool>;
  548. /**
  549. * Configuration options for tracing. Set to null to disable tracing. Once tracing
  550. * is enabled for a session, the configuration cannot be modified.
  551. *
  552. * `auto` will create a trace for the session with default values for the workflow
  553. * name, group id, and metadata.
  554. */
  555. tracing?: 'auto' | SessionCreateParams.TracingConfiguration;
  556. /**
  557. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  558. * set to `null` to turn off, in which case the client must manually trigger model
  559. * response. Server VAD means that the model will detect the start and end of
  560. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  561. * is more advanced and uses a turn detection model (in conjunction with VAD) to
  562. * semantically estimate whether the user has finished speaking, then dynamically
  563. * sets a timeout based on this probability. For example, if user audio trails off
  564. * with "uhhm", the model will score a low probability of turn end and wait longer
  565. * for the user to continue speaking. This can be useful for more natural
  566. * conversations, but may have a higher latency.
  567. */
  568. turn_detection?: SessionCreateParams.TurnDetection;
  569. /**
  570. * The voice the model uses to respond. Voice cannot be changed during the session
  571. * once the model has responded with audio at least once. Current voice options are
  572. * `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, and `verse`.
  573. */
  574. voice?: (string & {}) | 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
  575. }
  576. export declare namespace SessionCreateParams {
  577. /**
  578. * Configuration options for the generated client secret.
  579. */
  580. interface ClientSecret {
  581. /**
  582. * Configuration for the ephemeral token expiration.
  583. */
  584. expires_after?: ClientSecret.ExpiresAfter;
  585. }
  586. namespace ClientSecret {
  587. /**
  588. * Configuration for the ephemeral token expiration.
  589. */
  590. interface ExpiresAfter {
  591. /**
  592. * The anchor point for the ephemeral token expiration. Only `created_at` is
  593. * currently supported.
  594. */
  595. anchor: 'created_at';
  596. /**
  597. * The number of seconds from the anchor point to the expiration. Select a value
  598. * between `10` and `7200`.
  599. */
  600. seconds?: number;
  601. }
  602. }
  603. /**
  604. * Configuration for input audio noise reduction. This can be set to `null` to turn
  605. * off. Noise reduction filters audio added to the input audio buffer before it is
  606. * sent to VAD and the model. Filtering the audio can improve VAD and turn
  607. * detection accuracy (reducing false positives) and model performance by improving
  608. * perception of the input audio.
  609. */
  610. interface InputAudioNoiseReduction {
  611. /**
  612. * Type of noise reduction. `near_field` is for close-talking microphones such as
  613. * headphones, `far_field` is for far-field microphones such as laptop or
  614. * conference room microphones.
  615. */
  616. type?: 'near_field' | 'far_field';
  617. }
  618. /**
  619. * Configuration for input audio transcription, defaults to off and can be set to
  620. * `null` to turn off once on. Input audio transcription is not native to the
  621. * model, since the model consumes audio directly. Transcription runs
  622. * asynchronously through
  623. * [the /audio/transcriptions endpoint](https://platform.openai.com/docs/api-reference/audio/createTranscription)
  624. * and should be treated as guidance of input audio content rather than precisely
  625. * what the model heard. The client can optionally set the language and prompt for
  626. * transcription, these offer additional guidance to the transcription service.
  627. */
  628. interface InputAudioTranscription {
  629. /**
  630. * The language of the input audio. Supplying the input language in
  631. * [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
  632. * format will improve accuracy and latency.
  633. */
  634. language?: string;
  635. /**
  636. * The model to use for transcription, current options are `gpt-4o-transcribe`,
  637. * `gpt-4o-mini-transcribe`, and `whisper-1`.
  638. */
  639. model?: string;
  640. /**
  641. * An optional text to guide the model's style or continue a previous audio
  642. * segment. For `whisper-1`, the
  643. * [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
  644. * For `gpt-4o-transcribe` models, the prompt is a free text string, for example
  645. * "expect words related to technology".
  646. */
  647. prompt?: string;
  648. }
  649. interface Tool {
  650. /**
  651. * The description of the function, including guidance on when and how to call it,
  652. * and guidance about what to tell the user when calling (if anything).
  653. */
  654. description?: string;
  655. /**
  656. * The name of the function.
  657. */
  658. name?: string;
  659. /**
  660. * Parameters of the function in JSON Schema.
  661. */
  662. parameters?: unknown;
  663. /**
  664. * The type of the tool, i.e. `function`.
  665. */
  666. type?: 'function';
  667. }
  668. /**
  669. * Granular configuration for tracing.
  670. */
  671. interface TracingConfiguration {
  672. /**
  673. * The group id to attach to this trace to enable filtering and grouping in the
  674. * traces dashboard.
  675. */
  676. group_id?: string;
  677. /**
  678. * The arbitrary metadata to attach to this trace to enable filtering in the traces
  679. * dashboard.
  680. */
  681. metadata?: unknown;
  682. /**
  683. * The name of the workflow to attach to this trace. This is used to name the trace
  684. * in the traces dashboard.
  685. */
  686. workflow_name?: string;
  687. }
  688. /**
  689. * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
  690. * set to `null` to turn off, in which case the client must manually trigger model
  691. * response. Server VAD means that the model will detect the start and end of
  692. * speech based on audio volume and respond at the end of user speech. Semantic VAD
  693. * is more advanced and uses a turn detection model (in conjunction with VAD) to
  694. * semantically estimate whether the user has finished speaking, then dynamically
  695. * sets a timeout based on this probability. For example, if user audio trails off
  696. * with "uhhm", the model will score a low probability of turn end and wait longer
  697. * for the user to continue speaking. This can be useful for more natural
  698. * conversations, but may have a higher latency.
  699. */
  700. interface TurnDetection {
  701. /**
  702. * Whether or not to automatically generate a response when a VAD stop event
  703. * occurs.
  704. */
  705. create_response?: boolean;
  706. /**
  707. * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
  708. * will wait longer for the user to continue speaking, `high` will respond more
  709. * quickly. `auto` is the default and is equivalent to `medium`.
  710. */
  711. eagerness?: 'low' | 'medium' | 'high' | 'auto';
  712. /**
  713. * Whether or not to automatically interrupt any ongoing response with output to
  714. * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
  715. * occurs.
  716. */
  717. interrupt_response?: boolean;
  718. /**
  719. * Used only for `server_vad` mode. Amount of audio to include before the VAD
  720. * detected speech (in milliseconds). Defaults to 300ms.
  721. */
  722. prefix_padding_ms?: number;
  723. /**
  724. * Used only for `server_vad` mode. Duration of silence to detect speech stop (in
  725. * milliseconds). Defaults to 500ms. With shorter values the model will respond
  726. * more quickly, but may jump in on short pauses from the user.
  727. */
  728. silence_duration_ms?: number;
  729. /**
  730. * Used only for `server_vad` mode. Activation threshold for VAD (0.0 to 1.0), this
  731. * defaults to 0.5. A higher threshold will require louder audio to activate the
  732. * model, and thus might perform better in noisy environments.
  733. */
  734. threshold?: number;
  735. /**
  736. * Type of turn detection.
  737. */
  738. type?: 'server_vad' | 'semantic_vad';
  739. }
  740. }
  741. export declare namespace Sessions {
  742. export { type Session as Session, type SessionCreateResponse as SessionCreateResponse, type SessionCreateParams as SessionCreateParams, };
  743. }
  744. //# sourceMappingURL=sessions.d.mts.map