> ## Documentation Index
> Fetch the complete documentation index at: https://docs.sudoapp.dev/llms.txt
> Use this file to discover all available pages before exploring further.

# Audio Chat Completions

> Add audio input and output to Chat Completions (OpenAI)

Models like GPT-4o can handle audio modalities. With Sudo, you can call OpenAI-compatible audio models to produce audio output and optionally send audio input alongside text.

<Note>
  Only certain providers currently support audio input/output with Chat Completions in this format (notably OpenAI). Before using audio, confirm your chosen model provider’s API accepts `modalities` and `input_audio` as shown here, and adjust parameters accordingly.
</Note>

## Audio Output from Text

Generate spoken audio from a text prompt using an audio-capable model.

<CodeGroup dropdown>
  ```typescript TypeScript theme={null}
  import { Sudo } from "sudo-ai";

  const sudo = new Sudo({
    serverURL: "https://sudoapp.dev/api",
    apiKey: process.env.SUDO_API_KEY ?? "",
  });

  async function textToSpeech() {
    const response = await sudo.router.create({
      model: "gpt-4o-audio-preview",
      modalities: ["text", "audio"],
      audio: { voice: "alloy", format: "wav" },
      messages: [
        {
          role: "user",
          content: "Is a golden retriever a good family dog?",
        },
      ],
    });

    // The response may include both text and audio. Inspect the structure as providers evolve.
    console.log("Text:", response.choices[0].message.content);
    // Optionally check for audio fields if present in your provider’s response shape
    // console.log("Audio (base64):", response.choices[0].message.audio?.data);
  }

  textToSpeech();
  ```

  ```python Python theme={null}
  import os
  from sudo import Sudo

  with Sudo(
      server_url="https://sudoapp.dev/api",
      api_key=os.getenv("SUDO_API_KEY"),
  ) as client:
      response = client.router.create(
          model="gpt-4o-audio-preview",
          modalities=["text", "audio"],
          audio={"voice": "alloy", "format": "wav"},
          messages=[
              {
                  "role": "user",
                  "content": "Is a golden retriever a good family dog?",
              }
          ],
      )

      # The response may include both text and audio. Inspect the structure as providers evolve.
      print("Text:", response.choices[0].message.content)
      # Optionally check for audio fields if present in your provider’s response shape
      # print("Audio (base64):", getattr(response.choices[0].message, "audio", {}).get("data"))
  ```
</CodeGroup>

## Audio Input + Text

Send user audio together with a text prompt. Encode your audio file as base64 and include it using `type: "input_audio"`.

<CodeGroup dropdown>
  ```typescript TypeScript theme={null}
  import { Sudo } from "sudo-ai";
  import * as fs from "fs";
  import * as path from "path";

  const sudo = new Sudo({
    serverURL: "https://sudoapp.dev/api",
    apiKey: process.env.SUDO_API_KEY ?? "",
  });

  function encodeAudioToBase64(audioPath: string): { data: string; format: string } {
    const ext = path.extname(audioPath).toLowerCase();
    const format = ext.replace(".", "") || "wav"; // e.g. .wav -> wav
    const audioBuffer = fs.readFileSync(audioPath);
    return { data: audioBuffer.toString("base64"), format };
  }

  async function analyzeAudioWithText() {
    const { data, format } = encodeAudioToBase64("path/to/recording.wav");

    const response = await sudo.router.create({
      model: "gpt-4o-audio-preview",
      modalities: ["text", "audio"],
      audio: { voice: "alloy", format: "wav" },
      messages: [
        {
          role: "user",
          content: [
            { type: "text", text: "What is in this recording?" },
            {
              type: "input_audio",
              inputAudio: {
                data,
                format,
              },
            },
          ],
        },
      ],
    });

    console.log("Text:", response.choices[0].message.content);
  }

  analyzeAudioWithText();
  ```

  ```python Python theme={null}
  import os
  import base64
  from sudo import Sudo

  def encode_audio_to_base64(audio_path: str):
      with open(audio_path, "rb") as f:
          data = base64.b64encode(f.read()).decode("utf-8")
      # Infer format from file extension (e.g., .wav -> wav)
      format = audio_path.split(".")[-1].lower() if "." in audio_path else "wav"
      return {"data": data, "format": format}

  with Sudo(
      server_url="https://sudoapp.dev/api",
      api_key=os.getenv("SUDO_API_KEY"),
  ) as client:
      audio_b64 = encode_audio_to_base64("path/to/recording.wav")

      response = client.router.create(
          model="gpt-4o-audio-preview",
          modalities=["text", "audio"],
          audio={"voice": "alloy", "format": "wav"},
          messages=[
              {
                  "role": "user",
                  "content": [
                      {"type": "text", "text": "What is in this recording?"},
                      {
                          "type": "input_audio",
                          "input_audio": {
                              "data": audio_b64["data"],
                              "format": audio_b64["format"],
                          },
                      },
                  ],
              }
          ],
      )

      print("Text:", response.choices[0].message.content)
  ```
</CodeGroup>

## Tips

* Prefer lossless or high-quality input formats for best transcription/understanding.
* Keep requests small; very long audio can increase latency and cost.
* Verify your chosen model supports audio input/output and the requested `format`.
