Processing audio

Some Language Models support generating audio, processing audio inputs, or both. The following are examples for how to use the capabilities of these types of models in your application.

For a list of models capable of audio input or output, please refer to the multimodal models on the Models page. On that page, you will can also lookup parameters which are model specific.

Transcribe audio into text

The following is an example of how you can use the audio input features to transcribe a sound file into text.

from opperai import AsyncOpper
from opperai.types import AudioInput
import asyncio

opper = AsyncOpper()

async def transcribe_audio(path: str) -> str:
    transcription, response = await opper.call(
        name="async_transcribe_audio",
        instructions="given an audio file, return the transcription of the audio",
        output_type=str,
        input=AudioInput.from_path(path),
        model="gcp/gemini-1.5-flash-eu",
    )
    return transcription

async def main():
    # Source: https://ttsmp3.com/ai -- The quick brown fox jumps over the lazy dog.
    audio_file = "test.mp3"
    transcription = await transcribe_audio(audio_file)
    print(transcription)

asyncio.run(main())
# The quick brown fox jumps over the lazy dog.

import Client, { OpperMediaHandler } from "opperai";

const client = new Client();

(async () => {
    const audio = new OpperMediaHandler("test.mp3");

    const { message } = await client.call({
        name: "async_transcribe_audio",
        instructions: "given an audio file, return the transcription of the audio",
        input: audio.getInput(),
        model: "gcp/gemini-1.5-flash-eu",
    });

    console.log(`Audio transcription: ${message}`);
    // Audio transcription: The quick brown fox jumps over the lazy dog.
})();

Generate audio from text (experimental)

The following is an example of how you can use the audio output:

from opperai.types import BetaAudioOutput, CallConfiguration
from opperai import Opper
import sounddevice as sd
import soundfile as sf

opper = Opper()

def generate_audio(text: str) -> BetaAudioOutput:
    audio, _ = opper.call(
        name="generate_audio", 
        output_type=BetaAudioOutput,
        instructions="Say whatever I'm sending in a cheerful tone",
        input=text,
        model="openai/gpt-4o-audio-preview",
        configuration=CallConfiguration(
            model_parameters={
                "modalities": ["text", "audio"],
                "audio": {"voice": "alloy", "format": "wav"},
            },
        ),
    )
    return audio

def play_audio(file_path: str):
    print("Playing audio...")
    try:
        data, samplerate = sf.read(file_path)
        sd.play(data, samplerate)
        sd.wait()  # Wait until sound has finished playing
    except Exception as e:
        print(f"Audio playback error: {str(e)}")

def main():
    text = "Marry Christmas to you all"
    audio = generate_audio(text)
    # Save the audio to a file
    with open("output.wav", "wb") as f:
        f.write(audio.bytes)
    
    play_audio("output.wav")

if __name__ == "__main__":
    main()