Some Language Models support generating audio, processing audio inputs, or both. The following are examples for how to use the capabilities of these types of models in your application.
For a list of models capable of audio input or output, please refer to the multimodal models on the Models page. On that page, you will can also lookup parameters which are model specific.
Transcribe audio into text
The following is an example of how you can use the audio input features to transcribe a sound file into text.
from opperai import AsyncOpper
from opperai.types import AudioInput
import asyncio
opper = AsyncOpper()
async def transcribe_audio(path: str) -> str:
transcription, response = await opper.call(
name="async_transcribe_audio",
instructions="given an audio file, return the transcription of the audio",
output_type=str,
input=AudioInput.from_path(path),
model="gcp/gemini-1.5-flash-eu",
)
return transcription
async def main():
# Source: https://ttsmp3.com/ai -- The quick brown fox jumps over the lazy dog.
audio_file = "test.mp3"
transcription = await transcribe_audio(audio_file)
print(transcription)
asyncio.run(main())
# The quick brown fox jumps over the lazy dog.
Generate audio from text (experimental)
The following is an example of how you can use the audio output:
from opperai.types import BetaAudioOutput, CallConfiguration
from opperai import Opper
import sounddevice as sd
import soundfile as sf
opper = Opper()
def generate_audio(text: str) -> BetaAudioOutput:
audio, _ = opper.call(
name="generate_audio",
output_type=BetaAudioOutput,
instructions="Say whatever I'm sending in a cheerful tone",
input=text,
model="openai/gpt-4o-audio-preview",
configuration=CallConfiguration(
model_parameters={
"modalities": ["text", "audio"],
"audio": {"voice": "alloy", "format": "wav"},
},
),
)
return audio
def play_audio(file_path: str):
print("Playing audio...")
try:
data, samplerate = sf.read(file_path)
sd.play(data, samplerate)
sd.wait() # Wait until sound has finished playing
except Exception as e:
print(f"Audio playback error: {str(e)}")
def main():
text = "Marry Christmas to you all"
audio = generate_audio(text)
# Save the audio to a file
with open("output.wav", "wb") as f:
f.write(audio.bytes)
play_audio("output.wav")
if __name__ == "__main__":
main()