Processing images

Some Language Models support generating images, or looking at images, or both. The following are examples for how to use the capabilities of these types of models in your application.

For a list of models capable of image input or output, please refer to the multimodal models on the Models page. On that page, you will can also lookup parameters which are model specific.

Generating images

Generating an image can be fun, but can also be used in your product.

The following is a basic way of generating an image based on a prompt, and saving the image to a file.

from opperai import AsyncOpper
from opperai.types import CallConfiguration, ImageOutput
from tempfile import NamedTemporaryFile
import asyncio

opper = AsyncOpper()

def save_file(bytes: bytes, path: str = None) -> str:
    if path is None:
        with NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
            path = temp_file.name

    with open(path, "wb") as f:
        f.write(bytes)
        f.close()

    return path

async def generate_image(description: str) -> ImageOutput:
    image, _ = await opper.call(
        name="generate_image",
        output_type=ImageOutput,
        input=description,
        model="gcp/imagen-3.0-generate-001-eu",
        configuration=CallConfiguration(
            model_parameters={
                "aspectRatio": "9:16",
            }
        ),
    )
    return image

async def main():
    description = "wide-angle photo of a person holding a presentation about AI in a room full of people"
    image = await generate_image(description)
    path = save_file(image.bytes)
    print(path)

asyncio.run(main())
# /var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpave568r8.png

import Client from "opperai";
import fs from "fs";
import path from "path";
import os from "os";

const client = new Client();

(async () => {

    const cat = await client.generateImage({
        model: "gcp/imagen-3.0-generate-001-eu",
        prompt: "wide-angle photo of a person holding a presentation about AI in a room full of people",
        parameters: {
            aspectRatio: "9:16",
        }
    });

    const tempFilePath = path.join(os.tmpdir(), `image-${Date.now()}-${Math.random().toString(36).substring(2, 15)}.png`);
    fs.writeFileSync(tempFilePath, cat.bytes);
    console.log(`image written to temporary file: ${tempFilePath}`);
    // image written to temporary file: /var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/image-1727768092084-z07q0xpw5zp.png
})();

Using images as input

Processing images by using them as input to your models can unlock a variety of applications, such as image captioning, classification, or generating detailed descriptions based on the image content.

from opperai import Opper
from opperai.types import ImageInput

opper = Opper()

def describe_image(path: str) -> str:
    description, response = opper.call(
        name="describe_image",
        instructions="Describe the content of the image",
        output_type=str,
        input=ImageInput.from_path(path),
        model="openai/gpt-4o",
    )
    return description

image_path = "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpave568r8.png"
description = describe_image(image_path)
print(f"Image description: {description}")
# Image description: The image shows a person giving a presentation in a dark room. The presenter is pointing at a screen displaying a visual related to artificial intelligence (AI). The screen has the text 'AI' and 'AI RORENGE' along with circuit-like graphics. Several people are seated, watching the presentation attentively.

import Client, { OpperMediaHandler } from "opperai";

const client = new Client();

(async () => {
    const image = new OpperMediaHandler("/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/image-1727768092084-z07q0xpw5zp.png");

    const { message } = await client.call({
        name: "describe_image",
        instructions: "Describe the content of the image",
        input: image.getInput(),
        model: "openai/gpt-4o",
    });

    console.log(`Image description: ${message}`);
    //Image description: The image shows a person giving a presentation in a dark room. The presenter is pointing at a screen displaying a visual related to artificial intelligence (AI). The screen has the text 'AI' and 'AI RORENGE' along with circuit-like graphics. Several people are seated, watching the presentation attentively.
})();

Common denominator

In this example, we are passing in multiple images and asking the LLM to find the most common denominator between the images.

from opperai import Opper
from opperai.types import ImageInput
from typing import List

opper = Opper()

def common_denominator(paths: List[str]) -> str:
    images = [ImageInput.from_path(path) for path in paths]
    description, response = opper.call(
        name="common_denominator",
        instructions="given a list of images, return the most common denominator of the images",
        output_type=str,
        input=images,
        model="openai/gpt-4o",
    )
    return description

image_paths = [
    "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpid366821.png",
    "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpuhl0wyg4.png",
    "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpijk_v13m.png",
    "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpqglezg1g.png",
]
description = common_denominator(image_paths)
print(f"Image description: {description}")
# Image description: People using VR headsets in a forest

import Client, { OpperMediaHandler } from "opperai";

const client = new Client();

(async () => {
    async function commonDenominator(paths: string[]): Promise<string> {
        const images = paths.map((path) =>{
            const image = new OpperMediaHandler(path);

            return image.getInput();
        });

        const { message } = await client.call({
            name: "common_denominator",
            instructions: "given a list of images, return the most common denominator of the images",
            input: { images },
            model: "openai/gpt-4o",
        });
        return message;
    }

    const imagePaths = [
        "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpid366821.png",
        "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpuhl0wyg4.png",
        "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpijk_v13m.png",
        "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpqglezg1g.png",
    ];

    const description = await commonDenominator(imagePaths);
    console.log(`Image description: ${description}`);
    // Image description: The most common denominator among the images is that they all feature a person using a virtual reality (VR) headset in an outdoor forest environment.

})();