Some Language Models support generating images, or looking at images, or both. The following are examples for how to use the capabilities of these types of models in your application.
For a list of models capable of image input or output, please refer to the multimodal models on the Models page. On that page, you will can also lookup parameters which are model specific.
Generating images
Generating an image can be fun, but can also be used in your product.
The following is a basic way of generating an image based on a prompt, and saving the image to a file.
import os
import base64
from tempfile import NamedTemporaryFile
from opperai import Opper
opper = Opper(
http_bearer=os.getenv("OPPER_API_KEY"),
)
def save_file(bytes: bytes, path: str = None) -> str:
if path is None:
with NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
path = temp_file.name
with open(path, "wb") as f:
f.write(bytes)
f.close()
return path
def generate_image(description: str):
response = opper.call(
name="generate_image",
input=description,
model="openai/gpt-image-1",
)
return response.images[0]
def main():
description = "wide-angle photo of a person holding a presentation about AI in a room full of people"
generated_image_str = generate_image(description)
path = save_file(base64.b64decode(generated_image_str))
print(path)
if __name__ == "__main__":
main()
# /var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpave568r8.png
Processing images by using them as input to your models can unlock a variety of applications, such as image captioning, classification, or generating detailed descriptions based on the image content.
import os
import base64
from pydantic import BaseModel, Field
from opperai import Opper
opper = Opper(
http_bearer=os.getenv("OPPER_API_KEY"),
)
class ImageDescription(BaseModel):
description: str = Field(description="Description of the image content")
def describe_image(image_path: str) -> str:
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
media_input = f"data:image/png;base64,{image_data}"
response = opper.call(
name="describe_image",
instructions="Describe the content of the image",
output_schema=ImageDescription,
input={
"_opper_media_input": media_input, # this field name is required by opper
},
model="openai/gpt-4o",
)
return response.description
def main():
image_path = "/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpave568r8.png"
description = describe_image(image_path)
print(f"Image description: {description}")
# Image description: The image shows a person giving a presentation in a dark room. The presenter is pointing at a screen displaying a visual related to artificial intelligence (AI). The screen has the text 'AI' and 'AI RORENGE' along with circuit-like graphics. Several people are seated, watching the presentation attentively.
if __name__ == "__main__":
main()
Common denominator
In this example, we are passing in multiple images and asking the LLM to find the most common denominator between the images.
import os
import base64
from typing import List
from pydantic import BaseModel, Field
from opperai import Opper
opper = Opper(
http_bearer=os.getenv("OPPER_API_KEY"),
)
class CommonDenominator(BaseModel):
description: str = Field(description="Description of the common denominator among the images")
def common_denominator(image_paths: List[str]) -> str:
image_inputs = []
for path in image_paths:
with open(path, "rb") as f:
image_data = base64.b64encode(f.read()).decode('utf-8')
image_inputs.append(f"data:image/png;base64,{image_data}")
response = opper.call(
name="common_denominator",
instructions="given a list of images, return the most common denominator of the images",
output_schema=CommonDenominator,
input={
"_opper_media_input": image_inputs, # this field name is required by opper
},
model="openai/gpt-4o",
)
return response.description
def main():
image_paths = [
"/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpid366821.png",
"/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpuhl0wyg4.png",
"/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpijk_v13m.png",
"/var/folders/dx/lwwkf62n61j9z928d87z5txm0000gn/T/tmpqglezg1g.png",
]
description = common_denominator(image_paths)
print(f"Image description: {description}")
# Image description: People using VR headsets in a forest
if __name__ == "__main__":
main()
You can use LLMs to extract structured data from PDF documents. This can be useful to make the content of PDFs programmable and searchable.
Here is a simple example of extracting structured information from a PDF document:
import base64
import os
import sys
from opperai import Opper
from pydantic import BaseModel, Field
opper = Opper(os.getenv("OPPER_API_KEY"))
class InvoiceData(BaseModel):
invoice_number: str = Field(description="The invoice number")
total_amount: float = Field(description="The total amount in dollars")
vendor_name: str = Field(description="The name of the vendor or company")
date: str = Field(description="The invoice date")
items: list = Field(description="List of items and their prices")
def extract_invoice_data(pdf_path: str) -> dict:
media_input = f"data:application/pdf;base64,{base64.b64encode(open(pdf_path, 'rb').read()).decode('utf-8')}"
response = opper.call(
name="extract_invoice",
instructions="Extract structured invoice data from the PDF document",
input={
"_opper_media_input": media_input,
},
output_schema=InvoiceData.model_json_schema()
)
return response.json_payload
def main():
if len(sys.argv) < 2:
print("Usage: python extract_invoice.py <path_to_pdf>")
return
pdf_path = sys.argv[1]
result = extract_invoice_data(pdf_path)
print(result)
if __name__ == "__main__":
main()
# Output:
# {
# 'invoice_number': 'INV-2024-001',
# 'total_amount': 1250.00,
# 'vendor_name': 'Tech Solutions Inc.',
# 'date': '2024-01-15',
# 'items': [
# {'item': 'Software License', 'price': 1000.00},
# {'item': 'Support Services', 'price': 250.00}
# ]
# }