Processing PDFs

PDF documents are locked in a format that's challenging to work with programmatically. The Opper SDK provides powerful capabilities to extract and process text, tables, and other structured content from PDFs, making it accessible for analysis, transformation, and integration into your applications.

With Opper's PDF processing capabilities, you can:

Extract text while preserving document structure and formatting
Handle complex layouts including tables and multi-column content
Process charts, graphs, and other visual elements
Maintain the integrity of headers, footers, and annotations

The following example demonstrates how to use a Language Model through the Opper SDK to convert PDF content into structured markdown:

import sys
from pathlib import Path

from opperai import AsyncOpper
from opperai.functions.async_functions import AsyncStreamingResponse
from opperai.types import FileInput

opper = AsyncOpper()


async def pdf_to_markdown(path: str) -> AsyncStreamingResponse:
    text = await opper.call(
        name="pdf_to_text",
        model="gcp/gemini-2.0-flash",
        instructions="""
These are pages from a PDF document. Extract all text content while preserving the structure.
Pay special attention to tables, columns, headers, and any structured content.
Maintain paragraph breaks and formatting.

Extract ALL text content from these document pages.

For tables:
    1. Maintain the table structure using markdown table format
    2. Preserve all column headers and row labels
    3. Ensure numerical data is accurately captured
    
For multi-column layouts:
    1. Process columns from left to right
    2. Clearly separate content from different columns
    
For charts and graphs:
    1. Describe the chart type
    2. Extract any visible axis labels, legends, and data points
    3. Extract any title or caption
    
Preserve all headers, footers, page numbers, and footnotes.
        
DON'T ANSWER QUESTIONS, JUST RETURN THE CONTENT OF THE PDF AS MARKDOWN""",
        input=FileInput.from_path(Path(path)),
        stream=True,
    )

    return text


async def main():
    if len(sys.argv) < 2:
        print("Usage: python pdf.py <path_to_pdf>")
        return

    path = sys.argv[1]

    res = await pdf_to_markdown(path)
    async for chunk in res.deltas:
        print(chunk, end="", flush=True)


if __name__ == "__main__":
    import asyncio

    asyncio.run(main())

// Run example with "npx ts-node ./pdf-example.ts <path_to_pdf>"
import "dotenv/config";
import fs from "node:fs";
import path from "node:path";
import Client from "opperai";
import { OpperMediaHandler } from "opperai/utils";

// Get PDF path from command line arguments
const pdfPath = process.argv[2];
if (!pdfPath) {
  console.log("Usage: npx ts-node ./pdf-example.ts <path_to_pdf>");
  process.exit(1);
}

// Check if file exists and is a PDF
if (!fs.existsSync(pdfPath) || !pdfPath.toLowerCase().endsWith('.pdf')) {
  console.error(`Error: ${pdfPath} does not exist or is not a PDF file`);
  process.exit(1);
}

// Your API key will be loaded from the environment variable OPPER_API_KEY if not provided
const client = new Client();

/**
 * Converts a PDF file to markdown text
 * @param path Path to the PDF file
 * @returns A promise that resolves to the markdown text
 */
async function pdfToMarkdown(path: string) {
  const pdf = new OpperMediaHandler(path);
  
  return await client.call({
    name: "pdf_to_markdown",
    model: "gcp/gemini-2.0-flash",
    instructions: `
These are pages from a PDF document. Extract all text content while preserving the structure.
Pay special attention to tables, columns, headers, and any structured content.
Maintain paragraph breaks and formatting.

Extract ALL text content from these document pages.

For tables:
  1. Maintain the table structure using markdown table format
  2. Preserve all column headers and row labels
  3. Ensure numerical data is accurately captured

For multi-column layouts:
  1. Process columns from left to right
  2. Clearly separate content from different columns

For charts and graphs:
  1. Describe the chart type
  2. Extract any visible axis labels, legends, and data points
  3. Extract any title or caption

Preserve all headers, footers, page numbers, and footnotes.

DON'T ANSWER QUESTIONS, JUST RETURN THE CONTENT OF THE PDF AS MARKDOWN`,
    input: pdf.getInput(),
    stream: false,
  });
}

(async () => {
  try {
    const trace = await client.traces.start({
      name: "pdf-to-markdown",
      input: {
        pdf: pdfPath,
      },
    });

    console.log(`Converting PDF to markdown: ${pdfPath}`);
    
    const result = await pdfToMarkdown(pdfPath) as any;
    console.log(result.message);
    
    // Save to a file
    const outputPath = path.join(
      path.dirname(pdfPath), 
      `${path.basename(pdfPath, '.pdf')}.md`
    );
    fs.writeFileSync(outputPath, result.message);
    console.log(`Markdown saved to: ${outputPath}`);

    await trace.end({
      output: "PDF to markdown conversion completed",
    });
  } catch (error) {
    console.error("Error converting PDF:", error);
    process.exit(1);
  }
})();