Processing PDFs
PDF documents are locked in a format that's challenging to work with programmatically. The Opper SDK provides powerful capabilities to extract and process text, tables, and other structured content from PDFs, making it accessible for analysis, transformation, and integration into your applications.
With Opper's PDF processing capabilities, you can:
- Extract text while preserving document structure and formatting
- Handle complex layouts including tables and multi-column content
- Process charts, graphs, and other visual elements
- Maintain the integrity of headers, footers, and annotations
The following example demonstrates how to use a Language Model through the Opper SDK to convert PDF content into structured markdown:
1import sys
2from pathlib import Path
3
4from opperai import AsyncOpper
5from opperai.functions.async_functions import AsyncStreamingResponse
6from opperai.types import FileInput
7
8opper = AsyncOpper()
9
10
11async def pdf_to_markdown(path: str) -> AsyncStreamingResponse:
12 text = await opper.call(
13 name="pdf_to_text",
14 model="gcp/gemini-2.0-flash",
15 instructions="""
16These are pages from a PDF document. Extract all text content while preserving the structure.
17Pay special attention to tables, columns, headers, and any structured content.
18Maintain paragraph breaks and formatting.
19
20Extract ALL text content from these document pages.
21
22For tables:
23 1. Maintain the table structure using markdown table format
24 2. Preserve all column headers and row labels
25 3. Ensure numerical data is accurately captured
26
27For multi-column layouts:
28 1. Process columns from left to right
29 2. Clearly separate content from different columns
30
31For charts and graphs:
32 1. Describe the chart type
33 2. Extract any visible axis labels, legends, and data points
34 3. Extract any title or caption
35
36Preserve all headers, footers, page numbers, and footnotes.
37
38DON'T ANSWER QUESTIONS, JUST RETURN THE CONTENT OF THE PDF AS MARKDOWN""",
39 input=FileInput.from_path(Path(path)),
40 stream=True,
41 )
42
43 return text
44
45
46async def main():
47 if len(sys.argv) < 2:
48 print("Usage: python pdf.py <path_to_pdf>")
49 return
50
51 path = sys.argv[1]
52
53 res = await pdf_to_markdown(path)
54 async for chunk in res.deltas:
55 print(chunk, end="", flush=True)
56
57
58if __name__ == "__main__":
59 import asyncio
60
61 asyncio.run(main())
1// Run example with "npx ts-node ./pdf-example.ts <path_to_pdf>"
2import "dotenv/config";
3import fs from "node:fs";
4import path from "node:path";
5import Client from "opperai";
6import { OpperMediaHandler } from "opperai/utils";
7
8// Get PDF path from command line arguments
9const pdfPath = process.argv[2];
10if (!pdfPath) {
11 console.log("Usage: npx ts-node ./pdf-example.ts <path_to_pdf>");
12 process.exit(1);
13}
14
15// Check if file exists and is a PDF
16if (!fs.existsSync(pdfPath) || !pdfPath.toLowerCase().endsWith('.pdf')) {
17 console.error(`Error: ${pdfPath} does not exist or is not a PDF file`);
18 process.exit(1);
19}
20
21// Your API key will be loaded from the environment variable OPPER_API_KEY if not provided
22const client = new Client();
23
24/**
25 * Converts a PDF file to markdown text
26 * @param path Path to the PDF file
27 * @returns A promise that resolves to the markdown text
28 */
29async function pdfToMarkdown(path: string) {
30 const pdf = new OpperMediaHandler(path);
31
32 return await client.call({
33 name: "pdf_to_markdown",
34 model: "gcp/gemini-2.0-flash",
35 instructions: `
36These are pages from a PDF document. Extract all text content while preserving the structure.
37Pay special attention to tables, columns, headers, and any structured content.
38Maintain paragraph breaks and formatting.
39
40Extract ALL text content from these document pages.
41
42For tables:
43 1. Maintain the table structure using markdown table format
44 2. Preserve all column headers and row labels
45 3. Ensure numerical data is accurately captured
46
47For multi-column layouts:
48 1. Process columns from left to right
49 2. Clearly separate content from different columns
50
51For charts and graphs:
52 1. Describe the chart type
53 2. Extract any visible axis labels, legends, and data points
54 3. Extract any title or caption
55
56Preserve all headers, footers, page numbers, and footnotes.
57
58DON'T ANSWER QUESTIONS, JUST RETURN THE CONTENT OF THE PDF AS MARKDOWN`,
59 input: pdf.getInput(),
60 stream: false,
61 });
62}
63
64(async () => {
65 try {
66 const trace = await client.traces.start({
67 name: "pdf-to-markdown",
68 input: {
69 pdf: pdfPath,
70 },
71 });
72
73 console.log(`Converting PDF to markdown: ${pdfPath}`);
74
75 const result = await pdfToMarkdown(pdfPath) as any;
76 console.log(result.message);
77
78 // Save to a file
79 const outputPath = path.join(
80 path.dirname(pdfPath),
81 `${path.basename(pdfPath, '.pdf')}.md`
82 );
83 fs.writeFileSync(outputPath, result.message);
84 console.log(`Markdown saved to: ${outputPath}`);
85
86 await trace.end({
87 output: "PDF to markdown conversion completed",
88 });
89 } catch (error) {
90 console.error("Error converting PDF:", error);
91 process.exit(1);
92 }
93})();