Introduction
I used to think "vision models" were mostly for captions. The cookbook convinced me otherwise: vision gets interesting when you treat images like documents—charts, invoices, slides, screenshots—and you build pipelines around extraction.
That said, I don't consider vision perfectly reliable. I usually assume I'll need guardrails (structured outputs, second-pass checks, sometimes a human review step) when the extracted data matters.
1. Basic Image Analysis
Location: multimodal/getting_started_with_vision.ipynb
Image Input Formats
import base64
import httpx
from anthropic import Anthropic
client = Anthropic()
# From URL
def analyze_image_url(url: str, question: str) -> str:
image_data = base64.standard_b64encode(httpx.get(url).content).decode("utf-8")
media_type = "image/jpeg" # or detect from URL
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1000,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": image_data
}
},
{"type": "text", "text": question}
]
}]
)
return response.content[0].text
# From file
def analyze_image_file(path: str, question: str) -> str:
with open(path, "rb") as f:
image_data = base64.standard_b64encode(f.read()).decode("utf-8")
media_type = "image/png" if path.endswith(".png") else "image/jpeg"
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1000,
messages=[{
"role": "user",
"content": [
{
"type": "image",
"source": {"type": "base64", "media_type": media_type, "data": image_data}
},
{"type": "text", "text": question}
]
}]
)
return response.content[0].text
In practice, I keep an eye on image size and compression. If you're sending huge screenshots, you'll pay for it (latency and tokens), and sometimes the model does worse because important text becomes small and blurry.
2. Vision Best Practices
Location: multimodal/best_practices_for_vision.ipynb
Image Placement
When I'm sending multiple images, I've had better luck putting images first and questions last. It sounds small, but it noticeably reduces confusion about "which image are we talking about?"
def compare_images(images: list[tuple[str, str]], question: str) -> str:
content = []
for i, (data, media_type) in enumerate(images):
content.append({
"type": "image",
"source": {"type": "base64", "media_type": media_type, "data": data}
})
content.append({"type": "text", "text": f"Image {i+1}"})
content.append({"type": "text", "text": question})
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1500,
messages=[{"role": "user", "content": content}]
)
return response.content[0].text
Prompt Structure for Vision
def structured_image_analysis(image_data: str, media_type: str) -> dict:
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}},
{"type": "text", "text": """Analyze this image and provide:
1. **Description**: What is shown in the image?
2. **Key Elements**: List the main objects/text visible
3. **Context**: What is the purpose or setting?
4. **Notable Details**: Any important observations
Format as JSON."""}
]
}]
)
import json
return json.loads(response.content[0].text)
I'll be honest: "format as JSON" works often, but not always. When I need this to be robust, I prefer tool-based extraction (same idea as in the tool-use post) so I'm not depending on perfect JSON formatting.
3. Chart and Graph Interpretation
Location: multimodal/reading_charts_graphs_powerpoints.ipynb
Extracting Data from Charts
def extract_chart_data(image_data: str, media_type: str) -> dict:
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}},
{"type": "text", "text": """Extract all data from this chart.
Return JSON with:
{
"chart_type": "bar/line/pie/etc",
"title": "...",
"x_axis": {"label": "...", "values": [...]},
"y_axis": {"label": "...", "unit": "..."},
"data_series": [
{"name": "...", "values": [...]}
],
"insights": ["..."]
}"""}
]
}]
)
import json
return json.loads(response.content[0].text)
Charts are where I'm most cautious. The model is good at "reading" plots, but if axis ticks are tiny or the chart is stylized, it can guess. When accuracy matters, I'll ask it to call out uncertainty ("values approximate") or extract only what's clearly labeled.
PowerPoint Slide Analysis
def analyze_slide(image_data: str, media_type: str, slide_number: int) -> dict:
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=1500,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}},
{"type": "text", "text": f"""Analyze slide {slide_number}.
Extract:
- Title
- Main points (as bullet list)
- Any data/charts (summarize)
- Speaker notes suggestions
Return as JSON."""}
]
}]
)
import json
return json.loads(response.content[0].text)
4. Form Extraction (OCR)
Location: multimodal/how_to_transcribe_text.ipynb
Invoice Processing
def extract_invoice(image_data: str, media_type: str) -> dict:
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
messages=[{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}},
{"type": "text", "text": """Extract all information from this invoice.
Return JSON:
{
"vendor": {"name": "", "address": "", "contact": ""},
"invoice_number": "",
"date": "",
"due_date": "",
"line_items": [{"description": "", "quantity": 0, "unit_price": 0, "total": 0}],
"subtotal": 0,
"tax": 0,
"total": 0,
"payment_terms": ""
}"""}
]
}]
)
import json
return json.loads(response.content[0].text)
If you're automating invoices, my experience is that you'll want validation rules (totals add up, dates parse, tax is non-negative) and an exception queue. The model can read messy scans, but it's not a substitute for accounting sanity checks.
5. Crop Tool for Detailed Analysis
Location: multimodal/crop_tool.ipynb
The crop tool is a clever pattern: let the model ask for a zoomed-in region instead of hoping it can read tiny text from the full image.
from PIL import Image
import io
def crop_image(image_bytes: bytes, x1: int, y1: int, x2: int, y2: int) -> str:
img = Image.open(io.BytesIO(image_bytes))
cropped = img.crop((x1, y1, x2, y2))
buffer = io.BytesIO()
cropped.save(buffer, format="PNG")
return base64.standard_b64encode(buffer.getvalue()).decode()
crop_tool = {
"name": "crop_image",
"description": "Crop a region of the image for closer inspection",
"input_schema": {
"type": "object",
"properties": {
"x1": {"type": "integer", "description": "Left coordinate"},
"y1": {"type": "integer", "description": "Top coordinate"},
"x2": {"type": "integer", "description": "Right coordinate"},
"y2": {"type": "integer", "description": "Bottom coordinate"},
"reason": {"type": "string", "description": "Why this region needs inspection"}
},
"required": ["x1", "y1", "x2", "y2", "reason"]
}
}
def analyze_with_crop(image_bytes: bytes, question: str) -> str:
image_data = base64.standard_b64encode(image_bytes).decode()
messages = [{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": image_data}},
{"type": "text", "text": f"{question}\n\nIf you need to see a region more clearly, use the crop_image tool."}
]
}]
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
tools=[crop_tool],
messages=messages
)
while response.stop_reason == "tool_use":
tool_block = next(b for b in response.content if b.type == "tool_use")
cropped_data = crop_image(
image_bytes,
tool_block.input["x1"],
tool_block.input["y1"],
tool_block.input["x2"],
tool_block.input["y2"]
)
messages.append({"role": "assistant", "content": response.content})
messages.append({
"role": "user",
"content": [
{"type": "tool_result", "tool_use_id": tool_block.id, "content": [
{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": cropped_data}}
]}
]
})
response = client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2000,
tools=[crop_tool],
messages=messages
)
return response.content[0].text
If you implement something like this, I'd add coordinate bounds checks and maybe rate limits—partly for safety, partly to avoid wasting cycles on repeated crops of almost the same region.
Summary
| Capability | Use Case |
|---|---|
| Basic Vision | Image description, object detection |
| Multi-image | Comparison, before/after analysis |
| Chart Analysis | Data extraction from visualizations |
| OCR/Forms | Invoice processing, document digitization |
| Crop Tool | Detailed inspection of specific regions |
Next I'll get into agent patterns, which is where vision + tool use + retrieval start to feel like a cohesive workflow instead of separate features.