Multimodal Interaction
1. Multimodal Input/Output Map
| Modality | Direction | Recommended Library / Model |
|---|---|---|
| Image understanding | Input → LLM | GPT-4o / Claude 3.5 Sonnet / Gemini Pro Vision |
| Audio transcription | Input → Text | OpenAI Whisper (API or local) |
| Document intelligence | PDF/Docx → Text+Layout | Azure DI / Unstructured / pdfplumber |
| Image generation | Text → Image | DALL-E 3 / Stable Diffusion (via LangChain tool) |
| Text-to-speech | Text → Audio | OpenAI TTS / ElevenLabs |
| Video frames | Video → Images → LLM | OpenCV frame extraction + vision model |
2. Vision Models with LangChain
GPT-4o and similar vision models accept images as part of the message. LangChain represents image content as a list containing text and image_url parts.
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
llm = ChatOpenAI(model="gpt-4o", max_tokens=1024)
# Pass image as URL
message = HumanMessage(content=[
{"type": "text", "text": "Describe what you see in this image."},
{
"type": "image_url",
"image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/240px-PNG_transparency_demonstration_1.png"},
},
])
response = llm.invoke([message])
print(response.content)
import base64
from pathlib import Path
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
def encode_image(path: str) -> str:
return base64.b64encode(Path(path).read_bytes()).decode("utf-8")
llm = ChatOpenAI(model="gpt-4o")
image_b64 = encode_image("invoice.png")
message = HumanMessage(content=[
{"type": "text", "text": "Extract all line items, quantities, and prices from this invoice."},
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
},
])
response = llm.invoke([message])
print(response.content)
"detail": "high" or "detail": "low" inside image_url to control token usage. High detail costs more tokens but captures fine text; low is cheaper for general scene understanding.
3. Structured Output from Images
Combine vision input with with_structured_output() to extract typed data directly from images.
from pydantic import BaseModel, Field
from typing import List
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
class LineItem(BaseModel):
description: str
quantity: int
unit_price: float
total: float
class Invoice(BaseModel):
invoice_number: str
vendor: str
date: str
line_items: List[LineItem]
grand_total: float
structured_llm = ChatOpenAI(model="gpt-4o").with_structured_output(Invoice)
def extract_invoice(image_b64: str) -> Invoice:
message = HumanMessage(content=[
{"type": "text", "text": "Extract structured invoice data from this image."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
])
return structured_llm.invoke([message])
invoice = extract_invoice(encode_image("invoice.png"))
print(f"Total: {invoice.grand_total}, Items: {len(invoice.line_items)}")
4. Audio Transcription with Whisper
Whisper converts speech to text. Use the OpenAI Whisper API for production, or run the open-source model locally for air-gapped environments.
from openai import OpenAI
client = OpenAI()
def transcribe_audio(audio_path: str, language: str = "en") -> str:
with open(audio_path, "rb") as f:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=f,
language=language,
response_format="text",
)
return transcript
text = transcribe_audio("customer_call.mp3")
print(text)
import whisper # pip install openai-whisper
model = whisper.load_model("base") # tiny / base / small / medium / large
result = model.transcribe("customer_call.mp3")
print(result["text"])
# Access word-level timestamps
for segment in result["segments"]:
print(f"[{segment['start']:.1f}s] {segment['text']}")
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
openai_client = OpenAI()
llm = ChatOpenAI(model="gpt-4o-mini")
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful voice assistant. Be concise."),
("human", "{transcript}"),
])
chain = prompt | llm
def handle_voice_input(audio_path: str) -> str:
# Step 1: Speech → Text
with open(audio_path, "rb") as f:
transcript = openai_client.audio.transcriptions.create(
model="whisper-1", file=f, response_format="text"
)
# Step 2: Text → Agent
response = chain.invoke({"transcript": transcript})
return response.content
answer = handle_voice_input("question.mp3")
print(answer)
5. Document Intelligence with Unstructured
The unstructured library extracts text, tables, headers, and layout metadata from PDFs, Word docs, HTML, and more — preserving semantic structure for better RAG chunking.
pip install unstructured[all-docs] langchain-community
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = UnstructuredFileLoader(
"annual_report.pdf",
mode="elements", # preserves element types (Title, Table, NarrativeText)
strategy="hi_res", # uses OCR for scanned PDFs
)
elements = loader.load()
# Filter only narrative text, skip headers/footers
narrative = [e for e in elements if e.metadata.get("category") == "NarrativeText"]
print(f"Loaded {len(narrative)} narrative elements")
# Chunk for RAG
splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
chunks = splitter.split_documents(narrative)
print(f"Split into {len(chunks)} chunks")
from langchain_community.document_loaders import UnstructuredFileLoader
loader = UnstructuredFileLoader("financial_report.pdf", mode="elements")
elements = loader.load()
tables = [e for e in elements if e.metadata.get("category") == "Table"]
for i, table in enumerate(tables):
print(f"--- Table {i+1} ---")
print(table.page_content[:300]) # Table content as HTML or text
6. Multimodal RAG
Multimodal RAG retrieves both text and images relevant to a query, then passes them together to a vision model for synthesis.
import base64
from pathlib import Path
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from langchain_core.documents import Document
# 1. Build a mixed-modality store
# Images are stored with base64 content; text chunks stored normally
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
store = Chroma(embedding_function=embeddings, collection_name="multimodal-rag")
def index_image(image_path: str, caption: str):
b64 = base64.b64encode(Path(image_path).read_bytes()).decode()
doc = Document(
page_content=caption, # embed the caption for retrieval
metadata={"type": "image", "b64": b64, "path": image_path},
)
store.add_documents([doc])
def index_text(text: str, source: str):
doc = Document(page_content=text, metadata={"type": "text", "source": source})
store.add_documents([doc])
# 2. Retrieve relevant documents
def retrieve(query: str, k: int = 4) -> list[Document]:
return store.similarity_search(query, k=k)
# 3. Build multimodal prompt from retrieved docs
def build_multimodal_prompt(query: str, docs: list[Document]) -> list:
parts = [{"type": "text", "text": f"Answer the question using the provided context.\n\nQuestion: {query}\n\nContext:"}]
for doc in docs:
if doc.metadata.get("type") == "image":
parts.append({"type": "text", "text": f"\n[Image: {doc.page_content}]"})
parts.append({
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{doc.metadata['b64']}"},
})
else:
parts.append({"type": "text", "text": f"\n{doc.page_content}"})
return parts
# 4. Run the multimodal RAG chain
llm = ChatOpenAI(model="gpt-4o", max_tokens=1024)
def multimodal_rag_query(question: str) -> str:
docs = retrieve(question)
content_parts = build_multimodal_prompt(question, docs)
response = llm.invoke([HumanMessage(content=content_parts)])
return response.content
# Index some content
index_text("Our Q3 revenue was $4.2M, up 18% YoY.", source="q3-report.pdf")
index_image("q3_chart.png", "Q3 revenue bar chart showing monthly breakdown")
answer = multimodal_rag_query("What was the Q3 revenue growth?")
print(answer)
7. Image Generation as an Agent Tool
Agents can generate images by wrapping DALL-E or Stable Diffusion in a LangChain tool.
from openai import OpenAI
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
openai_client = OpenAI()
@tool
def generate_image(prompt: str, size: str = "1024x1024") -> str:
"""Generate an image from a text prompt using DALL-E 3.
Returns the URL of the generated image.
size: '1024x1024' | '1792x1024' | '1024x1792'
"""
response = openai_client.images.generate(
model="dall-e-3",
prompt=prompt,
size=size,
quality="standard",
n=1,
)
url = response.data[0].url
return f"Image generated: {url}"
llm = ChatOpenAI(model="gpt-4o")
agent = create_react_agent(llm, tools=[generate_image])
result = agent.invoke({
"messages": [("human", "Create an image of a futuristic city skyline at sunset.")]
})
print(result["messages"][-1].content)
8. Video Frame Analysis
For video, extract key frames and pass them to a vision model for analysis.
import base64
import cv2 # pip install opencv-python
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
def extract_frames(video_path: str, interval_sec: int = 5) -> list[str]:
"""Extract frames every N seconds. Returns list of base64 strings."""
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_interval = int(fps * interval_sec)
frames = []
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx % frame_interval == 0:
_, buffer = cv2.imencode(".jpg", frame)
frames.append(base64.b64encode(buffer).decode("utf-8"))
frame_idx += 1
cap.release()
return frames
def analyse_video(video_path: str, question: str) -> str:
frames = extract_frames(video_path, interval_sec=10)
llm = ChatOpenAI(model="gpt-4o", max_tokens=1024)
content = [{"type": "text", "text": f"Analyse these video frames and answer: {question}"}]
for b64 in frames[:10]: # limit to 10 frames to control token cost
content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}})
response = llm.invoke([HumanMessage(content=content)])
return response.content
answer = analyse_video("product_demo.mp4", "What product features are demonstrated?")
print(answer)