LangSmith: Observability & Evaluation
LangSmith Fundamentals
LangSmith is Anthropic's hosted platform for LLM observability, evaluation, and prompt management. It integrates with LangChain and LangGraph through automatic instrumentation — no code changes needed beyond setting two environment variables.
# .env file — all LangChain/LangGraph calls are traced automatically
LANGCHAIN_TRACING_V2=true
LANGSMITH_API_KEY=ls__your_key_here
LANGCHAIN_PROJECT=enterprise-rag-prod # organises traces by project
LANGCHAIN_ENDPOINT=https://api.smith.langchain.com # default
Tracing & Debugging
Every chain.invoke() or graph.invoke() produces a trace. The trace view in LangSmith shows a waterfall of spans — each span is one step in the pipeline with its inputs, outputs, duration, and token count.
Trace: "What is our refund policy?" 1.84s | 312 tokens ├── ChatPromptTemplate.format 2ms ├── ContextualCompressionRetriever 420ms │ ├── VectorStoreRetriever (k=20) 380ms [20 docs retrieved] │ └── CohereRerank (top_n=4) 38ms [4 docs kept] ├── ChatOpenAI (gpt-4o) 1,240ms | 248 tokens in / 64 out └── StrOutputParser 1ms
from langsmith import traceable, Client
from langchain_core.runnables import RunnableConfig
client = Client()
# @traceable: instruments a non-LangChain function as a LangSmith span
@traceable(name="custom_preprocessing", run_type="chain", tags=["preprocessing"])
def preprocess_query(query: str, user_id: str) -> dict:
"""Normalise and enrich the query before passing to the agent."""
cleaned = query.strip().lower()
return {
"original_query": query,
"cleaned_query": cleaned,
"word_count": len(cleaned.split()),
"user_id": user_id,
}
# Adding metadata to any LangChain runnable call
def agent_node_with_metadata(state: dict) -> dict:
response = model.invoke(
state["messages"],
config=RunnableConfig(
metadata={
"user_id": state.get("user_id"),
"session_id": state.get("session_id"),
"department": state.get("department", "general"),
},
tags=["production", "v2.1.0"],
)
)
return {"messages": [response]}
# Attach user feedback to a completed run
def record_user_feedback(run_id: str, score: int, comment: str):
"""Record thumbs up (1) or down (0) feedback linked to a trace."""
client.create_feedback(
run_id=run_id,
key="user_rating",
score=score, # 0.0 to 1.0 or boolean
comment=comment,
)
Datasets & Evaluation
Datasets in LangSmith are collections of input/output examples used to systematically evaluate your pipeline. The evaluation loop runs your pipeline against each example and scores the result.
from langsmith import Client
from langsmith.evaluation import evaluate, LangChainStringEvaluator
from langsmith.schemas import Run, Example
client = Client()
# ── Create or load a dataset ──
dataset_name = "enterprise-rag-golden-set"
try:
dataset = client.read_dataset(dataset_name=dataset_name)
except Exception:
dataset = client.create_dataset(
dataset_name=dataset_name,
description="Golden set for enterprise RAG evaluation"
)
# Add examples (question + expected answer pairs)
examples = [
("What is the SLA for P1 incidents?",
"P1 incidents have a 4-hour response SLA per the service agreement."),
("How do I reset 2FA?",
"Go to Account Settings → Security → Reset Two-Factor Authentication."),
("What is the maximum API rate limit?",
"The standard tier allows 1,000 requests per minute."),
]
for question, expected in examples:
client.create_example(
inputs={"question": question},
outputs={"answer": expected},
dataset_id=dataset.id,
)
# ── Define the pipeline to evaluate ──
def rag_pipeline(inputs: dict) -> dict:
answer = rag_chain.invoke(inputs["question"])
return {"answer": answer}
# ── Built-in evaluators ──
# qa: LLM judges if the answer correctly answers the question
# criteria: LLM judges against custom criteria
def custom_grounded_evaluator(run: Run, example: Example) -> dict:
"""Custom evaluator: check the answer references a source."""
answer = run.outputs.get("answer", "")
grounded = "[Source:" in answer or "page" in answer.lower()
return {"key": "grounded", "score": 1 if grounded else 0}
# ── Run evaluation ──
results = evaluate(
rag_pipeline,
data=dataset_name,
evaluators=[
LangChainStringEvaluator("qa"),
LangChainStringEvaluator("criteria",
criteria={"helpful": "Does the answer fully address the question?"}),
custom_grounded_evaluator,
],
experiment_prefix="rag-v2-test",
metadata={"model": "gpt-4o", "retriever": "hybrid-rerank", "k": 4},
)
print(f"Experiment: {results.experiment_name}")
for metric_name, scores in results.to_pandas().groupby("feedback.key")["feedback.score"].mean().items():
print(f" {metric_name}: {scores:.2f}")
Prompt Management with LangChain Hub
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
# ── Push a prompt to the Hub ──
rag_prompt = ChatPromptTemplate.from_messages([
("system", """You are an enterprise support assistant.
Answer using ONLY the provided context. Cite the source document.
If the answer is not in the context, say "I don't have that information."
Context:
{context}"""),
("human", "{question}"),
])
# Push to hub — creates a new version each time
hub.push("your-org/enterprise-rag-prompt", rag_prompt, new_repo_is_public=False)
# ── Pull the latest version at runtime ──
prompt = hub.pull("your-org/enterprise-rag-prompt")
chain = prompt | model | StrOutputParser()
# ── Pull a specific version by commit hash ──
prompt_v1 = hub.pull("your-org/enterprise-rag-prompt:abc123def")
# ── A/B test two prompt versions on the same dataset ──
results_v1 = evaluate(
lambda x: {"answer": (hub.pull("your-org/enterprise-rag-prompt:v1") | model | StrOutputParser()).invoke(x)},
data=dataset_name, evaluators=[LangChainStringEvaluator("qa")],
experiment_prefix="prompt-v1",
)
results_v2 = evaluate(
lambda x: {"answer": (hub.pull("your-org/enterprise-rag-prompt:v2") | model | StrOutputParser()).invoke(x)},
data=dataset_name, evaluators=[LangChainStringEvaluator("qa")],
experiment_prefix="prompt-v2",
)
CI/CD Integration
"""
ci_eval.py — run this in your CI pipeline (GitHub Actions, GitLab CI, etc.)
Exit code 0 = pass, 1 = quality regression detected.
"""
import sys
from langsmith import Client
from langsmith.evaluation import evaluate, LangChainStringEvaluator
client = Client()
QUALITY_THRESHOLD = 0.80 # 80% minimum on every metric
DATASET_NAME = "enterprise-rag-golden-set"
def run_ci_evaluation() -> bool:
"""Returns True if all quality gates pass."""
results = evaluate(
rag_pipeline,
data=DATASET_NAME,
evaluators=[
LangChainStringEvaluator("qa"),
LangChainStringEvaluator("criteria",
criteria={"helpful": "Fully addresses the question"}),
],
experiment_prefix="ci-run",
)
df = results.to_pandas()
grouped = df.groupby("feedback.key")["feedback.score"].mean()
all_pass = True
for metric, score in grouped.items():
status = "✓ PASS" if score >= QUALITY_THRESHOLD else "✗ FAIL"
print(f" {status} {metric}: {score:.2f} (threshold: {QUALITY_THRESHOLD})")
if score < QUALITY_THRESHOLD:
all_pass = False
return all_pass
if __name__ == "__main__":
print(f"Running CI evaluation on dataset: {DATASET_NAME}")
passed = run_ci_evaluation()
if not passed:
print("\n❌ Quality regression detected. Blocking deployment.")
sys.exit(1)
else:
print("\n✅ All quality gates passed.")
sys.exit(0)
Create at least 20 representative examples before going to production. Curate from real user queries once live. After any significant change (new retriever, new model, new prompt), run the eval suite and compare experiment results in the LangSmith UI before deploying.