Instructor 结构化输出¶

为什么要学¶

LLM 的输出是自由文本，但程序需要结构化数据。Instructor 解决了这个核心问题：

类型安全：用 Pydantic 模型定义输出结构，自动验证
零解析代码：不需要写正则或 JSON 解析逻辑
自动重试：验证失败时自动让 LLM 修正
全模型支持：OpenAI/Anthropic/Ollama/Gemini 等全支持
Pythonic：原生 Python 类型提示，IDE 自动补全完美

一句话：让 LLM 可靠地输出你定义的任何 Python 数据结构。

核心概念¶

白话解释¶

Instructor = 给 LLM 装一个"结构化输出适配器"
Pydantic Model = 告诉 LLM "我要的数据长这样"
Validation = 如果 LLM 返回格式不对，自动让它重来
Retry = "你没按格式来，请修正" 的自动化过程

核心概念对照表¶

概念	说明	类比
Pydantic Model	定义输出数据结构	表单模板
Field	模型中的一个字段(含描述)	表单中的一个填写项
Validator	字段验证规则	表单校验(必填/格式等)
Patch/Wrap	Instructor注入方式	给OpenAI客户端"升级"
response_model	指定返回类型	期望的返回值类型
max_retries	验证失败最大重试次数	"最多让LLM改几次"
Partial	流式返回部分结构化数据	边填表边显示
Iterable	流式返回列表中的项	逐条返回结果

安装配置¶

安装¶

pip install instructor

# 带特定LLM支持
pip install instructor[openai]
pip install instructor[anthropic]
pip install instructor[google-generativeai]
pip install instructor[ollama]

基本配置¶

import instructor
from openai import OpenAI
from pydantic import BaseModel

# 方法1: patch方式(修改原有client)
client = instructor.from_openai(OpenAI())

# 方法2: 使用Anthropic
from anthropic import Anthropic
client = instructor.from_anthropic(Anthropic())

# 方法3: 使用本地Ollama
from openai import OpenAI
client = instructor.from_openai(
    OpenAI(base_url="http://localhost:11434/v1", api_key="ollama"),
    mode=instructor.Mode.JSON
)

快速上手¶

第一个结构化输出¶

import instructor
from openai import OpenAI
from pydantic import BaseModel

client = instructor.from_openai(OpenAI())

# 定义你想要的数据结构
class UserInfo(BaseModel):
    name: str
    age: int
    email: str

# 调用LLM获取结构化数据
user = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=UserInfo,
    messages=[
        {"role": "user", "content": "从以下文本中提取用户信息：张三今年25岁，邮箱是zhangsan@example.com"}
    ]
)

print(user)
# UserInfo(name='张三', age=25, email='zhangsan@example.com')
print(user.name)  # 张三
print(user.age)   # 25

带描述的字段¶

from pydantic import BaseModel, Field

class MovieReview(BaseModel):
    title: str = Field(description="电影名称")
    rating: float = Field(description="评分(1-10)", ge=1, le=10)
    summary: str = Field(description="一句话总结")
    pros: list[str] = Field(description="优点列表")
    cons: list[str] = Field(description="缺点列表")
    recommended: bool = Field(description="是否推荐观看")

review = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=MovieReview,
    messages=[
        {"role": "user", "content": "评价电影《盗梦空间》"}
    ]
)

print(f"{review.title}: {review.rating}/10")
print(f"推荐: {'是' if review.recommended else '否'}")
print(f"优点: {', '.join(review.pros)}")

带验证的输出¶

from pydantic import BaseModel, Field, field_validator

class CodeSolution(BaseModel):
    language: str = Field(description="编程语言")
    code: str = Field(description="完整可运行的代码")
    explanation: str = Field(description="代码解释")
    time_complexity: str = Field(description="时间复杂度，如O(n)")

    @field_validator("language")
    @classmethod
    def validate_language(cls, v):
        allowed = ["python", "javascript", "typescript", "go", "rust"]
        if v.lower() not in allowed:
            raise ValueError(f"语言必须是以下之一: {allowed}")
        return v.lower()

    @field_validator("code")
    @classmethod
    def validate_code(cls, v):
        if len(v) < 10:
            raise ValueError("代码不能为空或过短")
        return v

solution = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=CodeSolution,
    max_retries=3,  # 验证失败自动重试
    messages=[
        {"role": "user", "content": "用Python实现二分查找"}
    ]
)

进阶用法¶

1. 嵌套模型¶

from pydantic import BaseModel, Field
from typing import Optional
from enum import Enum

class Priority(str, Enum):
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"

class SubTask(BaseModel):
    title: str
    estimated_hours: float
    priority: Priority

class ProjectPlan(BaseModel):
    project_name: str
    description: str
    total_hours: float
    tasks: list[SubTask] = Field(description="拆分的子任务列表")
    risks: list[str] = Field(description="潜在风险")
    tech_stack: list[str]

plan = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=ProjectPlan,
    messages=[
        {"role": "user", "content": "规划一个个人博客网站项目，使用Next.js + Supabase"}
    ]
)

for task in plan.tasks:
    print(f"[{task.priority.value}] {task.title} ({task.estimated_hours}h)")

2. 流式输出(Partial)¶

from instructor import Partial

# 流式获取部分结构化数据
for partial_user in client.chat.completions.create_partial(
    model="gpt-4o-mini",
    response_model=UserInfo,
    messages=[{"role": "user", "content": "提取信息：李四30岁，lisi@test.com"}]
):
    print(partial_user)
    # 逐步填充: UserInfo(name='李四', age=None, email=None)
    # → UserInfo(name='李四', age=30, email=None)
    # → UserInfo(name='李四', age=30, email='lisi@test.com')

3. 列表提取(Iterable)¶

class Person(BaseModel):
    name: str
    role: str
    company: str

# 从文本中提取多个实体
people = client.chat.completions.create_iterable(
    model="gpt-4o-mini",
    response_model=Person,
    messages=[{
        "role": "user",
        "content": """提取所有人物信息：
        张三是阿里巴巴的高级工程师，
        李四在字节跳动担任产品经理，
        王五是腾讯的设计师。"""
    }]
)

for person in people:  # 流式逐个返回
    print(f"{person.name} - {person.role} @ {person.company}")

4. 多模型支持¶

# Anthropic Claude
from anthropic import Anthropic
import instructor

client = instructor.from_anthropic(Anthropic())
result = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    response_model=UserInfo,
    messages=[{"role": "user", "content": "..."}]
)

# Google Gemini
import google.generativeai as genai
client = instructor.from_gemini(
    client=genai.GenerativeModel("gemini-1.5-flash")
)

# Ollama (本地)
client = instructor.from_openai(
    OpenAI(base_url="http://localhost:11434/v1", api_key="ollama"),
    mode=instructor.Mode.JSON
)

5. 分类任务¶

from enum import Enum
from pydantic import BaseModel

class Sentiment(str, Enum):
    POSITIVE = "positive"
    NEGATIVE = "negative"
    NEUTRAL = "neutral"

class TextClassification(BaseModel):
    sentiment: Sentiment
    confidence: float = Field(ge=0, le=1)
    keywords: list[str] = Field(description="关键情感词")
    reasoning: str = Field(description="判断理由")

result = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=TextClassification,
    messages=[{
        "role": "user",
        "content": "分类这条评论的情感：这个产品质量很差，完全不值这个价格，退货过程也很麻烦"
    }]
)

print(f"情感: {result.sentiment.value} (置信度: {result.confidence})")

6. 批量处理¶

import asyncio
from openai import AsyncOpenAI

async_client = instructor.from_openai(AsyncOpenAI())

async def extract_info(text: str) -> UserInfo:
    return await async_client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=UserInfo,
        messages=[{"role": "user", "content": f"提取信息：{text}"}]
    )

async def batch_extract(texts: list[str]) -> list[UserInfo]:
    tasks = [extract_info(t) for t in texts]
    return await asyncio.gather(*tasks)

texts = ["张三25岁...", "李四30岁...", "王五28岁..."]
results = asyncio.run(batch_extract(texts))

7. 重试与自定义验证¶

from tenacity import retry, stop_after_attempt
from pydantic import BaseModel, model_validator

class SafeAnswer(BaseModel):
    answer: str
    sources: list[str] = Field(min_length=1)
    confidence: float = Field(ge=0, le=1)

    @model_validator(mode="after")
    def validate_answer(self):
        # 自定义验证逻辑
        if self.confidence > 0.8 and len(self.sources) < 2:
            raise ValueError("高置信度回答需要至少2个来源")
        if "不确定" in self.answer and self.confidence > 0.5:
            raise ValueError("含'不确定'的回答置信度不应超过0.5")
        return self

result = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=SafeAnswer,
    max_retries=3,  # 验证失败最多重试3次
    messages=[{"role": "user", "content": "量子计算的现状如何?"}]
)

常见问题¶

Q1: 和 OpenAI 原生 JSON Mode 的区别？¶

特性	Instructor	OpenAI JSON Mode
类型验证	Pydantic完整验证	只保证是有效JSON
自动重试	验证失败自动修正	无
嵌套结构	完美支持	需要手动处理
IDE支持	完整类型提示	无
多模型	全平台	仅OpenAI

Q2: 验证一直失败怎么办？¶

简化 Pydantic 模型（去掉过于严格的验证）
增加 Field description 给 LLM 更多上下文
使用更强的模型（gpt-4o vs gpt-4o-mini）
增加 max_retries

Q3: 处理 Optional 字段？¶

from typing import Optional

class FlexibleOutput(BaseModel):
    required_field: str
    optional_field: Optional[str] = None  # LLM可以不填
    with_default: str = "默认值"

Q4: 性能开销大吗？¶

Instructor 本身几乎零开销
主要成本在 LLM API 调用
重试会增加调用次数（通常 1-2 次）
建议设置合理的 max_retries（2-3 次）

Q5: 支持流式输出吗？¶

支持。使用 create_partial 获取逐步填充的对象，适合前端实时展示。

参考资源¶

Instructor 官方文档 - 完整文档
Instructor GitHub - 源代码
Pydantic 文档 - 数据验证
Cookbook - 示例集
Jason Liu Blog - 作者博客