跳转至

Instructor 结构化输出

为什么要学

LLM 的输出是自由文本,但程序需要结构化数据。Instructor 解决了这个核心问题:

  • 类型安全:用 Pydantic 模型定义输出结构,自动验证
  • 零解析代码:不需要写正则或 JSON 解析逻辑
  • 自动重试:验证失败时自动让 LLM 修正
  • 全模型支持:OpenAI/Anthropic/Ollama/Gemini 等全支持
  • Pythonic:原生 Python 类型提示,IDE 自动补全完美

一句话:让 LLM 可靠地输出你定义的任何 Python 数据结构。

核心概念

白话解释

  • Instructor = 给 LLM 装一个"结构化输出适配器"
  • Pydantic Model = 告诉 LLM "我要的数据长这样"
  • Validation = 如果 LLM 返回格式不对,自动让它重来
  • Retry = "你没按格式来,请修正" 的自动化过程

核心概念对照表

概念说明类比
Pydantic Model定义输出数据结构表单模板
Field模型中的一个字段(含描述)表单中的一个填写项
Validator字段验证规则表单校验(必填/格式等)
Patch/WrapInstructor注入方式给OpenAI客户端"升级"
response_model指定返回类型期望的返回值类型
max_retries验证失败最大重试次数"最多让LLM改几次"
Partial流式返回部分结构化数据边填表边显示
Iterable流式返回列表中的项逐条返回结果

安装配置

安装

pip install instructor

# 带特定LLM支持
pip install instructor[openai]
pip install instructor[anthropic]
pip install instructor[google-generativeai]
pip install instructor[ollama]

基本配置

import instructor
from openai import OpenAI
from pydantic import BaseModel

# 方法1: patch方式(修改原有client)
client = instructor.from_openai(OpenAI())

# 方法2: 使用Anthropic
from anthropic import Anthropic
client = instructor.from_anthropic(Anthropic())

# 方法3: 使用本地Ollama
from openai import OpenAI
client = instructor.from_openai(
    OpenAI(base_url="http://localhost:11434/v1", api_key="ollama"),
    mode=instructor.Mode.JSON
)

快速上手

第一个结构化输出

import instructor
from openai import OpenAI
from pydantic import BaseModel

client = instructor.from_openai(OpenAI())

# 定义你想要的数据结构
class UserInfo(BaseModel):
    name: str
    age: int
    email: str

# 调用LLM获取结构化数据
user = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=UserInfo,
    messages=[
        {"role": "user", "content": "从以下文本中提取用户信息:张三今年25岁,邮箱是zhangsan@example.com"}
    ]
)

print(user)
# UserInfo(name='张三', age=25, email='zhangsan@example.com')
print(user.name)  # 张三
print(user.age)   # 25

带描述的字段

from pydantic import BaseModel, Field

class MovieReview(BaseModel):
    title: str = Field(description="电影名称")
    rating: float = Field(description="评分(1-10)", ge=1, le=10)
    summary: str = Field(description="一句话总结")
    pros: list[str] = Field(description="优点列表")
    cons: list[str] = Field(description="缺点列表")
    recommended: bool = Field(description="是否推荐观看")

review = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=MovieReview,
    messages=[
        {"role": "user", "content": "评价电影《盗梦空间》"}
    ]
)

print(f"{review.title}: {review.rating}/10")
print(f"推荐: {'是' if review.recommended else '否'}")
print(f"优点: {', '.join(review.pros)}")

带验证的输出

from pydantic import BaseModel, Field, field_validator

class CodeSolution(BaseModel):
    language: str = Field(description="编程语言")
    code: str = Field(description="完整可运行的代码")
    explanation: str = Field(description="代码解释")
    time_complexity: str = Field(description="时间复杂度,如O(n)")

    @field_validator("language")
    @classmethod
    def validate_language(cls, v):
        allowed = ["python", "javascript", "typescript", "go", "rust"]
        if v.lower() not in allowed:
            raise ValueError(f"语言必须是以下之一: {allowed}")
        return v.lower()

    @field_validator("code")
    @classmethod
    def validate_code(cls, v):
        if len(v) < 10:
            raise ValueError("代码不能为空或过短")
        return v

solution = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=CodeSolution,
    max_retries=3,  # 验证失败自动重试
    messages=[
        {"role": "user", "content": "用Python实现二分查找"}
    ]
)

进阶用法

1. 嵌套模型

from pydantic import BaseModel, Field
from typing import Optional
from enum import Enum

class Priority(str, Enum):
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"

class SubTask(BaseModel):
    title: str
    estimated_hours: float
    priority: Priority

class ProjectPlan(BaseModel):
    project_name: str
    description: str
    total_hours: float
    tasks: list[SubTask] = Field(description="拆分的子任务列表")
    risks: list[str] = Field(description="潜在风险")
    tech_stack: list[str]

plan = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=ProjectPlan,
    messages=[
        {"role": "user", "content": "规划一个个人博客网站项目,使用Next.js + Supabase"}
    ]
)

for task in plan.tasks:
    print(f"[{task.priority.value}] {task.title} ({task.estimated_hours}h)")

2. 流式输出(Partial)

from instructor import Partial

# 流式获取部分结构化数据
for partial_user in client.chat.completions.create_partial(
    model="gpt-4o-mini",
    response_model=UserInfo,
    messages=[{"role": "user", "content": "提取信息:李四30岁,lisi@test.com"}]
):
    print(partial_user)
    # 逐步填充: UserInfo(name='李四', age=None, email=None)
    # → UserInfo(name='李四', age=30, email=None)
    # → UserInfo(name='李四', age=30, email='lisi@test.com')

3. 列表提取(Iterable)

class Person(BaseModel):
    name: str
    role: str
    company: str

# 从文本中提取多个实体
people = client.chat.completions.create_iterable(
    model="gpt-4o-mini",
    response_model=Person,
    messages=[{
        "role": "user",
        "content": """提取所有人物信息:
        张三是阿里巴巴的高级工程师,
        李四在字节跳动担任产品经理,
        王五是腾讯的设计师。"""
    }]
)

for person in people:  # 流式逐个返回
    print(f"{person.name} - {person.role} @ {person.company}")

4. 多模型支持

# Anthropic Claude
from anthropic import Anthropic
import instructor

client = instructor.from_anthropic(Anthropic())
result = client.messages.create(
    model="claude-sonnet-4-20250514",
    max_tokens=1024,
    response_model=UserInfo,
    messages=[{"role": "user", "content": "..."}]
)

# Google Gemini
import google.generativeai as genai
client = instructor.from_gemini(
    client=genai.GenerativeModel("gemini-1.5-flash")
)

# Ollama (本地)
client = instructor.from_openai(
    OpenAI(base_url="http://localhost:11434/v1", api_key="ollama"),
    mode=instructor.Mode.JSON
)

5. 分类任务

from enum import Enum
from pydantic import BaseModel

class Sentiment(str, Enum):
    POSITIVE = "positive"
    NEGATIVE = "negative"
    NEUTRAL = "neutral"

class TextClassification(BaseModel):
    sentiment: Sentiment
    confidence: float = Field(ge=0, le=1)
    keywords: list[str] = Field(description="关键情感词")
    reasoning: str = Field(description="判断理由")

result = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=TextClassification,
    messages=[{
        "role": "user",
        "content": "分类这条评论的情感:这个产品质量很差,完全不值这个价格,退货过程也很麻烦"
    }]
)

print(f"情感: {result.sentiment.value} (置信度: {result.confidence})")

6. 批量处理

import asyncio
from openai import AsyncOpenAI

async_client = instructor.from_openai(AsyncOpenAI())

async def extract_info(text: str) -> UserInfo:
    return await async_client.chat.completions.create(
        model="gpt-4o-mini",
        response_model=UserInfo,
        messages=[{"role": "user", "content": f"提取信息:{text}"}]
    )

async def batch_extract(texts: list[str]) -> list[UserInfo]:
    tasks = [extract_info(t) for t in texts]
    return await asyncio.gather(*tasks)

texts = ["张三25岁...", "李四30岁...", "王五28岁..."]
results = asyncio.run(batch_extract(texts))

7. 重试与自定义验证

from tenacity import retry, stop_after_attempt
from pydantic import BaseModel, model_validator

class SafeAnswer(BaseModel):
    answer: str
    sources: list[str] = Field(min_length=1)
    confidence: float = Field(ge=0, le=1)

    @model_validator(mode="after")
    def validate_answer(self):
        # 自定义验证逻辑
        if self.confidence > 0.8 and len(self.sources) < 2:
            raise ValueError("高置信度回答需要至少2个来源")
        if "不确定" in self.answer and self.confidence > 0.5:
            raise ValueError("含'不确定'的回答置信度不应超过0.5")
        return self

result = client.chat.completions.create(
    model="gpt-4o-mini",
    response_model=SafeAnswer,
    max_retries=3,  # 验证失败最多重试3次
    messages=[{"role": "user", "content": "量子计算的现状如何?"}]
)

常见问题

Q1: 和 OpenAI 原生 JSON Mode 的区别?

特性InstructorOpenAI JSON Mode
类型验证Pydantic完整验证只保证是有效JSON
自动重试验证失败自动修正
嵌套结构完美支持需要手动处理
IDE支持完整类型提示
多模型全平台仅OpenAI

Q2: 验证一直失败怎么办?

  • 简化 Pydantic 模型(去掉过于严格的验证)
  • 增加 Field description 给 LLM 更多上下文
  • 使用更强的模型(gpt-4o vs gpt-4o-mini)
  • 增加 max_retries

Q3: 处理 Optional 字段?

from typing import Optional

class FlexibleOutput(BaseModel):
    required_field: str
    optional_field: Optional[str] = None  # LLM可以不填
    with_default: str = "默认值"

Q4: 性能开销大吗?

  • Instructor 本身几乎零开销
  • 主要成本在 LLM API 调用
  • 重试会增加调用次数(通常 1-2 次)
  • 建议设置合理的 max_retries(2-3 次)

Q5: 支持流式输出吗?

支持。使用 create_partial 获取逐步填充的对象,适合前端实时展示。

参考资源