tools 工具 Pandera — 轻量级 pandas/Polars 数据框验证库 一句话说明 Pandera 用装饰器或 Schema 类给 DataFrame 定义类型和约束,像类型系统一样在运行时验证数据,比 Great Expectations 更轻量,更适合代码内联使用。
安装与配置 # pip 安装
pip install pandera # 当前版本 0.21+
# 可选扩展
pip install pandera[ polars] # Polars 支持
pip install pandera[ hypotheses] # 统计假设检验支持
pip install pandera[ mypy] # mypy 类型检查集成
# 验证
python -c "import pandera as pa; print(pa.__version__)"
核心用法 基于 Schema 验证 import pandera as pa
import pandas as pd
# 定义 Schema(类型 + 约束)
schema = pa . DataFrameSchema ({
"age" : pa . Column (
int , # 数据类型:整数
checks = [
pa . Check . between ( 0 , 120 ), # 值在 0-120 之间
pa . Check . not_null (), # 不能为空
],
),
"email" : pa . Column (
str ,
checks = pa . Check . str_matches ( r "^[^@]+@[^@]+\.[^@]+$" ), # 正则匹配
nullable = True , # 允许空值
),
"salary" : pa . Column (
float ,
checks = pa . Check ( lambda x : x > 0 , error = "薪资必须大于0" ), # 自定义检查
nullable = False ,
),
})
# 验证 DataFrame(失败时抛出 SchemaError)
df = pd . DataFrame ({
"age" : [ 25 , 30 , 45 ],
"email" : [ "a@b.com" , None , "c@d.com" ],
"salary" : [ 5000.0 , 6000.0 , 7000.0 ],
})
validated_df = schema . validate ( df ) # 通过则返回 df,否则抛异常
print ( "验证通过!" )
使用装饰器验证(最简洁) import pandera as pa
from pandera.typing import DataFrame , Series
# 定义 Schema 为类
class PatientSchema ( pa . DataFrameModel ):
patient_id : Series [ int ] # 整数 ID
age : Series [ int ] = pa . Field ( ge = 0 , le = 120 ) # 0-120
bmi : Series [ float ] = pa . Field ( nullable = True ) # 允许空
diagnosis : Series [ str ] = pa . Field ( isin = [ "健康" , "糖尿病" , "高血压" ])
class Config :
name = "patient_data" # Schema 名称
strict = True # 不允许额外列
coerce = True # 自动转类型
# 用装饰器验证函数输入输出
@pa . check_types
def process_patients ( df : DataFrame [ PatientSchema ]) -> DataFrame [ PatientSchema ]:
"""函数接受的 df 会被自动验证"""
df = df . copy ()
df [ "risk_flag" ] = df [ "age" ] > 60 # 增加风险标记
return df
# 测试
test_df = pd . DataFrame ({
"patient_id" : [ 1 , 2 , 3 ],
"age" : [ 25 , 65 , 45 ],
"bmi" : [ 22.5 , None , 28.0 ],
"diagnosis" : [ "健康" , "糖尿病" , "高血压" ],
})
result = process_patients ( test_df )
实战案例 菌群丰度数据验证 import pandera as pa
from pandera.typing import DataFrame , Series
class OTUSchema ( pa . DataFrameModel ):
sample_id : Series [ str ] = pa . Field ( unique = True ) # 样本 ID 唯一
shannon : Series [ float ] = pa . Field ( ge = 0.0 , le = 10.0 ) # Shannon 指数
observed_asv : Series [ int ] = pa . Field ( ge = 0 ) # ASV 数非负
group : Series [ str ] = pa . Field ( isin = [ "T2D" , "健康对照" ])
class Config :
coerce = True # 自动类型转换
@pa . check_types
def analyze_diversity ( otu_df : DataFrame [ OTUSchema ]) -> pd . DataFrame :
"""分析多样性,输入会被自动验证"""
return otu_df . groupby ( "group" )[ "shannon" ] . describe ()
# 读取并验证
otu_df = pd . read_csv ( "otu_diversity.csv" )
stats = analyze_diversity ( otu_df )
常见报错与解决 报错 原因 解决 SchemaError: column '...' not in dataframe列缺失 检查列名是否一致(大小写) SchemaError: expected type ... got ...类型不匹配 Schema 加 coerce=True 自动转换 SchemaError: ... nullable空值检查失败 字段加 nullable=True SchemaError: isin值不在允许集合内 检查数据实际值,更新 isin 列表
速查表 操作 代码 基本 Schema pa.DataFrameSchema({"col": pa.Column(int)})范围检查 pa.Check.between(0, 100)非空检查 pa.Check.not_null()枚举检查 pa.Field(isin=["A","B"])允许空值 pa.Field(nullable=True)自动转类型 pa.Field(coerce=True) / Config.coerce=True装饰器验证 @pa.check_types验证 schema.validate(df)