69. Biopython 完全指南¶
一句话说明: Biopython 是 Python 的生物信息学工具箱,提供序列读写、比对、BLAST、数据库查询、系统发育等功能的统一接口,是生信 Python 编程的基础库。
1. Biopython 是什么¶
白话解释: 如果 Python 是一把瑞士军刀,Biopython 就是专门给生物学家打造的那套刀片——读 FASTA、做 BLAST、查 NCBI 数据库,一个库全搞定,不用自己从零写解析器。
安装:
# 用pip安装(推荐)
pip install biopython
# 用conda安装
conda install -c conda-forge biopython
# 验证安装
python -c "import Bio; print(Bio.__version__)"
# 输出类似:1.84(截至2025年的最新稳定版)
2. 核心模块详解¶
2.1 Bio.SeqIO —— 序列读写(最常用)¶
白话: 读写各种序列文件(FASTA、GenBank、FASTQ等)的万能接口。
from Bio import SeqIO
# ===== 读取单条序列 =====
record = SeqIO.read("single_gene.fasta", "fasta") # 只有一条序列时用read
print(record.id) # 序列ID
print(record.description) # 完整描述行(>后面的全部内容)
print(record.seq) # 序列本身(Seq对象)
print(len(record)) # 序列长度
# ===== 读取多条序列 =====
for record in SeqIO.parse("genes.fasta", "fasta"): # 多条序列用parse(返回迭代器)
print(f"{record.id}\t长度: {len(record)}")
# ===== 读取为字典(按ID快速查找)=====
seq_dict = SeqIO.to_dict(SeqIO.parse("genes.fasta", "fasta"))
print(seq_dict["gene_001"].seq) # 通过ID直接访问
# ===== 读取GenBank格式 =====
for record in SeqIO.parse("sequence.gb", "genbank"):
print(f"ID: {record.id}")
print(f"描述: {record.description}")
for feature in record.features: # 遍历注释特征
if feature.type == "CDS": # 只看编码区
print(f" CDS位置: {feature.location}")
if "gene" in feature.qualifiers:
print(f" 基因名: {feature.qualifiers['gene'][0]}")
# ===== 写入FASTA文件 =====
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
records = [
SeqRecord(Seq("ATCGATCG"), id="gene_001", description="sample gene 1"),
SeqRecord(Seq("GCGCATAT"), id="gene_002", description="sample gene 2"),
]
SeqIO.write(records, "output.fasta", "fasta") # 写入FASTA格式
# ===== 格式转换 =====
records = SeqIO.parse("input.genbank", "genbank")
SeqIO.write(records, "output.fasta", "fasta") # GenBank → FASTA 一行搞定
# ===== 读取FASTQ =====
for record in SeqIO.parse("reads.fastq", "fastq"):
quals = record.letter_annotations["phred_quality"] # 质量值列表
avg_qual = sum(quals) / len(quals) # 平均质量
print(f"{record.id}\t平均质量: {avg_qual:.1f}")
2.2 Bio.Seq —— 序列操作¶
from Bio.Seq import Seq
seq = Seq("ATGATCTCGTAA") # 创建DNA序列对象
# ===== 基本操作 =====
print(seq.complement()) # TACTAGAGCATT —— 互补链
print(seq.reverse_complement()) # TTACGAGATCAT —— 反向互补链(最常用)
print(seq.transcribe()) # AUGAUCUCGUAA —— DNA→RNA(T→U)
print(seq.translate()) # MIS* —— DNA→蛋白质(*=终止密码子)
print(seq.translate(to_stop=True)) # MIS —— 翻译到终止密码子前停止
# ===== 切片和拼接 =====
print(seq[0:3]) # ATG —— 前3个碱基(起始密码子)
print(seq[::-1]) # AATGCTCTAGTA —— 反转序列
new_seq = seq + Seq("AAAA") # ATGATCTCGTAAAAAA —— 拼接
# ===== 搜索 =====
print(seq.count("ATG")) # 1 —— ATG出现次数
print(seq.find("TCG")) # 5 —— TCG首次出现位置(0-based)
# ===== 注意:Biopython的Seq对象是不可变的(immutable)=====
# seq[0] = "G" # ❌ 报错!
mutable_seq = seq.tomutable() # 转为可变序列
mutable_seq[0] = "G" # ✅ 可以修改
seq2 = mutable_seq.toseq() # 转回不可变
2.3 Bio.SeqUtils —— 序列工具¶
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction # 注意:新版用gc_fraction,旧版是GC
from Bio.SeqUtils import molecular_weight
from Bio.SeqUtils.ProtParam import ProteinAnalysis
seq = Seq("ATCGATCGCCGCATAT")
# ===== GC含量 =====
gc = gc_fraction(seq) # 返回0-1之间的小数
print(f"GC含量: {gc:.2%}") # GC含量: 50.00%
# ===== 分子量 =====
mw = molecular_weight(seq, "DNA") # DNA分子量(单位:道尔顿)
print(f"分子量: {mw:.1f} Da")
# ===== 蛋白质分析 =====
protein = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPK")
print(f"分子量: {protein.molecular_weight():.1f}") # 蛋白质分子量
print(f"等电点: {protein.isoelectric_point():.2f}") # 等电点
print(f"氨基酸组成: {protein.get_amino_acids_percent()}") # 各氨基酸比例
2.4 Bio.Blast —— 网络BLAST¶
from Bio.Blast import NCBIWWW, NCBIXML
# ===== 在线BLAST(向NCBI服务器提交查询)=====
sequence = "ATGATCTCGTAAGATCGATCG"
result_handle = NCBIWWW.qblast(
"blastn", # 程序:blastn/blastp/blastx/tblastn/tblastx
"nt", # 数据库:nt(核酸)/nr(蛋白质)/swissprot
sequence # 查询序列
)
# ===== 解析BLAST结果 =====
blast_records = NCBIXML.parse(result_handle)
for blast_record in blast_records:
print(f"查询序列: {blast_record.query}")
for alignment in blast_record.alignments[:5]: # 取前5个比对结果
print(f"\n 命中: {alignment.title[:80]}")
for hsp in alignment.hsps: # HSP = 高分片段对
print(f" E-value: {hsp.expect}") # E值越小越好
print(f" 得分: {hsp.score}")
print(f" 一致性: {hsp.identities}/{hsp.align_length}")
print(f" Query: {hsp.query[:60]}")
print(f" Subject: {hsp.sbjct[:60]}")
# ===== 保存结果(网络BLAST很慢,建议保存XML)=====
with open("blast_result.xml", "w") as f:
f.write(result_handle.read())
# ===== 从本地XML文件读取 =====
with open("blast_result.xml") as f:
blast_records = NCBIXML.parse(f)
for record in blast_records:
print(f"命中数: {len(record.alignments)}")
2.5 Bio.Entrez —— NCBI数据库查询¶
from Bio import Entrez
Entrez.email = "your_email@example.com" # NCBI要求提供邮箱(必须设置)
# ===== 搜索PubMed =====
handle = Entrez.esearch(
db="pubmed", # 数据库:pubmed/nucleotide/protein/gene
term="metagenomics AND type 2 diabetes", # 搜索词
retmax=5 # 最多返回5条
)
record = Entrez.read(handle)
print(f"总结果数: {record['Count']}")
print(f"返回ID: {record['IdList']}") # 文献的PMID列表
# ===== 获取文献摘要 =====
handle = Entrez.efetch(
db="pubmed",
id=record['IdList'], # 传入PMID列表
rettype="abstract",
retmode="text"
)
print(handle.read()) # 打印摘要文本
# ===== 下载核酸序列 =====
handle = Entrez.efetch(
db="nucleotide",
id="NM_000546", # TP53的RefSeq ID
rettype="fasta",
retmode="text"
)
fasta_data = handle.read()
print(fasta_data[:200]) # 打印前200个字符
# ===== 批量下载并保存 =====
handle = Entrez.efetch(db="nucleotide", id="NM_000546", rettype="gb", retmode="text")
with open("TP53.gb", "w") as f:
f.write(handle.read()) # 保存GenBank格式文件
2.6 Bio.Phylo —— 系统发育树¶
from Bio import Phylo
from io import StringIO
# ===== 读取Newick格式的树 =====
tree = Phylo.read("tree.nwk", "newick") # 读取Newick格式树文件
Phylo.draw_ascii(tree) # 在终端画ASCII树
# ===== 从字符串创建树 =====
newick_str = "(((A:0.1,B:0.2):0.3,C:0.4):0.5,D:0.6);"
tree = Phylo.read(StringIO(newick_str), "newick")
# ===== 遍历树的节点 =====
for clade in tree.find_clades(): # 遍历所有节点
if clade.name:
print(f"节点: {clade.name}, 枝长: {clade.branch_length}")
# ===== 计算树的信息 =====
print(f"叶节点数: {tree.count_terminals()}") # 叶节点(物种)数量
print(f"总枝长: {tree.total_branch_length():.2f}") # 总进化距离
# ===== 格式转换 =====
Phylo.write(tree, "output.xml", "phyloxml") # Newick → PhyloXML
2.7 Bio.PDB —— 蛋白质结构¶
from Bio.PDB import PDBParser, PDBIO
# ===== 解析PDB文件 =====
parser = PDBParser(QUIET=True) # QUIET=True 关闭警告
structure = parser.get_structure("protein", "1fat.pdb") # 解析PDB文件
# ===== 遍历结构层次:Structure → Model → Chain → Residue → Atom =====
for model in structure:
for chain in model:
print(f"链: {chain.id}, 残基数: {len(list(chain.get_residues()))}")
for residue in list(chain.get_residues())[:3]: # 看前3个残基
print(f" 残基: {residue.get_resname()} {residue.id[1]}")
for atom in residue:
print(f" 原子: {atom.name}, 坐标: {atom.get_vector()}")
3. 常用操作汇总¶
3.1 读 FASTA + 统计¶
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
# 批量处理:读取FASTA,统计每条序列的长度和GC含量
results = []
for record in SeqIO.parse("genes.fasta", "fasta"):
gc = gc_fraction(record.seq) # 计算GC含量
results.append({
"id": record.id,
"length": len(record),
"gc": round(gc, 4)
})
print(f"{record.id}\t{len(record)}bp\tGC={gc:.2%}")
# 找出最长的序列
longest = max(results, key=lambda x: x["length"])
print(f"\n最长序列: {longest['id']} ({longest['length']}bp)")
3.2 反向互补 + 翻译¶
from Bio.Seq import Seq
dna = Seq("ATGATCTCGTAA")
# 正向翻译
protein = dna.translate()
print(f"正向翻译: {protein}") # MIS*
# 反向互补链翻译
rc = dna.reverse_complement()
protein_rc = rc.translate()
print(f"反向互补翻译: {protein_rc}") # 可能不同
# 三框翻译(查找所有可能的ORF)
for frame in range(3):
subseq = dna[frame:] # 从第frame位开始
# 补齐为3的倍数
subseq = subseq[:len(subseq) - len(subseq) % 3]
protein = subseq.translate()
print(f"Frame +{frame+1}: {protein}")
3.3 Motif搜索¶
from Bio import SeqIO
import re
# 搜索所有包含特定motif的序列
motif_pattern = re.compile(r"ATG[ATCG]{6,9}TAA") # ATG开头,TAA结尾,中间6-9个碱基
for record in SeqIO.parse("genes.fasta", "fasta"):
matches = motif_pattern.finditer(str(record.seq))
for match in matches:
print(f"{record.id}\t位置:{match.start()}-{match.end()}\t{match.group()}")
3.4 批量处理多个文件¶
import os
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
fasta_dir = "./fasta_files/" # FASTA文件目录
for filename in os.listdir(fasta_dir):
if filename.endswith((".fasta", ".fa", ".fna")): # 只处理FASTA文件
filepath = os.path.join(fasta_dir, filename)
total_len = 0
seq_count = 0
for record in SeqIO.parse(filepath, "fasta"):
total_len += len(record)
seq_count += 1
print(f"{filename}\t序列数: {seq_count}\t总长度: {total_len}bp")
4. Biopython vs 同类工具对比¶
| 工具 | 用途 | 优势 | 劣势 |
|---|---|---|---|
| Biopython | 通用生信工具箱 | 功能全面,社区大,文档好 | 处理大文件慢(纯Python) |
| pysam | 读写BAM/SAM/VCF/BCF | C底层实现,速度极快 | 只做比对文件,不做序列分析 |
| HTSeq | RNA-seq计数(htseq-count) | 简单易用,常用于featureCounts替代 | 功能较窄 |
| scikit-bio | 生态学/微生物组分析 | 多样性分析、距离矩阵 | 社区较小,更新慢 |
| pyfaidx | 快速索引FASTA | 随机访问极快(用.fai索引) | 只做FASTA读取 |
| gffutils | 解析GFF/GTF注释文件 | SQLite后端,查询灵活 | 只做注释文件 |
选择建议: - 通用序列操作、格式转换、NCBI查询 → Biopython - BAM/SAM文件处理 → pysam - 大FASTA文件随机访问 → pyfaidx - RNA-seq读计数 → HTSeq 或 featureCounts
5. 面试编程题¶
题1:统计FASTA文件中每条序列的GC含量¶
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
def fasta_gc_stats(filepath):
"""读取FASTA文件,返回每条序列的ID和GC含量"""
results = {}
for record in SeqIO.parse(filepath, "fasta"):
results[record.id] = round(gc_fraction(record.seq), 4)
return results
# stats = fasta_gc_stats("genes.fasta")
# for seq_id, gc in sorted(stats.items(), key=lambda x: x[1], reverse=True):
# print(f"{seq_id}\tGC={gc:.2%}")
题2:提取FASTA中长度大于N的序列¶
from Bio import SeqIO
def filter_by_length(input_file, output_file, min_length=500):
"""过滤掉长度小于min_length的序列"""
filtered = ( # 生成器表达式(省内存)
record for record in SeqIO.parse(input_file, "fasta")
if len(record) >= min_length
)
count = SeqIO.write(filtered, output_file, "fasta")
print(f"保留了 {count} 条序列(长度 >= {min_length}bp)")
# filter_by_length("contigs.fasta", "filtered.fasta", min_length=1000)
题3:从GenBank提取CDS序列¶
from Bio import SeqIO
def extract_cds(genbank_file):
"""从GenBank文件中提取所有CDS的核酸序列"""
cds_sequences = []
for record in SeqIO.parse(genbank_file, "genbank"):
for feature in record.features:
if feature.type == "CDS": # 只看CDS特征
cds_seq = feature.extract(record.seq) # 提取对应序列
gene_name = feature.qualifiers.get("gene", ["unknown"])[0]
cds_sequences.append((gene_name, str(cds_seq)))
return cds_sequences
# for name, seq in extract_cds("sequence.gb"):
# print(f">{name}\n{seq}")
题4:批量FASTA → 反向互补¶
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
def reverse_complement_fasta(input_file, output_file):
"""将FASTA中所有序列转为反向互补"""
rc_records = []
for record in SeqIO.parse(input_file, "fasta"):
rc_record = SeqRecord(
record.seq.reverse_complement(), # 反向互补
id=record.id + "_rc", # ID加后缀
description="reverse complement"
)
rc_records.append(rc_record)
SeqIO.write(rc_records, output_file, "fasta")
print(f"已生成 {len(rc_records)} 条反向互补序列")
# reverse_complement_fasta("genes.fasta", "genes_rc.fasta")
题5:查询NCBI并下载序列¶
from Bio import Entrez, SeqIO
Entrez.email = "your@email.com"
def search_and_download(query, db="nucleotide", max_results=5):
"""搜索NCBI并下载序列"""
# Step 1: 搜索
handle = Entrez.esearch(db=db, term=query, retmax=max_results)
result = Entrez.read(handle)
ids = result["IdList"]
print(f"搜索 '{query}',找到 {result['Count']} 条,下载前 {len(ids)} 条")
# Step 2: 下载
if ids:
handle = Entrez.efetch(db=db, id=ids, rettype="fasta", retmode="text")
records = list(SeqIO.parse(handle, "fasta"))
for r in records:
print(f" {r.id}: {r.description[:60]}... ({len(r)}bp)")
return records
return []
# records = search_and_download("BRCA1 Homo sapiens")
6. 速查表¶
| 操作 | 代码 |
|---|---|
| 安装 | pip install biopython |
| 读单条FASTA | SeqIO.read("f.fa", "fasta") |
| 读多条FASTA | SeqIO.parse("f.fa", "fasta") |
| 写FASTA | SeqIO.write(records, "out.fa", "fasta") |
| 格式转换 | SeqIO.write(SeqIO.parse("in.gb","genbank"), "out.fa","fasta") |
| 反向互补 | seq.reverse_complement() |
| 翻译 | seq.translate() / seq.translate(to_stop=True) |
| 转录 | seq.transcribe() |
| GC含量 | gc_fraction(seq) (新版) |
| 搜索NCBI | Entrez.esearch(db="pubmed", term="...") |
| 下载序列 | Entrez.efetch(db="nucleotide", id="...", rettype="fasta") |
| 在线BLAST | NCBIWWW.qblast("blastn", "nt", seq) |
| 读系统发育树 | Phylo.read("tree.nwk", "newick") |
| 画ASCII树 | Phylo.draw_ascii(tree) |
| 解析PDB | PDBParser().get_structure("name", "file.pdb") |
| Seq→字符串 | str(seq) |
| 字符串→Seq | Seq("ATCG") |
7. 延伸资源¶
| 资源 | 类型 | 说明 |
|---|---|---|
| Biopython Tutorial | 官方教程 | 最权威的Biopython教程(英文) |
| Biopython API文档 | API参考 | 所有模块的详细文档 |
| Biopython Cookbook | Wiki | 实用代码片段集合 |
| Biopython GitHub | 源码 | 最新代码和Issue讨论 |
| Rosalind | 刷题 | 用Biopython解生信编程题 |
面试提示: 面试中提到"用Python处理序列"时,主动说出 Biopython 的 SeqIO.parse 和 Seq 对象会加分。但也要知道它的局限——处理超大文件(>1GB)时 pysam 或 pyfaidx 更合适。