跳转至

69. Biopython 完全指南

一句话说明: Biopython 是 Python 的生物信息学工具箱,提供序列读写、比对、BLAST、数据库查询、系统发育等功能的统一接口,是生信 Python 编程的基础库。


1. Biopython 是什么

白话解释: 如果 Python 是一把瑞士军刀,Biopython 就是专门给生物学家打造的那套刀片——读 FASTA、做 BLAST、查 NCBI 数据库,一个库全搞定,不用自己从零写解析器。

安装:

# 用pip安装(推荐)
pip install biopython

# 用conda安装
conda install -c conda-forge biopython

# 验证安装
python -c "import Bio; print(Bio.__version__)"
# 输出类似:1.84(截至2025年的最新稳定版)

2. 核心模块详解

2.1 Bio.SeqIO —— 序列读写(最常用)

白话: 读写各种序列文件(FASTA、GenBank、FASTQ等)的万能接口。

from Bio import SeqIO

# ===== 读取单条序列 =====
record = SeqIO.read("single_gene.fasta", "fasta")  # 只有一条序列时用read
print(record.id)           # 序列ID
print(record.description)  # 完整描述行(>后面的全部内容)
print(record.seq)          # 序列本身(Seq对象)
print(len(record))         # 序列长度

# ===== 读取多条序列 =====
for record in SeqIO.parse("genes.fasta", "fasta"):  # 多条序列用parse(返回迭代器)
    print(f"{record.id}\t长度: {len(record)}")

# ===== 读取为字典(按ID快速查找)=====
seq_dict = SeqIO.to_dict(SeqIO.parse("genes.fasta", "fasta"))
print(seq_dict["gene_001"].seq)   # 通过ID直接访问

# ===== 读取GenBank格式 =====
for record in SeqIO.parse("sequence.gb", "genbank"):
    print(f"ID: {record.id}")
    print(f"描述: {record.description}")
    for feature in record.features:             # 遍历注释特征
        if feature.type == "CDS":               # 只看编码区
            print(f"  CDS位置: {feature.location}")
            if "gene" in feature.qualifiers:
                print(f"  基因名: {feature.qualifiers['gene'][0]}")

# ===== 写入FASTA文件 =====
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

records = [
    SeqRecord(Seq("ATCGATCG"), id="gene_001", description="sample gene 1"),
    SeqRecord(Seq("GCGCATAT"), id="gene_002", description="sample gene 2"),
]
SeqIO.write(records, "output.fasta", "fasta")  # 写入FASTA格式

# ===== 格式转换 =====
records = SeqIO.parse("input.genbank", "genbank")
SeqIO.write(records, "output.fasta", "fasta")  # GenBank → FASTA 一行搞定

# ===== 读取FASTQ =====
for record in SeqIO.parse("reads.fastq", "fastq"):
    quals = record.letter_annotations["phred_quality"]  # 质量值列表
    avg_qual = sum(quals) / len(quals)                   # 平均质量
    print(f"{record.id}\t平均质量: {avg_qual:.1f}")

2.2 Bio.Seq —— 序列操作

from Bio.Seq import Seq

seq = Seq("ATGATCTCGTAA")   # 创建DNA序列对象

# ===== 基本操作 =====
print(seq.complement())             # TACTAGAGCATT —— 互补链
print(seq.reverse_complement())     # TTACGAGATCAT —— 反向互补链(最常用)
print(seq.transcribe())             # AUGAUCUCGUAA —— DNA→RNA(T→U)
print(seq.translate())              # MIS* —— DNA→蛋白质(*=终止密码子)
print(seq.translate(to_stop=True))  # MIS —— 翻译到终止密码子前停止

# ===== 切片和拼接 =====
print(seq[0:3])    # ATG —— 前3个碱基(起始密码子)
print(seq[::-1])   # AATGCTCTAGTA —— 反转序列
new_seq = seq + Seq("AAAA")  # ATGATCTCGTAAAAAA —— 拼接

# ===== 搜索 =====
print(seq.count("ATG"))    # 1 —— ATG出现次数
print(seq.find("TCG"))     # 5 —— TCG首次出现位置(0-based)

# ===== 注意:Biopython的Seq对象是不可变的(immutable)=====
# seq[0] = "G"  # ❌ 报错!
mutable_seq = seq.tomutable()  # 转为可变序列
mutable_seq[0] = "G"           # ✅ 可以修改
seq2 = mutable_seq.toseq()     # 转回不可变

2.3 Bio.SeqUtils —— 序列工具

from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction          # 注意:新版用gc_fraction,旧版是GC
from Bio.SeqUtils import molecular_weight
from Bio.SeqUtils.ProtParam import ProteinAnalysis

seq = Seq("ATCGATCGCCGCATAT")

# ===== GC含量 =====
gc = gc_fraction(seq)                         # 返回0-1之间的小数
print(f"GC含量: {gc:.2%}")                     # GC含量: 50.00%

# ===== 分子量 =====
mw = molecular_weight(seq, "DNA")             # DNA分子量(单位:道尔顿)
print(f"分子量: {mw:.1f} Da")

# ===== 蛋白质分析 =====
protein = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPK")
print(f"分子量: {protein.molecular_weight():.1f}")       # 蛋白质分子量
print(f"等电点: {protein.isoelectric_point():.2f}")      # 等电点
print(f"氨基酸组成: {protein.get_amino_acids_percent()}") # 各氨基酸比例

2.4 Bio.Blast —— 网络BLAST

from Bio.Blast import NCBIWWW, NCBIXML

# ===== 在线BLAST(向NCBI服务器提交查询)=====
sequence = "ATGATCTCGTAAGATCGATCG"
result_handle = NCBIWWW.qblast(
    "blastn",                # 程序:blastn/blastp/blastx/tblastn/tblastx
    "nt",                    # 数据库:nt(核酸)/nr(蛋白质)/swissprot
    sequence                 # 查询序列
)

# ===== 解析BLAST结果 =====
blast_records = NCBIXML.parse(result_handle)
for blast_record in blast_records:
    print(f"查询序列: {blast_record.query}")
    for alignment in blast_record.alignments[:5]:           # 取前5个比对结果
        print(f"\n  命中: {alignment.title[:80]}")
        for hsp in alignment.hsps:                          # HSP = 高分片段对
            print(f"    E-value: {hsp.expect}")             # E值越小越好
            print(f"    得分: {hsp.score}")
            print(f"    一致性: {hsp.identities}/{hsp.align_length}")
            print(f"    Query:   {hsp.query[:60]}")
            print(f"    Subject: {hsp.sbjct[:60]}")

# ===== 保存结果(网络BLAST很慢,建议保存XML)=====
with open("blast_result.xml", "w") as f:
    f.write(result_handle.read())

# ===== 从本地XML文件读取 =====
with open("blast_result.xml") as f:
    blast_records = NCBIXML.parse(f)
    for record in blast_records:
        print(f"命中数: {len(record.alignments)}")

2.5 Bio.Entrez —— NCBI数据库查询

from Bio import Entrez

Entrez.email = "your_email@example.com"   # NCBI要求提供邮箱(必须设置)

# ===== 搜索PubMed =====
handle = Entrez.esearch(
    db="pubmed",                          # 数据库:pubmed/nucleotide/protein/gene
    term="metagenomics AND type 2 diabetes",  # 搜索词
    retmax=5                              # 最多返回5条
)
record = Entrez.read(handle)
print(f"总结果数: {record['Count']}")
print(f"返回ID: {record['IdList']}")      # 文献的PMID列表

# ===== 获取文献摘要 =====
handle = Entrez.efetch(
    db="pubmed",
    id=record['IdList'],                  # 传入PMID列表
    rettype="abstract",
    retmode="text"
)
print(handle.read())                      # 打印摘要文本

# ===== 下载核酸序列 =====
handle = Entrez.efetch(
    db="nucleotide",
    id="NM_000546",                       # TP53的RefSeq ID
    rettype="fasta",
    retmode="text"
)
fasta_data = handle.read()
print(fasta_data[:200])                   # 打印前200个字符

# ===== 批量下载并保存 =====
handle = Entrez.efetch(db="nucleotide", id="NM_000546", rettype="gb", retmode="text")
with open("TP53.gb", "w") as f:
    f.write(handle.read())                # 保存GenBank格式文件

2.6 Bio.Phylo —— 系统发育树

from Bio import Phylo
from io import StringIO

# ===== 读取Newick格式的树 =====
tree = Phylo.read("tree.nwk", "newick")   # 读取Newick格式树文件
Phylo.draw_ascii(tree)                     # 在终端画ASCII树

# ===== 从字符串创建树 =====
newick_str = "(((A:0.1,B:0.2):0.3,C:0.4):0.5,D:0.6);"
tree = Phylo.read(StringIO(newick_str), "newick")

# ===== 遍历树的节点 =====
for clade in tree.find_clades():           # 遍历所有节点
    if clade.name:
        print(f"节点: {clade.name}, 枝长: {clade.branch_length}")

# ===== 计算树的信息 =====
print(f"叶节点数: {tree.count_terminals()}")     # 叶节点(物种)数量
print(f"总枝长: {tree.total_branch_length():.2f}") # 总进化距离

# ===== 格式转换 =====
Phylo.write(tree, "output.xml", "phyloxml")       # Newick → PhyloXML

2.7 Bio.PDB —— 蛋白质结构

from Bio.PDB import PDBParser, PDBIO

# ===== 解析PDB文件 =====
parser = PDBParser(QUIET=True)                    # QUIET=True 关闭警告
structure = parser.get_structure("protein", "1fat.pdb")  # 解析PDB文件

# ===== 遍历结构层次:Structure → Model → Chain → Residue → Atom =====
for model in structure:
    for chain in model:
        print(f"链: {chain.id}, 残基数: {len(list(chain.get_residues()))}")
        for residue in list(chain.get_residues())[:3]:   # 看前3个残基
            print(f"  残基: {residue.get_resname()} {residue.id[1]}")
            for atom in residue:
                print(f"    原子: {atom.name}, 坐标: {atom.get_vector()}")

3. 常用操作汇总

3.1 读 FASTA + 统计

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction

# 批量处理:读取FASTA,统计每条序列的长度和GC含量
results = []
for record in SeqIO.parse("genes.fasta", "fasta"):
    gc = gc_fraction(record.seq)                    # 计算GC含量
    results.append({
        "id": record.id,
        "length": len(record),
        "gc": round(gc, 4)
    })
    print(f"{record.id}\t{len(record)}bp\tGC={gc:.2%}")

# 找出最长的序列
longest = max(results, key=lambda x: x["length"])
print(f"\n最长序列: {longest['id']} ({longest['length']}bp)")

3.2 反向互补 + 翻译

from Bio.Seq import Seq

dna = Seq("ATGATCTCGTAA")

# 正向翻译
protein = dna.translate()
print(f"正向翻译: {protein}")          # MIS*

# 反向互补链翻译
rc = dna.reverse_complement()
protein_rc = rc.translate()
print(f"反向互补翻译: {protein_rc}")   # 可能不同

# 三框翻译(查找所有可能的ORF)
for frame in range(3):
    subseq = dna[frame:]                            # 从第frame位开始
    # 补齐为3的倍数
    subseq = subseq[:len(subseq) - len(subseq) % 3]
    protein = subseq.translate()
    print(f"Frame +{frame+1}: {protein}")

3.3 Motif搜索

from Bio import SeqIO
import re

# 搜索所有包含特定motif的序列
motif_pattern = re.compile(r"ATG[ATCG]{6,9}TAA")   # ATG开头,TAA结尾,中间6-9个碱基

for record in SeqIO.parse("genes.fasta", "fasta"):
    matches = motif_pattern.finditer(str(record.seq))
    for match in matches:
        print(f"{record.id}\t位置:{match.start()}-{match.end()}\t{match.group()}")

3.4 批量处理多个文件

import os
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction

fasta_dir = "./fasta_files/"                        # FASTA文件目录

for filename in os.listdir(fasta_dir):
    if filename.endswith((".fasta", ".fa", ".fna")): # 只处理FASTA文件
        filepath = os.path.join(fasta_dir, filename)
        total_len = 0
        seq_count = 0
        for record in SeqIO.parse(filepath, "fasta"):
            total_len += len(record)
            seq_count += 1
        print(f"{filename}\t序列数: {seq_count}\t总长度: {total_len}bp")

4. Biopython vs 同类工具对比

工具用途优势劣势
Biopython通用生信工具箱功能全面,社区大,文档好处理大文件慢(纯Python)
pysam读写BAM/SAM/VCF/BCFC底层实现,速度极快只做比对文件,不做序列分析
HTSeqRNA-seq计数(htseq-count)简单易用,常用于featureCounts替代功能较窄
scikit-bio生态学/微生物组分析多样性分析、距离矩阵社区较小,更新慢
pyfaidx快速索引FASTA随机访问极快(用.fai索引)只做FASTA读取
gffutils解析GFF/GTF注释文件SQLite后端,查询灵活只做注释文件

选择建议: - 通用序列操作、格式转换、NCBI查询 → Biopython - BAM/SAM文件处理 → pysam - 大FASTA文件随机访问 → pyfaidx - RNA-seq读计数 → HTSeq 或 featureCounts


5. 面试编程题

题1:统计FASTA文件中每条序列的GC含量

from Bio import SeqIO
from Bio.SeqUtils import gc_fraction

def fasta_gc_stats(filepath):
    """读取FASTA文件,返回每条序列的ID和GC含量"""
    results = {}
    for record in SeqIO.parse(filepath, "fasta"):
        results[record.id] = round(gc_fraction(record.seq), 4)
    return results

# stats = fasta_gc_stats("genes.fasta")
# for seq_id, gc in sorted(stats.items(), key=lambda x: x[1], reverse=True):
#     print(f"{seq_id}\tGC={gc:.2%}")

题2:提取FASTA中长度大于N的序列

from Bio import SeqIO

def filter_by_length(input_file, output_file, min_length=500):
    """过滤掉长度小于min_length的序列"""
    filtered = (                                    # 生成器表达式(省内存)
        record for record in SeqIO.parse(input_file, "fasta")
        if len(record) >= min_length
    )
    count = SeqIO.write(filtered, output_file, "fasta")
    print(f"保留了 {count} 条序列(长度 >= {min_length}bp)")

# filter_by_length("contigs.fasta", "filtered.fasta", min_length=1000)

题3:从GenBank提取CDS序列

from Bio import SeqIO

def extract_cds(genbank_file):
    """从GenBank文件中提取所有CDS的核酸序列"""
    cds_sequences = []
    for record in SeqIO.parse(genbank_file, "genbank"):
        for feature in record.features:
            if feature.type == "CDS":                       # 只看CDS特征
                cds_seq = feature.extract(record.seq)       # 提取对应序列
                gene_name = feature.qualifiers.get("gene", ["unknown"])[0]
                cds_sequences.append((gene_name, str(cds_seq)))
    return cds_sequences

# for name, seq in extract_cds("sequence.gb"):
#     print(f">{name}\n{seq}")

题4:批量FASTA → 反向互补

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

def reverse_complement_fasta(input_file, output_file):
    """将FASTA中所有序列转为反向互补"""
    rc_records = []
    for record in SeqIO.parse(input_file, "fasta"):
        rc_record = SeqRecord(
            record.seq.reverse_complement(),            # 反向互补
            id=record.id + "_rc",                       # ID加后缀
            description="reverse complement"
        )
        rc_records.append(rc_record)
    SeqIO.write(rc_records, output_file, "fasta")
    print(f"已生成 {len(rc_records)} 条反向互补序列")

# reverse_complement_fasta("genes.fasta", "genes_rc.fasta")

题5:查询NCBI并下载序列

from Bio import Entrez, SeqIO

Entrez.email = "your@email.com"

def search_and_download(query, db="nucleotide", max_results=5):
    """搜索NCBI并下载序列"""
    # Step 1: 搜索
    handle = Entrez.esearch(db=db, term=query, retmax=max_results)
    result = Entrez.read(handle)
    ids = result["IdList"]
    print(f"搜索 '{query}',找到 {result['Count']} 条,下载前 {len(ids)} 条")

    # Step 2: 下载
    if ids:
        handle = Entrez.efetch(db=db, id=ids, rettype="fasta", retmode="text")
        records = list(SeqIO.parse(handle, "fasta"))
        for r in records:
            print(f"  {r.id}: {r.description[:60]}... ({len(r)}bp)")
        return records
    return []

# records = search_and_download("BRCA1 Homo sapiens")

6. 速查表

操作代码
安装pip install biopython
读单条FASTASeqIO.read("f.fa", "fasta")
读多条FASTASeqIO.parse("f.fa", "fasta")
写FASTASeqIO.write(records, "out.fa", "fasta")
格式转换SeqIO.write(SeqIO.parse("in.gb","genbank"), "out.fa","fasta")
反向互补seq.reverse_complement()
翻译seq.translate() / seq.translate(to_stop=True)
转录seq.transcribe()
GC含量gc_fraction(seq) (新版)
搜索NCBIEntrez.esearch(db="pubmed", term="...")
下载序列Entrez.efetch(db="nucleotide", id="...", rettype="fasta")
在线BLASTNCBIWWW.qblast("blastn", "nt", seq)
读系统发育树Phylo.read("tree.nwk", "newick")
画ASCII树Phylo.draw_ascii(tree)
解析PDBPDBParser().get_structure("name", "file.pdb")
Seq→字符串str(seq)
字符串→SeqSeq("ATCG")

7. 延伸资源

资源类型说明
Biopython Tutorial官方教程最权威的Biopython教程(英文)
Biopython API文档API参考所有模块的详细文档
Biopython CookbookWiki实用代码片段集合
Biopython GitHub源码最新代码和Issue讨论
Rosalind刷题用Biopython解生信编程题

面试提示: 面试中提到"用Python处理序列"时,主动说出 Biopython 的 SeqIO.parse 和 Seq 对象会加分。但也要知道它的局限——处理超大文件(>1GB)时 pysam 或 pyfaidx 更合适。