553_生信编程面试题精选¶
一句话说明¶
生信面试编程题通常结合 DNA/蛋白质序列、基因组区间、表达量矩阵等领域知识,考查编程和生物学的双重能力。
核心知识点¶
题型分类¶
1. 序列处理类:DNA互补、GC含量、k-mer、最长公共子序列
2. 区间操作类:外显子合并、覆盖深度计算、BED文件处理
3. 矩阵操作类:表达量标准化、差异基因筛选
4. 文件解析类:VCF/BED/GFF格式解析
5. 统计计算类:Fisher精确检验、FDR校正
实战代码/设计图/模板¶
题1:DNA互补反转(经典)¶
def reverse_complement(dna: str) -> str:
"""
返回DNA序列的反向互补序列
A↔T, G↔C,然后反转
例:ATCG → CGAT
时间 O(n),空间 O(n)
"""
complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G', 'N': 'N'}
return ''.join(complement.get(b, 'N') for b in reversed(dna.upper()))
# 测试
assert reverse_complement("ATCG") == "CGAT"
assert reverse_complement("AATTCC") == "GGAATT"
assert reverse_complement("") == ""
题2:k-mer频率分析¶
from collections import Counter
def kmer_frequency(sequence: str, k: int) -> dict:
"""
统计序列中所有k-mer的出现频率
时间 O(n),空间 O(4^k)(k很小时合理)
"""
if k > len(sequence):
return {}
kmers = [sequence[i:i+k] for i in range(len(sequence) - k + 1)]
total = len(kmers)
counts = Counter(kmers)
return {kmer: count/total for kmer, count in counts.most_common()}
def find_overrepresented_kmers(seq: str, k: int, threshold: float = 0.01) -> list:
"""找出过度代表的k-mer(频率超过阈值)"""
freqs = kmer_frequency(seq, k)
expected = 1 / (4 ** k) # 假设随机序列期望频率
return [(kmer, freq) for kmer, freq in freqs.items()
if freq > threshold and freq > expected * 2]
# 测试
seq = "ATCGATCGATCG"
print(kmer_frequency(seq, 3))
题3:外显子覆盖深度计算¶
def calculate_coverage(reads: list[tuple[int, int]], length: int) -> list[int]:
"""
给定一组比对reads的起止位置,计算每个位置的覆盖深度
reads: [(start, end), ...],均包含端点
length: 参考基因组长度
差分数组法:O(n+L),比暴力O(n*L)快很多
"""
diff = [0] * (length + 2)
for start, end in reads:
diff[start] += 1
diff[end + 1] -= 1
coverage = []
current = 0
for i in range(1, length + 1):
current += diff[i]
coverage.append(current)
return coverage
# 测试
reads = [(1, 5), (3, 8), (6, 10)]
cov = calculate_coverage(reads, 10)
print(cov)
# [1, 1, 2, 2, 2, 2, 2, 1, 1, 1]
题4:VCF文件解析¶
def parse_vcf(vcf_file: str, min_qual: float = 30.0) -> list[dict]:
"""
解析VCF文件,过滤低质量变异
返回:变异位点列表
"""
variants = []
with open(vcf_file) as f:
for line in f:
if line.startswith('#'):
continue # 跳过注释行
fields = line.strip().split('\t')
if len(fields) < 8:
continue
chrom, pos, vid, ref, alt, qual, filt, info = fields[:8]
# 过滤低质量
try:
qual_score = float(qual) if qual != '.' else 0.0
except ValueError:
continue
if qual_score < min_qual:
continue
# 解析INFO字段
info_dict = {}
for item in info.split(';'):
if '=' in item:
key, val = item.split('=', 1)
info_dict[key] = val
variants.append({
'chrom': chrom,
'pos': int(pos),
'ref': ref,
'alt': alt,
'qual': qual_score,
'dp': int(info_dict.get('DP', 0)),
'af': float(info_dict.get('AF', 0))
})
return variants
# 示例VCF行解析
sample_line = "chr1\t925952\t.\tG\tA\t50\tPASS\tDP=30;AF=0.5;ANN=missense"
题5:基因表达差异筛选¶
import numpy as np
from scipy import stats
def find_deg(expression: dict, group1: list[str], group2: list[str],
fc_threshold: float = 2.0, pval_threshold: float = 0.05) -> list[dict]:
"""
简单的差异表达基因筛选
expression: {gene_id: [样本1表达量, 样本2表达量, ...]}
group1/group2: 各组样本索引列表
返回满足FC和p值条件的基因列表
"""
results = []
for gene, values in expression.items():
vals = np.array(values)
g1 = vals[group1]
g2 = vals[group2]
mean1 = np.mean(g1)
mean2 = np.mean(g2)
if mean1 == 0 and mean2 == 0:
continue
# log2 fold change
fc = (mean2 + 0.01) / (mean1 + 0.01)
log2fc = np.log2(fc)
# t检验
t_stat, pval = stats.ttest_ind(g1, g2)
if abs(log2fc) >= np.log2(fc_threshold) and pval < pval_threshold:
results.append({
'gene': gene,
'log2fc': round(log2fc, 3),
'pval': round(pval, 6),
'direction': 'up' if log2fc > 0 else 'down'
})
return sorted(results, key=lambda x: x['pval'])
面试常问点¶
| 题型 | 关键点 |
|---|---|
| DNA序列处理 | 注意大小写、N碱基、边界空串 |
| 区间操作 | 差分数组优化覆盖深度;排序后扫描合并区间 |
| VCF解析 | 注释行以#开头;PASS vs 其他过滤状态 |
| 表达量处理 | 先log变换;zero值处理;多重检验校正 |
| k-mer | 时间复杂度O(n*k),空间随k指数增长 |
速查表¶
生信常见数据格式速记:
BED: chrom start end name score strand(0-based,半开区间)
VCF: CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT SAMPLE...)
GFF3: seqname source feature start end score strand frame attributes
SAM: QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL
FASTA: >header\nsequence
FASTQ: @header\nsequence\n+\nquality
区间注意:
BED: 0-based,半开区间 [start, end)
VCF: 1-based,闭区间
GTF: 1-based,闭区间
Python切片: 0-based,半开区间(同BED)