554_Python面试高频考点¶
一句话说明¶
Python 面试重点考查数据结构、函数特性、面向对象、生成器和并发,生信岗还会加考 Pandas/NumPy 操作。
核心知识点¶
必掌握核心特性¶
1. 列表推导式/生成器表达式
2. 装饰器(@decorator)
3. 上下文管理器(with语句)
4. 迭代器/生成器(yield)
5. *args/**kwargs
6. 默认可变参数陷阱
7. 浅拷贝vs深拷贝
8. GIL(全局解释器锁)
9. 类的魔术方法(__init__, __len__, __iter__等)
10. 异常处理
实战代码/设计图/模板¶
高频考题1:默认可变参数陷阱¶
# 错误示范(经典坑)
def append_gene(gene, gene_list=[]): # 列表作为默认参数!
gene_list.append(gene)
return gene_list
print(append_gene("BRCA1")) # ['BRCA1']
print(append_gene("TP53")) # ['BRCA1', 'TP53'] ← 预期 ['TP53']!
# 正确写法
def append_gene(gene, gene_list=None):
if gene_list is None:
gene_list = []
gene_list.append(gene)
return gene_list
高频考题2:装饰器¶
import time
from functools import wraps
def timer(func):
"""计时装饰器:记录函数运行时间"""
@wraps(func) # 保留原函数名和文档字符串
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
end = time.time()
print(f"{func.__name__} 运行耗时: {end-start:.3f}秒")
return result
return wrapper
@timer
def align_reads(fastq_file: str) -> str:
"""比对reads到参考基因组"""
time.sleep(1) # 模拟耗时操作
return "output.bam"
# 等价于:align_reads = timer(align_reads)
align_reads("sample.fastq.gz")
# 输出: align_reads 运行耗时: 1.001秒
高频考题3:生成器处理大文件¶
def parse_fastq(filepath: str):
"""
用生成器逐条读取FASTQ文件
好处:内存中只保留一条reads,适合处理超大文件
"""
with open(filepath) as f:
while True:
header = f.readline().strip()
if not header:
break
seq = f.readline().strip()
plus = f.readline().strip()
quality = f.readline().strip()
if header.startswith('@'):
yield {
'id': header[1:],
'sequence': seq,
'quality': quality
}
# 使用:逐条处理,不会一次性加载到内存
for read in parse_fastq("huge_sample.fastq"):
if len(read['sequence']) > 100:
process_read(read)
高频考题4:上下文管理器¶
class DatabaseTransaction:
"""数据库事务上下文管理器"""
def __init__(self, connection):
self.conn = connection
def __enter__(self):
print("开始事务")
return self.conn
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is None:
self.conn.commit()
print("事务提交成功")
else:
self.conn.rollback()
print(f"事务回滚,错误: {exc_val}")
return False # 不抑制异常
# 使用
with DatabaseTransaction(db_conn) as conn:
conn.execute("INSERT INTO samples VALUES (...)")
# 如果出错会自动回滚
高频考题5:Pandas 操作¶
import pandas as pd
import numpy as np
# 创建样本数据(模拟OTU丰度表)
df = pd.DataFrame({
'sample_id': ['S1', 'S2', 'S3'],
'OTU1': [100, 200, 0],
'OTU2': [50, 0, 300],
'group': ['control', 'treatment', 'treatment']
})
# 常用操作
# 1. 筛选行
treatment = df[df['group'] == 'treatment']
# 2. 按条件筛选列
otu_cols = [c for c in df.columns if c.startswith('OTU')]
# 3. 计算相对丰度(每行归一化到1)
df[otu_cols] = df[otu_cols].div(df[otu_cols].sum(axis=1), axis=0)
# 4. 分组统计
mean_abundance = df.groupby('group')[otu_cols].mean()
# 5. 透视表
pivot = df.pivot_table(index='sample_id', columns='group', values='OTU1', aggfunc='mean')
# 6. 融化(宽转长格式)
long_df = pd.melt(df, id_vars=['sample_id', 'group'],
value_vars=otu_cols,
var_name='OTU', value_name='abundance')
# 7. 处理缺失值
df.fillna(0, inplace=True)
df.dropna(subset=['OTU1'], inplace=True)
高频考题6:列表推导式 vs map/filter¶
# 列表推导式(Pythonic,推荐)
gc_contents = [seq.count('G') + seq.count('C') for seq in sequences]
# 带条件过滤
long_seqs = [seq for seq in sequences if len(seq) > 100]
# 嵌套推导(矩阵转置)
matrix = [[1,2,3],[4,5,6]]
transposed = [[row[i] for row in matrix] for i in range(len(matrix[0]))]
# 生成器表达式(省内存,适合大数据)
total_gc = sum(seq.count('G') + seq.count('C') for seq in sequences)
# map/filter(函数式风格,较少用)
gc_map = list(map(lambda s: s.count('G') + s.count('C'), sequences))
面试常问点¶
| 问题 | 简答 |
|---|---|
| Python GIL 是什么? | 全局解释器锁,同时只有一个线程执行Python字节码 |
| 多线程 vs 多进程? | IO密集→多线程;CPU密集→多进程(绕过GIL) |
| 浅拷贝 vs 深拷贝? | copy.copy()浅拷贝,copy.deepcopy()深拷贝嵌套对象 |
| @staticmethod vs @classmethod? | static不接收self/cls;classmethod接收cls可访问类变量 |
| yield vs return? | yield生成生成器,懒惰求值;return立即返回 |
速查表¶
# 常用技巧速查
# 交换变量(Pythonic)
a, b = b, a
# 字典合并(Python 3.9+)
merged = dict1 | dict2
# 带索引遍历
for i, val in enumerate(lst):
...
# zip 合并
for a, b in zip(list1, list2):
...
# sorted 自定义排序
genes.sort(key=lambda g: (g.chrom, g.start))
# any/all
has_gc_rich = any(seq.count('GC') > 0.5*len(seq) for seq in seqs)
all_pass = all(len(seq) > 50 for seq in seqs)
# f-string
print(f"GC含量: {gc:.2%}") # 输出:GC含量: 52.30%