生信面试 SQL 数据库题精选¶
一句话概述:生信面试中 SQL 题集中在基本查询(SELECT/WHERE/JOIN)、分组聚合(GROUP BY)、子查询和窗口函数四个方向,掌握这些就能处理 TCGA、GEO 等公共数据库的数据提取和分析。
核心知识点速查表¶
| 概念 | 白话解释 |
|---|---|
| SELECT | 选择要查看的列(相当于awk取列) |
| WHERE | 筛选行的条件(相当于grep/awk条件) |
| JOIN | 合并两张表(相当于R的merge) |
| GROUP BY | 按某列分组统计(相当于R的group_by) |
| ORDER BY | 排序(相当于sort) |
| 子查询 | 查询嵌套查询(先查出结果再查) |
| 索引(INDEX) | 加速查询的数据结构(类似tabix索引) |
| 主键(PRIMARY KEY) | 唯一标识每行的列 |
一、基础查询(SELECT/WHERE)¶
-- ========== 基础查询 ==========
-- 查看全部数据
SELECT * FROM genes; -- 选择genes表的所有列所有行
-- 选择特定列
SELECT gene_name, chromosome, start_pos, end_pos
FROM genes; -- 只看基因名、染色体、起止位置
-- 条件筛选(WHERE)
SELECT gene_name, log2fc, padj
FROM diff_expression
WHERE padj < 0.05; -- 只要padj<0.05的行
-- 多条件(AND/OR)
SELECT gene_name, log2fc, padj
FROM diff_expression
WHERE padj < 0.05 -- 显著
AND ABS(log2fc) > 1; -- 且|logFC|>1(差异倍数大)
-- 模糊匹配(LIKE)
SELECT * FROM genes
WHERE gene_name LIKE 'BRCA%'; -- 以BRCA开头的基因(BRCA1, BRCA2)
SELECT * FROM genes
WHERE gene_name LIKE '%kinase%'; -- 名字包含kinase的基因
-- IN — 多值匹配(相当于grep -f)
SELECT * FROM diff_expression
WHERE gene_name IN ('TP53', 'BRCA1', 'EGFR', 'KRAS', 'MYC');
-- BETWEEN — 范围查询
SELECT * FROM variants
WHERE position BETWEEN 1000000 AND 2000000 -- 位置在1M到2M之间
AND chromosome = 'chr1'; -- 且在1号染色体
-- IS NULL — 查找缺失值
SELECT * FROM samples
WHERE survival_time IS NULL; -- 找没有生存时间数据的样本
-- DISTINCT — 去重
SELECT DISTINCT chromosome
FROM variants; -- 有变异的染色体列表(不重复)
-- LIMIT — 限制返回行数
SELECT * FROM diff_expression
ORDER BY padj ASC -- 按padj升序
LIMIT 20; -- 只要前20行(最显著的20个基因)
二、排序与聚合(ORDER BY / GROUP BY)¶
-- ========== 排序 ==========
-- 单列排序
SELECT gene_name, padj
FROM diff_expression
ORDER BY padj ASC; -- 按padj升序(小→大)
SELECT gene_name, log2fc
FROM diff_expression
ORDER BY ABS(log2fc) DESC; -- 按|logFC|降序(大→小)
-- 多列排序
SELECT chromosome, position, ref_allele, alt_allele
FROM variants
ORDER BY chromosome ASC, position ASC; -- 先按染色体排,再按位置排
-- ========== 聚合函数 ==========
-- COUNT — 计数
SELECT COUNT(*) AS total_variants -- AS 给结果取别名
FROM variants
WHERE filter_status = 'PASS'; -- 统计PASS变异数
-- SUM / AVG / MIN / MAX
SELECT
AVG(log2fc) AS mean_logfc, -- 平均logFC
MIN(padj) AS min_padj, -- 最小padj
MAX(ABS(log2fc)) AS max_abs_logfc -- 最大|logFC|
FROM diff_expression
WHERE padj < 0.05;
-- ========== GROUP BY — 分组统计(高频考点) ==========
-- 每条染色体有多少个变异
SELECT chromosome,
COUNT(*) AS variant_count -- 每组的计数
FROM variants
GROUP BY chromosome -- 按染色体分组
ORDER BY variant_count DESC; -- 按数量降序
-- 每种变异类型的数量
SELECT variant_type,
COUNT(*) AS count
FROM variants
WHERE filter_status = 'PASS' -- 只看PASS的
GROUP BY variant_type; -- 按类型分组
-- 结果示例:SNP 50000, INDEL 8000, MNV 200
-- HAVING — 对分组结果再筛选(WHERE对行,HAVING对组)
SELECT gene_name,
COUNT(*) AS mutation_count
FROM somatic_mutations
GROUP BY gene_name
HAVING COUNT(*) >= 10 -- 至少有10个突变的基因
ORDER BY mutation_count DESC;
-- 面试经典:统计每个样本的突变数(TMB的基础)
SELECT sample_id,
COUNT(*) AS total_mutations,
SUM(CASE WHEN variant_type='SNP' THEN 1 ELSE 0 END) AS snp_count,
SUM(CASE WHEN variant_type='INDEL' THEN 1 ELSE 0 END) AS indel_count
FROM somatic_mutations
WHERE filter_status = 'PASS'
GROUP BY sample_id
ORDER BY total_mutations DESC;
三、表连接(JOIN,核心考点)¶
-- ========== JOIN:合并多张表 ==========
-- 场景:基因表 + 差异表达表 + 通路注释表
-- genes: gene_id, gene_name, chromosome, start_pos
-- diff_expr: gene_id, log2fc, padj, baseMean
-- pathways: gene_id, pathway_name, pathway_id
-- INNER JOIN — 只保留两表都有的(交集)
SELECT g.gene_name, d.log2fc, d.padj
FROM genes g -- g 是 genes 表的别名
INNER JOIN diff_expr d -- d 是 diff_expr 表的别名
ON g.gene_id = d.gene_id -- 连接条件:gene_id相同
WHERE d.padj < 0.05
ORDER BY d.padj;
-- LEFT JOIN — 保留左表所有行(左表为主)
SELECT g.gene_name, d.log2fc, d.padj
FROM genes g
LEFT JOIN diff_expr d
ON g.gene_id = d.gene_id;
-- 即使基因没有差异表达数据,也会显示(log2fc和padj为NULL)
-- 多表连接
SELECT g.gene_name, d.log2fc, d.padj, p.pathway_name
FROM diff_expr d
INNER JOIN genes g ON d.gene_id = g.gene_id
INNER JOIN pathways p ON d.gene_id = p.gene_id
WHERE d.padj < 0.05 AND ABS(d.log2fc) > 1
ORDER BY d.padj;
-- 结果:显著差异基因 + 它们所在的通路
-- 面试经典:找出在多个数据集中都显著的基因
SELECT a.gene_name
FROM dataset1_results a
INNER JOIN dataset2_results b ON a.gene_name = b.gene_name
WHERE a.padj < 0.05 AND ABS(a.log2fc) > 1
AND b.padj < 0.05 AND ABS(b.log2fc) > 1;
-- SELF JOIN — 自连接(同一张表连接自己)
-- 场景:找同一基因在不同样本中的表达差异
SELECT a.gene_name,
a.expression AS tumor_expr,
b.expression AS normal_expr,
a.expression - b.expression AS diff
FROM expression a
INNER JOIN expression b
ON a.gene_name = b.gene_name
WHERE a.sample_type = 'Tumor'
AND b.sample_type = 'Normal';
JOIN 类型总结¶
-- JOIN 类型速记:
-- INNER JOIN: A ∩ B (交集,两边都有才保留)
-- LEFT JOIN: A(左表全保留,右表没有的填NULL)
-- RIGHT JOIN: B(右表全保留,左表没有的填NULL)
-- FULL OUTER JOIN: A ∪ B(并集,都保留,没有的填NULL)
-- 生信最常用:INNER JOIN 和 LEFT JOIN
四、子查询与高级用法¶
-- ========== 子查询 ==========
-- 标量子查询(返回单个值)
SELECT gene_name, log2fc
FROM diff_expression
WHERE log2fc > (
SELECT AVG(log2fc) -- 子查询:平均logFC
FROM diff_expression
WHERE padj < 0.05
);
-- IN子查询(返回一列值)
-- 找出在通路"cell cycle"中的差异基因
SELECT gene_name, log2fc, padj
FROM diff_expression
WHERE gene_id IN (
SELECT gene_id -- 子查询:cell cycle通路的基因
FROM pathways
WHERE pathway_name = 'cell cycle'
) AND padj < 0.05;
-- EXISTS子查询(检查是否存在)
-- 找有突变数据的差异表达基因
SELECT d.gene_name, d.log2fc
FROM diff_expression d
WHERE EXISTS (
SELECT 1
FROM somatic_mutations m
WHERE m.gene_name = d.gene_name -- 关联:同一基因
);
-- ========== CASE WHEN — 条件表达式 ==========
SELECT gene_name, log2fc, padj,
CASE
WHEN padj < 0.05 AND log2fc > 1 THEN 'Up' -- 上调
WHEN padj < 0.05 AND log2fc < -1 THEN 'Down' -- 下调
ELSE 'NS' -- 不显著
END AS regulation -- 新列名
FROM diff_expression;
-- ========== 窗口函数(进阶,面试加分) ==========
-- ROW_NUMBER — 排名
SELECT gene_name, padj,
ROW_NUMBER() OVER (ORDER BY padj ASC) AS rank -- 按padj排名
FROM diff_expression
WHERE padj < 0.05;
-- RANK — 排名(允许并列)
SELECT gene_name, log2fc,
RANK() OVER (
PARTITION BY chromosome -- 每条染色体单独排名
ORDER BY ABS(log2fc) DESC -- 按|logFC|排
) AS chr_rank
FROM diff_expression
WHERE padj < 0.05;
-- 面试经典:每条染色体上padj最小的3个基因
SELECT * FROM (
SELECT gene_name, chromosome, padj,
ROW_NUMBER() OVER (
PARTITION BY chromosome
ORDER BY padj ASC
) AS rn
FROM diff_expression
WHERE padj < 0.05
) ranked
WHERE rn <= 3; -- 每组只要前3
五、建表与数据操作¶
-- ========== 创建表 ==========
CREATE TABLE samples (
sample_id VARCHAR(50) PRIMARY KEY, -- 样本ID(主键,唯一)
patient_id VARCHAR(50) NOT NULL, -- 患者ID(不能为空)
sample_type VARCHAR(20), -- 样本类型(Tumor/Normal)
age INT, -- 年龄
gender CHAR(1), -- 性别(M/F)
stage VARCHAR(10), -- 分期
survival_days INT, -- 生存天数
survival_status INT -- 生存状态(0=存活, 1=死亡)
);
-- ========== 插入数据 ==========
INSERT INTO samples VALUES
('TCGA-A1', 'P001', 'Tumor', 55, 'F', 'IIA', 730, 0),
('TCGA-A2', 'P002', 'Tumor', 62, 'M', 'IIIB', 365, 1);
-- ========== 更新数据 ==========
UPDATE samples
SET stage = 'IIB'
WHERE sample_id = 'TCGA-A1';
-- ========== 删除数据 ==========
DELETE FROM samples
WHERE survival_days IS NULL; -- 删除没有生存数据的行
-- ========== 创建索引(加速查询) ==========
CREATE INDEX idx_gene_name ON diff_expression(gene_name); -- 基因名索引
CREATE INDEX idx_chr_pos ON variants(chromosome, position); -- 复合索引
-- 面试要点:
-- 索引加速查找但减慢插入/更新
-- 经常作为WHERE/JOIN条件的列应建索引
-- 类似BAM的.bai索引和VCF的.tbi索引
六、生信实战 SQL¶
-- ========== 实战:TCGA数据分析场景 ==========
-- 场景1:找出特定癌种中高频突变基因(Top 20)
SELECT gene_name,
COUNT(DISTINCT sample_id) AS mutated_samples,
COUNT(*) AS total_mutations,
ROUND(COUNT(DISTINCT sample_id) * 100.0 /
(SELECT COUNT(DISTINCT sample_id) FROM samples), 2
) AS frequency_pct -- 突变频率百分比
FROM somatic_mutations
WHERE cancer_type = 'BRCA' -- 乳腺癌
AND filter_status = 'PASS'
GROUP BY gene_name
HAVING COUNT(DISTINCT sample_id) >= 5 -- 至少5个样本有突变
ORDER BY mutated_samples DESC
LIMIT 20;
-- 场景2:比较肿瘤和正常组织的基因表达差异
SELECT t.gene_name,
AVG(t.tpm) AS avg_tumor_tpm,
AVG(n.tpm) AS avg_normal_tpm,
AVG(t.tpm) - AVG(n.tpm) AS diff
FROM expression t
INNER JOIN expression n
ON t.gene_name = n.gene_name
AND t.patient_id = n.patient_id -- 配对样本
WHERE t.sample_type = 'Tumor'
AND n.sample_type = 'Normal'
GROUP BY t.gene_name
HAVING AVG(t.tpm) > 1 OR AVG(n.tpm) > 1 -- 排除低表达
ORDER BY ABS(AVG(t.tpm) - AVG(n.tpm)) DESC;
-- 场景3:生存分析数据准备
SELECT s.sample_id,
s.survival_days,
s.survival_status,
s.stage,
CASE
WHEN e.tpm > (SELECT PERCENTILE_CONT(0.5)
WITHIN GROUP (ORDER BY tpm)
FROM expression
WHERE gene_name = 'TP53')
THEN 'High'
ELSE 'Low'
END AS tp53_group -- TP53中位数分组
FROM samples s
LEFT JOIN expression e
ON s.sample_id = e.sample_id
AND e.gene_name = 'TP53'
WHERE s.sample_type = 'Tumor';
七、常见报错与解决¶
| 问题 | 原因 | 解决方法 |
|---|---|---|
| Column ambiguous | JOIN后两表有同名列 | 用表名.列名指定 |
| GROUP BY error | SELECT的列不在GROUP BY或聚合函数中 | 加入GROUP BY或用聚合函数 |
| NULL比较 | WHERE col = NULL不对 | 用IS NULL或IS NOT NULL |
| 查询太慢 | 大表没索引 | 建索引,用EXPLAIN分析 |
| 数据类型不匹配 | 字符串和数字比较 | 用CAST转换类型 |
八、面试高频问题¶
Q1:INNER JOIN 和 LEFT JOIN 的区别?
INNER JOIN 只返回两表都匹配的行(交集)。LEFT JOIN 返回左表所有行,右表没匹配的填 NULL。生信中常用 LEFT JOIN 保留所有基因,即使某些基因没有注释信息。
Q2:WHERE 和 HAVING 的区别?
WHERE 在分组前过滤行,HAVING 在分组后过滤组。例如先 WHERE 过滤 PASS 变异,再 GROUP BY 基因后 HAVING 过滤突变数>=10的基因。
Q3:如何优化慢查询?
(1) 在 WHERE/JOIN 的列上建索引;(2) 避免 SELECT *,只选需要的列;(3) 用 EXPLAIN 查看查询计划;(4) 减少子查询,改用 JOIN;(5) 大表分页查询用 LIMIT。
Q4:什么是事务(Transaction)?
事务保证一组操作要么全部成功,要么全部失败(ACID特性)。比如插入样本数据时,如果中途出错,事务回滚保证数据一致性。
BEGIN; INSERT...; UPDATE...; COMMIT;
九、速查表¶
-- === SQL 面试速查 ===
-- 基本查询
SELECT col FROM table WHERE condition ORDER BY col LIMIT n;
-- 聚合
SELECT col, COUNT(*), AVG(val), SUM(val)
FROM table GROUP BY col HAVING COUNT(*) > n;
-- 连接
SELECT a.*, b.col FROM a INNER JOIN b ON a.id = b.id;
SELECT a.*, b.col FROM a LEFT JOIN b ON a.id = b.id;
-- 子查询
SELECT * FROM t WHERE col IN (SELECT col FROM t2 WHERE ...);
-- 窗口函数
SELECT col, ROW_NUMBER() OVER (PARTITION BY grp ORDER BY val) AS rn
FROM table;
-- CASE
SELECT CASE WHEN condition THEN 'A' ELSE 'B' END AS label FROM t;
-- 建表
CREATE TABLE t (id INT PRIMARY KEY, name VARCHAR(50), val FLOAT);
CREATE INDEX idx_name ON t(name);
-- 执行顺序(面试常问):
-- FROM → WHERE → GROUP BY → HAVING → SELECT → ORDER BY → LIMIT
参考资料:SQL必知必会 (Ben Forta 2023)、W3Schools SQL Tutorial、TCGA GDC Data Portal文档