跳转至

生信面试 SQL 数据库题精选

一句话概述:生信面试中 SQL 题集中在基本查询(SELECT/WHERE/JOIN)、分组聚合(GROUP BY)、子查询和窗口函数四个方向,掌握这些就能处理 TCGA、GEO 等公共数据库的数据提取和分析。


核心知识点速查表

概念白话解释
SELECT选择要查看的列(相当于awk取列)
WHERE筛选行的条件(相当于grep/awk条件)
JOIN合并两张表(相当于R的merge)
GROUP BY按某列分组统计(相当于R的group_by)
ORDER BY排序(相当于sort)
子查询查询嵌套查询(先查出结果再查)
索引(INDEX)加速查询的数据结构(类似tabix索引)
主键(PRIMARY KEY)唯一标识每行的列

一、基础查询(SELECT/WHERE)

-- ========== 基础查询 ==========

-- 查看全部数据
SELECT * FROM genes;                   -- 选择genes表的所有列所有行

-- 选择特定列
SELECT gene_name, chromosome, start_pos, end_pos
FROM genes;                            -- 只看基因名、染色体、起止位置

-- 条件筛选(WHERE)
SELECT gene_name, log2fc, padj
FROM diff_expression
WHERE padj < 0.05;                     -- 只要padj<0.05的行

-- 多条件(AND/OR)
SELECT gene_name, log2fc, padj
FROM diff_expression
WHERE padj < 0.05                      -- 显著
  AND ABS(log2fc) > 1;                 -- 且|logFC|>1(差异倍数大)

-- 模糊匹配(LIKE)
SELECT * FROM genes
WHERE gene_name LIKE 'BRCA%';         -- 以BRCA开头的基因(BRCA1, BRCA2)

SELECT * FROM genes
WHERE gene_name LIKE '%kinase%';       -- 名字包含kinase的基因

-- IN — 多值匹配(相当于grep -f)
SELECT * FROM diff_expression
WHERE gene_name IN ('TP53', 'BRCA1', 'EGFR', 'KRAS', 'MYC');

-- BETWEEN — 范围查询
SELECT * FROM variants
WHERE position BETWEEN 1000000 AND 2000000  -- 位置在1M到2M之间
  AND chromosome = 'chr1';                   -- 且在1号染色体

-- IS NULL — 查找缺失值
SELECT * FROM samples
WHERE survival_time IS NULL;           -- 找没有生存时间数据的样本

-- DISTINCT — 去重
SELECT DISTINCT chromosome
FROM variants;                         -- 有变异的染色体列表(不重复)

-- LIMIT — 限制返回行数
SELECT * FROM diff_expression
ORDER BY padj ASC                      -- 按padj升序
LIMIT 20;                              -- 只要前20行(最显著的20个基因)

二、排序与聚合(ORDER BY / GROUP BY)

-- ========== 排序 ==========

-- 单列排序
SELECT gene_name, padj
FROM diff_expression
ORDER BY padj ASC;                     -- 按padj升序(小→大)

SELECT gene_name, log2fc
FROM diff_expression
ORDER BY ABS(log2fc) DESC;            -- 按|logFC|降序(大→小)

-- 多列排序
SELECT chromosome, position, ref_allele, alt_allele
FROM variants
ORDER BY chromosome ASC, position ASC; -- 先按染色体排,再按位置排

-- ========== 聚合函数 ==========

-- COUNT — 计数
SELECT COUNT(*) AS total_variants      -- AS 给结果取别名
FROM variants
WHERE filter_status = 'PASS';          -- 统计PASS变异数

-- SUM / AVG / MIN / MAX
SELECT
    AVG(log2fc) AS mean_logfc,         -- 平均logFC
    MIN(padj) AS min_padj,             -- 最小padj
    MAX(ABS(log2fc)) AS max_abs_logfc  -- 最大|logFC|
FROM diff_expression
WHERE padj < 0.05;

-- ========== GROUP BY — 分组统计(高频考点) ==========

-- 每条染色体有多少个变异
SELECT chromosome,
       COUNT(*) AS variant_count       -- 每组的计数
FROM variants
GROUP BY chromosome                    -- 按染色体分组
ORDER BY variant_count DESC;           -- 按数量降序

-- 每种变异类型的数量
SELECT variant_type,
       COUNT(*) AS count
FROM variants
WHERE filter_status = 'PASS'           -- 只看PASS的
GROUP BY variant_type;                 -- 按类型分组
-- 结果示例:SNP 50000, INDEL 8000, MNV 200

-- HAVING — 对分组结果再筛选(WHERE对行,HAVING对组)
SELECT gene_name,
       COUNT(*) AS mutation_count
FROM somatic_mutations
GROUP BY gene_name
HAVING COUNT(*) >= 10                  -- 至少有10个突变的基因
ORDER BY mutation_count DESC;

-- 面试经典:统计每个样本的突变数(TMB的基础)
SELECT sample_id,
       COUNT(*) AS total_mutations,
       SUM(CASE WHEN variant_type='SNP' THEN 1 ELSE 0 END) AS snp_count,
       SUM(CASE WHEN variant_type='INDEL' THEN 1 ELSE 0 END) AS indel_count
FROM somatic_mutations
WHERE filter_status = 'PASS'
GROUP BY sample_id
ORDER BY total_mutations DESC;

三、表连接(JOIN,核心考点)

-- ========== JOIN:合并多张表 ==========

-- 场景:基因表 + 差异表达表 + 通路注释表
-- genes: gene_id, gene_name, chromosome, start_pos
-- diff_expr: gene_id, log2fc, padj, baseMean
-- pathways: gene_id, pathway_name, pathway_id

-- INNER JOIN — 只保留两表都有的(交集)
SELECT g.gene_name, d.log2fc, d.padj
FROM genes g                           -- g 是 genes 表的别名
INNER JOIN diff_expr d                 -- d 是 diff_expr 表的别名
  ON g.gene_id = d.gene_id            -- 连接条件:gene_id相同
WHERE d.padj < 0.05
ORDER BY d.padj;

-- LEFT JOIN — 保留左表所有行(左表为主)
SELECT g.gene_name, d.log2fc, d.padj
FROM genes g
LEFT JOIN diff_expr d
  ON g.gene_id = d.gene_id;
-- 即使基因没有差异表达数据,也会显示(log2fc和padj为NULL)

-- 多表连接
SELECT g.gene_name, d.log2fc, d.padj, p.pathway_name
FROM diff_expr d
INNER JOIN genes g ON d.gene_id = g.gene_id
INNER JOIN pathways p ON d.gene_id = p.gene_id
WHERE d.padj < 0.05 AND ABS(d.log2fc) > 1
ORDER BY d.padj;
-- 结果:显著差异基因 + 它们所在的通路

-- 面试经典:找出在多个数据集中都显著的基因
SELECT a.gene_name
FROM dataset1_results a
INNER JOIN dataset2_results b ON a.gene_name = b.gene_name
WHERE a.padj < 0.05 AND ABS(a.log2fc) > 1
  AND b.padj < 0.05 AND ABS(b.log2fc) > 1;

-- SELF JOIN — 自连接(同一张表连接自己)
-- 场景:找同一基因在不同样本中的表达差异
SELECT a.gene_name,
       a.expression AS tumor_expr,
       b.expression AS normal_expr,
       a.expression - b.expression AS diff
FROM expression a
INNER JOIN expression b
  ON a.gene_name = b.gene_name
WHERE a.sample_type = 'Tumor'
  AND b.sample_type = 'Normal';

JOIN 类型总结

-- JOIN 类型速记:
-- INNER JOIN: A ∩ B (交集,两边都有才保留)
-- LEFT JOIN:  A(左表全保留,右表没有的填NULL)
-- RIGHT JOIN: B(右表全保留,左表没有的填NULL)
-- FULL OUTER JOIN: A ∪ B(并集,都保留,没有的填NULL)
-- 生信最常用:INNER JOIN 和 LEFT JOIN

四、子查询与高级用法

-- ========== 子查询 ==========

-- 标量子查询(返回单个值)
SELECT gene_name, log2fc
FROM diff_expression
WHERE log2fc > (
    SELECT AVG(log2fc)                 -- 子查询:平均logFC
    FROM diff_expression
    WHERE padj < 0.05
);

-- IN子查询(返回一列值)
-- 找出在通路"cell cycle"中的差异基因
SELECT gene_name, log2fc, padj
FROM diff_expression
WHERE gene_id IN (
    SELECT gene_id                     -- 子查询:cell cycle通路的基因
    FROM pathways
    WHERE pathway_name = 'cell cycle'
) AND padj < 0.05;

-- EXISTS子查询(检查是否存在)
-- 找有突变数据的差异表达基因
SELECT d.gene_name, d.log2fc
FROM diff_expression d
WHERE EXISTS (
    SELECT 1
    FROM somatic_mutations m
    WHERE m.gene_name = d.gene_name    -- 关联:同一基因
);

-- ========== CASE WHEN — 条件表达式 ==========
SELECT gene_name, log2fc, padj,
    CASE
        WHEN padj < 0.05 AND log2fc > 1 THEN 'Up'        -- 上调
        WHEN padj < 0.05 AND log2fc < -1 THEN 'Down'     -- 下调
        ELSE 'NS'                                          -- 不显著
    END AS regulation                                      -- 新列名
FROM diff_expression;

-- ========== 窗口函数(进阶,面试加分) ==========

-- ROW_NUMBER — 排名
SELECT gene_name, padj,
    ROW_NUMBER() OVER (ORDER BY padj ASC) AS rank  -- 按padj排名
FROM diff_expression
WHERE padj < 0.05;

-- RANK — 排名(允许并列)
SELECT gene_name, log2fc,
    RANK() OVER (
        PARTITION BY chromosome        -- 每条染色体单独排名
        ORDER BY ABS(log2fc) DESC      -- 按|logFC|排
    ) AS chr_rank
FROM diff_expression
WHERE padj < 0.05;

-- 面试经典:每条染色体上padj最小的3个基因
SELECT * FROM (
    SELECT gene_name, chromosome, padj,
        ROW_NUMBER() OVER (
            PARTITION BY chromosome
            ORDER BY padj ASC
        ) AS rn
    FROM diff_expression
    WHERE padj < 0.05
) ranked
WHERE rn <= 3;                         -- 每组只要前3

五、建表与数据操作

-- ========== 创建表 ==========
CREATE TABLE samples (
    sample_id VARCHAR(50) PRIMARY KEY, -- 样本ID(主键,唯一)
    patient_id VARCHAR(50) NOT NULL,   -- 患者ID(不能为空)
    sample_type VARCHAR(20),           -- 样本类型(Tumor/Normal)
    age INT,                           -- 年龄
    gender CHAR(1),                    -- 性别(M/F)
    stage VARCHAR(10),                 -- 分期
    survival_days INT,                 -- 生存天数
    survival_status INT                -- 生存状态(0=存活, 1=死亡)
);

-- ========== 插入数据 ==========
INSERT INTO samples VALUES
('TCGA-A1', 'P001', 'Tumor', 55, 'F', 'IIA', 730, 0),
('TCGA-A2', 'P002', 'Tumor', 62, 'M', 'IIIB', 365, 1);

-- ========== 更新数据 ==========
UPDATE samples
SET stage = 'IIB'
WHERE sample_id = 'TCGA-A1';

-- ========== 删除数据 ==========
DELETE FROM samples
WHERE survival_days IS NULL;           -- 删除没有生存数据的行

-- ========== 创建索引(加速查询) ==========
CREATE INDEX idx_gene_name ON diff_expression(gene_name);  -- 基因名索引
CREATE INDEX idx_chr_pos ON variants(chromosome, position); -- 复合索引

-- 面试要点:
-- 索引加速查找但减慢插入/更新
-- 经常作为WHERE/JOIN条件的列应建索引
-- 类似BAM的.bai索引和VCF的.tbi索引

六、生信实战 SQL

-- ========== 实战:TCGA数据分析场景 ==========

-- 场景1:找出特定癌种中高频突变基因(Top 20)
SELECT gene_name,
       COUNT(DISTINCT sample_id) AS mutated_samples,
       COUNT(*) AS total_mutations,
       ROUND(COUNT(DISTINCT sample_id) * 100.0 /
             (SELECT COUNT(DISTINCT sample_id) FROM samples), 2
       ) AS frequency_pct                     -- 突变频率百分比
FROM somatic_mutations
WHERE cancer_type = 'BRCA'                    -- 乳腺癌
  AND filter_status = 'PASS'
GROUP BY gene_name
HAVING COUNT(DISTINCT sample_id) >= 5         -- 至少5个样本有突变
ORDER BY mutated_samples DESC
LIMIT 20;

-- 场景2:比较肿瘤和正常组织的基因表达差异
SELECT t.gene_name,
       AVG(t.tpm) AS avg_tumor_tpm,
       AVG(n.tpm) AS avg_normal_tpm,
       AVG(t.tpm) - AVG(n.tpm) AS diff
FROM expression t
INNER JOIN expression n
  ON t.gene_name = n.gene_name
  AND t.patient_id = n.patient_id             -- 配对样本
WHERE t.sample_type = 'Tumor'
  AND n.sample_type = 'Normal'
GROUP BY t.gene_name
HAVING AVG(t.tpm) > 1 OR AVG(n.tpm) > 1      -- 排除低表达
ORDER BY ABS(AVG(t.tpm) - AVG(n.tpm)) DESC;

-- 场景3:生存分析数据准备
SELECT s.sample_id,
       s.survival_days,
       s.survival_status,
       s.stage,
       CASE
           WHEN e.tpm > (SELECT PERCENTILE_CONT(0.5)
                         WITHIN GROUP (ORDER BY tpm)
                         FROM expression
                         WHERE gene_name = 'TP53')
           THEN 'High'
           ELSE 'Low'
       END AS tp53_group                      -- TP53中位数分组
FROM samples s
LEFT JOIN expression e
  ON s.sample_id = e.sample_id
  AND e.gene_name = 'TP53'
WHERE s.sample_type = 'Tumor';

七、常见报错与解决

问题原因解决方法
Column ambiguousJOIN后两表有同名列表名.列名指定
GROUP BY errorSELECT的列不在GROUP BY或聚合函数中加入GROUP BY或用聚合函数
NULL比较WHERE col = NULL不对IS NULLIS NOT NULL
查询太慢大表没索引建索引,用EXPLAIN分析
数据类型不匹配字符串和数字比较用CAST转换类型

八、面试高频问题

Q1:INNER JOIN 和 LEFT JOIN 的区别?

INNER JOIN 只返回两表都匹配的行(交集)。LEFT JOIN 返回左表所有行,右表没匹配的填 NULL。生信中常用 LEFT JOIN 保留所有基因,即使某些基因没有注释信息。

Q2:WHERE 和 HAVING 的区别?

WHERE 在分组前过滤行,HAVING 在分组后过滤组。例如先 WHERE 过滤 PASS 变异,再 GROUP BY 基因后 HAVING 过滤突变数>=10的基因。

Q3:如何优化慢查询?

(1) 在 WHERE/JOIN 的列上建索引;(2) 避免 SELECT *,只选需要的列;(3) 用 EXPLAIN 查看查询计划;(4) 减少子查询,改用 JOIN;(5) 大表分页查询用 LIMIT。

Q4:什么是事务(Transaction)?

事务保证一组操作要么全部成功,要么全部失败(ACID特性)。比如插入样本数据时,如果中途出错,事务回滚保证数据一致性。BEGIN; INSERT...; UPDATE...; COMMIT;


九、速查表

-- === SQL 面试速查 ===

-- 基本查询
SELECT col FROM table WHERE condition ORDER BY col LIMIT n;

-- 聚合
SELECT col, COUNT(*), AVG(val), SUM(val)
FROM table GROUP BY col HAVING COUNT(*) > n;

-- 连接
SELECT a.*, b.col FROM a INNER JOIN b ON a.id = b.id;
SELECT a.*, b.col FROM a LEFT JOIN b ON a.id = b.id;

-- 子查询
SELECT * FROM t WHERE col IN (SELECT col FROM t2 WHERE ...);

-- 窗口函数
SELECT col, ROW_NUMBER() OVER (PARTITION BY grp ORDER BY val) AS rn
FROM table;

-- CASE
SELECT CASE WHEN condition THEN 'A' ELSE 'B' END AS label FROM t;

-- 建表
CREATE TABLE t (id INT PRIMARY KEY, name VARCHAR(50), val FLOAT);
CREATE INDEX idx_name ON t(name);

-- 执行顺序(面试常问):
-- FROM → WHERE → GROUP BY → HAVING → SELECT → ORDER BY → LIMIT

参考资料:SQL必知必会 (Ben Forta 2023)、W3Schools SQL Tutorial、TCGA GDC Data Portal文档