410_SQL递归查询CTE¶
一句话说明¶
CTE(公共表表达式)就是SQL里的临时命名子查询,递归CTE则能处理树形/层级结构数据,比如物种分类树、基因本体GO树。
核心知识点¶
CTE基础语法¶
递归CTE结构¶
WITH RECURSIVE tree AS (
-- 锚点部分(Base Case):查询根节点
SELECT id, name, parent_id, 0 AS depth FROM nodes WHERE parent_id IS NULL
UNION ALL
-- 递归部分:引用自身,查询子节点
SELECT n.id, n.name, n.parent_id, tree.depth + 1
FROM nodes n
JOIN tree ON n.parent_id = tree.id -- 连接上一层结果
)
SELECT * FROM tree;
应用场景¶
| 场景 | 说明 |
|---|---|
| GO本体树 | 找某GO term的所有祖先/子孙 |
| 物种分类 | 界→门→纲→目→科→属→种 |
| 组织架构 | 找某部门所有下级部门 |
| 路径查找 | 找两节点间的路径 |
实战SQL示例¶
-- ========== 数据库:GO本体(基因本体论)层级结构 ==========
-- 建表
CREATE TABLE go_terms (
go_id VARCHAR(10) PRIMARY KEY, -- GO:0001234
go_name TEXT NOT NULL, -- 术语名称
parent_id VARCHAR(10), -- 父节点GO ID(NULL=根节点)
category VARCHAR(20) -- biological_process/molecular_function/cellular_component
);
-- 插入示例数据(生物过程子树)
INSERT INTO go_terms VALUES
('GO:0008150', 'biological_process', NULL, 'BP'), -- 根节点
('GO:0009987', 'cellular process', 'GO:0008150', 'BP'), -- 一级子节点
('GO:0044699', 'single-organism process', 'GO:0008150', 'BP'),
('GO:0009056', 'catabolic process', 'GO:0009987', 'BP'), -- 二级
('GO:0044248', 'cellular catabolic process', 'GO:0009056', 'BP'), -- 三级
('GO:0006914', 'autophagy', 'GO:0009987', 'BP'),
('GO:0000045', 'autophagosome assembly', 'GO:0006914', 'BP'); -- 四级
-- ========== 1. 基础CTE:用多个CTE简化复杂查询 ==========
-- 找表达量 > 均值 + 2标准差 的高表达基因(分步计算更清晰)
WITH
stats AS (
-- 第一步:计算统计量
SELECT
AVG(expression) AS mean_expr,
STDDEV(expression) AS std_expr
FROM gene_expression
WHERE condition = 'treatment'
),
high_expr AS (
-- 第二步:筛选高表达基因(引用上一个CTE)
SELECT gene_id, expression
FROM gene_expression, stats -- 笛卡尔积(stats只有1行)
WHERE condition = 'treatment'
AND expression > mean_expr + 2 * std_expr
)
-- 第三步:关联基因注释(引用high_expr)
SELECT h.gene_id, h.expression, a.gene_name, a.function
FROM high_expr h
JOIN gene_annotations a ON h.gene_id = a.gene_id;
-- ========== 2. 递归CTE:遍历GO本体树(找所有子节点) ==========
-- 找 GO:0009987(cellular process)下的所有子孙GO term
WITH RECURSIVE go_subtree AS (
-- 锚点:根节点(我们要查的起点)
SELECT
go_id,
go_name,
parent_id,
0 AS depth, -- 当前深度(根=0)
go_id::TEXT AS path -- 记录路径(用于调试)
FROM go_terms
WHERE go_id = 'GO:0009987' -- 从这个节点开始向下遍历
UNION ALL
-- 递归部分:找当前层的所有子节点
SELECT
t.go_id,
t.go_name,
t.parent_id,
sub.depth + 1, -- 深度+1
sub.path || ' -> ' || t.go_id -- 路径追加
FROM go_terms t
JOIN go_subtree sub ON t.parent_id = sub.go_id -- 子节点连接父节点
)
SELECT go_id, go_name, depth, path
FROM go_subtree
ORDER BY depth, go_id;
-- 结果示例:
-- GO:0009987 cellular process depth=0
-- GO:0009056 catabolic process depth=1
-- GO:0006914 autophagy depth=1
-- GO:0044248 cellular catabolic process depth=2
-- GO:0000045 autophagosome assembly depth=2
-- ========== 3. 递归CTE:向上查找所有祖先节点 ==========
-- 找 GO:0000045(autophagosome assembly)的所有祖先
WITH RECURSIVE go_ancestors AS (
-- 锚点:目标节点本身
SELECT go_id, go_name, parent_id, 0 AS level
FROM go_terms
WHERE go_id = 'GO:0000045'
UNION ALL
-- 递归:找父节点(方向是向上)
SELECT
t.go_id,
t.go_name,
t.parent_id,
anc.level + 1 -- level表示与起始节点的距离
FROM go_terms t
JOIN go_ancestors anc ON t.go_id = anc.parent_id -- 注意连接方向!
WHERE anc.parent_id IS NOT NULL -- 到根节点停止
)
SELECT go_id, go_name, level AS distance_from_leaf
FROM go_ancestors
ORDER BY level;
-- ========== 4. 递归CTE:带路径的层级展示 ==========
-- 生成物种分类树的缩进展示
WITH RECURSIVE taxonomy AS (
SELECT
taxon_id, taxon_name, parent_id, rank,
1 AS depth,
taxon_name AS full_path -- 完整路径
FROM taxonomy_table
WHERE parent_id IS NULL -- 根节点(Bacteria域等)
UNION ALL
SELECT
t.taxon_id, t.taxon_name, t.parent_id, t.rank,
tax.depth + 1,
tax.full_path || ' > ' || t.taxon_name -- 拼接路径
FROM taxonomy_table t
JOIN taxonomy tax ON t.parent_id = tax.taxon_id
)
SELECT
REPEAT(' ', depth - 1) || taxon_name AS indented_name, -- 根据深度缩进
rank,
full_path
FROM taxonomy
ORDER BY full_path; -- 按路径排序可以保持层级顺序
-- ========== 5. 防止无限循环(限制递归深度) ==========
WITH RECURSIVE safe_traverse AS (
SELECT go_id, go_name, parent_id, 1 AS depth
FROM go_terms WHERE parent_id IS NULL
UNION ALL
SELECT t.go_id, t.go_name, t.parent_id, s.depth + 1
FROM go_terms t
JOIN safe_traverse s ON t.parent_id = s.go_id
WHERE s.depth < 10 -- ★ 限制最大递归深度,防止有环时死循环
)
SELECT * FROM safe_traverse;
面试常问点¶
Q: CTE和子查询有什么区别? A: CTE可以被多次引用(子查询只能用一次)、代码更清晰易读、支持递归(子查询不支持)。部分数据库对CTE有缓存优化。
Q: 递归CTE什么时候会死循环? A: 数据中存在循环(A→B→A),或递归终止条件写错。加
WHERE depth < N或CYCLE DETECTION(PostgreSQL 14+)防止。Q: PostgreSQL和MySQL的递归CTE语法区别? A: PostgreSQL用
WITH RECURSIVE;MySQL 8.0+也用WITH RECURSIVE;MySQL 5.x不支持递归CTE。Q: GO富集分析中为什么需要递归查询? A: GO注释有传播性(一个基因注释了子节点GO term,自动也注释了所有祖先GO term),需递归查祖先。
速查表¶
-- 基本CTE
WITH cte AS (SELECT ...)
SELECT * FROM cte;
-- 多CTE链式(PostgreSQL/MySQL 8+)
WITH
a AS (SELECT ...),
b AS (SELECT ... FROM a)
SELECT * FROM b;
-- 递归CTE模板
WITH RECURSIVE r AS (
SELECT ... WHERE ... -- 锚点(根节点)
UNION ALL
SELECT ... FROM tbl JOIN r ON ... -- 递归步骤
WHERE r.depth < 20 -- 深度限制
)
SELECT * FROM r;