跳转至

410_SQL递归查询CTE


一句话说明

CTE(公共表表达式)就是SQL里的临时命名子查询,递归CTE则能处理树形/层级结构数据,比如物种分类树、基因本体GO树。


核心知识点

CTE基础语法

WITH cte_name AS (
    SELECT ...  -- 这里定义CTE内容
)
SELECT * FROM cte_name;  -- 像普通表一样使用

递归CTE结构

WITH RECURSIVE tree AS (
    -- 锚点部分(Base Case):查询根节点
    SELECT id, name, parent_id, 0 AS depth FROM nodes WHERE parent_id IS NULL

    UNION ALL

    -- 递归部分:引用自身,查询子节点
    SELECT n.id, n.name, n.parent_id, tree.depth + 1
    FROM nodes n
    JOIN tree ON n.parent_id = tree.id  -- 连接上一层结果
)
SELECT * FROM tree;

应用场景

场景说明
GO本体树找某GO term的所有祖先/子孙
物种分类界→门→纲→目→科→属→种
组织架构找某部门所有下级部门
路径查找找两节点间的路径

实战SQL示例

-- ========== 数据库:GO本体(基因本体论)层级结构 ==========

-- 建表
CREATE TABLE go_terms (
    go_id      VARCHAR(10) PRIMARY KEY,  -- GO:0001234
    go_name    TEXT NOT NULL,            -- 术语名称
    parent_id  VARCHAR(10),              -- 父节点GO ID(NULL=根节点)
    category   VARCHAR(20)               -- biological_process/molecular_function/cellular_component
);

-- 插入示例数据(生物过程子树)
INSERT INTO go_terms VALUES
('GO:0008150', 'biological_process', NULL, 'BP'),           -- 根节点
('GO:0009987', 'cellular process', 'GO:0008150', 'BP'),     -- 一级子节点
('GO:0044699', 'single-organism process', 'GO:0008150', 'BP'),
('GO:0009056', 'catabolic process', 'GO:0009987', 'BP'),    -- 二级
('GO:0044248', 'cellular catabolic process', 'GO:0009056', 'BP'),  -- 三级
('GO:0006914', 'autophagy', 'GO:0009987', 'BP'),
('GO:0000045', 'autophagosome assembly', 'GO:0006914', 'BP'); -- 四级

-- ========== 1. 基础CTE:用多个CTE简化复杂查询 ==========
-- 找表达量 > 均值 + 2标准差 的高表达基因(分步计算更清晰)
WITH 
stats AS (
    -- 第一步:计算统计量
    SELECT 
        AVG(expression) AS mean_expr,
        STDDEV(expression) AS std_expr
    FROM gene_expression
    WHERE condition = 'treatment'
),
high_expr AS (
    -- 第二步:筛选高表达基因(引用上一个CTE)
    SELECT gene_id, expression
    FROM gene_expression, stats  -- 笛卡尔积(stats只有1行)
    WHERE condition = 'treatment'
      AND expression > mean_expr + 2 * std_expr
)
-- 第三步:关联基因注释(引用high_expr)
SELECT h.gene_id, h.expression, a.gene_name, a.function
FROM high_expr h
JOIN gene_annotations a ON h.gene_id = a.gene_id;

-- ========== 2. 递归CTE:遍历GO本体树(找所有子节点) ==========
-- 找 GO:0009987(cellular process)下的所有子孙GO term
WITH RECURSIVE go_subtree AS (
    -- 锚点:根节点(我们要查的起点)
    SELECT 
        go_id, 
        go_name, 
        parent_id,
        0 AS depth,              -- 当前深度(根=0)
        go_id::TEXT AS path      -- 记录路径(用于调试)
    FROM go_terms
    WHERE go_id = 'GO:0009987'   -- 从这个节点开始向下遍历

    UNION ALL

    -- 递归部分:找当前层的所有子节点
    SELECT 
        t.go_id, 
        t.go_name, 
        t.parent_id,
        sub.depth + 1,           -- 深度+1
        sub.path || ' -> ' || t.go_id  -- 路径追加
    FROM go_terms t
    JOIN go_subtree sub ON t.parent_id = sub.go_id  -- 子节点连接父节点
)
SELECT go_id, go_name, depth, path
FROM go_subtree
ORDER BY depth, go_id;

-- 结果示例:
-- GO:0009987  cellular process          depth=0
-- GO:0009056  catabolic process         depth=1
-- GO:0006914  autophagy                 depth=1
-- GO:0044248  cellular catabolic process depth=2
-- GO:0000045  autophagosome assembly    depth=2

-- ========== 3. 递归CTE:向上查找所有祖先节点 ==========
-- 找 GO:0000045(autophagosome assembly)的所有祖先
WITH RECURSIVE go_ancestors AS (
    -- 锚点:目标节点本身
    SELECT go_id, go_name, parent_id, 0 AS level
    FROM go_terms
    WHERE go_id = 'GO:0000045'

    UNION ALL

    -- 递归:找父节点(方向是向上)
    SELECT 
        t.go_id, 
        t.go_name, 
        t.parent_id,
        anc.level + 1  -- level表示与起始节点的距离
    FROM go_terms t
    JOIN go_ancestors anc ON t.go_id = anc.parent_id  -- 注意连接方向!
    WHERE anc.parent_id IS NOT NULL  -- 到根节点停止
)
SELECT go_id, go_name, level AS distance_from_leaf
FROM go_ancestors
ORDER BY level;

-- ========== 4. 递归CTE:带路径的层级展示 ==========
-- 生成物种分类树的缩进展示
WITH RECURSIVE taxonomy AS (
    SELECT 
        taxon_id, taxon_name, parent_id, rank,
        1 AS depth,
        taxon_name AS full_path  -- 完整路径
    FROM taxonomy_table
    WHERE parent_id IS NULL  -- 根节点(Bacteria域等)

    UNION ALL

    SELECT 
        t.taxon_id, t.taxon_name, t.parent_id, t.rank,
        tax.depth + 1,
        tax.full_path || ' > ' || t.taxon_name  -- 拼接路径
    FROM taxonomy_table t
    JOIN taxonomy tax ON t.parent_id = tax.taxon_id
)
SELECT 
    REPEAT('  ', depth - 1) || taxon_name AS indented_name,  -- 根据深度缩进
    rank,
    full_path
FROM taxonomy
ORDER BY full_path;  -- 按路径排序可以保持层级顺序

-- ========== 5. 防止无限循环(限制递归深度) ==========
WITH RECURSIVE safe_traverse AS (
    SELECT go_id, go_name, parent_id, 1 AS depth
    FROM go_terms WHERE parent_id IS NULL

    UNION ALL

    SELECT t.go_id, t.go_name, t.parent_id, s.depth + 1
    FROM go_terms t
    JOIN safe_traverse s ON t.parent_id = s.go_id
    WHERE s.depth < 10  -- ★ 限制最大递归深度,防止有环时死循环
)
SELECT * FROM safe_traverse;

面试常问点

  1. Q: CTE和子查询有什么区别? A: CTE可以被多次引用(子查询只能用一次)、代码更清晰易读、支持递归(子查询不支持)。部分数据库对CTE有缓存优化。

  2. Q: 递归CTE什么时候会死循环? A: 数据中存在循环(A→B→A),或递归终止条件写错。加WHERE depth < NCYCLE DETECTION(PostgreSQL 14+)防止。

  3. Q: PostgreSQL和MySQL的递归CTE语法区别? A: PostgreSQL用WITH RECURSIVE;MySQL 8.0+也用WITH RECURSIVE;MySQL 5.x不支持递归CTE。

  4. Q: GO富集分析中为什么需要递归查询? A: GO注释有传播性(一个基因注释了子节点GO term,自动也注释了所有祖先GO term),需递归查祖先。


速查表

-- 基本CTE
WITH cte AS (SELECT ...)
SELECT * FROM cte;

-- 多CTE链式(PostgreSQL/MySQL 8+)
WITH
a AS (SELECT ...),
b AS (SELECT ... FROM a)
SELECT * FROM b;

-- 递归CTE模板
WITH RECURSIVE r AS (
    SELECT ... WHERE ...              -- 锚点(根节点)
    UNION ALL
    SELECT ... FROM tbl JOIN r ON ... -- 递归步骤
    WHERE r.depth < 20               -- 深度限制
)
SELECT * FROM r;