555_R语言面试高频考点¶

一句话说明¶

R 语言面试侧重数据框操作、统计检验、ggplot2 可视化，以及生信专用包（DESeq2/edgeR/vegan）的使用方法。

核心知识点¶

必掌握核心模块¶

基础 R：
  向量操作、apply家族、数据框操作
  apply/sapply/lapply/vapply/tapply

Tidyverse（现代 R 生态）：
  dplyr：数据操作（filter/select/mutate/group_by/summarise）
  tidyr：数据整理（pivot_longer/pivot_wider/separate/unite）
  ggplot2：可视化（图层语法）
  purrr：函数式编程（map系列）

生信专用：
  DESeq2/edgeR：RNA-seq差异分析
  vegan：生态学/微生物多样性
  limma：微阵列/蛋白组差异分析
  Bioconductor：生信包集合

实战代码/设计图/模板¶

高频考题1：apply 家族¶

# 有一个表达量矩阵，行=基因，列=样本
expr_matrix <- matrix(
  c(100, 200, 50, 300, 150, 80),
  nrow=3, ncol=2,
  dimnames=list(
    c("BRCA1", "TP53", "MYC"),
    c("Control", "Treatment")
  )
)

# apply：对矩阵行或列应用函数
row_means <- apply(expr_matrix, 1, mean)  # 1=行
col_sums  <- apply(expr_matrix, 2, sum)   # 2=列

# sapply：对列表/向量应用，返回简单结果
gc_contents <- sapply(c("ATCG", "GCGC", "ATAT"), function(seq) {
  (nchar(gsub("[^GC]", "", seq))) / nchar(seq) * 100
})

# lapply：返回列表
file_data <- lapply(list.files("data/", "*.tsv"), function(f) {
  read.table(f, header=TRUE, sep="\t")
})

高频考题2：dplyr 管道操作¶

library(dplyr)
library(tidyr)

# 模拟OTU丰度数据
otu_data <- data.frame(
  sample_id = c("S1", "S2", "S3", "S4"),
  OTU1 = c(100, 200, 50, 300),
  OTU2 = c(50, 0, 200, 100),
  group = c("control", "control", "treatment", "treatment")
)

# dplyr 管道：%>% 或 |>（R 4.1+原生管道）
result <- otu_data |>
  filter(group == "treatment") |>       # 筛选treatment组
  select(sample_id, OTU1, OTU2) |>     # 选列
  mutate(total = OTU1 + OTU2,           # 新增列
         OTU1_prop = OTU1 / total) |>   # 计算比例
  arrange(desc(total))                  # 按total降序排列

# 分组统计
summary_stats <- otu_data |>
  group_by(group) |>
  summarise(
    mean_OTU1 = mean(OTU1),
    sd_OTU1   = sd(OTU1),
    n         = n(),
    .groups = "drop"
  )

# 宽转长格式（便于ggplot绘图）
long_data <- otu_data |>
  pivot_longer(cols = starts_with("OTU"),
               names_to = "OTU",
               values_to = "abundance")

高频考题3：ggplot2 绘图¶

library(ggplot2)

# 箱线图（比较组间差异）
p <- ggplot(long_data, aes(x = group, y = abundance, fill = group)) +
  geom_boxplot(alpha = 0.7) +
  geom_jitter(width = 0.2, alpha = 0.5) +   # 显示散点
  facet_wrap(~ OTU, scales = "free_y") +     # 按OTU分面
  scale_fill_manual(values = c("control" = "#2196F3",
                                "treatment" = "#F44336")) +
  labs(title = "OTU在不同组别的丰度",
       x = "分组", y = "丰度", fill = "分组") +
  theme_bw() +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5))

ggsave("otu_boxplot.pdf", p, width = 8, height = 6)

# PCoA 散点图
pcoa_df <- data.frame(
  PC1 = c(0.3, -0.2, 0.5, -0.4),
  PC2 = c(0.1, 0.4, -0.1, -0.3),
  group = c("control", "control", "treatment", "treatment"),
  sample = c("S1", "S2", "S3", "S4")
)

ggplot(pcoa_df, aes(x = PC1, y = PC2, color = group, label = sample)) +
  geom_point(size = 4) +
  geom_text(vjust = -0.8) +
  stat_ellipse(type = "norm") +      # 添加置信椭圆
  labs(title = "Beta多样性 PCoA分析", x = "PC1 (32%)", y = "PC2 (18%)") +
  theme_bw()

高频考题4：DESeq2 差异分析¶

library(DESeq2)

# 假设已有count矩阵（行=基因，列=样本）
count_data <- round(matrix(
  abs(rnorm(60, mean=100, sd=50)), nrow=10, ncol=6
))
rownames(count_data) <- paste0("Gene", 1:10)
colnames(count_data) <- c("Ctrl1","Ctrl2","Ctrl3","Trt1","Trt2","Trt3")

# 样本信息
col_data <- data.frame(
  condition = factor(c("Control","Control","Control",
                        "Treatment","Treatment","Treatment")),
  row.names = colnames(count_data)
)

# 创建 DESeqDataSet 对象
dds <- DESeqDataSetFromMatrix(
  countData = count_data,
  colData   = col_data,
  design    = ~ condition      # 设计公式
)

# 运行差异分析
dds <- DESeq(dds)

# 提取结果
res <- results(dds, 
               contrast = c("condition", "Treatment", "Control"),
               alpha = 0.05)    # FDR阈值

# 筛选显著差异基因
sig_genes <- as.data.frame(res) |>
  tibble::rownames_to_column("gene") |>
  dplyr::filter(!is.na(padj), padj < 0.05, abs(log2FoldChange) > 1) |>
  dplyr::arrange(padj)

cat("显著差异基因数:", nrow(sig_genes), "\n")

高频考题5：向量化 vs 循环¶

# 慢！不要在R里写循环处理向量
sequences <- c("ATCG", "GCGC", "ATAT")

# 循环版本（慢）
gc <- c()
for (seq in sequences) {
  gc <- c(gc, nchar(gsub("[^GC]", "", seq)) / nchar(seq))
}

# 向量化版本（快，R推荐）
gc <- nchar(gsub("[^GC]", "", sequences)) / nchar(sequences)

# sapply（更Pythonic的写法）
gc <- sapply(sequences, function(s) {
  nchar(gsub("[^GC]", "", s)) / nchar(s)
})

面试常问点¶

问题	简答
<- 和 = 的区别？	<- 赋值（推荐）；= 用在函数参数
如何查看对象类型？	class(x)、typeof(x)、str(x)
list vs vector？	vector同类型；list可混合类型
NA vs NULL？	NA表示缺失值；NULL表示不存在
为什么用padj不用pvalue？	多重检验会导致假阳性，padj是校正后p值

速查表¶

# 常用操作速查
# 读文件
df <- read.csv("file.csv", header=TRUE, stringsAsFactors=FALSE)
df <- read.table("file.tsv", sep="\t", header=TRUE)

# 写文件
write.csv(df, "output.csv", row.names=FALSE)
write.table(df, "output.tsv", sep="\t", row.names=FALSE, quote=FALSE)

# 查看数据
head(df); tail(df); dim(df); str(df); summary(df)
nrow(df); ncol(df); colnames(df); rownames(df)

# 安装包
install.packages("ggplot2")
BiocManager::install("DESeq2")

# 检查缺失值
sum(is.na(df))
df[complete.cases(df), ]  # 删除含NA的行