555_R语言面试高频考点¶
一句话说明¶
R 语言面试侧重数据框操作、统计检验、ggplot2 可视化,以及生信专用包(DESeq2/edgeR/vegan)的使用方法。
核心知识点¶
必掌握核心模块¶
基础 R:
向量操作、apply家族、数据框操作
apply/sapply/lapply/vapply/tapply
Tidyverse(现代 R 生态):
dplyr:数据操作(filter/select/mutate/group_by/summarise)
tidyr:数据整理(pivot_longer/pivot_wider/separate/unite)
ggplot2:可视化(图层语法)
purrr:函数式编程(map系列)
生信专用:
DESeq2/edgeR:RNA-seq差异分析
vegan:生态学/微生物多样性
limma:微阵列/蛋白组差异分析
Bioconductor:生信包集合
实战代码/设计图/模板¶
高频考题1:apply 家族¶
# 有一个表达量矩阵,行=基因,列=样本
expr_matrix <- matrix(
c(100, 200, 50, 300, 150, 80),
nrow=3, ncol=2,
dimnames=list(
c("BRCA1", "TP53", "MYC"),
c("Control", "Treatment")
)
)
# apply:对矩阵行或列应用函数
row_means <- apply(expr_matrix, 1, mean) # 1=行
col_sums <- apply(expr_matrix, 2, sum) # 2=列
# sapply:对列表/向量应用,返回简单结果
gc_contents <- sapply(c("ATCG", "GCGC", "ATAT"), function(seq) {
(nchar(gsub("[^GC]", "", seq))) / nchar(seq) * 100
})
# lapply:返回列表
file_data <- lapply(list.files("data/", "*.tsv"), function(f) {
read.table(f, header=TRUE, sep="\t")
})
高频考题2:dplyr 管道操作¶
library(dplyr)
library(tidyr)
# 模拟OTU丰度数据
otu_data <- data.frame(
sample_id = c("S1", "S2", "S3", "S4"),
OTU1 = c(100, 200, 50, 300),
OTU2 = c(50, 0, 200, 100),
group = c("control", "control", "treatment", "treatment")
)
# dplyr 管道:%>% 或 |>(R 4.1+原生管道)
result <- otu_data |>
filter(group == "treatment") |> # 筛选treatment组
select(sample_id, OTU1, OTU2) |> # 选列
mutate(total = OTU1 + OTU2, # 新增列
OTU1_prop = OTU1 / total) |> # 计算比例
arrange(desc(total)) # 按total降序排列
# 分组统计
summary_stats <- otu_data |>
group_by(group) |>
summarise(
mean_OTU1 = mean(OTU1),
sd_OTU1 = sd(OTU1),
n = n(),
.groups = "drop"
)
# 宽转长格式(便于ggplot绘图)
long_data <- otu_data |>
pivot_longer(cols = starts_with("OTU"),
names_to = "OTU",
values_to = "abundance")
高频考题3:ggplot2 绘图¶
library(ggplot2)
# 箱线图(比较组间差异)
p <- ggplot(long_data, aes(x = group, y = abundance, fill = group)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.5) + # 显示散点
facet_wrap(~ OTU, scales = "free_y") + # 按OTU分面
scale_fill_manual(values = c("control" = "#2196F3",
"treatment" = "#F44336")) +
labs(title = "OTU在不同组别的丰度",
x = "分组", y = "丰度", fill = "分组") +
theme_bw() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5))
ggsave("otu_boxplot.pdf", p, width = 8, height = 6)
# PCoA 散点图
pcoa_df <- data.frame(
PC1 = c(0.3, -0.2, 0.5, -0.4),
PC2 = c(0.1, 0.4, -0.1, -0.3),
group = c("control", "control", "treatment", "treatment"),
sample = c("S1", "S2", "S3", "S4")
)
ggplot(pcoa_df, aes(x = PC1, y = PC2, color = group, label = sample)) +
geom_point(size = 4) +
geom_text(vjust = -0.8) +
stat_ellipse(type = "norm") + # 添加置信椭圆
labs(title = "Beta多样性 PCoA分析", x = "PC1 (32%)", y = "PC2 (18%)") +
theme_bw()
高频考题4:DESeq2 差异分析¶
library(DESeq2)
# 假设已有count矩阵(行=基因,列=样本)
count_data <- round(matrix(
abs(rnorm(60, mean=100, sd=50)), nrow=10, ncol=6
))
rownames(count_data) <- paste0("Gene", 1:10)
colnames(count_data) <- c("Ctrl1","Ctrl2","Ctrl3","Trt1","Trt2","Trt3")
# 样本信息
col_data <- data.frame(
condition = factor(c("Control","Control","Control",
"Treatment","Treatment","Treatment")),
row.names = colnames(count_data)
)
# 创建 DESeqDataSet 对象
dds <- DESeqDataSetFromMatrix(
countData = count_data,
colData = col_data,
design = ~ condition # 设计公式
)
# 运行差异分析
dds <- DESeq(dds)
# 提取结果
res <- results(dds,
contrast = c("condition", "Treatment", "Control"),
alpha = 0.05) # FDR阈值
# 筛选显著差异基因
sig_genes <- as.data.frame(res) |>
tibble::rownames_to_column("gene") |>
dplyr::filter(!is.na(padj), padj < 0.05, abs(log2FoldChange) > 1) |>
dplyr::arrange(padj)
cat("显著差异基因数:", nrow(sig_genes), "\n")
高频考题5:向量化 vs 循环¶
# 慢!不要在R里写循环处理向量
sequences <- c("ATCG", "GCGC", "ATAT")
# 循环版本(慢)
gc <- c()
for (seq in sequences) {
gc <- c(gc, nchar(gsub("[^GC]", "", seq)) / nchar(seq))
}
# 向量化版本(快,R推荐)
gc <- nchar(gsub("[^GC]", "", sequences)) / nchar(sequences)
# sapply(更Pythonic的写法)
gc <- sapply(sequences, function(s) {
nchar(gsub("[^GC]", "", s)) / nchar(s)
})
面试常问点¶
| 问题 | 简答 |
|---|---|
| <- 和 = 的区别? | <- 赋值(推荐);= 用在函数参数 |
| 如何查看对象类型? | class(x)、typeof(x)、str(x) |
| list vs vector? | vector同类型;list可混合类型 |
| NA vs NULL? | NA表示缺失值;NULL表示不存在 |
| 为什么用padj不用pvalue? | 多重检验会导致假阳性,padj是校正后p值 |
速查表¶
# 常用操作速查
# 读文件
df <- read.csv("file.csv", header=TRUE, stringsAsFactors=FALSE)
df <- read.table("file.tsv", sep="\t", header=TRUE)
# 写文件
write.csv(df, "output.csv", row.names=FALSE)
write.table(df, "output.tsv", sep="\t", row.names=FALSE, quote=FALSE)
# 查看数据
head(df); tail(df); dim(df); str(df); summary(df)
nrow(df); ncol(df); colnames(df); rownames(df)
# 安装包
install.packages("ggplot2")
BiocManager::install("DESeq2")
# 检查缺失值
sum(is.na(df))
df[complete.cases(df), ] # 删除含NA的行