《ChAMP处理TCGA 450K甲基化数据的实践》
这个脚本是使用ChAMP对 TCGA 上的 methylation 450K 的数据进行分析。
1
原理部分,网上有很多,在网上找到两张图片,简单来说就是利用探针与 CpG 位点杂交,来检测一下这个 CpG 位点的甲基化状态。

image.png

image.png
2
总的流程就是:
1.使用TCGAbiolinks下载 methylation 450K的数据
2.去除 NA 比例大于 50% 的探针,剩余的探针使用 KNN 方法插补数据。
3.标准的 ChAMP 流程:filter ➡️ norm ➡️ QC ➡️ PCA ➡️ Differential analysis (DMP/DMR)
下载数据
library(TCGAbiolinks)
# 设置查询,选择TCGA的甲基化数据
query <- GDCquery(
project = "TCGA-BRCA", # 选择一个项目,这里使用乳腺癌数据(TCGA-BRCA)
data.category = "DNA Methylation",
data.type = "Methylation Beta Value", # 选择甲基化β值数据
platform = "Illumina Human Methylation 450"
)
# 下载数据
GDCdownload(query)
acc.met <- GDCprepare(
query = query,
save = TRUE,
save.filename = "accDNAmet.rda",
summarizedExperiment = TRUE
)
load("accDNAmet.rda")
acc.met = data
# 查看数据的前几行
head(assay(acc.met))
dim(assay(acc.met))
rm(list = ls())
setwd("~/workspace/methylation 450K/")
library("dplyr")
library("tibble")
# load data ---------------------------------------------------------------
load("accDNAmet.rda")
acc.met = data
# 查看数据的前几行
head(assay(acc.met))
dim(assay(acc.met))
# [1] 485577 895
# select ER+ and normal ---------------------------------------------------
# 提取 colData
a <- colData(acc.met)
# 筛选条件
selected_samples <- a$paper_BRCA_Subtype_PAM50 %in% c("LumA", "LumB") | a$sample_type == "Solid Tissue Normal"
# 筛选后的样本信息
filtered_colData <- a[selected_samples, ]
dim(filtered_colData)
# 筛选后的甲基化数据
filtered_beta <- assay(acc.met)[, selected_samples]
dim(filtered_beta)
# 将筛选后的数据保存到新的 SummarizedExperiment 对象
filtered_acc.met <- acc.met[, selected_samples]
# 检查筛选后的样本数量
dim(filtered_beta)
# [1] 485577 662
# 检查筛选后的样本分布
table(filtered_colData$paper_BRCA_Subtype_PAM50)
table(filtered_colData$sample_type)
# -------------------------------------------------------------------------
# remove more than 50% missing data probe
# 计算每个探针的缺失值比例
missing_ratio <- rowSums(is.na(assay(filtered_acc.met))) / ncol(filtered_acc.met)
# 保留缺失值比例小于等于 50% 的探针
acc.met_filtered <- filtered_acc.met[missing_ratio <= 0.5, ]
dim(assay(acc.met_filtered))
# [1] 416070 662
# KNN impute NA -----------------------------------------------------------
# The remaining probes with NAs were imputed by the K-nearest neighbor (KNN) imputation procedure.
# 加载 impute 包
library(impute)
# 提取数据矩阵
data_matrix <- assay(acc.met_filtered)
# 使用 KNN 插值填补缺失值
imputed_data <- impute.knn(data_matrix)
# 提取插值后的数据
imputed_matrix <- imputed_data$data
# 检查是否还有缺失值
sum(is.na(imputed_matrix)) # 应该返回 0
# 将插值后的数据保存回 SummarizedExperiment 对象
assay(acc.met_filtered) <- imputed_matrix
# Filter-------------------------------------------------------------------------
.libPaths(c(.libPaths(),"/home/u24211510018/.conda/envs/genomation/lib/R/library"))
library(minfi) # DNA甲基化分析
.libPaths(c(.libPaths(),"/home/u24211510018/.conda/envs/R_champ/lib/R/library"))
library(ChAMP)
{
load("~/workspace/methylation 450K/ER/Filtered_ER_myload.Rdata")
load("./ER/group_list.Rdata")
load("./ER/step2-champ_myNorm.Rdata")
load("./ER/step3-output-myDMP.Rdata")
load("./ER/myDMR.Rdata")
load("./ER/myBlock.Rdata")
}
# 提取插补后的 beta 值矩阵
beta_matrix <- assay(acc.met_filtered)
# 提取样本信息表
pd <- colData(acc.met_filtered)
beta_matrix[1:3,1:3]
myLoad=champ.filter(beta = beta_matrix ,pd = pd)
#save(myLoad, file = "~/workspace/methylation 450K/ER/Filtered_ER_myload.Rdata")
load("~/workspace/methylation 450K/ER/Filtered_ER_myload.Rdata")
head(myLoad$beta[1:3,1:3])
dim(myLoad$beta)
pd = myLoad$pd
# [1] 402993 662
# Norm and QC -------------------------------------------------------------------------
# Check the distribution of beta values after filtering
hist(myLoad$beta)
library(RPMM)
#install.packages("RPMM", dependencies=T)
myNorm <- champ.norm(beta=myLoad$beta,arraytype="450K",cores=1, method = "BMIQ")
#save(myNorm,file = './ER/step2-champ_myNorm.Rdata')
load('./ER/step2-champ_myNorm.Rdata')
dim(myNorm)
QC.GUI(beta = myNorm, pheno = myLoad$pd$sample_type,arraytype="450K")
champ.QC(beta = myNorm, pheno = myLoad$pd$barcode)
myNorm[1:3,1:3]
head(myLoad$pd$barcode)
head(pd[1:2,1:2])
dim(pd)
dim(myNorm)
# PCA分析-------------------------------------------------------------------------
library(ggplot2)
# 执行主成分分析
dat.pca <- prcomp(t(myNorm), center = TRUE, scale. = TRUE)
# 提取主成分的结果
pca_data <- as.data.frame(dat.pca$x)
head(pca_data)
# 获取样本分组信息
group_list <- pd$sample_type
# 可视化 PCA 结果
ggplot(pca_data, aes(x = PC1, y = PC2, color = group_list)) +
geom_point() +
stat_ellipse() +
labs(title = "PCA of Samples", x = "PC1", y = "PC2", color = "Groups")
# heatmap carrelation heatmap-------------------------------------------------------------------------
# 热图
# 计算标准差最大的的 1000 个 probe
cg=names(tail(sort(apply(myNorm,1,sd)),1000))
library(pheatmap)
ac=data.frame(group=group_list)
rownames(ac)=colnames(myNorm)
head(ac)
pheatmap(myNorm[cg,],show_colnames =F,show_rownames = F,
annotation_col=ac)
# 相关关系矩阵热图
# 组内的样本的相似性应该是要高于组间的!
pheatmap::pheatmap(cor(myNorm[cg,]),
annotation_col = ac,
show_rownames = F,
show_colnames = F)
# DMP and DMR-------------------------------------------------------------------------
group_list <- pd$sample_type
group_list <- gsub(" ", "_", group_list)
group_list <- factor(group_list, levels = c("Solid_Tissue_Normal", "Primary_Tumor"))
save(group_list,file = "./ER/group_list.Rdata")
myDMP <- champ.DMP(beta = myNorm, pheno = group_list)
head(myDMP[[1]])
DMP.GUI(DMP=myDMP[[1]],beta=myNorm,pheno=group_list)
#save(myDMP,file = './ER//step3-output-myDMP.Rdata')
load('./ER/step3-output-myDMP.Rdata')
myDMP$Primary_Tumor_to_Solid_Tissue_Normal[1:5,1:10]
## volcanol plot
df_DMP <- myDMP$Primary_Tumor_to_Solid_Tissue_Normal[,1:5]
head(df_DMP)
logFC_cutoff <- 0.45
df_DMP$change <- ifelse(df_DMP$adj.P.Val < 10^-15 & abs(df_DMP$logFC) > logFC_cutoff,
ifelse(df_DMP$logFC > logFC_cutoff ,'Hyper','Hypo'),'NOT')
table(df_DMP$change)
this_tile <- paste0('Cutoff for DeltaBeta is ',round(logFC_cutoff,3),
'/nThe number of Hypermethylation genes is ',nrow(df_DMP[df_DMP$change =='Hyper',]) ,
'/nThe number of Hypomethylation genes is ',nrow(df_DMP[df_DMP$change =='Hypo',]))
library(ggplot2)
g <- ggplot(data=df_DMP,
aes(x=logFC, y=-log10(adj.P.Val),
color=change)) +
geom_point(alpha=0.4, size=1) +
theme_set(theme_set(theme_bw(base_size=10)))+
xlab("DeltaBeta") + ylab("-log10 adj.P.Val") +
ggtitle( this_tile ) + theme(plot.title = element_text(size=10,hjust = 0.5))+
scale_colour_manual(values = c('red','blue','black'))
print(g)
# DMR ---------------------------------------------------------------------
myDMR <- champ.DMR(beta=myNorm,group_list,method="Bumphunter")
save(myDMR, file = "./ER/myDMR.Rdata")
load("./ER/myDMR.Rdata")
head(myDMR$BumphunterDMR)
DMR.GUI(DMR=myDMR,pheno = group_list)
# Block -------------------------------------------------------------------
myBlock <- champ.Block(beta=myNorm,pheno=group_list,arraytype="450K")
#save(myBlock, file = "./ER/myBlock.Rdata")
load("./ER/myBlock.Rdata")
head(myBlock$Block)
Block.GUI(Block=myBlock,beta=myNorm,pheno=group_list,runDMP=TRUE,compare.group=NULL,arraytype="450K")
# GSEA --------------------------------------------------------------------
myGSEA = champ.GSEA(beta = myNorm,
DMP = myDMP[[1]],
DMR = myDMR,
CpGlist=NULL,
Genelist=NULL,
pheno = group_list,
method = "fisher",
arraytype = "450K",
Rplot = TRUE,
adjPval = 0.01)
head(myGSEA$DMP)
head(myGSEA$DMR)
共有 0 条评论