转载来源https://blog.csdn.net/weixin_43569478/article/details/83745105 GSEA和常规的GO、KEGG的差异在于,GSEA使用的是基因集。基因集的差异分析。GSEA可以清除的说明上调基因和下调基因分别参加了哪些通路,生物学途径。 GO、KEGG只能给出差异基因的pathway,生物学通路、表达部位等。无法却分哪部分是上调基因,哪部分是下调基因。
同时说明Y叔的clusterProfiler的包中的gseGO和gseKEGG的genelist个格式。 https://github.com/GuangchuangYu/DOSE/wiki/how-to-prepare-your-own-geneList
gsea的原始数据分为三列,一列是geneid,一列是FoldChange,一列是根据FoldChange排序的结果。
其实gsea就是看foldchange的值的分布,如果是随机分布,那结果就不理想。我们想要的是在两端富集分布。 GSEA分析同样可以使用clusterprofiler包。Y叔的包真的强大,我还要用它做转录因子的富集分析。
GSEA的数据集是所有基因的entrez id,和Log2FoldChange的值。不是设置p-value<0.05和log2FoldChange>1过滤后的值。
library(DO.db)
require(DOSE)
library(clusterProfiler)
library(AnnotationHub)
library(readr)
library("genefilter")
library(pheatmap)
library(tidyverse)
library(DESeq2)
library(ggplot2)
library(export)
library(enrichplot)
library(Rgraphviz)
#构造图片输出函数,need input filename width height
#函数依赖export包
out_img <- function(filename,pic_width=5,pic_height=7){
graph2png(file=filename,width=pic_width,height=pic_height)
graph2ppt(file=filename,width=pic_width,height=pic_height)
graph2tif(file=filename,width=pic_width,height=pic_height)
}
#all_entrez.csv的第一列是entrezid,第二列是FoldChange的值。
#gse需要单独做数据格式
d <- read.csv("all_entrez.csv")
geneList <- d[,2]
names(geneList)=as.factor(d[,1])
geneList <- sort(geneList,decreasing = TRUE)
#gseGO进行GSEA分析
#参考连接https://yulab-smu.github.io/clusterProfiler-book/chapter12.html
###gseBP <- gseGO(geneList=geneList,ont="BP",OrgDb=maize,keyType = 'ENTREZID',nPerm = 50000,minGSSize = 100,maxGSSize = 6000,pvalueCutoff = 0.05,verbose = FALSE)
############# GSEA CC 模式 start
ego3 <- gseGO(geneList = geneList,OrgDb = maize,ont = "CC",nPerm = 1000,minGSSize = 100,maxGSSize = 1000,pvalueCutoff = 0.05,verbose = FALSE)
write.csv(ego3,file = "GESA-GO_CC.csv")
#ridgeline plot for expression distribution of GSEA result
ridgeplot(ego3)
out_img(filename = "ridgeplot_CC",pic_width = 12,pic_height = 12)
#只显示值最高的一组的信息
#gseaplot(ego3,geneSetID = 1,by="runningScore",title=ego3$Description[1])
#gseaplot(ego3,geneSetID = 1,by="preranked",title=ego3$Description[1])
#gseaplot(ego3,geneSetID = 1,title=ego3$Description[1])
#显示前4组信息
gseaplot2(ego3,geneSetID = 1:4, ES_geom = "dot",pvalue_table = TRUE)
out_img(filename = "gseaplot_CC",pic_width=12,pic_height = 10)
#gsearank(ego3,1,title=ego3[1,"Description"])
############GSEA CC 模式end
############# GSEA BP 模式 start
ego2 <- gseGO(geneList = geneList,OrgDb = maize,ont = "BP",pvalueCutoff = 0.05,verbose = FALSE)
write.csv(ego2,file = "GESA-GO_BP.csv")
#ridgeline plot for expression distribution of GSEA result
ridgeplot(ego2)
out_img(filename = "ridgeplot_BP",pic_width = 12,pic_height = 12)
#只显示值最高的一组的信息
#gseaplot(ego3,geneSetID = 1,by="runningScore",title=ego3$Description[1])
#gseaplot(ego3,geneSetID = 1,by="preranked",title=ego3$Description[1])
#gseaplot(ego3,geneSetID = 1,title=ego3$Description[1])
#显示前4组信息
gseaplot2(ego2,geneSetID = 1:4, ES_geom = "dot",pvalue_table = TRUE)
out_img(filename = "gseaplot_BP",pic_width=12,pic_height = 10)
#gsearank(ego3,1,title=ego3[1,"Description"])
############GSEA BP 模式end
############# GSEA MF 模式 start
ego4 <- gseGO(geneList = geneList,OrgDb = maize,ont = "MF",pvalueCutoff = 0.05,verbose = FALSE)
write.csv(ego4,file = "GESA-GO_MF.csv")
#ridgeline plot for expression distribution of GSEA result
ridgeplot(ego4)
out_img(filename = "ridgeplot_MF",pic_width = 12,pic_height = 12)
#只显示值最高的一组的信息
#gseaplot(ego3,geneSetID = 1,by="runningScore",title=ego3$Description[1])
#gseaplot(ego3,geneSetID = 1,by="preranked",title=ego3$Description[1])
#gseaplot(ego3,geneSetID = 1,title=ego3$Description[1])
#显示前4组信息
gseaplot2(ego4,geneSetID = 1:4, ES_geom = "dot",pvalue_table = TRUE)
out_img(filename = "gseaplot_MF",pic_width=12,pic_height = 10)
#gsearank(ego3,1,title=ego3[1,"Description"])
############GSEA MF 模式end
#gsaKEGG基因富集分析
kk2 <- gseKEGG(geneList = geneList,organism = 'zma',pvalueCutoff = 0.05,verbose = FALSE)
write.csv(kk2,file="GSEA_KEGG.csv")
gseaplot2(kk2,geneSetID = 1:4,ES_geom = "dot",pvalue_table = TRUE)
out_img(filename="GSEA_KEGG",pic_width = 12,pic_height = 10)
ridgeplot(kk2)
out_img(filename="ridgeplot_GSEA_KEGG",pic_width = 12,pic_height = 12)