跳转至

上游数据下载地址 https://cf.10xgenomics.com/samples/cell/pbmc3k/pbmc3k_filtered_gene_bc_matrices.tar.gz 下载后,解压里面有个hg19文件夹。 放在E:/cm/R/scRNA-seq这个目录(也是程序运行目录)里。同时新建一个output文件夹,用于保存中间数据。产出的图片也保存在运行目录下。 官网地址 如果你无法访问官网,原因只可能是404. 2020.08.10 编写:bugs:Seurat包的pheatmap包可能有问题,使用ggsave无法获取到画图的图层,原因不明。所以所有热图都需要手动保存。

rm(list = ls())
library(BiocManager)
####单细胞测序分析包Seurat
#install.packages('Seurat')
library(Seurat)  #Seurat需要python,本地一定要安装配置python,不然会提醒你安装conda.
# 查看Seurat包的版本信息
packageVersion("Seurat")
#第一次运行,需要安装hdf5r这个包
#install.packages("hdf5r")
library(hdf5r)
library(ggplot2)
library(patchwork)
library(dplyr)
setwd('E:/cm/R/scRNA-seq/')
# Load the PBMC dataset
pbmc.data <- Read10X(data.dir = "./hg19/")
# Initialize the Seurat object with the raw (non-normalized data).
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc

标准的预处理流程

质控和选择细胞进行进一步分析

The [[ operator can add columns to object metadata. This is a great place to stash QC stats

提取以MT开头的,线粒体的基因
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
##### Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
ggsave("QC1.feature count.png",width=8,height = 5)
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used 
#for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
ggsave("Qc2.png",width=8,height = 5)
#根据上面的两个图,设置过滤参数对nFeature_RNA和percent.mt进行过滤
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)

归一化数据

#pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- NormalizeData(pbmc) #默认参数等价于上面的参数设定

#识别高度可变的特征(特征选择)
`pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)`

# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)

##variant可视化,,同时给前十个添加标签
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

plot2
ggsave("3.standstardizedVariance.png",width=7,height = 6)

数据缩放,使所有变化的细胞设置为1

all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
#pbmc[["RNA"]]@scale.data
##PCA
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

查看前5个dimensions,每个取5个feature

print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)

可以查看所有的dimensions的数量

pbmc[["pca"]]

可视化pca的结果

##散点图可视化前2个dimensions
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
ggsave("4.PCA-1.png")
##散点可视化2个dimensions
DimPlot(pbmc,dims = 2:3,reduction = "pca")
ggsave("4.PCA-2.png")
##热图可视化不同数量的细胞的不同dimension的分布
####Dimheatmap生成的图片在R中,用ggsave或者jpeg无法获取到绘图层
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
#ggsave("4.PCA-3.png",plot=last_plot())
DimHeatmap(pbmc,dims = 2,cells = 1000,balanced = TRUE)
DimHeatmap(pbmc,dims = 1:20,cells = 500,balanced = TRUE)
#ggsave("4.PCA-4.png")

确定数据集的维数(耗费时间较长)

NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in ElbowPlot() can be used to reduce computation time

pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:20)
ggsave("5.JackStrawPlot.png")

ElbowPlot(pbmc)
ggsave("6.ElbowPlot.png")

综合4,5,6三个部分的结果对比分析,进行PCs的选择,根据4可以看出,在PC12有一个明显的分界,根据5看出在PC7有一个明显的分界. 根据6可以看出在10有一个明显的分界,一般情况建议尽量选择较大的PCs值,这批数据选择7-12都合适,我们选择10.

#细胞聚类cluster cells
pbmc <- FindNeighbors(pbmc, dims = 1:10)  ##此处设置的dims=1:10,就是根据上面的结果选择的10.
pbmc <- FindClusters(pbmc, resolution = 0.5)  ##设置为0.4-1.2之间,
head(Idents(pbmc), 5)

#运行非线性降维(UMAP / tSNE)
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
pbmc <- RunUMAP(pbmc, dims = 1:10)
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "umap")
#可视化选择的PCs的分布(散点图)
DimPlot(pbmc, reduction = "umap",label = TRUE)
ggsave("7.UMAP.png")
saveRDS(pbmc, file = "./output/pbmc_tutorial.rds")

##使用tSNE进行降维
pbmc2 <- RunTSNE(pbmc,dims=1:10)
DimPlot(pbmc2,reduction = "tsne",label=TRUE)
ggsave("7.tSNE.png")
#查找差异表达的基因(集群生物标记)
# find all markers of cluster 1
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
# find markers for every cluster compared to all remaining cells, report only the positive ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)

cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(pbmc, features = c("MS4A1", "CD79A","CCR7"))
ggsave("8.vlnplot_1.png")
##相当于GSEA的可视化细胞群的分布
RidgePlot(pbmc,features = c("MS4A1","GNLY","CD3E"))
ggsave("8.RidgePlot.png")
##点状图可视化,特定细胞群
DotPlot(pbmc,features = c("MS4A1","CD3E","LYZ","CD8A","PPBP","GNLY"))
ggsave("8.dotplot.png")
# you can plot raw counts as well
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP","CD8A"))
ggsave("8.FeaturePlot_2.png")
##绘制热图前10个PCs,展示前20个基因,如果少于20个则展示所有
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
#ggsave("8.Dopheatmap.png")  (可能跟使用的pheatmap包有关,仍然无法正常导出图片)

##与已知的细胞类型相匹配
new.cluster.ids <- c("Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono", 
                     "NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
ggsave("9.Dimplot.png")

saveRDS(pbmc, file = "./output/pbmc3k_final.rds")

1.QC.feature count.png

2.Qc.png

3.standstardizedVariance.png

4.PCA-1.png

4.PCA-2.png

4.PCA-3.png

4.PCA-4.png

5.JackStrawPlot.png

6.ElbowPlot.png

7.UMAP.png 8.Dopheatmap.png

8.dotplot.png

8.FeaturePlot_2.png

8.RidgePlot.png

8.vlnplot_1.png

9.Dimplot.png

回到页面顶部