library(tidyverse) # Data manipulation and visualization
library(Seurat) # Single-cell RNA sequencing
library(patchwork) # Combine plots
library(conflicted) # Resolve function name conflicts
library(presto) # DE analysis for large datasets
# Set function preferences
conflicts_prefer(dplyr::select)
conflicts_prefer(dplyr::filter)
# Define data directory
dataDir <- paste(getwd(), "data", "filtered_gene_bc_matrices", "hg19", sep = "/")
list.files(dataDir)
## [1] "barcodes.tsv" "genes.tsv" "matrix.mtx"
# Load dataset
pbmc.data <- Read10X(dataDir)
# Initialize Seurat object
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
pbmc
## An object of class Seurat
## 13714 features across 2700 samples within 1 assay
## Active assay: RNA (13714 features, 0 variable features)
## 1 layer present: counts
# sumber of genes (rows) by samples (columns)
dim(pbmc)
## [1] 13714 2700
# Sample names
head(rownames(pbmc))
## [1] "AL627309.1" "AP006222.2" "RP11-206L10.2" "RP11-206L10.9"
## [5] "LINC00115" "NOC2L"
# Gene barcodes
head(colnames(pbmc))
## [1] "AAACATACAACCAC-1" "AAACATTGAGCTAC-1" "AAACATTGATCAGC-1" "AAACCGTGCTTCCG-1"
## [5] "AAACCGTGTATGCG-1" "AAACGCACTGGTAC-1"
Common QC metrics include:
The number of unique genes detected in each cell. - Very low gene counts = Low-quality cells or empty droplets - Very high gene counts = Doublets or multiplets
Total number of RNA molecules per cell
The percentage of mitochondrial genome reads - High precentage = Low-quality / dying cells
# View number of unique genes and total molecules from metadata slot
head(pbmc@meta.data, 5)
## orig.ident nCount_RNA nFeature_RNA
## AAACATACAACCAC-1 pbmc3k 2419 779
## AAACATTGAGCTAC-1 pbmc3k 4903 1352
## AAACATTGATCAGC-1 pbmc3k 3147 1129
## AAACCGTGCTTCCG-1 pbmc3k 2639 960
## AAACCGTGTATGCG-1 pbmc3k 980 521
# Calculate mitochondrial QC metrics and save in Seurat object
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# View new QC metric
head(pbmc@meta.data, 5)
## orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACATACAACCAC-1 pbmc3k 2419 779 3.0177759
## AAACATTGAGCTAC-1 pbmc3k 4903 1352 3.7935958
## AAACATTGATCAGC-1 pbmc3k 3147 1129 0.8897363
## AAACCGTGCTTCCG-1 pbmc3k 2639 960 1.7430845
## AAACCGTGTATGCG-1 pbmc3k 980 521 1.2244898
# Vizualize QC metrics and cutoffs
CountPlot <- ggplot(pbmc@meta.data, aes("", nCount_RNA)) +
geom_violin(fill = "pink") +
geom_jitter(alpha = 0.2) +
geom_hline(yintercept = c(200, 7000), color = c("red")) +
labs(title = "Molecule Counts")
MTPlot <- ggplot(pbmc@meta.data, aes("", percent.mt)) +
geom_violin(fill = "pink") +
geom_jitter(alpha = 0.2) +
geom_hline(yintercept = c(0.5, 5), color = c("red", "red")) +
labs(title = "Percent Mitochondria")
FeaturePlot <- ggplot(pbmc@meta.data, aes("", nFeature_RNA)) +
geom_violin(fill = "pink") +
geom_jitter(alpha = 0.2) +
geom_hline(yintercept = c(300, 1700), color = c("red", "red")) +
labs(title = "Unique Genes")
# Arrange plts
CountPlot + MTPlot + FeaturePlot + plot_layout(ncol = 3)
CountMT <- ggplot(pbmc@meta.data, aes(nCount_RNA, percent.mt)) +
geom_point() +
geom_hline(yintercept = c(0.5, 5), color = c("red", "red")) +
geom_vline(xintercept = c(200, 7000), color = c("red", "red"))
CountFeature <- ggplot(pbmc@meta.data, aes(nCount_RNA, nFeature_RNA)) +
geom_point() +
geom_hline(yintercept = c(300, 1700), color = c("red", "red")) +
geom_vline(xintercept = c(200, 7000), color = c("red", "red"))
CountMT + CountFeature + plot_layout(ncol = 2)
Keep cells with:
300 - 1700 unique genes
200 - 7000 total RNA molecules
0.5 - 5% mitochondrial gene percentage
pbmc <- subset(pbmc, subset = nFeature_RNA > 300 & nFeature_RNA < 1700 &
nCount_RNA > 200 & nCount_RNA < 7000 &
percent.mt > 0.5 & percent.mt < 5)
# Violin plots
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, alpha = 0.2)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
# Scatter plots
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt", pt.size = 2) + FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", pt.size = 2) + plot_layout(ncol = 2)
Normalizes feature expression by total expression with “LogNormalize” method, multiply by a scaling factor (10,000 by default)
Normalized values are stored in pbmc[[“RNA”]]$data.
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts
# Return 2,000 features per dataset
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts
# top 20 most variable genes
top20 <- head(VariableFeatures(pbmc), 20)
# plot with labels
plot1 <- VariableFeaturePlot(pbmc)
LabelPoints(plot1, top20,
repel = TRUE,
xnudge = 0,
ynudge = 0)
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
Before dimensionality reduction methods such as UMAP and PCA plots,
use ScaleData() function to prevent domination by highly-expressed
genes.
- Scale expression of each gene, so the variance is 1
- Shift expression of each gene, so the mean is 0
Results are stored in pbmc[[“RNA”]]$scale.data
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
# Extract principle components & save into Seurat object
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
# Save TSNE
pbmc <- RunTSNE(pbmc, features = VariableFeatures(object = pbmc))
# Save UMAP
pbmc <- RunUMAP(pbmc, features = VariableFeatures(object = pbmc))
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
VizDimReduction(), DimPlot(), and DimHeatmap()
# print top 3 PC's most significant genes
print(pbmc[["pca"]], dims = 1:3, nfeatures = 5)
## PC_ 1
## Positive: MALAT1, LTB, IL32, IL7R, CD2
## Negative: CST3, TYROBP, LST1, AIF1, FTL
## PC_ 2
## Positive: NKG7, PRF1, GZMB, CST7, GZMA
## Negative: CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1
## PC_ 3
## Positive: IL7R, VIM, IL32, TMSB4X, S100A6
## Negative: CD79A, CD79B, HLA-DQA1, MS4A1, HLA-DQB1
# Plot top 3 PC's most significant genes
VizDimLoadings(pbmc, dims = 1:3, nfeatures = 10, reduction = "pca", balanced = TRUE, ncol = 3)
# Dimplot() will function for all these pots
# PCA plot
PCAPlot(pbmc, reduction = "pca") + NoLegend() + labs(title="PCA")
# TSNE plot
TSNEPlot(pbmc, reduction = "tsne") + NoLegend() + labs(title="tSNE")
# UMAP
UMAPPlot(pbmc, reduction = "umap") + NoLegend() + labs(title="UMAP")
DimHeatmap(pbmc, dims = c(1:4),
nfeatures = 10, cells = 500, ncol = 2,
balanced = TRUE, reduction = "pca")
This plot shows the variation (y) that can be described by a given number of PC’s (x). As the variation stabalizes, we can determine the number of principle components that acuratly describe most of the variation within a dataset. In this case i would choose 8-10 depending on my intentions for the analysis.
ElbowPlot(pbmc) +
geom_vline(xintercept = c(8, 9, 10), color = c("red", "blue", "green")) +
labs(title = "Elbow Plot")
Using the FindNeighbors() function:
- construct a KNN graph based on euclidean distances in PCA space
- refine the edge weights between any two cells based on the shared
overlap in their local neighborhoods (Jaccard similarity)
pbmc <- FindNeighbors(pbmc, dims = 1:9)
## Computing nearest neighbor graph
## Computing SNN
Using FindClusters() function:
- Calculate k-nearest neighbors and construct the SNN graph.
- Optimize the modularity function to determine clusters
A resolution parameter between 0.4-1.2 is ideal for ~3K single-cell
data.
Optimal resolution increases with dataset.
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2588
## Number of edges: 91404
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8745
## Number of communities: 8
## Elapsed time: 0 seconds
# View cluster assignments
head(Idents(pbmc), 5)
## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1 AAACCGTGCTTCCG-1
## 1 3 1 5
## AAACCGTGTATGCG-1
## 6
## Levels: 0 1 2 3 4 5 6 7
# Extract principle components & save into Seurat object
pbmc <- RunPCA(pbmc, dims = 1:9)
# PCA plot
PCAPlot(pbmc, reduction = "pca") + labs(title="Clustered PCA")
# Save TSNE
pbmc <- RunTSNE(pbmc, dims = 1:9)
# TSNE plot
TSNEPlot(pbmc, reduction = "tsne") + labs(title="Clustered tSNE")
# Run UMAP calculations with newly clustered cells
pbmc <- RunUMAP(pbmc, dims = 1:9)
# Plot UMAP
UMAPPlot(pbmc, reduction = "umap") + labs(title="Clustered UMAP")
# find all markers of cluster 2
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2)
head(cluster2.markers, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## S100A8 0.000000e+00 6.421435 0.978 0.129 0.000000e+00
## LGALS2 0.000000e+00 5.482369 0.908 0.066 0.000000e+00
## S100A9 0.000000e+00 5.970127 0.996 0.222 0.000000e+00
## FCN1 5.017886e-305 3.951801 0.954 0.156 6.881529e-301
## CD14 4.745812e-301 5.920096 0.695 0.030 6.508406e-297
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3))
head(cluster5.markers, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## IFITM3 2.911025e-183 6.134180 0.941 0.048 3.992180e-179
## FCGR3A 4.842991e-182 6.760032 0.904 0.036 6.641677e-178
## CFD 1.154870e-178 5.983351 0.904 0.040 1.583789e-174
## CD68 1.561259e-172 5.362181 0.882 0.038 2.141110e-168
## FCER1G 5.473783e-171 6.460346 0.963 0.089 7.506746e-167
# find markers all cluster compared to all remaining cells, report only the positive ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
pbmc.markers |>
group_by(cluster) |>
filter(avg_log2FC > 1)
## # A tibble: 6,208 × 7
## # Groups: cluster [8]
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 1.55e-110 1.22 0.936 0.596 2.12e-106 0 LDHB
## 2 4.00e-101 2.54 0.494 0.109 5.49e- 97 0 CCR7
## 3 8.59e- 66 1.03 0.857 0.421 1.18e- 61 0 CD3D
## 4 3.38e- 55 2.16 0.368 0.105 4.63e- 51 0 LEF1
## 5 9.01e- 51 1.26 0.662 0.361 1.24e- 46 0 NOSIP
## 6 7.58e- 49 1.58 0.477 0.185 1.04e- 44 0 PIK3IP1
## 7 3.51e- 46 2.00 0.355 0.114 4.81e- 42 0 PRKCQ-AS1
## 8 2.75e- 45 2.79 0.218 0.041 3.77e- 41 0 FHIT
## 9 5.52e- 43 1.02 0.618 0.297 7.56e- 39 0 CD7
## 10 1.69e- 39 2.92 0.175 0.028 2.31e- 35 0 LINC00176
## # ℹ 6,198 more rows
# ROC test returns ‘classification power’ individual markers (0=random, to 1=perfect).
cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25,
test.use = "roc", only.pos = TRUE)
head(cluster0.markers)
## myAUC avg_diff power avg_log2FC pct.1 pct.2
## RPS6 0.830 0.4672260 0.660 0.6802275 1.000 0.997
## RPS12 0.830 0.4969190 0.660 0.7254151 1.000 0.994
## RPL32 0.826 0.4316360 0.652 0.6296283 0.998 0.997
## RPS27 0.824 0.4945441 0.648 0.7222220 0.998 0.993
## RPS14 0.813 0.4218663 0.626 0.6167222 1.000 0.996
## RPL31 0.810 0.5376612 0.620 0.7973715 1.000 0.965
# Violin Plot
VlnPlot(pbmc, features = c("CD3E", "CD8A", "CCR7"))
# Plot raw counts as well
VlnPlot(pbmc, features = c("CD3E", "CD8A", "CCR7"), slot = "counts", log = TRUE)
## Warning: The `slot` argument of `VlnPlot()` is deprecated as of Seurat 5.0.0.
## ℹ Please use the `layer` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Single marker UMAPS
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
top10 <- pbmc.markers |>
group_by(cluster) |>
filter(avg_log2FC > 1) |>
slice_head(n = 10) |>
ungroup()
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
# Extract top 3 cluster identifying markers
ClusID <- top10 |>
select(cluster, gene) |>
group_by(cluster) |>
slice_head(n=3) |>
ungroup()
# UMAP visualization
FeaturePlot(pbmc, features = ClusID$gene, ncol = 3)
# Violin plots
VlnPlot(pbmc, features = ClusID$gene, ncol = 3)
# Ridgeplots
RidgePlot(pbmc, features = ClusID$gene, ncol = 3)
# Dotplot
DotPlot(pbmc, features = ClusID$gene)
# # Plot something
# plot <- FeaturePlot(pbmc, features = "CD8A")
#
# # Run interactive plot
# HoverLocator(plot = plot, information = FetchData(pbmc, vars = c("ident", "PC_1", "nFeature_RNA")))
# Save label names as characters
clusterIDs <- c("0 = Naive CD4 T", "1 = CD4 Mem?",
"2 = Mono?", "3 = B",
"4 = NK?", "5 = ?",
"6 = ?", "7 = DC?")
# Assign cluster levels to names of ClusterID DF
names(clusterIDs) <- levels(pbmc)
# Rename Clusters
pbmc <- RenameIdents(pbmc, clusterIDs)
# Plot UMAP with labels
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 2,
label.size = 6, label.box = TRUE) + NoLegend()
# saveRDS(pbmc, file = paste(getwd(), "output/pbmc.rds", sep = "/"))
sessionInfo()
## R version 4.3.3 (2024-02-29 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] presto_1.0.0 data.table_1.15.4 Rcpp_1.0.12 conflicted_1.2.0
## [5] patchwork_1.2.0 Seurat_5.0.3 SeuratObject_5.0.1 sp_2.1-3
## [9] lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
## [13] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1 tibble_3.2.1
## [17] ggplot2_3.5.0 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.16.0 jsonlite_1.8.8
## [4] magrittr_2.0.3 ggbeeswarm_0.7.2 spatstat.utils_3.0-4
## [7] farver_2.1.1 rmarkdown_2.26 vctrs_0.6.5
## [10] ROCR_1.0-11 memoise_2.0.1 spatstat.explore_3.2-7
## [13] htmltools_0.5.8.1 sass_0.4.9 sctransform_0.4.1
## [16] parallelly_1.37.1 KernSmooth_2.23-22 bslib_0.7.0
## [19] htmlwidgets_1.6.4 ica_1.0-3 plyr_1.8.9
## [22] plotly_4.10.4 zoo_1.8-12 cachem_1.0.8
## [25] igraph_2.0.3 mime_0.12 lifecycle_1.0.4
## [28] pkgconfig_2.0.3 Matrix_1.6-5 R6_2.5.1
## [31] fastmap_1.1.1 fitdistrplus_1.1-11 future_1.33.2
## [34] shiny_1.8.1.1 digest_0.6.35 colorspace_2.1-0
## [37] tensor_1.5 RSpectra_0.16-1 irlba_2.3.5.1
## [40] labeling_0.4.3 progressr_0.14.0 fansi_1.0.6
## [43] spatstat.sparse_3.0-3 timechange_0.3.0 httr_1.4.7
## [46] polyclip_1.10-6 abind_1.4-5 compiler_4.3.3
## [49] withr_3.0.0 fastDummies_1.7.3 highr_0.10
## [52] R.utils_2.12.3 MASS_7.3-60.0.1 tools_4.3.3
## [55] vipor_0.4.7 lmtest_0.9-40 beeswarm_0.4.0
## [58] httpuv_1.6.15 future.apply_1.11.2 goftest_1.2-3
## [61] R.oo_1.26.0 glue_1.7.0 nlme_3.1-164
## [64] promises_1.3.0 grid_4.3.3 Rtsne_0.17
## [67] cluster_2.1.6 reshape2_1.4.4 generics_0.1.3
## [70] gtable_0.3.4 spatstat.data_3.0-4 tzdb_0.4.0
## [73] R.methodsS3_1.8.2 hms_1.1.3 utf8_1.2.4
## [76] spatstat.geom_3.2-9 RcppAnnoy_0.0.22 ggrepel_0.9.5
## [79] RANN_2.6.1 pillar_1.9.0 limma_3.58.1
## [82] spam_2.10-0 RcppHNSW_0.6.0 later_1.3.2
## [85] splines_4.3.3 lattice_0.22-6 survival_3.5-8
## [88] deldir_2.0-4 tidyselect_1.2.1 miniUI_0.1.1.1
## [91] pbapply_1.7-2 knitr_1.46 gridExtra_2.3
## [94] scattermore_1.2 xfun_0.43 statmod_1.5.0
## [97] matrixStats_1.3.0 stringi_1.8.3 lazyeval_0.2.2
## [100] yaml_2.3.8 evaluate_0.23 codetools_0.2-20
## [103] cli_3.6.2 uwot_0.2.1 xtable_1.8-4
## [106] reticulate_1.36.0 munsell_0.5.1 jquerylib_0.1.4
## [109] globals_0.16.3 spatstat.random_3.2-3 png_0.1-8
## [112] ggrastr_1.0.2 parallel_4.3.3 dotCall64_1.1-1
## [115] listenv_0.9.1 viridisLite_0.4.2 scales_1.3.0
## [118] ggridges_0.5.6 crayon_1.5.2 leiden_0.4.3.1
## [121] rlang_1.1.3 cowplot_1.1.3