Load libraries

library(tidyverse)  # Data manipulation and visualization

library(Seurat)  # Single-cell RNA sequencing

library(patchwork)  # Combine plots

library(conflicted)  # Resolve function name conflicts

library(presto) # DE analysis for large datasets

Conflicted functions

# Set function preferences
conflicts_prefer(dplyr::select)

conflicts_prefer(dplyr::filter)

Prep

Suerat Object

# Define data directory
dataDir <- paste(getwd(), "data", "filtered_gene_bc_matrices", "hg19", sep = "/")

list.files(dataDir)
## [1] "barcodes.tsv" "genes.tsv"    "matrix.mtx"
# Load dataset
pbmc.data <- Read10X(dataDir)

# Initialize Seurat object 
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
pbmc
## An object of class Seurat 
## 13714 features across 2700 samples within 1 assay 
## Active assay: RNA (13714 features, 0 variable features)
##  1 layer present: counts

Data information

# sumber of genes (rows) by samples (columns)
dim(pbmc)
## [1] 13714  2700
# Sample names
head(rownames(pbmc))
## [1] "AL627309.1"    "AP006222.2"    "RP11-206L10.2" "RP11-206L10.9"
## [5] "LINC00115"     "NOC2L"
# Gene barcodes
head(colnames(pbmc))
## [1] "AAACATACAACCAC-1" "AAACATTGAGCTAC-1" "AAACATTGATCAGC-1" "AAACCGTGCTTCCG-1"
## [5] "AAACCGTGTATGCG-1" "AAACGCACTGGTAC-1"

Quality Control

Common QC metrics include:

  • The number of unique genes detected in each cell. - Very low gene counts = Low-quality cells or empty droplets - Very high gene counts = Doublets or multiplets

  • Total number of RNA molecules per cell

  • The percentage of mitochondrial genome reads - High precentage = Low-quality / dying cells

MT Genes

# View number of unique genes and total molecules from metadata slot
head(pbmc@meta.data, 5)
##                  orig.ident nCount_RNA nFeature_RNA
## AAACATACAACCAC-1     pbmc3k       2419          779
## AAACATTGAGCTAC-1     pbmc3k       4903         1352
## AAACATTGATCAGC-1     pbmc3k       3147         1129
## AAACCGTGCTTCCG-1     pbmc3k       2639          960
## AAACCGTGTATGCG-1     pbmc3k        980          521
# Calculate mitochondrial QC metrics and save in Seurat object
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")

# View new QC metric
head(pbmc@meta.data, 5)
##                  orig.ident nCount_RNA nFeature_RNA percent.mt
## AAACATACAACCAC-1     pbmc3k       2419          779  3.0177759
## AAACATTGAGCTAC-1     pbmc3k       4903         1352  3.7935958
## AAACATTGATCAGC-1     pbmc3k       3147         1129  0.8897363
## AAACCGTGCTTCCG-1     pbmc3k       2639          960  1.7430845
## AAACCGTGTATGCG-1     pbmc3k        980          521  1.2244898

Visualize QC

violin plot

# Vizualize QC metrics and cutoffs
CountPlot <- ggplot(pbmc@meta.data, aes("", nCount_RNA)) +
  geom_violin(fill = "pink") +
  geom_jitter(alpha = 0.2) +
  geom_hline(yintercept = c(200, 7000), color = c("red")) +
  labs(title = "Molecule Counts")

MTPlot <- ggplot(pbmc@meta.data, aes("", percent.mt)) +
  geom_violin(fill = "pink") +
  geom_jitter(alpha = 0.2) +
  geom_hline(yintercept = c(0.5, 5), color = c("red", "red")) +
  labs(title = "Percent Mitochondria")

FeaturePlot <- ggplot(pbmc@meta.data, aes("", nFeature_RNA)) +
  geom_violin(fill = "pink") + 
  geom_jitter(alpha = 0.2) +
  geom_hline(yintercept = c(300, 1700), color = c("red", "red")) +
  labs(title = "Unique Genes")

# Arrange plts
CountPlot + MTPlot + FeaturePlot + plot_layout(ncol = 3)

Scatterplot

CountMT <- ggplot(pbmc@meta.data, aes(nCount_RNA, percent.mt)) +
  geom_point() +
  geom_hline(yintercept = c(0.5, 5), color = c("red", "red")) +
  geom_vline(xintercept = c(200, 7000), color = c("red", "red"))

CountFeature <- ggplot(pbmc@meta.data, aes(nCount_RNA, nFeature_RNA)) +
  geom_point() + 
  geom_hline(yintercept = c(300, 1700), color = c("red", "red")) +
  geom_vline(xintercept = c(200, 7000), color = c("red", "red"))

CountMT + CountFeature + plot_layout(ncol = 2)

Apply QC

Keep cells with:

  • 300 - 1700 unique genes

  • 200 - 7000 total RNA molecules

  • 0.5 - 5% mitochondrial gene percentage

pbmc <- subset(pbmc, subset = nFeature_RNA > 300 & nFeature_RNA < 1700 & 
                 nCount_RNA > 200 & nCount_RNA < 7000 &
                 percent.mt > 0.5 & percent.mt < 5)

View post QC

# Violin plots
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, alpha = 0.2)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

# Scatter plots
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt",  pt.size = 2) + FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", pt.size = 2) + plot_layout(ncol = 2)

Normalize & Transform data

Normalize

Normalizes feature expression by total expression with “LogNormalize” method, multiply by a scaling factor (10,000 by default)

Normalized values are stored in pbmc[[“RNA”]]$data.

pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
## Normalizing layer: counts

Feature Selection

Stabalize variance

# Return 2,000 features per dataset
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
## Finding variable features for layer counts

Plot mean by variance

# top 20 most variable genes
top20 <- head(VariableFeatures(pbmc), 20)

# plot with labels
plot1 <- VariableFeaturePlot(pbmc)

LabelPoints(plot1, top20, 
            repel = TRUE, 
            xnudge = 0,
            ynudge = 0)
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.

Scale data

Before dimensionality reduction methods such as UMAP and PCA plots, use ScaleData() function to prevent domination by highly-expressed genes.
- Scale expression of each gene, so the variance is 1
- Shift expression of each gene, so the mean is 0

Results are stored in pbmc[[“RNA”]]$scale.data

all.genes <- rownames(pbmc)

pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix

Dimensionailty reduction

Principle Components

# Extract principle components & save into Seurat object
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))

# Save TSNE
pbmc <- RunTSNE(pbmc, features = VariableFeatures(object = pbmc))

# Save UMAP
pbmc <- RunUMAP(pbmc, features = VariableFeatures(object = pbmc))
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

Explore PC’s

VizDimReduction(), DimPlot(), and DimHeatmap()

# print top 3 PC's most significant genes
print(pbmc[["pca"]], dims = 1:3, nfeatures = 5)
## PC_ 1 
## Positive:  MALAT1, LTB, IL32, IL7R, CD2 
## Negative:  CST3, TYROBP, LST1, AIF1, FTL 
## PC_ 2 
## Positive:  NKG7, PRF1, GZMB, CST7, GZMA 
## Negative:  CD79A, MS4A1, TCL1A, HLA-DQA1, HLA-DQB1 
## PC_ 3 
## Positive:  IL7R, VIM, IL32, TMSB4X, S100A6 
## Negative:  CD79A, CD79B, HLA-DQA1, MS4A1, HLA-DQB1
# Plot top 3 PC's most significant genes 
VizDimLoadings(pbmc, dims = 1:3, nfeatures = 10, reduction = "pca", balanced = TRUE, ncol = 3)

Dim Down Plots

# Dimplot() will function for all these pots
# PCA plot 
PCAPlot(pbmc, reduction = "pca") + NoLegend() + labs(title="PCA")

# TSNE plot
TSNEPlot(pbmc, reduction = "tsne") + NoLegend() + labs(title="tSNE")

# UMAP
UMAPPlot(pbmc, reduction = "umap") + NoLegend() + labs(title="UMAP")

Heatmaps

DimHeatmap(pbmc, dims = c(1:4),  
           nfeatures = 10, cells = 500, ncol = 2,
           balanced = TRUE, reduction = "pca")

Determine Clusters

Elbowplot

This plot shows the variation (y) that can be described by a given number of PC’s (x). As the variation stabalizes, we can determine the number of principle components that acuratly describe most of the variation within a dataset. In this case i would choose 8-10 depending on my intentions for the analysis.

ElbowPlot(pbmc) + 
  geom_vline(xintercept = c(8, 9, 10), color = c("red", "blue", "green")) + 
  labs(title = "Elbow Plot")

Distance metric

Using the FindNeighbors() function:
- construct a KNN graph based on euclidean distances in PCA space
- refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity)

pbmc <- FindNeighbors(pbmc, dims = 1:9)
## Computing nearest neighbor graph
## Computing SNN

Cluster Cells

Using FindClusters() function:
- Calculate k-nearest neighbors and construct the SNN graph.
- Optimize the modularity function to determine clusters

A resolution parameter between 0.4-1.2 is ideal for ~3K single-cell data.
Optimal resolution increases with dataset.

pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2588
## Number of edges: 91404
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8745
## Number of communities: 8
## Elapsed time: 0 seconds
# View cluster assignments
head(Idents(pbmc), 5)
## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1 AAACCGTGCTTCCG-1 
##                1                3                1                5 
## AAACCGTGTATGCG-1 
##                6 
## Levels: 0 1 2 3 4 5 6 7

Clustered Dimed Down

# Extract principle components & save into Seurat object
pbmc <- RunPCA(pbmc, dims = 1:9)

# PCA plot
PCAPlot(pbmc, reduction = "pca") + labs(title="Clustered PCA")

# Save TSNE
pbmc <- RunTSNE(pbmc, dims = 1:9)

# TSNE plot
TSNEPlot(pbmc, reduction = "tsne") + labs(title="Clustered tSNE")

# Run UMAP calculations with newly clustered cells
pbmc <- RunUMAP(pbmc, dims = 1:9)

# Plot UMAP
UMAPPlot(pbmc, reduction = "umap") + labs(title="Clustered UMAP")

Differential Gene Expression

Identify cluster features

# find all markers of cluster 2
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2)

head(cluster2.markers, n = 5)
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## S100A8  0.000000e+00   6.421435 0.978 0.129  0.000000e+00
## LGALS2  0.000000e+00   5.482369 0.908 0.066  0.000000e+00
## S100A9  0.000000e+00   5.970127 0.996 0.222  0.000000e+00
## FCN1   5.017886e-305   3.951801 0.954 0.156 6.881529e-301
## CD14   4.745812e-301   5.920096 0.695 0.030 6.508406e-297
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3))

head(cluster5.markers, n = 5)
##                p_val avg_log2FC pct.1 pct.2     p_val_adj
## IFITM3 2.911025e-183   6.134180 0.941 0.048 3.992180e-179
## FCGR3A 4.842991e-182   6.760032 0.904 0.036 6.641677e-178
## CFD    1.154870e-178   5.983351 0.904 0.040 1.583789e-174
## CD68   1.561259e-172   5.362181 0.882 0.038 2.141110e-168
## FCER1G 5.473783e-171   6.460346 0.963 0.089 7.506746e-167
# find markers all cluster compared to all remaining cells, report only the positive ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
pbmc.markers |> 
  group_by(cluster) |> 
  filter(avg_log2FC > 1)
## # A tibble: 6,208 × 7
## # Groups:   cluster [8]
##        p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene     
##        <dbl>      <dbl> <dbl> <dbl>     <dbl> <fct>   <chr>    
##  1 1.55e-110       1.22 0.936 0.596 2.12e-106 0       LDHB     
##  2 4.00e-101       2.54 0.494 0.109 5.49e- 97 0       CCR7     
##  3 8.59e- 66       1.03 0.857 0.421 1.18e- 61 0       CD3D     
##  4 3.38e- 55       2.16 0.368 0.105 4.63e- 51 0       LEF1     
##  5 9.01e- 51       1.26 0.662 0.361 1.24e- 46 0       NOSIP    
##  6 7.58e- 49       1.58 0.477 0.185 1.04e- 44 0       PIK3IP1  
##  7 3.51e- 46       2.00 0.355 0.114 4.81e- 42 0       PRKCQ-AS1
##  8 2.75e- 45       2.79 0.218 0.041 3.77e- 41 0       FHIT     
##  9 5.52e- 43       1.02 0.618 0.297 7.56e- 39 0       CD7      
## 10 1.69e- 39       2.92 0.175 0.028 2.31e- 35 0       LINC00176
## # ℹ 6,198 more rows
# ROC test returns ‘classification power’ individual markers (0=random, to 1=perfect).
cluster0.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, 
                                test.use = "roc", only.pos = TRUE)

head(cluster0.markers)
##       myAUC  avg_diff power avg_log2FC pct.1 pct.2
## RPS6  0.830 0.4672260 0.660  0.6802275 1.000 0.997
## RPS12 0.830 0.4969190 0.660  0.7254151 1.000 0.994
## RPL32 0.826 0.4316360 0.652  0.6296283 0.998 0.997
## RPS27 0.824 0.4945441 0.648  0.7222220 0.998 0.993
## RPS14 0.813 0.4218663 0.626  0.6167222 1.000 0.996
## RPL31 0.810 0.5376612 0.620  0.7973715 1.000 0.965

View results

# Violin Plot
VlnPlot(pbmc, features = c("CD3E", "CD8A", "CCR7"))

# Plot raw counts as well
VlnPlot(pbmc, features = c("CD3E", "CD8A", "CCR7"), slot = "counts", log = TRUE)
## Warning: The `slot` argument of `VlnPlot()` is deprecated as of Seurat 5.0.0.
## ℹ Please use the `layer` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Single marker UMAPS
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
    "CD8A"))

Heatmap

top10 <- pbmc.markers |> 
  group_by(cluster) |> 
  filter(avg_log2FC > 1) |> 
  slice_head(n = 10) |> 
  ungroup()

DoHeatmap(pbmc, features = top10$gene) + NoLegend()

Widefield View

# Extract top 3 cluster identifying markers
ClusID <- top10 |> 
  select(cluster, gene) |>
  group_by(cluster) |> 
  slice_head(n=3) |> 
  ungroup()

# UMAP visualization 
FeaturePlot(pbmc, features = ClusID$gene, ncol = 3)

# Violin plots
VlnPlot(pbmc, features = ClusID$gene, ncol = 3)

# Ridgeplots
RidgePlot(pbmc, features = ClusID$gene, ncol = 3)

# Dotplot
DotPlot(pbmc, features =  ClusID$gene)

Interactive exploration

# # Plot something 
# plot <- FeaturePlot(pbmc, features = "CD8A")
# 
# # Run interactive plot
# HoverLocator(plot = plot, information = FetchData(pbmc, vars = c("ident", "PC_1", "nFeature_RNA")))

Label Clusters

# Save label names as characters
clusterIDs <- c("0 = Naive CD4 T", "1 = CD4 Mem?", 
                     "2 = Mono?", "3 = B", 
                     "4 = NK?", "5 = ?", 
                     "6 = ?", "7 = DC?")

# Assign cluster levels to names of ClusterID DF
names(clusterIDs) <- levels(pbmc)

# Rename Clusters
pbmc <- RenameIdents(pbmc, clusterIDs)

# Plot UMAP with labels
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 2, 
        label.size = 6, label.box = TRUE) + NoLegend()

Save Seurat Object

# saveRDS(pbmc, file = paste(getwd(), "output/pbmc.rds", sep = "/"))

Session Information

sessionInfo()
## R version 4.3.3 (2024-02-29 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] presto_1.0.0       data.table_1.15.4  Rcpp_1.0.12        conflicted_1.2.0  
##  [5] patchwork_1.2.0    Seurat_5.0.3       SeuratObject_5.0.1 sp_2.1-3          
##  [9] lubridate_1.9.3    forcats_1.0.0      stringr_1.5.1      dplyr_1.1.4       
## [13] purrr_1.0.2        readr_2.1.5        tidyr_1.3.1        tibble_3.2.1      
## [17] ggplot2_3.5.0      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     rstudioapi_0.16.0      jsonlite_1.8.8        
##   [4] magrittr_2.0.3         ggbeeswarm_0.7.2       spatstat.utils_3.0-4  
##   [7] farver_2.1.1           rmarkdown_2.26         vctrs_0.6.5           
##  [10] ROCR_1.0-11            memoise_2.0.1          spatstat.explore_3.2-7
##  [13] htmltools_0.5.8.1      sass_0.4.9             sctransform_0.4.1     
##  [16] parallelly_1.37.1      KernSmooth_2.23-22     bslib_0.7.0           
##  [19] htmlwidgets_1.6.4      ica_1.0-3              plyr_1.8.9            
##  [22] plotly_4.10.4          zoo_1.8-12             cachem_1.0.8          
##  [25] igraph_2.0.3           mime_0.12              lifecycle_1.0.4       
##  [28] pkgconfig_2.0.3        Matrix_1.6-5           R6_2.5.1              
##  [31] fastmap_1.1.1          fitdistrplus_1.1-11    future_1.33.2         
##  [34] shiny_1.8.1.1          digest_0.6.35          colorspace_2.1-0      
##  [37] tensor_1.5             RSpectra_0.16-1        irlba_2.3.5.1         
##  [40] labeling_0.4.3         progressr_0.14.0       fansi_1.0.6           
##  [43] spatstat.sparse_3.0-3  timechange_0.3.0       httr_1.4.7            
##  [46] polyclip_1.10-6        abind_1.4-5            compiler_4.3.3        
##  [49] withr_3.0.0            fastDummies_1.7.3      highr_0.10            
##  [52] R.utils_2.12.3         MASS_7.3-60.0.1        tools_4.3.3           
##  [55] vipor_0.4.7            lmtest_0.9-40          beeswarm_0.4.0        
##  [58] httpuv_1.6.15          future.apply_1.11.2    goftest_1.2-3         
##  [61] R.oo_1.26.0            glue_1.7.0             nlme_3.1-164          
##  [64] promises_1.3.0         grid_4.3.3             Rtsne_0.17            
##  [67] cluster_2.1.6          reshape2_1.4.4         generics_0.1.3        
##  [70] gtable_0.3.4           spatstat.data_3.0-4    tzdb_0.4.0            
##  [73] R.methodsS3_1.8.2      hms_1.1.3              utf8_1.2.4            
##  [76] spatstat.geom_3.2-9    RcppAnnoy_0.0.22       ggrepel_0.9.5         
##  [79] RANN_2.6.1             pillar_1.9.0           limma_3.58.1          
##  [82] spam_2.10-0            RcppHNSW_0.6.0         later_1.3.2           
##  [85] splines_4.3.3          lattice_0.22-6         survival_3.5-8        
##  [88] deldir_2.0-4           tidyselect_1.2.1       miniUI_0.1.1.1        
##  [91] pbapply_1.7-2          knitr_1.46             gridExtra_2.3         
##  [94] scattermore_1.2        xfun_0.43              statmod_1.5.0         
##  [97] matrixStats_1.3.0      stringi_1.8.3          lazyeval_0.2.2        
## [100] yaml_2.3.8             evaluate_0.23          codetools_0.2-20      
## [103] cli_3.6.2              uwot_0.2.1             xtable_1.8-4          
## [106] reticulate_1.36.0      munsell_0.5.1          jquerylib_0.1.4       
## [109] globals_0.16.3         spatstat.random_3.2-3  png_0.1-8             
## [112] ggrastr_1.0.2          parallel_4.3.3         dotCall64_1.1-1       
## [115] listenv_0.9.1          viridisLite_0.4.2      scales_1.3.0          
## [118] ggridges_0.5.6         crayon_1.5.2           leiden_0.4.3.1        
## [121] rlang_1.1.3            cowplot_1.1.3