Cluster sweep has already been run on the QC’d, filtered, normalised and batch corrected Caron data. Please see the script ClusterSweep.R for details.
Here the clusterSweep
results object, cluster behaviour
metrics and the SCE object containing corrected data and the clusters
are loaded.
out <- readRDS("R_objects/clusterSweep.out.rds")
df <- read_tsv("R_objects/clusterSweep.metrics_df.tsv",
show_col_types = FALSE)
sce <- readRDS("R_objects/clusterSweep.sce.rds")
Cluster sweep was run with the following combinations of parameters:
out$parameters %>%
as.data.frame() %>%
select(`Cluster function`=cluster.fun, k) %>%
rownames_to_column("colData column") %>%
datatable(rownames=FALSE)
We will consider the number of clusters, the mean silhouette width and the sum of the within-cluster sum of squares as an initial assessment of cluster behaviour. To assess the behaviour of the clusterings we can plot these metrics against the k.
nclPlot <- ggplot(df, aes(x = k, y = num.clusters)) +
geom_line(aes(colour=cluster.fun), lwd=2) +
ylim(c(0, 75))
silPlot <- ggplot(df, aes(x = k, y = silhouette)) +
geom_line(aes(colour=cluster.fun), lwd=2)
wcssPlot <- ggplot(df, aes(x = k, y = wcss)) +
geom_line(aes(colour=cluster.fun))
nclPlot + silPlot + wcssPlot + plot_layout(guides = "collect")
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
df %>%
filter(num.clusters>=10 & num.clusters<=15)
## # A tibble: 7 × 5
## k cluster.fun num.clusters silhouette wcss
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 20 louvain 15 0.182 1154.
## 2 30 louvain 13 0.181 1192.
## 3 40 louvain 12 0.183 1201.
## 4 50 louvain 13 0.189 1172.
## 5 70 leiden 14 0.267 1178.
## 6 80 leiden 12 0.269 1267.
## 7 90 leiden 10 0.180 1479.
I am going to be plotting the UMAP a lot, but I want to plot the UMAP
generated from the batch corrected values - “UMAP_corrected”. So that I
can use plotUMAP
instead of having to use
plotReducedDim(sce, dimred = "UMAP_corrected", ...)
, I will
first rename the reduced dimensions slots.
reducedDim(sce, "UMAP_uncorrected") <- reducedDim(sce, "UMAP")
reducedDim(sce, "UMAP") <- reducedDim(sce, "UMAP_corrected")
We can now visualise our selected clusterings of interest on tSNE plots.
plotUMAP(sce, colour_by = "k.60_cluster.fun.leiden")
plotUMAP(sce, colour_by = "k.70_cluster.fun.leiden")
plotUMAP(sce, colour_by = "k.20_cluster.fun.louvain")
plotUMAP(sce, colour_by = "k.30_cluster.fun.louvain")
table(sce$k.60_cluster.fun.leiden)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 11880 3555 4582 1273 1863 609 471 1198 749 1293 95 1199 44
## 14 15 16
## 1450 110 90
table(sce$k.70_cluster.fun.leiden)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 12127 3416 5785 1275 1867 544 283 955 1292 93 1185 1457 96
## 14
## 86
table(sce$k.20_cluster.fun.louvain)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 3000 3045 4970 1277 3215 4153 2364 733 1862 1606 1391 97 1213 90 1445
table(sce$k.30_cluster.fun.louvain)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 3032 3979 729 4756 3580 4219 2155 1861 1543 1850 97 1214 1446
plotSilhouette <- function(clusters, plotTitle){
silTab <- approxSilhouette(reducedDim(sce, "corrected"),
clusters=clusters) %>%
as.data.frame() %>%
mutate(closestCluster = ifelse(width > 0, cluster, other) %>%
factor())
plt <- ggplot(silTab, aes(x=cluster, y=width, colour=closestCluster)) +
ggbeeswarm::geom_quasirandom(method="smiley") +
ggtitle(plotTitle)
plt <- scater:::.resolve_plot_colours(plt,
silTab$closestCluster,
"closestCluster",
colour=TRUE)
plt
}
plotSilhouette(sce$k.60_cluster.fun.leiden, "Leiden k=60")
plotSilhouette(sce$k.70_cluster.fun.leiden, "Leiden k=70")
plotSilhouette(sce$k.20_cluster.fun.louvain, "Louvain k=20")
plotSilhouette(sce$k.30_cluster.fun.louvain, "Louvain k=30")
rownames(sce) <- uniquifyFeatureNames(rowData(sce)$ID, rowData(sce)$Symbol)
p1 <- plotUMAP(sce,
by_exprs_values = "logcounts",
colour_by = "MS4A1",
text_by = "k.60_cluster.fun.leiden")
p2 <- plotUMAP(sce,
by_exprs_values = "logcounts",
colour_by = "CD79A",
text_by = "k.60_cluster.fun.leiden")
p3 <- plotUMAP(sce,
by_exprs_values = "logcounts",
colour_by = "TNFRSF13C",
text_by = "k.60_cluster.fun.leiden")
p4 <- plotUMAP(sce, by_exprs_values = "logcounts",
colour_by = "BCL6",
text_by = "k.60_cluster.fun.leiden")
(p1 + p2) / (p3 + p4)
We can look at how cells move between clusters using
clustree
combined <- cbind(k.60=sce$k.60_cluster.fun.leiden,
k.70=sce$k.70_cluster.fun.leiden,
k.80=sce$k.80_cluster.fun.leiden)
library(clustree)
## Loading required package: ggraph
set.seed(1111)
clustree(combined, prefix="k.", edge_arrow=FALSE)
jacc.mat <- linkClustersMatrix(sce$k.20_cluster.fun.louvain, sce$k.60_cluster.fun.leiden)
rownames(jacc.mat) <- paste("Louvain", rownames(jacc.mat))
colnames(jacc.mat) <- paste("Leiden", colnames(jacc.mat))
pheatmap(jacc.mat, color=viridis::viridis(100), cluster_cols=FALSE, cluster_rows=FALSE)
From this Leiden with k of 60-70 looks reasonable. Let’s try k between 61 and 69.
p1 <- plotUMAP(sce,
colour_by = "k.60_cluster.fun.leiden",
text_by = "k.60_cluster.fun.leiden")
p2 <- plotSilhouette(sce$k.60_cluster.fun.leiden, "Leiden k=60")
p1 + p2
out.leiden <- clusterSweep(reducedDim(sce, "corrected"),
BLUSPARAM = NNGraphParam(),
k = as.integer(61:69),
cluster.fun = "leiden",
BPPARAM=BiocParallel::MulticoreParam(5))
saveRDS(out.leiden, "R_objects/clusterSweep.out.leiden.rds")
out.leiden <- readRDS("R_objects/clusterSweep.out.leiden.rds")
Add the new clusters to the single cell experiment object.
colData(sce) <- cbind(colData(sce), DataFrame(out.leiden$clusters))
Now let’s compare some of the new clusterings.
u1 <- plotUMAP(sce, colour_by = "k.60_cluster.fun.leiden")
s1 <- plotSilhouette(sce$k.60_cluster.fun.leiden, "Leiden k=60")
u2 <- plotUMAP(sce, colour_by = "k.64_cluster.fun.leiden")
s2 <- plotSilhouette(sce$k.64_cluster.fun.leiden, "Leiden k=64")
u3 <- plotUMAP(sce, colour_by = "k.68_cluster.fun.leiden")
s3 <- plotSilhouette(sce$k.68_cluster.fun.leiden, "Leiden k=68")
u4 <- plotUMAP(sce, colour_by = "k.70_cluster.fun.leiden")
s4 <- plotSilhouette(sce$k.70_cluster.fun.leiden, "Leiden k=70")
(u1 + s1) / (u2 + s2) / (u3 + s3) / (u4 + s4)
I am going to move forward with Leiden & k=64 for now.
colLabels(sce) <- sce$k.64_cluster.fun.leiden
Add the clusters and then run scoreMarkers
.
markers <- scoreMarkers(sce,
groups = factor(sce$label),
block = sce$SampleName)
I asked you to identify this cluster:
Which is made up of clusters 3 and 8.
plotUMAP(sce, colour_by = "label", text_by = "label")
Let’s have a look at the top marker genes for cluster 3:
topRanked <- markers[["3"]] %>%
as.data.frame() %>%
select(contains("cohen")) %>%
filter(rank.logFC.cohen <= 5) %>%
arrange(desc(mean.logFC.cohen))
topRanked
## mean.logFC.cohen min.logFC.cohen median.logFC.cohen max.logFC.cohen
## RPS27 2.2086427 0.58998004 2.4171121 3.737874
## RPS29 1.9671993 0.63356438 1.8396375 3.556583
## IFITM1 1.9396747 0.05599095 1.9633703 3.089170
## TMSB4X 1.8919933 -0.72792641 1.9473786 4.761244
## RPL34 1.8808054 0.56792810 1.8430121 3.328266
## RPL13 1.8693746 0.54146107 1.9311971 3.417350
## RPL32 1.8456450 0.23836304 1.9823056 3.390924
## RPL21 1.8403180 -0.61061113 1.8900659 3.379281
## BCL11B 1.7168354 0.42123029 1.9468263 2.389297
## CD3D 1.6390227 0.18242057 1.8901915 2.058136
## RPS18 1.6284166 -0.59104384 1.8787889 3.316480
## RPL10 1.6270615 0.20504308 1.7983478 3.223473
## RPS6 1.6160312 -0.32319207 1.6593600 3.492856
## TRAC 1.6080331 0.30120700 1.7223556 2.273590
## RPL3 1.5102086 -0.35079261 1.6350930 3.224181
## SKAP1 1.4700874 -0.29503735 1.6553207 2.273273
## RPLP2 1.4660823 0.37875512 1.3032786 2.672618
## MALAT1 1.4599524 -0.38938315 1.7572542 2.525748
## RPL41 1.3922771 -0.07119977 1.5547280 3.024773
## IL32 1.3763449 -0.48538067 1.5216902 2.143791
## PRKCH 1.3612573 -0.51526874 1.6775238 2.170600
## TMSB10 1.2779581 -0.56882152 1.1915941 4.097014
## RPL12 1.2762230 -0.62562787 1.3925349 3.395872
## MAML2 1.2499578 -0.13443010 1.2057313 2.200663
## LEF1 1.2474189 0.41805977 1.5182721 1.868711
## ARHGAP15 1.2450001 0.20889182 0.9297429 3.181594
## CD52 1.1692037 -0.59178906 1.2417303 2.617555
## RPS8 1.1332849 -0.62999583 1.0984547 2.591411
## B2M 1.0322528 -0.97975774 1.0403301 2.731195
## BACH2 0.7535566 -2.19283062 1.4309170 2.310046
## FOXP1 0.7432128 -0.46657689 0.5324600 2.425049
## rank.logFC.cohen
## RPS27 1
## RPS29 1
## IFITM1 2
## TMSB4X 1
## RPL34 4
## RPL13 2
## RPL32 1
## RPL21 1
## BCL11B 2
## CD3D 3
## RPS18 5
## RPL10 4
## RPS6 2
## TRAC 5
## RPL3 5
## SKAP1 4
## RPLP2 5
## MALAT1 2
## RPL41 4
## IL32 1
## PRKCH 4
## TMSB10 2
## RPL12 4
## MAML2 3
## LEF1 2
## ARHGAP15 1
## CD52 1
## RPS8 4
## B2M 3
## BACH2 2
## FOXP1 1
There’s lots of ribosomal genes at the top of this list. Let’s check the expression of a few:
plotExpression(sce, features = "RPS27", x = "label")
plotExpression(sce, features = "RPL34", x = "label")
plotExpression(sce, features = "RPL21", x = "label")
These aren’t really going to be useful as marker genes.
On the other hand, further down we do have CD3D, CD3E and TRAC, which are T-cell markers, and IL32, which is an indicator of T-cell activation.
Let’s take a look at these:
p1 <- plotUMAP(sce, colour_by = "CD3D", text_by = "label")
p2 <- plotUMAP(sce, colour_by = "CD3E", text_by = "label")
p3 <- plotUMAP(sce, colour_by = "TRAC", text_by = "label")
p4 <- plotUMAP(sce, colour_by = "IL32", text_by = "label")
(p1 + p2) / (p3 + p4)
p1 <- plotExpression(sce, features = "CD3D", x = "label")
p2 <- plotExpression(sce, features = "CD3E", x = "label")
p3 <- plotExpression(sce, features = "TRAC", x = "label")
p4 <- plotExpression(sce, features = "IL32", x = "label")
(p1 + p2) / (p3 + p4)
Let’s take look at cluster 8’s marker genes.
topRanked <- markers[["8"]] %>%
as.data.frame() %>%
select(contains("cohen")) %>%
filter(rank.logFC.cohen <= 5) %>%
arrange(desc(mean.logFC.cohen))
topRanked
## mean.logFC.cohen min.logFC.cohen median.logFC.cohen max.logFC.cohen
## CCL5 2.401978 1.77192076 2.410006 2.818365
## NKG7 2.098841 1.72211735 2.145472 2.459877
## PRKCH 2.090526 0.51526874 2.293531 3.206973
## SKAP1 1.889324 0.29503735 1.948614 2.706289
## IFITM1 1.766693 -0.05599095 1.623900 3.209413
## FYN 1.756517 0.61307783 1.644280 2.671027
## IL32 1.738158 0.48538067 1.837424 2.315852
## GZMA 1.733396 1.61789398 1.748401 1.883363
## SRGN 1.694160 -1.07934978 1.952375 2.890256
## LINC-PINT 1.672463 0.06092954 1.619500 3.342376
## B2M 1.672070 -0.51568655 1.751439 3.338142
## CST7 1.577076 1.23539115 1.586985 1.861315
## HCST 1.567094 0.46131597 1.701663 2.168690
## PTPRC 1.469947 0.19655093 1.431387 3.099784
## TMSB4X 1.434874 -0.86432276 1.528512 3.670008
## HLA-A 1.199578 0.23310078 1.036929 2.040352
## CD69 1.105932 -0.33383431 1.083895 2.181632
## MALAT1 1.024304 -0.52211766 1.113982 2.322428
## rank.logFC.cohen
## CCL5 1
## NKG7 1
## PRKCH 1
## SKAP1 1
## IFITM1 1
## FYN 4
## IL32 4
## GZMA 3
## SRGN 1
## LINC-PINT 1
## B2M 1
## CST7 2
## HCST 5
## PTPRC 4
## TMSB4X 1
## HLA-A 5
## CD69 5
## MALAT1 4
IL32, NKG7, GZMA and CD69 might suggest that these are activated T-cells.
p1 <- plotUMAP(sce, colour_by = "IL32", text_by = "label")
p2 <- plotUMAP(sce, colour_by = "NKG7", text_by = "label")
p3 <- plotUMAP(sce, colour_by = "GZMA", text_by = "label")
p4 <- plotUMAP(sce, colour_by = "CD69", text_by = "label")
(p1 + p2) / (p3 + p4)
sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /opt/miniforge3/envs/R/lib/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] clustree_0.5.1 ggraph_2.1.0
## [3] DT_0.32 lubridate_1.9.3
## [5] forcats_1.0.0 stringr_1.5.1
## [7] dplyr_1.1.4 purrr_1.0.2
## [9] readr_2.1.5 tidyr_1.3.1
## [11] tibble_3.2.1 tidyverse_2.0.0
## [13] patchwork_1.2.0 pheatmap_1.0.12
## [15] igraph_2.0.2 cluster_2.1.6
## [17] bluster_1.12.0 scran_1.30.0
## [19] scater_1.30.1 ggplot2_3.5.0
## [21] scuttle_1.12.0 SingleCellExperiment_1.24.0
## [23] SummarizedExperiment_1.32.0 Biobase_2.62.0
## [25] GenomicRanges_1.54.1 GenomeInfoDb_1.38.1
## [27] IRanges_2.36.0 S4Vectors_0.40.2
## [29] BiocGenerics_0.48.1 MatrixGenerics_1.14.0
## [31] matrixStats_1.2.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.15.0
## [3] jsonlite_1.8.8 magrittr_2.0.3
## [5] ggbeeswarm_0.7.2 farver_2.1.1
## [7] rmarkdown_2.26 zlibbioc_1.48.0
## [9] vctrs_0.6.5 DelayedMatrixStats_1.24.0
## [11] RCurl_1.98-1.14 htmltools_0.5.7
## [13] S4Arrays_1.2.0 BiocNeighbors_1.20.0
## [15] SparseArray_1.2.2 sass_0.4.9
## [17] bslib_0.6.1 htmlwidgets_1.6.4
## [19] cachem_1.0.8 lifecycle_1.0.4
## [21] pkgconfig_2.0.3 rsvd_1.0.5
## [23] Matrix_1.6-5 R6_2.5.1
## [25] fastmap_1.1.1 GenomeInfoDbData_1.2.11
## [27] digest_0.6.35 colorspace_2.1-0
## [29] dqrng_0.3.2 irlba_2.3.5.1
## [31] crosstalk_1.2.1 beachmat_2.18.0
## [33] labeling_0.4.3 fansi_1.0.6
## [35] timechange_0.3.0 polyclip_1.10-6
## [37] abind_1.4-5 compiler_4.3.3
## [39] bit64_4.0.5 withr_3.0.0
## [41] backports_1.4.1 BiocParallel_1.36.0
## [43] viridis_0.6.5 highr_0.10
## [45] ggforce_0.4.2 MASS_7.3-60
## [47] DelayedArray_0.28.0 tools_4.3.3
## [49] vipor_0.4.7 beeswarm_0.4.0
## [51] glue_1.7.0 grid_4.3.3
## [53] checkmate_2.3.1 generics_0.1.3
## [55] gtable_0.3.4 tzdb_0.4.0
## [57] hms_1.1.3 BiocSingular_1.18.0
## [59] tidygraph_1.3.0 ScaledMatrix_1.10.0
## [61] metapod_1.10.0 utf8_1.2.4
## [63] XVector_0.42.0 ggrepel_0.9.5
## [65] pillar_1.9.0 vroom_1.6.5
## [67] limma_3.58.1 tweenr_2.0.3
## [69] lattice_0.22-5 bit_4.0.5
## [71] tidyselect_1.2.0 locfit_1.5-9.9
## [73] knitr_1.45 gridExtra_2.3
## [75] edgeR_4.0.2 xfun_0.42
## [77] graphlayouts_1.1.0 statmod_1.5.0
## [79] stringi_1.8.3 yaml_2.3.8
## [81] evaluate_0.23 codetools_0.2-19
## [83] cli_3.6.2 munsell_0.5.0
## [85] jquerylib_0.1.4 Rcpp_1.0.12
## [87] parallel_4.3.3 ellipsis_0.3.2
## [89] sparseMatrixStats_1.14.0 bitops_1.0-7
## [91] viridisLite_0.4.2 scales_1.3.0
## [93] crayon_1.5.2 rlang_1.1.3
## [95] cowplot_1.1.3