library(scater)
library(scran)
library(batchelor)
library(bluster)
library(tidyverse)
library(pheatmap)
library(BiocSingular)
We are loading a single sce
object containing the 7 Caron samples. The data has been processed as detailed in previous weeks and downsampled to 500 cells per sample for demonstration purposes.
all.sce <- readRDS("../Robjects/DataIntegration_all_sce_dimred.Rds")
all.sce
## class: SingleCellExperiment
## dim: 17700 3500
## metadata(7): Samples Samples ... Samples Samples
## assays(2): counts logcounts
## rownames(17700): ENSG00000241860 ENSG00000237491 ... ENSG00000275063
## ENSG00000278817
## rowData names(9): ID Symbol ... detected gene_sparsity
## colnames(3500): 1_AAACGGGCAGTTCATG-1 1_AAACGGGGTTCACCTC-1 ...
## 12_TTTGTCACAATGAAAC-1 12_TTTGTCATCAGTTGAC-1
## colData names(20): SampleName Barcode ... sizeFactor cell_sparsity
## reducedDimNames(3): PCA TSNE UMAP
## mainExpName: NULL
## altExpNames(0):
Before we actually perform any correction, it is worth examining whether there is any batch effect in this dataset.The PCA, TSNE and UMAP have already been run but we can do some quick clustering just so we can see things a bit better and this will be needed for the diagnostics later.
uncorrected.snn.gr <- buildSNNGraph(all.sce, use.dimred="PCA")
uncorrected.clusters <- igraph::cluster_walktrap(uncorrected.snn.gr)$membership
colLabels(all.sce) <- factor(uncorrected.clusters)
all.sce$Cluster <- factor(uncorrected.clusters)
uncorrected.tab <- table(Cluster=uncorrected.clusters, Batch=all.sce$SampleName)
uncorrected.tab
## Batch
## Cluster ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 ETV6-RUNX1_4 PBMMC_1b PBMMC_2
## 1 0 1 27 110 0 129
## 2 0 0 0 2 30 37
## 3 0 0 0 0 68 18
## 4 0 0 0 0 105 12
## 5 3 469 0 0 2 0
## 6 0 2 1 1 5 37
## 7 0 0 217 2 2 6
## 8 0 0 5 337 0 0
## 9 0 0 13 11 1 38
## 10 2 4 13 5 12 52
## 11 446 1 0 0 0 0
## 12 0 0 0 0 72 0
## 13 45 0 0 0 0 0
## 14 0 8 47 11 2 44
## 15 3 15 177 16 3 0
## 16 0 0 0 1 12 17
## 17 0 0 0 4 0 110
## 18 1 0 0 0 186 0
## Batch
## Cluster PBMMC_3
## 1 7
## 2 83
## 3 84
## 4 29
## 5 1
## 6 44
## 7 29
## 8 0
## 9 7
## 10 39
## 11 0
## 12 1
## 13 0
## 14 63
## 15 99
## 16 5
## 17 9
## 18 0
plotTSNE(all.sce, colour_by="SampleName")
Reminder: Of course, the other explanation for batch-specific clusters is that there are cell types that are unique to each batch.
quick.corrected.all <- quickCorrect(all.sce, batch = all.sce$SampleName)
quick.sce.all <- quick.corrected.all$corrected
quick.sce.all
## class: SingleCellExperiment
## dim: 17700 3500
## metadata(2): merge.info pca.info
## assays(1): reconstructed
## rownames(17700): ENSG00000241860 ENSG00000237491 ... ENSG00000275063
## ENSG00000278817
## rowData names(1): rotation
## colnames(3500): 1_AAACGGGCAGTTCATG-1 1_AAACGGGGTTCACCTC-1 ...
## 12_TTTGTCACAATGAAAC-1 12_TTTGTCATCAGTTGAC-1
## colData names(1): batch
## reducedDimNames(1): corrected
## mainExpName: NULL
## altExpNames(0):
set.seed(0010101011)
quick.sce.all <- runTSNE(quick.sce.all, dimred="corrected")
quick.sce.all$SampleName <- factor(all.sce$SampleName)
plotTSNE(quick.sce.all, colour_by="SampleName")
meta <- colData(all.sce) %>%
as.data.frame() %>%
DataFrame()
colData(quick.sce.all) <- meta
Get the cluster information
all.snn.gr <- buildSNNGraph(quick.sce.all, use.dimred="corrected", k=20)
clusters.all <- igraph::cluster_walktrap(all.snn.gr)$membership
colLabels(quick.sce.all) <- factor(clusters.all)
quick.sce.all$Cluster <- factor(clusters.all)
clusterTab.all <- data.frame(clusters=clusters.all, batch=quick.sce.all$SampleName, source=quick.sce.all$SampleGroup)
all.tab <- table(Cluster=clusters.all, Batch=quick.sce.all$SampleName)
all.tab
## Batch
## Cluster ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 ETV6-RUNX1_4 PBMMC_1b PBMMC_2
## 1 0 1 24 45 0 94
## 2 0 0 11 15 6 38
## 3 95 26 19 84 98 27
## 4 0 0 0 1 27 25
## 5 352 394 192 202 69 28
## 6 6 18 179 21 189 119
## 7 0 0 0 1 9 18
## 8 47 49 13 52 60 10
## 9 0 10 47 13 23 53
## 10 0 2 10 4 13 39
## 11 0 0 0 1 6 15
## 12 0 0 5 61 0 34
## Batch
## Cluster PBMMC_3
## 1 4
## 2 7
## 3 41
## 4 33
## 5 114
## 6 113
## 7 4
## 8 22
## 9 76
## 10 33
## 11 50
## 12 3
plotTSNE(quick.sce.all, colour_by="Cluster") +
facet_wrap(~colData(quick.sce.all)$SampleName)
ClusterInfo.all <- data.frame(Cluster=clusters.all, Batch=quick.sce.all$SampleName, source=quick.sce.all$SampleGroup) %>%
group_by(Cluster,Batch) %>%
summarise(cells = n())
## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.
ggplot(data=ClusterInfo.all, aes(x=Cluster,y=cells, fill=Batch)) +
theme(axis.text.x=element_text(angle = 90, hjust = 0)) +
geom_col() +
theme(legend.text = element_text(size = 7))
var <- clusterAbundanceVar(all.tab)
# Also printing the percentage of cells in each cluster in each batch:
percent <- t(t(all.tab)/colSums(all.tab)) * 100
df <- DataFrame(Batch=unclass(percent), var=var)
df[order(df$var, decreasing=TRUE),]
## DataFrame with 12 rows and 8 columns
## Batch.ETV6.RUNX1_1 Batch.ETV6.RUNX1_2 Batch.ETV6.RUNX1_3 Batch.ETV6.RUNX1_4
## <numeric> <numeric> <numeric> <numeric>
## 6 1.2 3.6 35.8 4.2
## 1 0.0 0.2 4.8 9.0
## 5 70.4 78.8 38.4 40.4
## 12 0.0 0.0 1.0 12.2
## 9 0.0 2.0 9.4 2.6
## ... ... ... ... ...
## 10 0.0 0.4 2.0 0.8
## 3 19.0 5.2 3.8 16.8
## 2 0.0 0.0 2.2 3.0
## 8 9.4 9.8 2.6 10.4
## 7 0.0 0.0 0.0 0.2
## Batch.PBMMC_1b Batch.PBMMC_2 Batch.PBMMC_3 var
## <numeric> <numeric> <numeric> <numeric>
## 6 37.8 23.8 22.6 2.23780
## 1 0.0 18.8 0.8 1.86119
## 5 13.8 5.6 22.8 1.48641
## 12 0.0 6.8 0.6 1.33824
## 9 4.6 10.6 15.2 1.19316
## ... ... ... ... ...
## 10 2.6 7.8 6.6 0.787030
## 3 19.6 5.4 8.2 0.658431
## 2 1.2 7.6 1.4 0.624413
## 8 12.0 2.0 4.4 0.557709
## 7 1.8 3.6 0.8 0.333864
Comparing the clustering pre and post correction
# PBMMC 1b
only_PBMMC1b <- all.sce[,all.sce$SampleName == "PBMMC_1b"]
tab.sample.PBMMC1b <- nestedClusters(ref=paste("before", colLabels(only_PBMMC1b)),
alt=paste("after", clusters.all[quick.sce.all$SampleName=="PBMMC_1b"]))
tab.sample.PBMMC1b$alt.mapping
## DataFrame with 10 rows and 2 columns
## max which
## <numeric> <character>
## after 10 0.846154 before 10
## after 11 1.000000 before 2
## after 2 0.833333 before 16
## after 3 0.591837 before 12
## after 4 0.888889 before 2
## after 5 0.739130 before 3
## after 6 0.973545 before 18
## after 7 0.777778 before 16
## after 8 1.000000 before 4
## after 9 0.739130 before 3
# ETV6-RUNX1_1
only_ETV6RUNX1_1 <- all.sce[,all.sce$SampleName == "ETV6-RUNX1_1"]
tab.sample.ETV6RUNX1_1 <- nestedClusters(ref=paste("before", colLabels(only_ETV6RUNX1_1)),
alt=paste("after", clusters.all[quick.sce.all$SampleName=="ETV6-RUNX1_1"]))
tab.sample.ETV6RUNX1_1$alt.mapping
## DataFrame with 4 rows and 2 columns
## max which
## <numeric> <character>
## after 3 0.989474 before 11
## after 5 0.991477 before 11
## after 6 0.500000 before 15
## after 8 0.936170 before 13
We can visualize this mapping for the samples.
# For the first sample:
heat.sample.1 <- pheatmap(tab.sample.PBMMC1b$proportions, cluster_row=FALSE, cluster_col=FALSE,
main="Sample 1 comparison", silent=TRUE)
# For the second sample:
heat.sample.2 <- pheatmap(tab.sample.ETV6RUNX1_1$proportions, cluster_row=FALSE, cluster_col=FALSE,
main="Sample 2 comparison", silent=TRUE)
gridExtra::grid.arrange(heat.sample.1[[4]], heat.sample.2[[4]])
ri.sample.1 <- pairwiseRand(clusters.all[quick.sce.all$SampleName=="PBMMC_1b"], colLabels(only_PBMMC1b), mode="index")
ri.sample.1
## [1] 0.7733491
ri.sample.2 <- pairwiseRand(clusters.all[quick.sce.all$SampleName=="ETV6-RUNX1_1"], colLabels(only_ETV6RUNX1_1), mode="index")
ri.sample.2
## [1] 0.4026121
ARI on per-cluster level
# For the first batch.
tab1 <- pairwiseRand(uncorrected.clusters[all.sce$SampleName=="PBMMC_1b"], clusters.all[quick.sce.all$SampleName=="PBMMC_1b"])
heat.1 <- pheatmap(tab1, cluster_row=FALSE, cluster_col=FALSE,
col=rev(viridis::magma(100)), main="PBMMC_1b probabilities", silent=TRUE)
# For the second batch.
tab2 <- pairwiseRand(uncorrected.clusters[all.sce$SampleName=="ETV6-RUNX1_1"], clusters.all[quick.sce.all$SampleName=="ETV6-RUNX1_1"])
heat.2 <- pheatmap(tab2, cluster_row=FALSE, cluster_col=FALSE,
col=rev(viridis::magma(100)), main="ETV6-RUNX1_1 probabilities", silent=TRUE)
gridExtra::grid.arrange(heat.1[[4]], heat.2[[4]])
lost variance
metadata(quick.sce.all)$merge.info$lost.var
## ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 ETV6-RUNX1_4 PBMMC_1b
## [1,] 0.023736513 0.0329204451 0.0000000000 0.0000000000 0.000000000
## [2,] 0.004040773 0.0055962398 0.0358095086 0.0000000000 0.000000000
## [3,] 0.003472898 0.0029937349 0.0035287466 0.0446873580 0.000000000
## [4,] 0.028163123 0.0282391485 0.0289088645 0.0102509911 0.055229461
## [5,] 0.003125929 0.0033802894 0.0018166287 0.0015178279 0.003190129
## [6,] 0.001151427 0.0009835561 0.0008556258 0.0006980316 0.001346453
## PBMMC_2 PBMMC_3
## [1,] 0.000000000 0.00000000
## [2,] 0.000000000 0.00000000
## [3,] 0.000000000 0.00000000
## [4,] 0.000000000 0.00000000
## [5,] 0.059650697 0.00000000
## [6,] 0.001062763 0.05640212
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] BiocSingular_1.10.0 pheatmap_1.0.12
## [3] forcats_0.5.1 stringr_1.4.0
## [5] dplyr_1.0.8 purrr_0.3.4
## [7] readr_2.1.2 tidyr_1.2.0
## [9] tibble_3.1.6 tidyverse_1.3.1
## [11] bluster_1.4.0 batchelor_1.10.0
## [13] scran_1.22.1 scater_1.22.0
## [15] ggplot2_3.3.5 scuttle_1.4.0
## [17] SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0
## [19] Biobase_2.54.0 GenomicRanges_1.46.1
## [21] GenomeInfoDb_1.30.1 IRanges_2.28.0
## [23] S4Vectors_0.32.3 BiocGenerics_0.40.0
## [25] MatrixGenerics_1.6.0 matrixStats_0.61.0
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 ggbeeswarm_0.6.0
## [3] colorspace_2.0-2 ellipsis_0.3.2
## [5] XVector_0.34.0 BiocNeighbors_1.12.0
## [7] fs_1.5.2 rstudioapi_0.13
## [9] farver_2.1.0 ggrepel_0.9.1
## [11] fansi_1.0.2 lubridate_1.8.0
## [13] xml2_1.3.3 sparseMatrixStats_1.6.0
## [15] knitr_1.37 jsonlite_1.7.3
## [17] ResidualMatrix_1.4.0 broom_0.7.12
## [19] cluster_2.1.2 dbplyr_2.1.1
## [21] compiler_4.1.2 httr_1.4.2
## [23] dqrng_0.3.0 backports_1.4.1
## [25] assertthat_0.2.1 Matrix_1.4-0
## [27] fastmap_1.1.0 limma_3.50.0
## [29] cli_3.1.1 htmltools_0.5.2
## [31] tools_4.1.2 rsvd_1.0.5
## [33] igraph_1.2.11 gtable_0.3.0
## [35] glue_1.6.1 GenomeInfoDbData_1.2.7
## [37] Rcpp_1.0.8 cellranger_1.1.0
## [39] jquerylib_0.1.4 vctrs_0.3.8
## [41] DelayedMatrixStats_1.16.0 xfun_0.29
## [43] beachmat_2.10.0 rvest_1.0.2
## [45] lifecycle_1.0.1 irlba_2.3.5
## [47] statmod_1.4.36 edgeR_3.36.0
## [49] zlibbioc_1.40.0 scales_1.1.1
## [51] hms_1.1.1 parallel_4.1.2
## [53] RColorBrewer_1.1-2 yaml_2.2.2
## [55] gridExtra_2.3 sass_0.4.0
## [57] stringi_1.7.6 highr_0.9
## [59] ScaledMatrix_1.2.0 BiocParallel_1.28.3
## [61] rlang_1.0.1 pkgconfig_2.0.3
## [63] bitops_1.0-7 evaluate_0.14
## [65] lattice_0.20-45 labeling_0.4.2
## [67] cowplot_1.1.1 tidyselect_1.1.1
## [69] magrittr_2.0.2 R6_2.5.1
## [71] generics_0.1.2 metapod_1.2.0
## [73] DelayedArray_0.20.0 DBI_1.1.2
## [75] pillar_1.7.0 haven_2.4.3
## [77] withr_2.4.3 RCurl_1.98-1.6
## [79] modelr_0.1.8 crayon_1.4.2
## [81] utf8_1.2.2 tzdb_0.2.0
## [83] rmarkdown_2.11 viridis_0.6.2
## [85] locfit_1.5-9.4 grid_4.1.2
## [87] readxl_1.3.1 reprex_2.0.1
## [89] digest_0.6.29 munsell_0.5.0
## [91] beeswarm_0.4.0 viridisLite_0.4.0
## [93] vipor_0.4.5 bslib_0.3.1