1 Data integration - PBMMC and ETV6-RUNX samples

library(scater)
library(scran)
library(batchelor)
library(bluster)
library(tidyverse)
library(pheatmap)
library(BiocSingular)

1.1 Load the data

We are loading a single sce object containing the 7 Caron samples. The data has been processed as detailed in previous weeks and downsampled to 500 cells per sample for demonstration purposes.

all.sce <- readRDS("../Robjects/DataIntegration_all_sce_dimred.Rds")
all.sce

## class: SingleCellExperiment 
## dim: 17700 3500 
## metadata(7): Samples Samples ... Samples Samples
## assays(2): counts logcounts
## rownames(17700): ENSG00000241860 ENSG00000237491 ... ENSG00000275063
##   ENSG00000278817
## rowData names(9): ID Symbol ... detected gene_sparsity
## colnames(3500): 1_AAACGGGCAGTTCATG-1 1_AAACGGGGTTCACCTC-1 ...
##   12_TTTGTCACAATGAAAC-1 12_TTTGTCATCAGTTGAC-1
## colData names(20): SampleName Barcode ... sizeFactor cell_sparsity
## reducedDimNames(3): PCA TSNE UMAP
## mainExpName: NULL
## altExpNames(0):

1.2 View the uncorrected data

Before we actually perform any correction, it is worth examining whether there is any batch effect in this dataset.The PCA, TSNE and UMAP have already been run but we can do some quick clustering just so we can see things a bit better and this will be needed for the diagnostics later.

uncorrected.snn.gr <- buildSNNGraph(all.sce, use.dimred="PCA")
uncorrected.clusters <- igraph::cluster_walktrap(uncorrected.snn.gr)$membership
colLabels(all.sce) <- factor(uncorrected.clusters)
all.sce$Cluster <- factor(uncorrected.clusters)
uncorrected.tab <- table(Cluster=uncorrected.clusters, Batch=all.sce$SampleName)
uncorrected.tab

##        Batch
## Cluster ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 ETV6-RUNX1_4 PBMMC_1b PBMMC_2
##      1             0            1           27          110        0     129
##      2             0            0            0            2       30      37
##      3             0            0            0            0       68      18
##      4             0            0            0            0      105      12
##      5             3          469            0            0        2       0
##      6             0            2            1            1        5      37
##      7             0            0          217            2        2       6
##      8             0            0            5          337        0       0
##      9             0            0           13           11        1      38
##      10            2            4           13            5       12      52
##      11          446            1            0            0        0       0
##      12            0            0            0            0       72       0
##      13           45            0            0            0        0       0
##      14            0            8           47           11        2      44
##      15            3           15          177           16        3       0
##      16            0            0            0            1       12      17
##      17            0            0            0            4        0     110
##      18            1            0            0            0      186       0
##        Batch
## Cluster PBMMC_3
##      1        7
##      2       83
##      3       84
##      4       29
##      5        1
##      6       44
##      7       29
##      8        0
##      9        7
##      10      39
##      11       0
##      12       1
##      13       0
##      14      63
##      15      99
##      16       5
##      17       9
##      18       0

plotTSNE(all.sce, colour_by="SampleName")

Reminder: Of course, the other explanation for batch-specific clusters is that there are cell types that are unique to each batch.

1.3 Correct the data - quickCorrect wrapper

quick.corrected.all <- quickCorrect(all.sce, batch = all.sce$SampleName)

quick.sce.all <- quick.corrected.all$corrected

quick.sce.all

## class: SingleCellExperiment 
## dim: 17700 3500 
## metadata(2): merge.info pca.info
## assays(1): reconstructed
## rownames(17700): ENSG00000241860 ENSG00000237491 ... ENSG00000275063
##   ENSG00000278817
## rowData names(1): rotation
## colnames(3500): 1_AAACGGGCAGTTCATG-1 1_AAACGGGGTTCACCTC-1 ...
##   12_TTTGTCACAATGAAAC-1 12_TTTGTCATCAGTTGAC-1
## colData names(1): batch
## reducedDimNames(1): corrected
## mainExpName: NULL
## altExpNames(0):

1.4 Visualise the corrected data

set.seed(0010101011)
quick.sce.all <- runTSNE(quick.sce.all, dimred="corrected")
quick.sce.all$SampleName <- factor(all.sce$SampleName)
plotTSNE(quick.sce.all, colour_by="SampleName")

1.5 Retrieve the metadata

meta <- colData(all.sce) %>%
  as.data.frame() %>%
  DataFrame()

colData(quick.sce.all) <- meta

1.6 Correction Diagnostics

1.6.1 Mixing Between Batches

Get the cluster information

all.snn.gr <- buildSNNGraph(quick.sce.all, use.dimred="corrected", k=20)
clusters.all <- igraph::cluster_walktrap(all.snn.gr)$membership
colLabels(quick.sce.all) <- factor(clusters.all)
quick.sce.all$Cluster <- factor(clusters.all)

clusterTab.all <- data.frame(clusters=clusters.all, batch=quick.sce.all$SampleName, source=quick.sce.all$SampleGroup)

all.tab <- table(Cluster=clusters.all, Batch=quick.sce.all$SampleName)
all.tab

##        Batch
## Cluster ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 ETV6-RUNX1_4 PBMMC_1b PBMMC_2
##      1             0            1           24           45        0      94
##      2             0            0           11           15        6      38
##      3            95           26           19           84       98      27
##      4             0            0            0            1       27      25
##      5           352          394          192          202       69      28
##      6             6           18          179           21      189     119
##      7             0            0            0            1        9      18
##      8            47           49           13           52       60      10
##      9             0           10           47           13       23      53
##      10            0            2           10            4       13      39
##      11            0            0            0            1        6      15
##      12            0            0            5           61        0      34
##        Batch
## Cluster PBMMC_3
##      1        4
##      2        7
##      3       41
##      4       33
##      5      114
##      6      113
##      7        4
##      8       22
##      9       76
##      10      33
##      11      50
##      12       3

plotTSNE(quick.sce.all, colour_by="Cluster") + 
  facet_wrap(~colData(quick.sce.all)$SampleName)

ClusterInfo.all <- data.frame(Cluster=clusters.all, Batch=quick.sce.all$SampleName, source=quick.sce.all$SampleGroup) %>%
  group_by(Cluster,Batch) %>%
  summarise(cells = n())

## `summarise()` has grouped output by 'Cluster'. You can override using the
## `.groups` argument.

ggplot(data=ClusterInfo.all, aes(x=Cluster,y=cells, fill=Batch)) +
    theme(axis.text.x=element_text(angle = 90, hjust = 0)) +
    geom_col() +
    theme(legend.text = element_text(size = 7))

var <- clusterAbundanceVar(all.tab)

# Also printing the percentage of cells in each cluster in each batch:
percent <- t(t(all.tab)/colSums(all.tab)) * 100 
df <- DataFrame(Batch=unclass(percent), var=var)
df[order(df$var, decreasing=TRUE),]

## DataFrame with 12 rows and 8 columns
##     Batch.ETV6.RUNX1_1 Batch.ETV6.RUNX1_2 Batch.ETV6.RUNX1_3 Batch.ETV6.RUNX1_4
##              <numeric>          <numeric>          <numeric>          <numeric>
## 6                  1.2                3.6               35.8                4.2
## 1                  0.0                0.2                4.8                9.0
## 5                 70.4               78.8               38.4               40.4
## 12                 0.0                0.0                1.0               12.2
## 9                  0.0                2.0                9.4                2.6
## ...                ...                ...                ...                ...
## 10                 0.0                0.4                2.0                0.8
## 3                 19.0                5.2                3.8               16.8
## 2                  0.0                0.0                2.2                3.0
## 8                  9.4                9.8                2.6               10.4
## 7                  0.0                0.0                0.0                0.2
##     Batch.PBMMC_1b Batch.PBMMC_2 Batch.PBMMC_3       var
##          <numeric>     <numeric>     <numeric> <numeric>
## 6             37.8          23.8          22.6   2.23780
## 1              0.0          18.8           0.8   1.86119
## 5             13.8           5.6          22.8   1.48641
## 12             0.0           6.8           0.6   1.33824
## 9              4.6          10.6          15.2   1.19316
## ...            ...           ...           ...       ...
## 10             2.6           7.8           6.6  0.787030
## 3             19.6           5.4           8.2  0.658431
## 2              1.2           7.6           1.4  0.624413
## 8             12.0           2.0           4.4  0.557709
## 7              1.8           3.6           0.8  0.333864

1.6.2 Preserving Biological Heterogeneity

Comparing the clustering pre and post correction

# PBMMC 1b

only_PBMMC1b <- all.sce[,all.sce$SampleName == "PBMMC_1b"]

tab.sample.PBMMC1b <- nestedClusters(ref=paste("before", colLabels(only_PBMMC1b)),
    alt=paste("after", clusters.all[quick.sce.all$SampleName=="PBMMC_1b"]))
tab.sample.PBMMC1b$alt.mapping

## DataFrame with 10 rows and 2 columns
##                max       which
##          <numeric> <character>
## after 10  0.846154   before 10
## after 11  1.000000    before 2
## after 2   0.833333   before 16
## after 3   0.591837   before 12
## after 4   0.888889    before 2
## after 5   0.739130    before 3
## after 6   0.973545   before 18
## after 7   0.777778   before 16
## after 8   1.000000    before 4
## after 9   0.739130    before 3

# ETV6-RUNX1_1

only_ETV6RUNX1_1 <- all.sce[,all.sce$SampleName == "ETV6-RUNX1_1"]

tab.sample.ETV6RUNX1_1 <- nestedClusters(ref=paste("before", colLabels(only_ETV6RUNX1_1)),
    alt=paste("after", clusters.all[quick.sce.all$SampleName=="ETV6-RUNX1_1"]))
tab.sample.ETV6RUNX1_1$alt.mapping

## DataFrame with 4 rows and 2 columns
##               max       which
##         <numeric> <character>
## after 3  0.989474   before 11
## after 5  0.991477   before 11
## after 6  0.500000   before 15
## after 8  0.936170   before 13

We can visualize this mapping for the samples.

# For the first sample:
heat.sample.1 <- pheatmap(tab.sample.PBMMC1b$proportions, cluster_row=FALSE, cluster_col=FALSE,
                   main="Sample 1 comparison", silent=TRUE)

# For the second sample:
heat.sample.2 <- pheatmap(tab.sample.ETV6RUNX1_1$proportions, cluster_row=FALSE, cluster_col=FALSE,
                   main="Sample 2 comparison", silent=TRUE)

gridExtra::grid.arrange(heat.sample.1[[4]], heat.sample.2[[4]])

1.7 Adjusted Rand index

ri.sample.1 <- pairwiseRand(clusters.all[quick.sce.all$SampleName=="PBMMC_1b"], colLabels(only_PBMMC1b), mode="index")
ri.sample.1

## [1] 0.7733491

ri.sample.2 <- pairwiseRand(clusters.all[quick.sce.all$SampleName=="ETV6-RUNX1_1"], colLabels(only_ETV6RUNX1_1), mode="index")
ri.sample.2

## [1] 0.4026121

ARI on per-cluster level

# For the first batch.
tab1 <- pairwiseRand(uncorrected.clusters[all.sce$SampleName=="PBMMC_1b"], clusters.all[quick.sce.all$SampleName=="PBMMC_1b"])
heat.1 <- pheatmap(tab1, cluster_row=FALSE, cluster_col=FALSE,
    col=rev(viridis::magma(100)), main="PBMMC_1b probabilities", silent=TRUE)

# For the second batch.
tab2 <- pairwiseRand(uncorrected.clusters[all.sce$SampleName=="ETV6-RUNX1_1"], clusters.all[quick.sce.all$SampleName=="ETV6-RUNX1_1"])
heat.2 <- pheatmap(tab2, cluster_row=FALSE, cluster_col=FALSE,
    col=rev(viridis::magma(100)), main="ETV6-RUNX1_1 probabilities", silent=TRUE)

gridExtra::grid.arrange(heat.1[[4]], heat.2[[4]])

1.7.1 MNN specific test

lost variance

metadata(quick.sce.all)$merge.info$lost.var

##      ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 ETV6-RUNX1_4    PBMMC_1b
## [1,]  0.023736513 0.0329204451 0.0000000000 0.0000000000 0.000000000
## [2,]  0.004040773 0.0055962398 0.0358095086 0.0000000000 0.000000000
## [3,]  0.003472898 0.0029937349 0.0035287466 0.0446873580 0.000000000
## [4,]  0.028163123 0.0282391485 0.0289088645 0.0102509911 0.055229461
## [5,]  0.003125929 0.0033802894 0.0018166287 0.0015178279 0.003190129
## [6,]  0.001151427 0.0009835561 0.0008556258 0.0006980316 0.001346453
##          PBMMC_2    PBMMC_3
## [1,] 0.000000000 0.00000000
## [2,] 0.000000000 0.00000000
## [3,] 0.000000000 0.00000000
## [4,] 0.000000000 0.00000000
## [5,] 0.059650697 0.00000000
## [6,] 0.001062763 0.05640212

1.8 Session information

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] BiocSingular_1.10.0         pheatmap_1.0.12            
##  [3] forcats_0.5.1               stringr_1.4.0              
##  [5] dplyr_1.0.8                 purrr_0.3.4                
##  [7] readr_2.1.2                 tidyr_1.2.0                
##  [9] tibble_3.1.6                tidyverse_1.3.1            
## [11] bluster_1.4.0               batchelor_1.10.0           
## [13] scran_1.22.1                scater_1.22.0              
## [15] ggplot2_3.3.5               scuttle_1.4.0              
## [17] SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0
## [19] Biobase_2.54.0              GenomicRanges_1.46.1       
## [21] GenomeInfoDb_1.30.1         IRanges_2.28.0             
## [23] S4Vectors_0.32.3            BiocGenerics_0.40.0        
## [25] MatrixGenerics_1.6.0        matrixStats_0.61.0         
## 
## loaded via a namespace (and not attached):
##  [1] Rtsne_0.15                ggbeeswarm_0.6.0         
##  [3] colorspace_2.0-2          ellipsis_0.3.2           
##  [5] XVector_0.34.0            BiocNeighbors_1.12.0     
##  [7] fs_1.5.2                  rstudioapi_0.13          
##  [9] farver_2.1.0              ggrepel_0.9.1            
## [11] fansi_1.0.2               lubridate_1.8.0          
## [13] xml2_1.3.3                sparseMatrixStats_1.6.0  
## [15] knitr_1.37                jsonlite_1.7.3           
## [17] ResidualMatrix_1.4.0      broom_0.7.12             
## [19] cluster_2.1.2             dbplyr_2.1.1             
## [21] compiler_4.1.2            httr_1.4.2               
## [23] dqrng_0.3.0               backports_1.4.1          
## [25] assertthat_0.2.1          Matrix_1.4-0             
## [27] fastmap_1.1.0             limma_3.50.0             
## [29] cli_3.1.1                 htmltools_0.5.2          
## [31] tools_4.1.2               rsvd_1.0.5               
## [33] igraph_1.2.11             gtable_0.3.0             
## [35] glue_1.6.1                GenomeInfoDbData_1.2.7   
## [37] Rcpp_1.0.8                cellranger_1.1.0         
## [39] jquerylib_0.1.4           vctrs_0.3.8              
## [41] DelayedMatrixStats_1.16.0 xfun_0.29                
## [43] beachmat_2.10.0           rvest_1.0.2              
## [45] lifecycle_1.0.1           irlba_2.3.5              
## [47] statmod_1.4.36            edgeR_3.36.0             
## [49] zlibbioc_1.40.0           scales_1.1.1             
## [51] hms_1.1.1                 parallel_4.1.2           
## [53] RColorBrewer_1.1-2        yaml_2.2.2               
## [55] gridExtra_2.3             sass_0.4.0               
## [57] stringi_1.7.6             highr_0.9                
## [59] ScaledMatrix_1.2.0        BiocParallel_1.28.3      
## [61] rlang_1.0.1               pkgconfig_2.0.3          
## [63] bitops_1.0-7              evaluate_0.14            
## [65] lattice_0.20-45           labeling_0.4.2           
## [67] cowplot_1.1.1             tidyselect_1.1.1         
## [69] magrittr_2.0.2            R6_2.5.1                 
## [71] generics_0.1.2            metapod_1.2.0            
## [73] DelayedArray_0.20.0       DBI_1.1.2                
## [75] pillar_1.7.0              haven_2.4.3              
## [77] withr_2.4.3               RCurl_1.98-1.6           
## [79] modelr_0.1.8              crayon_1.4.2             
## [81] utf8_1.2.2                tzdb_0.2.0               
## [83] rmarkdown_2.11            viridis_0.6.2            
## [85] locfit_1.5-9.4            grid_4.1.2               
## [87] readxl_1.3.1              reprex_2.0.1             
## [89] digest_0.6.29             munsell_0.5.0            
## [91] beeswarm_0.4.0            viridisLite_0.4.0        
## [93] vipor_0.4.5               bslib_0.3.1

CRUK CI Bitesize Introduction to Single Cell RNASeq Analysis

Dataset Integration and Batch Correction - clinic session

Abbi Edwards