1 Normalisation - Exercises

Exercise: apply the deconvolution and SCTransform normalisations methods on a single sample: ETV6-RUNX1_1 (aka GSM3872434).

library(scater)
library(scran)
library(tidyverse)
library(BiocSingular)
library(BiocParallel)

bpp <- MulticoreParam(7)

1.1 Load object

# Read object in:
# remember getwd() and dir()
sce <- readRDS("../Robjects/Caron_filtered.rds")
colData(sce)$SampleName <- colData(sce)$Sample

Select cells for ETV6-RUNX1_1:

# have new list of cell barcodes for each sample
sce.master <- sce
vec.bc <- colData(sce.master) %>%
    data.frame() %>%
    filter(SampleName == "ETV6-RUNX1_1") %>%
    group_by(SampleName) %>%
    pull(Barcode)

Number of cells in the sample:

table(colData(sce.master)$Barcode %in% vec.bc)

Subset cells from the SCE object:

tmpInd <- which(colData(sce.master)$Barcode %in% vec.bc)
sce <- sce.master[,tmpInd]
sce

Check columns data:

head(colData(sce))
table(colData(sce)$SampleName)

1.2 Exercise 1 : Deconvolution

Cluster cells then normalise.

1.2.1 Cluster cells

set.seed(100) # clusters with PCA from irlba with approximation
clust <- quickCluster(sce) # slow with all cells.
table(clust)

1.2.2 Compute size factors

# deconvolve


# set size factors


# size factors distribution summary

Plot deconvolution size factors against library size factors:

# compute library size factors


# make data frame keeping library and deconvolution size factors for plotting
# plot deconv.sf against lib.sf

# colour by library size

1.2.3 Apply size factors

1.3 Exercise 2 : sctransform

Get UMI counts matrix:

counts <- counts(sce)
colnames(counts) <- colData(sce)$Barcode

1.3.1 Inspect data

Derive gene and cell attributes from the UMI matrix.

Mean-variance relationship plot

Mean-detection-rate relationship plot

# add the expected detection rate under Poisson model

1.3.2 Transformation

Estimate model parameters and transform data

print(dim(counts))
# We use the Future API for parallel processing; set parameters here
future::plan(strategy = 'multicore', workers = 4)
options(future.globals.maxSize = 10 * 1024 ^ 3)

set.seed(44)

# now transform:

Inspect model:

print(vst_out$model_str)

We will look at several genes in more detail.

rowData(sce) %>%
  as.data.frame %>%
  filter(Symbol %in% c('RPL10', 'FTL'))

# plot model data for these two genes:

Distribution of residual mean:

ggplot(vst_out$gene_attr, aes(residual_mean)) +
  geom_histogram(binwidth=0.01)

Distribution of residual variance:

ggplot(vst_out$gene_attr, aes(residual_variance)) +
  geom_histogram(binwidth=0.1) +
  geom_vline(xintercept=1, color='red') +
  xlim(0, 10)

Variance against mean (genes):

ggplot(vst_out$gene_attr,
       aes(log10(gmean), residual_variance)) +
       geom_point(alpha=0.3, shape=16) +
       geom_density_2d(size = 0.3)

Check genes with large residual variance:

dd <- vst_out$gene_attr %>%
    arrange(-residual_variance) %>%
    slice_head(n = 22) %>%
    mutate(across(where(is.numeric), round, 2))

dd %>% tibble::rownames_to_column("ID") %>%
    left_join(as.data.frame(rowData(sce))[,c("ID", "Symbol")], "ID") %>%
    DT::datatable(rownames = FALSE)

Check transformed values:

print(dim(vst_out$y))
vst_out$y[1:10,1:5]

Genes that are expressed in fewer than 5 cells are not used and not returned, so to add vst_out$y as an assay we need to remove the missing genes.

sceOrig <- sce
sceOrig
geneOverlap <- rownames(sce) %in% rownames(vst_out$y)
if(!all(geneOverlap))
{
  table(rownames(sce) %in% rownames(vst_out$y))
  tmpInd <- which(rownames(sce) %in% rownames(vst_out$y))
  sce <- sce[tmpInd,]
  assayNames(sce)
}
rm(sceOrig)