1 Normalisation - Exercises

Exercise: apply the deconvolution and SCTransform normalisations methods on a single sample: ETV6-RUNX1_1 (aka GSM3872434).

1.1 Load object

# Read object in:
# remember getwd() and dir()
sce <- readRDS("../Robjects/Caron_filtered.rds")
colData(sce)$SampleName <- colData(sce)$Sample

Select cells for ETV6-RUNX1_1:

# have new list of cell barcodes for each sample
sce.master <- sce
vec.bc <- colData(sce.master) %>%
    data.frame() %>%
    filter(SampleName == "ETV6-RUNX1_1") %>%
    group_by(SampleName) %>%
    pull(Barcode)

Number of cells in the sample:

table(colData(sce.master)$Barcode %in% vec.bc)
## 
## FALSE  TRUE 
## 43841  2730

Subset cells from the SCE object:

tmpInd <- which(colData(sce.master)$Barcode %in% vec.bc)
sce <- sce.master[,tmpInd]
sce
## class: SingleCellExperiment 
## dim: 28377 2730 
## metadata(1): Samples
## assays(1): counts
## rownames(28377): ENSG00000238009 ENSG00000239945 ... ENSG00000278633
##   ENSG00000278817
## rowData names(4): ID Symbol Type Chromosome
## colnames(2730): 1_AAACCTGAGACTTTCG-1 1_AAACCTGGTCTTCAAG-1 ...
##   1_TTTGTCAAGGACGAAA-1 1_TTTGTCAGTTCGGCAC-1
## colData names(15): Sample Barcode ... high_Mito_percent SampleName
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):

Check columns data:

head(colData(sce))
## DataFrame with 6 rows and 15 columns
##                            Sample            Barcode    SampleId SampleGroup
##                       <character>        <character> <character> <character>
## 1_AAACCTGAGACTTTCG-1 ETV6-RUNX1_1 AAACCTGAGACTTTCG-1  SRR9264343  ETV6-RUNX1
## 1_AAACCTGGTCTTCAAG-1 ETV6-RUNX1_1 AAACCTGGTCTTCAAG-1  SRR9264343  ETV6-RUNX1
## 1_AAACCTGGTGTTGAGG-1 ETV6-RUNX1_1 AAACCTGGTGTTGAGG-1  SRR9264343  ETV6-RUNX1
## 1_AAACCTGTCCCAAGTA-1 ETV6-RUNX1_1 AAACCTGTCCCAAGTA-1  SRR9264343  ETV6-RUNX1
## 1_AAACCTGTCGAATGCT-1 ETV6-RUNX1_1 AAACCTGTCGAATGCT-1  SRR9264343  ETV6-RUNX1
## 1_AAACGGGCACCATCCT-1 ETV6-RUNX1_1 AAACGGGCACCATCCT-1  SRR9264343  ETV6-RUNX1
##                      DatasetName       sum  detected subsets_Mito_sum
##                      <character> <numeric> <integer>        <numeric>
## 1_AAACCTGAGACTTTCG-1       Caron      6677      2056              292
## 1_AAACCTGGTCTTCAAG-1       Caron     12064      3177              575
## 1_AAACCTGGTGTTGAGG-1       Caron      8175      2570              429
## 1_AAACCTGTCCCAAGTA-1       Caron      8638      2389              526
## 1_AAACCTGTCGAATGCT-1       Caron      1418       725               91
## 1_AAACGGGCACCATCCT-1       Caron      2231      1034              103
##                      subsets_Mito_detected subsets_Mito_percent     total
##                                  <integer>            <numeric> <numeric>
## 1_AAACCTGAGACTTTCG-1                    12              4.37322      6677
## 1_AAACCTGGTCTTCAAG-1                    12              4.76625     12064
## 1_AAACCTGGTGTTGAGG-1                    12              5.24771      8175
## 1_AAACCTGTCCCAAGTA-1                    13              6.08937      8638
## 1_AAACCTGTCGAATGCT-1                    11              6.41749      1418
## 1_AAACGGGCACCATCCT-1                    10              4.61676      2231
##                          low_lib_size   low_n_features high_Mito_percent
##                      <outlier.filter> <outlier.filter>  <outlier.filter>
## 1_AAACCTGAGACTTTCG-1            FALSE            FALSE             FALSE
## 1_AAACCTGGTCTTCAAG-1            FALSE            FALSE             FALSE
## 1_AAACCTGGTGTTGAGG-1            FALSE            FALSE             FALSE
## 1_AAACCTGTCCCAAGTA-1            FALSE            FALSE             FALSE
## 1_AAACCTGTCGAATGCT-1            FALSE            FALSE             FALSE
## 1_AAACGGGCACCATCCT-1            FALSE            FALSE             FALSE
##                        SampleName
##                       <character>
## 1_AAACCTGAGACTTTCG-1 ETV6-RUNX1_1
## 1_AAACCTGGTCTTCAAG-1 ETV6-RUNX1_1
## 1_AAACCTGGTGTTGAGG-1 ETV6-RUNX1_1
## 1_AAACCTGTCCCAAGTA-1 ETV6-RUNX1_1
## 1_AAACCTGTCGAATGCT-1 ETV6-RUNX1_1
## 1_AAACGGGCACCATCCT-1 ETV6-RUNX1_1
table(colData(sce)$SampleName)
## 
## ETV6-RUNX1_1 
##         2730

1.2 Exercise 1 : Deconvolution

Cluster cells then normalise.

1.2.1 Cluster cells

set.seed(100) # clusters with PCA from irlba with approximation
clust <- quickCluster(sce) # slow with all cells.
table(clust)
## clust
##   1   2   3   4   5   6   7 
## 242 176 401 676 334 592 309

1.2.2 Compute size factors

# deconvolve
sce <- computePooledFactors(sce, cluster=clust, min.mean=0.1)
# set size factors
deconv.sf <- sizeFactors(sce)
# size factors distribution summary
summary(deconv.sf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1970  0.6816  0.9131  1.0000  1.2012  3.7625

Plot deconvolution size factors against library size factors:

# compute library size factors
lib.sf <- librarySizeFactors(sce)

deconvDf <- data.frame(lib.sf, deconv.sf,
            "SampleGroup" = sce$SampleGroup,
            "sum" = sce$sum,
            "mito_content" = sce$subsets_Mito_percent)

1.2.3 Apply size factors

sce <- logNormCounts(sce) # adds logcounts
print(assayNames(sce))
## [1] "counts"    "logcounts"

1.3 Exercise 2 : sctransform

Get UMI counts matrix:

counts <- counts(sce)
colnames(counts) <- colData(sce)$Barcode

1.3.1 Inspect data

Derive gene and cell attributes from the UMI matrix.

gene_attr <- data.frame(mean = rowMeans(counts), 
                        detection_rate = rowMeans(counts > 0),
                        var = apply(counts, 1, var))
gene_attr$log_mean <- log10(gene_attr$mean)
gene_attr$log_var <- log10(gene_attr$var)
rownames(gene_attr) <- rownames(counts)
cell_attr <- data.frame(n_umi = colSums(counts),
                        n_gene = colSums(counts > 0))
rownames(cell_attr) <- colnames(counts)

Mean-variance relationship

ggplot(gene_attr, aes(log_mean, log_var)) + 
  geom_point(alpha=0.3, shape=16) + 
  geom_density_2d(size = 0.3) +
  geom_abline(intercept = 0, slope = 1, color='red')

Mean-detection-rate relationship

# add the expected detection rate under Poisson model
x = seq(from = -3, to = 2, length.out = 1000)
poisson_model <- data.frame(log_mean = x, detection_rate = 1 - dpois(0, lambda = 10^x))
ggplot(gene_attr, aes(log_mean, detection_rate)) + 
  geom_point(alpha=0.3, shape=16) + 
  geom_line(data=poisson_model, color='red') +
  theme_gray(base_size = 8)

1.3.2 Transformation

Estimate model parameters and transform data

print(dim(counts))
## [1] 28377  2730
# We use the Future API for parallel processing; set parameters here
future::plan(strategy = 'multicore', workers = 4)
options(future.globals.maxSize = 10 * 1024 ^ 3)

set.seed(44)
vst_out <- sctransform::vst(counts,
                            latent_var = c('log_umi'),
                            return_gene_attr = TRUE,
                            return_cell_attr = TRUE,
                            show_progress = FALSE)
sctransform::plot_model_pars(vst_out)

Inspect model:

print(vst_out$model_str)
## [1] "y ~ log_umi"

We will look at several genes in more detail.

rowData(sce) %>%
  as.data.frame %>%
  filter(Symbol %in% c('RPL10', 'FTL'))
sctransform::plot_model(vst_out,
                        counts,
                        c('ENSG00000147403', 'ENSG00000087086'),
                        plot_residual = TRUE)

Distribution of residual mean:

ggplot(vst_out$gene_attr, aes(residual_mean)) +
  geom_histogram(binwidth=0.01)

Distribution of residual variance:

ggplot(vst_out$gene_attr, aes(residual_variance)) +
  geom_histogram(binwidth=0.1) +
  geom_vline(xintercept=1, color='red') +
  xlim(0, 10)

Variance against mean (genes):

ggplot(vst_out$gene_attr,
       aes(log10(gmean), residual_variance)) +
       geom_point(alpha=0.3, shape=16) +
       geom_density_2d(size = 0.3)

Check genes with large residual variance:

dd <- vst_out$gene_attr %>%
    arrange(-residual_variance) %>%
    slice_head(n = 22) %>%
    mutate(across(where(is.numeric), round, 2))

dd %>% tibble::rownames_to_column("ID") %>%
    left_join(as.data.frame(rowData(sce))[,c("ID", "Symbol")], "ID") %>%
    DT::datatable(rownames = FALSE)

Check transformed values:

print(dim(vst_out$y))
## [1] 15134  2730
vst_out$y[1:10,1:5]
##                 AAACCTGAGACTTTCG-1 AAACCTGGTCTTCAAG-1 AAACCTGGTGTTGAGG-1
## ENSG00000241860        -0.05399615        -0.06799107        -0.05844518
## ENSG00000237491        -0.20630436        -0.26683055        -0.22551198
## ENSG00000228794        -0.26852079        -0.35175293         2.88379691
## ENSG00000225880        -0.08894054        -0.11061932        -0.09587264
## ENSG00000230368        -0.12373766        -0.15655685        -0.13420637
## ENSG00000230699        -0.04843311        -0.06022261        -0.05219726
## ENSG00000188976         1.06567541         1.64503545         0.84710871
## ENSG00000187961        -0.05399615        -0.06799107        -0.05844518
## ENSG00000188290        -0.09198579        -0.11429896        -0.09912500
## ENSG00000187608         2.47652968        -0.77399206        -0.65427609
##                 AAACCTGTCCCAAGTA-1 AAACCTGTCGAATGCT-1
## ENSG00000241860        -0.05971516        -0.02926041
## ENSG00000237491        -0.23100495        -0.10233795
## ENSG00000228794        -0.30240495        -0.12824866
## ENSG00000225880        -0.09784441        -0.04962978
## ENSG00000230368        -0.13718966        -0.06547507
## ENSG00000230699        -0.05326882        -0.02713647
## ENSG00000188976         0.78951910        -0.27477949
## ENSG00000187961        -0.05971516        -0.02926041
## ENSG00000188290        -0.10115496        -0.05142845
## ENSG00000187608        -0.67031481        -0.29001927

Genes that are expressed in fewer than 5 cells are not used and not returned, so to add vst_out$y as an assay we need to remove the missing genes.

sceOrig <- sce
sceOrig
## class: SingleCellExperiment 
## dim: 28377 2730 
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(28377): ENSG00000238009 ENSG00000239945 ... ENSG00000278633
##   ENSG00000278817
## rowData names(4): ID Symbol Type Chromosome
## colnames(2730): 1_AAACCTGAGACTTTCG-1 1_AAACCTGGTCTTCAAG-1 ...
##   1_TTTGTCAAGGACGAAA-1 1_TTTGTCAGTTCGGCAC-1
## colData names(16): Sample Barcode ... SampleName sizeFactor
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
geneOverlap <- rownames(sce) %in% rownames(vst_out$y)
if(!all(geneOverlap))
{
  table(rownames(sce) %in% rownames(vst_out$y))
  tmpInd <- which(rownames(sce) %in% rownames(vst_out$y))
  sce <- sce[tmpInd,]
  assayNames(sce)
}
## [1] "counts"    "logcounts"
rm(sceOrig)