Exercise: apply the deconvolution and SCTransform normalisations methods on a single sample: ETV6-RUNX1_1 (aka GSM3872434).
library(scater)
library(scran)
library(tidyverse)
library(BiocSingular)
library(BiocParallel)
bpp <- MulticoreParam(7)
# Read object in:
# remember getwd() and dir()
sce <- readRDS("../Robjects/Caron_filtered.rds")
colData(sce)$SampleName <- colData(sce)$Sample
Select cells for ETV6-RUNX1_1:
# have new list of cell barcodes for each sample
sce.master <- sce
vec.bc <- colData(sce.master) %>%
data.frame() %>%
filter(SampleName == "ETV6-RUNX1_1") %>%
group_by(SampleName) %>%
pull(Barcode)
Number of cells in the sample:
table(colData(sce.master)$Barcode %in% vec.bc)
Subset cells from the SCE object:
tmpInd <- which(colData(sce.master)$Barcode %in% vec.bc)
sce <- sce.master[,tmpInd]
sce
Check columns data:
head(colData(sce))
table(colData(sce)$SampleName)
Cluster cells then normalise.
set.seed(100) # clusters with PCA from irlba with approximation
clust <- quickCluster(sce) # slow with all cells.
table(clust)
# deconvolve
# set size factors
# size factors distribution summary
Plot deconvolution size factors against library size factors:
# compute library size factors
# make data frame keeping library and deconvolution size factors for plotting
# plot deconv.sf against lib.sf
# colour by library size
Get UMI counts matrix:
counts <- counts(sce)
colnames(counts) <- colData(sce)$Barcode
Derive gene and cell attributes from the UMI matrix.
Mean-variance relationship plot
Mean-detection-rate relationship plot
# add the expected detection rate under Poisson model
Estimate model parameters and transform data
print(dim(counts))
# We use the Future API for parallel processing; set parameters here
future::plan(strategy = 'multicore', workers = 4)
options(future.globals.maxSize = 10 * 1024 ^ 3)
set.seed(44)
# now transform:
Inspect model:
print(vst_out$model_str)
We will look at several genes in more detail.
rowData(sce) %>%
as.data.frame %>%
filter(Symbol %in% c('RPL10', 'FTL'))
# plot model data for these two genes:
Distribution of residual mean:
ggplot(vst_out$gene_attr, aes(residual_mean)) +
geom_histogram(binwidth=0.01)
Distribution of residual variance:
ggplot(vst_out$gene_attr, aes(residual_variance)) +
geom_histogram(binwidth=0.1) +
geom_vline(xintercept=1, color='red') +
xlim(0, 10)
Variance against mean (genes):
ggplot(vst_out$gene_attr,
aes(log10(gmean), residual_variance)) +
geom_point(alpha=0.3, shape=16) +
geom_density_2d(size = 0.3)
Check genes with large residual variance:
dd <- vst_out$gene_attr %>%
arrange(-residual_variance) %>%
slice_head(n = 22) %>%
mutate(across(where(is.numeric), round, 2))
dd %>% tibble::rownames_to_column("ID") %>%
left_join(as.data.frame(rowData(sce))[,c("ID", "Symbol")], "ID") %>%
DT::datatable(rownames = FALSE)
Check transformed values:
print(dim(vst_out$y))
vst_out$y[1:10,1:5]
Genes that are expressed in fewer than 5 cells are not used and not returned, so to add vst_out$y as an assay we need to remove the missing genes.
sceOrig <- sce
sceOrig
geneOverlap <- rownames(sce) %in% rownames(vst_out$y)
if(!all(geneOverlap))
{
table(rownames(sce) %in% rownames(vst_out$y))
tmpInd <- which(rownames(sce) %in% rownames(vst_out$y))
sce <- sce[tmpInd,]
assayNames(sce)
}
rm(sceOrig)