Exercise: apply the deconvolution and SCTransform normalisations methods on a single sample: ETV6-RUNX1_1 (aka GSM3872434).
# Read object in:
# remember getwd() and dir()
sce <- readRDS("../Robjects/Caron_filtered.rds")
colData(sce)$SampleName <- colData(sce)$Sample
Select cells for ETV6-RUNX1_1:
# have new list of cell barcodes for each sample
sce.master <- sce
vec.bc <- colData(sce.master) %>%
data.frame() %>%
filter(SampleName == "ETV6-RUNX1_1") %>%
group_by(SampleName) %>%
pull(Barcode)
Number of cells in the sample:
table(colData(sce.master)$Barcode %in% vec.bc)
##
## FALSE TRUE
## 43841 2730
Subset cells from the SCE object:
tmpInd <- which(colData(sce.master)$Barcode %in% vec.bc)
sce <- sce.master[,tmpInd]
sce
## class: SingleCellExperiment
## dim: 28377 2730
## metadata(1): Samples
## assays(1): counts
## rownames(28377): ENSG00000238009 ENSG00000239945 ... ENSG00000278633
## ENSG00000278817
## rowData names(4): ID Symbol Type Chromosome
## colnames(2730): 1_AAACCTGAGACTTTCG-1 1_AAACCTGGTCTTCAAG-1 ...
## 1_TTTGTCAAGGACGAAA-1 1_TTTGTCAGTTCGGCAC-1
## colData names(15): Sample Barcode ... high_Mito_percent SampleName
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
Check columns data:
head(colData(sce))
## DataFrame with 6 rows and 15 columns
## Sample Barcode SampleId SampleGroup
## <character> <character> <character> <character>
## 1_AAACCTGAGACTTTCG-1 ETV6-RUNX1_1 AAACCTGAGACTTTCG-1 SRR9264343 ETV6-RUNX1
## 1_AAACCTGGTCTTCAAG-1 ETV6-RUNX1_1 AAACCTGGTCTTCAAG-1 SRR9264343 ETV6-RUNX1
## 1_AAACCTGGTGTTGAGG-1 ETV6-RUNX1_1 AAACCTGGTGTTGAGG-1 SRR9264343 ETV6-RUNX1
## 1_AAACCTGTCCCAAGTA-1 ETV6-RUNX1_1 AAACCTGTCCCAAGTA-1 SRR9264343 ETV6-RUNX1
## 1_AAACCTGTCGAATGCT-1 ETV6-RUNX1_1 AAACCTGTCGAATGCT-1 SRR9264343 ETV6-RUNX1
## 1_AAACGGGCACCATCCT-1 ETV6-RUNX1_1 AAACGGGCACCATCCT-1 SRR9264343 ETV6-RUNX1
## DatasetName sum detected subsets_Mito_sum
## <character> <numeric> <integer> <numeric>
## 1_AAACCTGAGACTTTCG-1 Caron 6677 2056 292
## 1_AAACCTGGTCTTCAAG-1 Caron 12064 3177 575
## 1_AAACCTGGTGTTGAGG-1 Caron 8175 2570 429
## 1_AAACCTGTCCCAAGTA-1 Caron 8638 2389 526
## 1_AAACCTGTCGAATGCT-1 Caron 1418 725 91
## 1_AAACGGGCACCATCCT-1 Caron 2231 1034 103
## subsets_Mito_detected subsets_Mito_percent total
## <integer> <numeric> <numeric>
## 1_AAACCTGAGACTTTCG-1 12 4.37322 6677
## 1_AAACCTGGTCTTCAAG-1 12 4.76625 12064
## 1_AAACCTGGTGTTGAGG-1 12 5.24771 8175
## 1_AAACCTGTCCCAAGTA-1 13 6.08937 8638
## 1_AAACCTGTCGAATGCT-1 11 6.41749 1418
## 1_AAACGGGCACCATCCT-1 10 4.61676 2231
## low_lib_size low_n_features high_Mito_percent
## <outlier.filter> <outlier.filter> <outlier.filter>
## 1_AAACCTGAGACTTTCG-1 FALSE FALSE FALSE
## 1_AAACCTGGTCTTCAAG-1 FALSE FALSE FALSE
## 1_AAACCTGGTGTTGAGG-1 FALSE FALSE FALSE
## 1_AAACCTGTCCCAAGTA-1 FALSE FALSE FALSE
## 1_AAACCTGTCGAATGCT-1 FALSE FALSE FALSE
## 1_AAACGGGCACCATCCT-1 FALSE FALSE FALSE
## SampleName
## <character>
## 1_AAACCTGAGACTTTCG-1 ETV6-RUNX1_1
## 1_AAACCTGGTCTTCAAG-1 ETV6-RUNX1_1
## 1_AAACCTGGTGTTGAGG-1 ETV6-RUNX1_1
## 1_AAACCTGTCCCAAGTA-1 ETV6-RUNX1_1
## 1_AAACCTGTCGAATGCT-1 ETV6-RUNX1_1
## 1_AAACGGGCACCATCCT-1 ETV6-RUNX1_1
table(colData(sce)$SampleName)
##
## ETV6-RUNX1_1
## 2730
Cluster cells then normalise.
set.seed(100) # clusters with PCA from irlba with approximation
clust <- quickCluster(sce) # slow with all cells.
table(clust)
## clust
## 1 2 3 4 5 6 7
## 242 176 401 676 334 592 309
# deconvolve
sce <- computePooledFactors(sce, cluster=clust, min.mean=0.1)
# set size factors
deconv.sf <- sizeFactors(sce)
# size factors distribution summary
summary(deconv.sf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1970 0.6816 0.9131 1.0000 1.2012 3.7625
Plot deconvolution size factors against library size factors:
# compute library size factors
lib.sf <- librarySizeFactors(sce)
deconvDf <- data.frame(lib.sf, deconv.sf,
"SampleGroup" = sce$SampleGroup,
"sum" = sce$sum,
"mito_content" = sce$subsets_Mito_percent)
# colour by library size
sp <- ggplot(deconvDf, aes(x=lib.sf, y=deconv.sf, col=sum)) +
geom_point()
sp
sce <- logNormCounts(sce) # adds logcounts
print(assayNames(sce))
## [1] "counts" "logcounts"
Get UMI counts matrix:
counts <- counts(sce)
colnames(counts) <- colData(sce)$Barcode
Derive gene and cell attributes from the UMI matrix.
gene_attr <- data.frame(mean = rowMeans(counts),
detection_rate = rowMeans(counts > 0),
var = apply(counts, 1, var))
gene_attr$log_mean <- log10(gene_attr$mean)
gene_attr$log_var <- log10(gene_attr$var)
rownames(gene_attr) <- rownames(counts)
cell_attr <- data.frame(n_umi = colSums(counts),
n_gene = colSums(counts > 0))
rownames(cell_attr) <- colnames(counts)
Mean-variance relationship
ggplot(gene_attr, aes(log_mean, log_var)) +
geom_point(alpha=0.3, shape=16) +
geom_density_2d(size = 0.3) +
geom_abline(intercept = 0, slope = 1, color='red')
Mean-detection-rate relationship
# add the expected detection rate under Poisson model
x = seq(from = -3, to = 2, length.out = 1000)
poisson_model <- data.frame(log_mean = x, detection_rate = 1 - dpois(0, lambda = 10^x))
ggplot(gene_attr, aes(log_mean, detection_rate)) +
geom_point(alpha=0.3, shape=16) +
geom_line(data=poisson_model, color='red') +
theme_gray(base_size = 8)
Estimate model parameters and transform data
print(dim(counts))
## [1] 28377 2730
# We use the Future API for parallel processing; set parameters here
future::plan(strategy = 'multicore', workers = 4)
options(future.globals.maxSize = 10 * 1024 ^ 3)
set.seed(44)
vst_out <- sctransform::vst(counts,
latent_var = c('log_umi'),
return_gene_attr = TRUE,
return_cell_attr = TRUE,
show_progress = FALSE)
sctransform::plot_model_pars(vst_out)
Inspect model:
print(vst_out$model_str)
## [1] "y ~ log_umi"
We will look at several genes in more detail.
rowData(sce) %>%
as.data.frame %>%
filter(Symbol %in% c('RPL10', 'FTL'))
sctransform::plot_model(vst_out,
counts,
c('ENSG00000147403', 'ENSG00000087086'),
plot_residual = TRUE)
Distribution of residual mean:
ggplot(vst_out$gene_attr, aes(residual_mean)) +
geom_histogram(binwidth=0.01)
Distribution of residual variance:
ggplot(vst_out$gene_attr, aes(residual_variance)) +
geom_histogram(binwidth=0.1) +
geom_vline(xintercept=1, color='red') +
xlim(0, 10)
Variance against mean (genes):
ggplot(vst_out$gene_attr,
aes(log10(gmean), residual_variance)) +
geom_point(alpha=0.3, shape=16) +
geom_density_2d(size = 0.3)
Check genes with large residual variance:
dd <- vst_out$gene_attr %>%
arrange(-residual_variance) %>%
slice_head(n = 22) %>%
mutate(across(where(is.numeric), round, 2))
dd %>% tibble::rownames_to_column("ID") %>%
left_join(as.data.frame(rowData(sce))[,c("ID", "Symbol")], "ID") %>%
DT::datatable(rownames = FALSE)
Check transformed values:
print(dim(vst_out$y))
## [1] 15134 2730
vst_out$y[1:10,1:5]
## AAACCTGAGACTTTCG-1 AAACCTGGTCTTCAAG-1 AAACCTGGTGTTGAGG-1
## ENSG00000241860 -0.05399615 -0.06799107 -0.05844518
## ENSG00000237491 -0.20630436 -0.26683055 -0.22551198
## ENSG00000228794 -0.26852079 -0.35175293 2.88379691
## ENSG00000225880 -0.08894054 -0.11061932 -0.09587264
## ENSG00000230368 -0.12373766 -0.15655685 -0.13420637
## ENSG00000230699 -0.04843311 -0.06022261 -0.05219726
## ENSG00000188976 1.06567541 1.64503545 0.84710871
## ENSG00000187961 -0.05399615 -0.06799107 -0.05844518
## ENSG00000188290 -0.09198579 -0.11429896 -0.09912500
## ENSG00000187608 2.47652968 -0.77399206 -0.65427609
## AAACCTGTCCCAAGTA-1 AAACCTGTCGAATGCT-1
## ENSG00000241860 -0.05971516 -0.02926041
## ENSG00000237491 -0.23100495 -0.10233795
## ENSG00000228794 -0.30240495 -0.12824866
## ENSG00000225880 -0.09784441 -0.04962978
## ENSG00000230368 -0.13718966 -0.06547507
## ENSG00000230699 -0.05326882 -0.02713647
## ENSG00000188976 0.78951910 -0.27477949
## ENSG00000187961 -0.05971516 -0.02926041
## ENSG00000188290 -0.10115496 -0.05142845
## ENSG00000187608 -0.67031481 -0.29001927
Genes that are expressed in fewer than 5 cells are not used and not returned, so to add vst_out$y as an assay we need to remove the missing genes.
sceOrig <- sce
sceOrig
## class: SingleCellExperiment
## dim: 28377 2730
## metadata(1): Samples
## assays(2): counts logcounts
## rownames(28377): ENSG00000238009 ENSG00000239945 ... ENSG00000278633
## ENSG00000278817
## rowData names(4): ID Symbol Type Chromosome
## colnames(2730): 1_AAACCTGAGACTTTCG-1 1_AAACCTGGTCTTCAAG-1 ...
## 1_TTTGTCAAGGACGAAA-1 1_TTTGTCAGTTCGGCAC-1
## colData names(16): Sample Barcode ... SampleName sizeFactor
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
geneOverlap <- rownames(sce) %in% rownames(vst_out$y)
if(!all(geneOverlap))
{
table(rownames(sce) %in% rownames(vst_out$y))
tmpInd <- which(rownames(sce) %in% rownames(vst_out$y))
sce <- sce[tmpInd,]
assayNames(sce)
}
## [1] "counts" "logcounts"
rm(sceOrig)