1 Quality Control

1.1 Introduction

We will use two sets of Bone Marrow Mononuclear Cells (BMMC):

  • ‘CaronBourque2020’: pediatric samples
  • ‘Hca’: HCA Census of Immune Cells for adult BMMCs

Fastq files were retrieved from publicly available archive (SRA and HCA).

Sequencing quality was assessed and visualised using fastQC and MultiQC.

Reads were aligned against GRCh38 and features counted using cellranger (v3.1.0).

We will now check the quality of the data further:

  • mapping quality and amplification rate
  • cell counts
  • distribution of keys quality metrics

We will then:

  • filter genes with very low expression
  • identify low-quality cells
  • filter and/or mark low quality cells

1.2 Load packages

  • SingleCellExperiment - to store the data
  • Matrix - to deal with sparse and/or large matrices
  • DropletUtils - utilities for the analysis of droplet-based, inc. cell counting
  • scater - QC
  • scran - normalisation
  • igraph - graphs
  • biomaRt - for gene annotation
  • ggplot2 - for plotting
  • irlba - for faster PCA
#wrkDir <- "/mnt/scratchb/bioinformatics/baller01/20200511_FernandesM_ME_crukBiSs2020/CaronBourque2020/grch38300"
projDir <- "/home/ubuntu/Course_Materials/scRNAseq"
wrkDir <- sprintf("%s/CaronBourque2020/grch38300", projDir) 
#setwd(wrkDir)
outDirBit <- params$outDirBit # "AnaWiSeurat/Attempt1"
qcPlotDirBit <- "QcPlots"
poolBool <- FALSE # whether to read each sample in and pool them and write object to file, or just load that file.
biomartBool <- FALSE # biomaRt sometimes fails, do it once, write to file and use that copy.
addQcBool <- FALSE # biomaRt sometimes fails, do it once, write to file and use that copy.
runAll <- FALSE # TRUE

1.3 Sample sheet

We will load both the Caron and Hca data sets.

# CaronBourque2020
cb_sampleSheetFn <- file.path(projDir, "Data/CaronBourque2020/SraRunTable.txt")
# Human Cell Atlas
hca_sampleSheetFn <- file.path(projDir, "Data/Hca/accList_Hca.txt")

# read sample sheet in:
splShtColToKeep <- c("Run", "Sample.Name", "source_name")

cb_sampleSheet <- read.table(cb_sampleSheetFn, header=T, sep=",")
hca_sampleSheet <- read.table(hca_sampleSheetFn, header=F, sep=",")
colnames(hca_sampleSheet) <- "Sample.Name"
hca_sampleSheet$Run <- hca_sampleSheet$Sample.Name
hca_sampleSheet$source_name <- "ABMMC" # adult BMMC

sampleSheet <- rbind(cb_sampleSheet[,splShtColToKeep], hca_sampleSheet[,splShtColToKeep])

sampleSheet %>%
    as.data.frame() %>%
    datatable(rownames = FALSE)

1.4 Data representation

We will use a SingleCellExperiment object that is described here and stores various data types:

  • the count matrix
  • feature (gene) annotation
  • droplet annotation
  • outcome of downstream analysis such as dimensionality reduction

[singleCellExperimentClass.png](/home/ubuntu/Course_Materials/scRNAseq/Images/singleCellExperimentClass.png

1.5 Example

We will load the data for the first sample in the sample sheet: SRR9264343.

i <- 1
sample.path <- sprintf("%s/%s/%s/outs/raw_feature_bc_matrix/", wrkDir, sampleSheet[i,"Run"], sampleSheet[i,"Run"])
sce.raw <- read10xCounts(sample.path, col.names=TRUE)
sce.raw
## class: SingleCellExperiment 
## dim: 33538 737280 
## metadata(1): Samples
## assays(1): counts
## rownames(33538): ENSG00000243485 ENSG00000237613 ... ENSG00000277475
##   ENSG00000268674
## rowData names(3): ID Symbol Type
## colnames(737280): AAACCTGAGAAACCAT-1 AAACCTGAGAAACCGC-1 ...
##   TTTGTCATCTTTAGTC-1 TTTGTCATCTTTCCTC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## altExpNames(0):

We can access these different types of data with various functions.

Number of genes and droplets in the count matrix:

dim(counts(sce.raw))
## [1]  33538 737280

Features, with rowData():

head(rowData(sce.raw))
## DataFrame with 6 rows and 3 columns
##                              ID      Symbol            Type
##                     <character> <character>     <character>
## ENSG00000243485 ENSG00000243485 MIR1302-2HG Gene Expression
## ENSG00000237613 ENSG00000237613     FAM138A Gene Expression
## ENSG00000186092 ENSG00000186092       OR4F5 Gene Expression
## ENSG00000238009 ENSG00000238009  AL627309.1 Gene Expression
## ENSG00000239945 ENSG00000239945  AL627309.3 Gene Expression
## ENSG00000239906 ENSG00000239906  AL627309.2 Gene Expression

Samples, with colData():

head(colData(sce.raw))
## DataFrame with 6 rows and 2 columns
##                                                                                                                                 Sample
##                                                                                                                            <character>
## AAACCTGAGAAACCAT-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAACCGC-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAACCTA-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAACGAG-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAACGCC-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAAGTGG-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
##                               Barcode
##                           <character>
## AAACCTGAGAAACCAT-1 AAACCTGAGAAACCAT-1
## AAACCTGAGAAACCGC-1 AAACCTGAGAAACCGC-1
## AAACCTGAGAAACCTA-1 AAACCTGAGAAACCTA-1
## AAACCTGAGAAACGAG-1 AAACCTGAGAAACGAG-1
## AAACCTGAGAAACGCC-1 AAACCTGAGAAACGCC-1
## AAACCTGAGAAAGTGG-1 AAACCTGAGAAAGTGG-1

Single-cell RNA-seq data compared to bulk RNA-seq is sparse, especially with droplet-based methods such as 10X, mostly because:

  • a given cell does not express each gene
  • the library preparation does not capture all transcript the cell does express
  • the sequencing depth per cell is far lower

Counts, with counts(). Given the large number of droplets in a sample, count matrices can be large. They are however very sparse and can be stored in a ‘sparse matrix’ that only stores non-zero values, for example a ‘dgCMatrix’ object (‘DelayedArray’ class).

counts(sce.raw) <- as(counts(sce.raw), "dgCMatrix")
#class(counts(sce.raw))
counts(sce.raw)[1:10, 1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
##                                    
## ENSG00000243485 . . . . . . . . . .
## ENSG00000237613 . . . . . . . . . .
## ENSG00000186092 . . . . . . . . . .
## ENSG00000238009 . . . . . . . . . .
## ENSG00000239945 . . . . . . . . . .
## ENSG00000239906 . . . . . . . . . .
## ENSG00000241599 . . . . . . . . . .
## ENSG00000236601 . . . . . . . . . .
## ENSG00000284733 . . . . . . . . . .
## ENSG00000235146 . . . . . . . . . .

1.6 Mapping QC

1.6.1 Gene body coverage

In /home/ubuntu/Course_Materials/scRNAseq/ScriptsForTut:

tmpFn <- sprintf("%s/Images/1_AAACCTGAGACTTTCG-1.rseqcGeneBodyCovCheck.txt.geneBodyCoverage.curves.png", projDir)
knitr::include_graphics(tmpFn, auto_pdf = TRUE)

1.6.2 Amplification rate

We will use the information stored in the ‘molecule info’ file to count the number of UMI and reads for each gene in each cell.

##mol.info.file <- sprintf("%s/%s/%s/outs/molecule_info.h5", wrkDir, sampleSheet[i,"Run"], sampleSheet[i,"Run"])
##mol.info <- read10xMolInfo(mol.info.file)
# or mol.info object if issue with H5Fopen
i <- 1
mol.info.file <- sprintf("%s/%s/%s/outs/molecule_info_h5.Rds", wrkDir, sampleSheet[i,"Run"], sampleSheet[i,"Run"])
mol.info <- readRDS(mol.info.file)
# slow
mol.info$data
## DataFrame with 18544666 rows and 5 columns
##                      cell       umi gem_group      gene     reads
##               <character> <integer> <integer> <integer> <integer>
## 1        AAACCTGAGAAACCTA    467082         1      3287         1
## 2        AAACCTGAGAAACCTA    205888         1      3446         1
## 3        AAACCTGAGAAACCTA    866252         1      3896         3
## 4        AAACCTGAGAAACCTA    796027         1      3969         1
## 5        AAACCTGAGAAACCTA    542561         1      5008         1
## ...                   ...       ...       ...       ...       ...
## 18544662 TTTGTCATCTTTAGTC    927060         1     23634         1
## 18544663 TTTGTCATCTTTAGTC    975865         1     27143         1
## 18544664 TTTGTCATCTTTAGTC    364964         1     27467         4
## 18544665 TTTGTCATCTTTAGTC    152570         1     30125         7
## 18544666 TTTGTCATCTTTAGTC    383230         1     30283         5
head(mol.info$genes)
## [1] "ENSG00000243485" "ENSG00000237613" "ENSG00000186092" "ENSG00000238009"
## [5] "ENSG00000239945" "ENSG00000239906"
dd <- mol.info$data %>% data.frame()
dd$umi <- as.character(dd$umi)

tmpFn <- sprintf("%s/%s/Robjects/sce_preProc_ampDf1.Rds", projDir, outDirBit)
if(!FALSE) # slow
{
ampDf <- dd %>% group_by(cell, gene) %>%
    summarise(nUmis = n(), totReads=sum(reads)) %>%
    data.frame()

# Write object to file
saveRDS(ampDf, tmpFn)
} else {
  ampDf <- readRDS(tmpFn)
}
rm(tmpFn)


summary(ampDf$nUmis)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    1.000    1.000    1.000    2.377    1.000 7137.000
summary(ampDf$totReads)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00     2.00     6.00    15.23    12.00 79275.00
summary(log2(ampDf$nUmis))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.4504  0.0000 12.8011
summary(log2(ampDf$totReads))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   2.585   2.401   3.585  16.275

We now plot the amplification rate.

# too slow
sp <- ggplot(ampDf, aes(x=nUmis, y=totReads)) +
  geom_point() +
  scale_x_continuous(trans='log2') +
  scale_y_continuous(trans='log2')
sp
#ggMarginal(sp)
sp2 <- ggplot(ampDf, aes(x=nUmis, y=totReads)) +
  geom_bin2d(bins = 50) +
  scale_fill_continuous(type = "viridis") +
  scale_x_continuous(trans='log10') +
  scale_y_continuous(trans='log10') +
  theme_bw()

sp2

#rm(mol.info)
gc(verbose=FALSE)
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells   8359027 446.5   35542973 1898.2  69419867 3707.5
## Vcells 105889427 807.9  277281324 2115.5 277281324 2115.5

1.7 Cell calling for droplet data

For a given sample, amongst the tens of thousands of droplets used in the assay, some will contain a cell while many others will not. The presence of RNA in a droplet will show with non-zero UMI count. This is however not sufficient to infer that the droplet does contain a cell. Indeed, after sample preparation, some cell debris including RNA will still float in the mix. This ambient RNA is unwillignly captured during library preparation and sequenced.

Cellranger generates a count matrix that includes all droplets analysed in the assay. We will now load this ‘raw matrix’ for one sample and draw the distribution of UMI counts.

Distribution of UMI counts:

dd <- mol.info$data %>% data.frame()
dd$umi <- as.character(dd$umi)
libSizeDf <- dd %>% group_by(cell) %>%
    summarise(nUmis = n(), totReads=sum(reads)) %>%
    data.frame()
ggplot(libSizeDf, aes(x=log10(nUmis))) + geom_histogram(bins = 50)

Library size varies widely, both amongst empty droplets and droplets carrying cells, mostly due to:

  • variation in droplet size,
  • amplification efficiency,
  • sequencing

Most cell counting methods try to identify the library size that best distinguishes empty from cell-carrying droplets.

1.7.1 Mixture model

This method by default fits a mixture of two normal distributions to the logged library sizes:

  • one with a small mean for empty droplets
  • the other with a higher mean for cell-carrying droplets
set.seed(100)
# get package
library("mixtools")
# have library sizes on a log10 scale
log10_lib_size <- log10(libSizeDf$nUmis)
# fit mixture
mix <- normalmixEM(log10_lib_size, maxrestarts=50, epsilon = 1e-03)
## One of the variances is going to zero;  trying new starting values.
## number of iterations= 19
# plot
plot(mix, which=2, xlab2="log(mol per cell)")
# get density for each distribution:
p1 <- dnorm(log10_lib_size, mean=mix$mu[1], sd=mix$sigma[1])
p2 <- dnorm(log10_lib_size, mean=mix$mu[2], sd=mix$sigma[2])
# find intersection:
if (mix$mu[1] < mix$mu[2]) {
    split <- min(log10_lib_size[p2 > p1])
} else {
    split <- min(log10_lib_size[p1 > p2])
}
# show split on plot:
#log10_lib_size <- log10(libSizeDf$nUmis)
abline(v=split, lwd=2)

1.7.2 Barcode rank plot

The barcode rank plot shows the library sizes against their rank in decreasing order.

barcode_rank <- rank(-libSizeDf$nUmis)
plot(barcode_rank, libSizeDf$nUmis, xlim=c(1,10000), ylab="library size")

Given the exponential shape of the curve above, library sizes can be shown on the log10 scale:

plot(barcode_rank, log10_lib_size, xlim=c(1,10000))

The point on the curve where it drops sharply may be used as the split point. Before that point library sizes are high, because droplets carry a cell. After that point, library sizes are far smaller because droplets do not carry a cell, only ambient RNA (… or do they?).

Here, we could manually count 2500 cells. There are however more robust and convenient methods.

1.7.3 Inflection point

We could also compute the inflection point.

o <- order(barcode_rank)
log10_lib_size <- log10_lib_size[o]
barcode_rank <- barcode_rank[o]

rawdiff <- diff(log10_lib_size)/diff(barcode_rank)
inflection <- which(rawdiff == min(rawdiff[100:length(rawdiff)], na.rm=TRUE))

plot(barcode_rank, log10_lib_size, xlim=c(1,10000))
abline(v=inflection, col="red", lwd=2)

The inflection is at 3279 UMIs and 2306 cells are detected.

1.7.4 Cellranger v1 and v2

Given an expected number of cells, cellranger used to assume a ~10-fold range of library sizes for real cells and estimate this range (cellranger (v1 and v2). The threshold was defined as the 99th quantile of the library size, divided by 10.

n_cells <- 2500
# CellRanger
totals <- sort(libSizeDf$nUmis, decreasing = TRUE)
# 99th percentile of top n_cells divided by 10
thresh = totals[round(0.01*n_cells)]/10
plot(log10(totals), xlim=c(1,10000))
abline(h=log10(thresh), col="red", lwd=2)

# table(libSizeDf$nUmis >= thresh)

The threshold is at 1452 UMIs and 2773 cells are detected.

1.7.5 DropletUtils and EmptyDrops

The DropletUtils package offers utilities to analyse droplet-based data, including cell counting using the library size as seen above. These simple approaches may exclude droplets with small or quiet cells with low RNA content. The emptyDrops method calls cells by first computing the expression profile for droplets with RNA content so low they almost certainly do not contain any cell: the ‘background’ or ‘ambient’ profile. The method then tests each non-background droplet for significant difference in expression profile.

Let’s first check the knee and inflection methods.

my.counts <- counts(sce.raw)
br.out <- barcodeRanks(my.counts)
plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total UMI count")
o <- order(br.out$rank)
lines(br.out$rank[o], br.out$fitted[o], col="red")

abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"), 
    legend=c("knee", "inflection"))

Testing for empty droplets.

We will call cells with a false discovery rate (FDR) of 0.1% so that at most 1 in 1000 droplets called may be empty.

# a bit slow
# significance is computed by simulation so we set a seed for reproducibility
set.seed(100)
# run analysis:
e.out <- emptyDrops(counts(sce.raw))
e.out
## DataFrame with 737280 rows and 5 columns
##                        Total   LogProb    PValue   Limited       FDR
##                    <integer> <numeric> <numeric> <logical> <numeric>
## AAACCTGAGAAACCAT-1         0        NA        NA        NA        NA
## AAACCTGAGAAACCGC-1         0        NA        NA        NA        NA
## AAACCTGAGAAACCTA-1        31        NA        NA        NA        NA
## AAACCTGAGAAACGAG-1         0        NA        NA        NA        NA
## AAACCTGAGAAACGCC-1         0        NA        NA        NA        NA
## ...                      ...       ...       ...       ...       ...
## TTTGTCATCTTTACAC-1         0        NA        NA        NA        NA
## TTTGTCATCTTTACGT-1         1        NA        NA        NA        NA
## TTTGTCATCTTTAGGG-1         0        NA        NA        NA        NA
## TTTGTCATCTTTAGTC-1        26        NA        NA        NA        NA
## TTTGTCATCTTTCCTC-1         0        NA        NA        NA        NA

NAs are assigned to droplets used to compute the ambient profile.

Get summary:

summary(e.out$FDR <= 0.001)
##    Mode   FALSE    TRUE    NA's 
## logical     492    3070  733718

The test significance is computed by permutation. For each droplet tested, the number of permutations may limit the value of the p-value. This information is available in the ‘Limited’ column. If ‘Limited’ is ‘TRUE’ for any non-significant droplet, the number of permutations was too low, should be increased and the analysis re-run.

table(Sig=e.out$FDR <= 0.001, Limited=e.out$Limited)
##        Limited
## Sig     FALSE TRUE
##   FALSE   492    0
##   TRUE     71 2999

Let’s check that the background comprises only empty droplets. If the droplets used to define the background profile are indeed empty, testing them should result in a flat distribution of p-values. Let’s test the ‘ambient’ droplets and draw the p-value distribution.

set.seed(100)
limit <- 100   
all.out <- emptyDrops(counts(sce.raw), lower=limit, test.ambient=TRUE)
hist(all.out$PValue[all.out$Total <= limit & all.out$Total > 0],
    xlab="P-value", main="", col="grey80") 

The distribution of p-values looks uniform with no large peak for small values: no cell in these droplets.

To evaluate the outcome of the analysis, we will plot the strength of the evidence against library size.

is.cell <- e.out$FDR <= 0.001
#is.cell <- e.out$FDR <= 0.01
sum(is.cell, na.rm=TRUE)
## [1] 3070
cellColour <- ifelse(is.cell, "red", "black") # rep("black", nrow((e.out)))
tmpBoolInflex <- e.out$Total < metadata(br.out)$inflection #
tmpBoolSmall <- e.out$FDR <= 0.001 # small cell
#table(tmpBoolInflex, tmpBoolSmall)
cellColour[tmpBoolInflex & tmpBoolSmall] <- "green" # 'recovered' cells

plot(log10(e.out$Total), -e.out$LogProb, col=ifelse(is.cell, "red", "black"), xlim=c(2,max(log10(e.out$Total))),
    xlab="Total UMI count", ylab="-Log Probability")
points(log10(e.out$Total)[tmpBoolInflex & tmpBoolSmall], -e.out$LogProb[tmpBoolInflex & tmpBoolSmall], pch=16, col="green")

Let’s filter out empty droplets.

sce.ed <- sce.raw[,which(e.out$FDR <= 0.001)] # ed for empty droplet
sce.ed
## class: SingleCellExperiment 
## dim: 33538 3070 
## metadata(1): Samples
## assays(1): counts
## rownames(33538): ENSG00000243485 ENSG00000237613 ... ENSG00000277475
##   ENSG00000268674
## rowData names(3): ID Symbol Type
## colnames(3070): AAACCTGAGACTTTCG-1 AAACCTGGTCTTCAAG-1 ...
##   TTTGTCACAGGCTCAC-1 TTTGTCAGTTCGGCAC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## altExpNames(0):

Cell calling in cellranger v3 uses a method similar to emptyDrops() and a ‘filtered matrix’ is generated that only keeps droplets deemed to contain a cell. We will load these filtered matrices now.

1.8 Load filtered matrices

Each sample was analysed with cellranger separately. We load filtered matrices one sample at a time, showing for each the name and number of features and cells.

# a bit slow
# load data:
sce.list <- vector("list", length = nrow(sampleSheet))

for (i in 1:nrow(sampleSheet))
{
    print(sprintf("%s: %s", sampleSheet[i,"Run"], sampleSheet[i,"Sample.Name"]))
    sample.path <- sprintf("%s/%s/%s/outs/filtered_feature_bc_matrix/",
                  wrkDir, sampleSheet[i,"Run"], sampleSheet[i,"Run"])
    sce.list[[i]] <- read10xCounts(sample.path)
    #print(sce.list[[i]])
    print(dim(sce.list[[i]]))
}

Let’s combine all 20 samples into a single object.

We first check the feature lists are identical.

# combine
# check row names are the same

rowNames1 <- rownames(sce.list[[1]])
for (i in 2:nrow(sampleSheet))
{
    print(identical(rowNames1, rownames(sce.list[[i]])))
}

A cell barcode comprises the actual sequence and a ‘group ID’, e.g. AAACCTGAGAAACCAT-1. The latter helps distinguish cells with identical barcode sequence but come from different samples. As each sample was analysed separately, the group ID is set to 1 in all data sets. To pool these data sets we first need to change group IDs so cell barcodes are unique across all samples. We will use the position of the sample in the sample sheet.

# subset 2000 cells:
if(ncol(sce.list[[1]]) < 2000)
{
  sce <- sce.list[[1]]
} else {
  sce <- sce.list[[1]][, sample(ncol(sce.list[[1]]), 2000)]
}

colData(sce)$Barcode <- gsub("([0-9])$", 1, colData(sce)$Barcode)
print(head(colData(sce)$Barcode))
print(tail(colData(sce)$Barcode))
for (i in 2:nrow(sampleSheet))
{
  if(ncol(sce.list[[i]]) < 2000)
  {
      sce.tmp <- sce.list[[i]]
  } else {
    sce.tmp <- sce.list[[i]][, sample(ncol(sce.list[[i]]), 2000)]
  }
    colData(sce.tmp)$Barcode <- gsub("([0-9])$", i, colData(sce.tmp)$Barcode)
    sce <- cbind(sce, sce.tmp)
    #print(head(colData(sce)$Barcode))
    print(tail(colData(sce)$Barcode, 2))
}

We now add the sample sheet information to the object metadata.

colDataOrig <- colData(sce)
#colData(sce) <- colDataOrig[,1:2]

# split path:
tmpList <- strsplit(colDataOrig$Sample, split="/")
# get Run ID, to use to match sample in the meta data and sample sheet objects:
tmpVec <- unlist(lapply(tmpList, function(x){x[9]}))
colData(sce)$Run <- tmpVec
# merge:
colData(sce) <- colData(sce) %>% data.frame %>% left_join(sampleSheet[,splShtColToKeep], "Run") %>% DataFrame

Let’s save the object for future reference.

# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_postPool.Rds", projDir, outDirBit)
saveRDS(sce, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_postPool.Rds", projDir, outDirBit)
sce <- readRDS(tmpFn)

1.9 Properties of scRNA-seq data

# a bit slow

# TODO first remove emtpy droplets the mol.info data
mol.info$data
head(mol.info$genes)

dd <- mol.info$data %>% data.frame()
dd$umi <- as.character(dd$umi)

tmpFn <- sprintf("%s/%s/Robjects/sce_preProc_ampDf2.Rds", projDir, outDirBit)
if(!FALSE) # slow
{
ampDf <- dd %>%
    filter(paste(cell,gem_group, sep="-") %in% sce$Barcode) %>%
  group_by(cell, gene) %>%
    summarise(nUmis = n(), totReads=sum(reads)) %>%
    data.frame()
# Write object to file
saveRDS(ampDf, tmpFn)
} else {
  ampDf <- readRDS(tmpFn)
}
rm(tmpFn)

#summary(ampDf$nUmis)
#summary(ampDf$totReads)

#summary(log2(ampDf$nUmis))
#summary(log2(ampDf$totReads))

hist(log2(ampDf$nUmis), n=100)
hist(log2(ampDf$totReads), n=100)
sp2 <- ggplot(ampDf, aes(x=nUmis, y=totReads)) +
  geom_bin2d(bins = 50) +
  scale_fill_continuous(type = "viridis") +
  scale_x_continuous(trans='log10') +
  scale_y_continuous(trans='log10') +
  theme_bw()

sp2

e#rm(mol.info)
gc()

The number and identity of genes detected in a cell vary across cells: the total number of genes detected across all cells is far larger than the number of genes per cell.

# for each gene, compute total number of UMIs across all cells,
# then counts genes with at least one UMI:
sum(rowSums(counts(sce)) > 0)
## [1] 25154
# for each cell count number of genes with at least 1 UMI
# then compute distribution moments:
summary(colSums(counts(sce) > 0))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##      29     717     939    1225    1523    6570

Now let’s plot for each gene, the total number of UMIs and the proportion of cells that express it. Lowly expressed genes tend to be detected in a large proportion of cells. The higher the overall expression the lower the proportion of cells.

# randomly subset 1000 cells.
tmpCounts <- counts(sce)[,sample(1000)]

plot(rowSums(tmpCounts),
     rowMeans(tmpCounts == 0),
     log = "x",
     xlab="total number of UMIs",
     ylab="proportion of cells expressing the gene"
)

Count genes that are not ‘expressed’ (detected):

not.expressed <- rowSums(counts(sce)) == 0
table(not.expressed)
## not.expressed
## FALSE  TRUE 
## 25154  8384

Plot the percentage of counts per gene and show genes with the highest UMI counts:

#Compute the relative expression of each gene per cell
rel_expression <- t( t(counts(sce)) / Matrix::colSums(counts(sce))) * 100
rownames(rel_expression) <- rowData(sce)$Symbol
most_expressed <- sort(Matrix::rowSums( rel_expression ),T)[20:1] / ncol(sce)

boxplot( as.matrix(t(rel_expression[names(most_expressed),])),cex=.1, las=1, xlab="% total count per cell",col=scales::hue_pal()(20)[20:1],horizontal=TRUE)

Mind that we have combined two data sets here. It may be interesting to count non-expressed genes in each set separately.

1.10 Quality control

Cell calling performed above does not inform on the quality of the library in each of the droplets kept. Poor-quality cells, or rather droplets, may be caused by cell damage during dissociation or failed library preparation. They usually have low UMI counts, few genes detected and/or high mitochondrial content. The presence may affect normalisation, assessment of cell population heterogeneity, clustering and trajectory:

  • Normalisation: Contaminating genes, ‘the ambient RNA’, are detected at low levels in all libraires. In low quality libraries with low RNA content, scaling will increase counts for these genes more than for better-quality cells, resulting in their apparent upregulation in these cells and increased variance overall.
  • Cell population heterogeneity: variance estimation and dimensionality reduction with PCA where the first principal component will be correlated with library size, rather than biology.
  • Clustering and trajectory: higher mitochondrial and/or nuclear RNA content may cause low-quality cells to cluster separately or form states or trajectories between distinct cell types.

We will now exclude lowly expressed features and identify low-quality cells using the following metrics mostly:

  • library size, i.e. the total number of UMIs per cell
  • number of features detected per cell
  • mitochondrial content, i.e. the proportion of UMIs that map to mitochondrial genes, with higher values consistent with leakage from the cytoplasm of RNA, but not mitochondria

We will first annotate genes, to know which lie in the mitochondrial genome, then use scater’s addPerCellQC() to compute various metrics.

Annotate genes with biomaRt.

# retrieve the feature information
gene.info <- rowData(sce)

# setup the biomaRt connection to Ensembl using the correct species genome (hsapiens_gene_ensembl)
ensembl <- useEnsembl(biomart='ensembl', dataset='hsapiens_gene_ensembl')

# retrieve the attributes of interest from biomaRt using the Ensembl gene ID as the key
# beware that this will only retrieve information for matching IDs
gene_symbol <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'chromosome_name', 'start_position', 'end_position', 'strand'),
                     filters='ensembl_gene_id', mart=ensembl,
                     values=gene.info[, 1])

# create a new data frame of the feature information
gene.merge <- merge(gene_symbol, gene.info, by.x=c('ensembl_gene_id'), by.y=c('ID'), all.y=TRUE)
rownames(gene.merge) <- gene.merge$ensembl_gene_id

# set the order for the same as the original gene information
gene.merge <- gene.merge[gene.info[, 1], ]

# reset the rowdata on the SCE object to contain all of this information
rowData(sce) <- gene.merge
# slow
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_postBiomart.Rds", projDir, outDirBit)
saveRDS(sce, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_postBiomart.Rds", projDir, outDirBit)
sce <- readRDS(tmpFn)

Number of genes per chromosome, inc. 13 on the mitochondrial genome:

# number of genes per chromosome
table(rowData(sce)$chromosome_name) %>%
    as.data.frame() %>%
    datatable(rownames = FALSE)
# mitochondrial genes
is.mito <- which(rowData(sce)$chromosome_name=="MT")

Calculate and store QC metrics for genes with addPerFeatureQC() and for cells with addPerCellQC().

# long
# for genes
sce <- addPerFeatureQC(sce)
# colnames(rowData(sce))
head(rowData(sce)) %>%
    as.data.frame() %>%
    datatable(rownames = TRUE)

Three columns of interest for cells:

  • ‘sum’: total UMI count
  • ‘detected’: number of features (genes) detected
  • ‘subsets_Mito_percent’: percentage of reads mapped to mitochondrial transcripts
# for cells
sce <- addPerCellQC(sce, subsets=list(Mito=is.mito))
# colnames(colData(sce))
head(colData(sce)) %>%
    as.data.frame() %>%
    datatable(rownames = TRUE)
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_postAddQc.Rds", projDir, outDirBit)
saveRDS(sce, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_postAddQc.Rds", projDir, outDirBit)
sce <- readRDS(tmpFn)

1.10.1 QC metric distribution

Overall:

par(mfrow=c(1, 2))
hist(log10(sce$sum), breaks=20, col="grey80", xlab="Log-total UMI count", main="")
hist(sce$subsets_Mito_percent, breaks=20, col="grey80", xlab="Proportion of reads in mitochondrial genes", main="")
abline(v=20, lty=2, col='purple')

Per sample group:

sce$source_name <- factor(sce$source_name)
sce$block <- sce$source_name
sce$setName <- ifelse(grepl("ABMMC", sce$source_name), "Hca", "Caron")

Library size (‘Total count’):

p <- plotColData(sce, x="block", y="sum",
        other_fields="setName") + #facet_wrap(~setName) + 
        scale_y_log10() + ggtitle("Total count")
p

Number of genes (‘detected’):

p <- plotColData(sce, x="block", y="detected", 
        other_fields="setName") + #facet_wrap(~setName) + 
        scale_y_log10() + ggtitle("Detected features")
p

Mitochondrial content (‘subsets_Mito_percent’):

p <- plotColData(sce, x="block", y="subsets_Mito_percent", 
        other_fields="setName") + # facet_wrap(~setName) +
        ggtitle("Mito percent")
p

#tmpFn <- sprintf("%s/%s/%s/qc_metricDistrib_plotColData.png", projDir, outDirBit, qcPlotDirBit)
tmpFn <- sprintf("%s/%s/%s/qc_metricDistrib_plotColData2.png", projDir, outDirBit, qcPlotDirBit)
png(tmpFn)
# violin plots
gridExtra::grid.arrange(
    plotColData(sce, x="block", y="sum",
        other_fields="setName") + #facet_wrap(~setName) + 
        scale_y_log10() + ggtitle("Total count"),
    plotColData(sce, x="block", y="detected", 
        other_fields="setName") + #facet_wrap(~setName) + 
        scale_y_log10() + ggtitle("Detected features"),
    plotColData(sce, x="block", y="subsets_Mito_percent", 
        other_fields="setName") + # facet_wrap(~setName) +
        ggtitle("Mito percent"),
    ncol=1
)
dev.off()
#tmpFn <- sprintf("%s/%s/%s/qc_metricDistrib_plotColData.png", projDir, outDirBit, qcPlotDirBit)
tmpFn <- sprintf("%s/%s/%s/qc_metricDistrib_plotColData2.png", projDir, outDirBit, qcPlotDirBit)
knitr::include_graphics(tmpFn, auto_pdf = TRUE)

rm(tmpFn)

Correlation between the number of genes detected and library size (‘detected’ against ‘sum’):

#sp <- ggplot(data.frame(colData(sce)), aes(x=sum, y=detected, col=source_name)) +
#  geom_point()
#sp + facet_wrap(~source_name)

sp <- ggplot(data.frame(colData(sce)), aes(x=sum, y=detected, col=subsets_Mito_percent)) +
  geom_point()
sp + facet_wrap(~source_name)

1.11 Identification of low-quality cells with adaptive thresholds

One can use hard threshold for the library size, number of genes detected and mitochondrial content. These will however vary across runs. It may therefore be preferable to rely on outlier detection to identify cells that markerdly differ from most cells.

We saw above that the distribution of the QC metrics is close to Normal. Hence, we can detect outlier using the median and the median absolute deviation (MAD) from the median (not the mean and the standard deviation that both are sensitive to outliers).

For a given metric, an outlier value is one that lies over some number of MADs away from the median. A cell will be excluded if it is an outlier in the part of the range to avoid, for example low gene counts, or high mitochondrial content. For a normal distribution, a threshold defined with a distance of 3 MADs from the median retains about 99% of values.

1.11.1 Library size

For the library size we use the log scale to avoid negative values for lower part of the distribution.

qc.lib2 <- isOutlier(sce$sum, log=TRUE, type="lower")
table(qc.lib2)
## qc.lib2
## FALSE  TRUE 
## 37788   340

Threshold values:

attr(qc.lib2, "thresholds")
##    lower   higher 
## 571.1178      Inf

1.11.2 Number of genes

For the number of genes detected we also use the log scale to avoid negative values for lower part of the distribution.

qc.nexprs2 <- isOutlier(sce$detected, log=TRUE, type="lower")
table(qc.nexprs2)
## qc.nexprs2
## FALSE  TRUE 
## 37719   409

Threshold values:

attr(qc.nexprs2, "thresholds")
##    lower   higher 
## 201.0401      Inf

1.11.3 Mitochondrial content

For the mitochondrial content the exclusion zone is in the higher part of the distribution.

qc.mito2 <- isOutlier(sce$subsets_Mito_percent, type="higher")
table(qc.mito2)
## qc.mito2
## FALSE  TRUE 
## 35701  2427

Threshold values:

attr(qc.mito2, "thresholds")
##   lower  higher 
##    -Inf 8.72434

1.11.4 Summary

discard2 <- qc.lib2 | qc.nexprs2 | qc.mito2

# Summarize the number of cells removed for each reason.
DataFrame(LibSize=sum(qc.lib2),
      NExprs=sum(qc.nexprs2),
      MitoProp=sum(qc.mito2),
      Total=sum(discard2))
## DataFrame with 1 row and 4 columns
##     LibSize    NExprs  MitoProp     Total
##   <integer> <integer> <integer> <integer>
## 1       340       409      2427      2947

1.11.5 All steps at once

The steps above may be run at once with quickPerCellQC():

reasons <- quickPerCellQC(sce,
              percent_subsets=c("subsets_Mito_percent"))
colSums(as.matrix(reasons)) %>%
  as.data.frame() %>%
    datatable(rownames = TRUE)

1.11.6 Assumptions

Data quality depends on the tissue analysed, some being difficult to dissociate, e.g. brain, so that one level of QC stringency will not fit all data sets.

Filtering based on QC metrics as done here assumes that these QC metrics are not correlated with biology. This may not necessarily be true in highly heterogenous data sets where some cell types represented by good-quality cells may have low RNA content or high mitochondrial content.

1.12 Experimental factors

The two data sets analysed here may have been obtained in experiments with different settings, such as cell preparation or sequencing depth. Such differences between these two batches would affect the adaptive thesholds discussed above. We will now perform QC in each batch separately.

We will use the quickPerCellQC() ‘batch’ option.

batch.reasons <- quickPerCellQC(sce, percent_subsets=c("subsets_Mito_percent"), batch=sce$setName)
colSums(as.matrix(batch.reasons)) %>%
  as.data.frame() %>%
    datatable(rownames = TRUE)
sce$discard <- batch.reasons$discard

Fewer cells are discarded, in particular because of small library size and low gene number.

But the differences are deeper as the two sets only partially overlap:

table(reasons$discard, batch.reasons$discard)
##        
##         FALSE  TRUE
##   FALSE 34901   280
##   TRUE    773  2174

Library size:

plotColData(sce, x="block", y="sum", colour_by="discard",
        other_fields="setName") +
    #facet_wrap(~setName) + 
        scale_y_log10() + ggtitle("Total count")

Number of genes detected:

    plotColData(sce, x="block", y="detected", colour_by="discard", 
        other_fields="setName") +
    #facet_wrap(~setName) + 
        scale_y_log10() + ggtitle("Detected features")

plotColData(sce, x="block", y="detected", colour_by="discard", 
        other_fields="setName") +
          facet_wrap(~colour_by) + 
        scale_y_log10() + ggtitle("Detected features")

Mitochondrial content:

    plotColData(sce, x="block", y="subsets_Mito_percent", 
        colour_by="discard", other_fields="setName") + 
        #facet_wrap(~setName) +
    ggtitle("Mito percent")

1.12.1 Identify poor-quality batches

We will now consider the ‘sample’ batch to illustrate how to identify batches with overall low quality or different from other batches. Let’s compare thresholds across sample groups.

1.12.1.1 Number of genes detected

# compute
discard.nexprs <- isOutlier(sce$detected, log=TRUE, type="lower", batch=sce$Sample.Name)
nexprs.thresholds <- attr(discard.nexprs, "thresholds")["lower",]
nexprs.thresholds %>%
  round(0) %>%
  as.data.frame() %>%
    datatable(rownames = TRUE)

Without block:

# plots - without blocking
discard.nexprs.woBlock <- isOutlier(sce$detected, log=TRUE, type="lower")
without.blocking <- plotColData(sce, x="Sample.Name", y="detected",
    colour_by=I(discard.nexprs.woBlock))

without.blocking + theme(axis.text.x = element_text(angle = 90, hjust = 1))

With block:

# plots - with blocking
with.blocking <- plotColData(sce, x="Sample.Name", y="detected",
    colour_by=I(discard.nexprs))
with.blocking + theme(axis.text.x = element_text(angle = 90, hjust = 1))

1.12.1.1.1 Mitochondrial content
discard.mito <- isOutlier(sce$subsets_Mito_percent, type="higher", batch=sce$Sample.Name)
mito.thresholds <- attr(discard.mito, "thresholds")["higher",]
mito.thresholds %>%
  round(0) %>%
  as.data.frame() %>%
    datatable(rownames = TRUE)

Without block:

# plots - without blocking
discard.mito.woBlock <- isOutlier(sce$subsets_Mito_percent, type="higher")
without.blocking <- plotColData(sce, x="Sample.Name", y="subsets_Mito_percent",
    colour_by=I(discard.mito.woBlock))
without.blocking + theme(axis.text.x = element_text(angle = 90, hjust = 1))

With block:

# plots - with blocking
with.blocking <- plotColData(sce, x="Sample.Name", y="subsets_Mito_percent",
    colour_by=I(discard.mito))
with.blocking + theme(axis.text.x = element_text(angle = 90, hjust = 1))

1.12.1.2 Samples to check

Names of samples with a ‘low’ threshold for the number of genes detected:

# names
names(nexprs.thresholds)[isOutlier(nexprs.thresholds, type="lower")]
## character(0)

Names of samples with a ‘high’ threshold for mitocondrial content:

# names
names(mito.thresholds)[isOutlier(mito.thresholds, type="higher")]
## [1] "GSM3872434" "GSM3872437" "GSM3872438" "GSM3872443"

1.12.2 QC metrics space

A similar approach exists to identify outliers using a set of metrics together. We will the same QC metrics as above.

# slow
stats <- cbind(log10(sce$sum),
           log10(sce$detected),
           sce$subsets_Mito_percent)

library(robustbase)
outlying <- adjOutlyingness(stats, only.outlyingness = TRUE)
multi.outlier <- isOutlier(outlying, type = "higher")
summary(multi.outlier)
##    Mode   FALSE    TRUE 
## logical   34323    3805

Compare with previous filtering:

table(sce$discard, multi.outlier)
##        multi.outlier
##         FALSE  TRUE
##   FALSE 33531  2143
##   TRUE    792  1662

1.12.3 QC PCA

One can also perform a principal components analysis (PCA) on cells, based on the column metadata in a SingleCellExperiment object. Here we will only use the library size, the number of genes detected (which is correlated with library size) and the mitochondrial content.

sce <- runColDataPCA(sce, variables=list(
    "sum", "detected", "subsets_Mito_percent"),
    outliers=TRUE)

#reducedDimNames(sce)
#head(reducedDim(sce))
#head(colData(sce))

#p <- plotReducedDim(sce, dimred="PCA_coldata", colour_by = "Sample.Name")
p <- plotReducedDim(sce, dimred="PCA_coldata", colour_by = "outlier")
p + facet_wrap(~sce$discard)

Compare with previous filtering:

table(sce$discard, sce$outlier)
##        
##         FALSE  TRUE
##   FALSE 33989  1685
##   TRUE    785  1669
table(multi.outlier, sce$outlier)
##              
## multi.outlier FALSE  TRUE
##         FALSE 33207  1116
##         TRUE   1567  2238

1.12.4 Other diagnostic plots

Mitochondrial content against library size:

plotColData(sce, x="sum", y="subsets_Mito_percent", colour_by="discard")

Split by sample group:

sp <- ggplot(data.frame(colData(sce)), aes(x=detected, y=subsets_Mito_percent, col=discard)) +
  geom_point()
sp + facet_wrap(~source_name)

Mind distributions:

sp <- ggplot(data.frame(colData(sce)), aes(x=detected, y=subsets_Mito_percent)) +
  geom_point(size = 0.05, alpha = 0.0)
sp + 
  #geom_density_2d_filled(alpha = 0.5) +
  geom_density_2d(size = 0.5, colour = "black")

#sp <- ggplot(data.frame(colData(sce)), aes(x=detected, y=subsets_Mito_percent)) +
#  geom_point()
#sp + geom_density_2d_filled() + facet_wrap(~source_name)

1.12.5 Filter low-quality cells out

We will now exclude poor-quality cells.

scePreQc <- sce
sce <- scePreQc[,!scePreQc$discard]
sce
## class: SingleCellExperiment 
## dim: 33538 35674 
## metadata(20): Samples Samples ... Samples Samples
## assays(1): counts
## rownames(33538): ENSG00000243485 ENSG00000237613 ... ENSG00000277475
##   ENSG00000268674
## rowData names(10): ensembl_gene_id external_gene_name ... mean detected
## colnames: NULL
## colData names(19): Sample Barcode ... discard outlier
## reducedDimNames(1): PCA_coldata
## altExpNames(0):

We also write the R object to file to use later if need be.

# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_postQc.Rds", projDir, outDirBit)
saveRDS(sce, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_postQc.Rds", projDir, outDirBit)
sce <- readRDS(tmpFn)

1.13 Novelty

The number of genes per UMI for each cell informs on the level of sequencing saturation achieved ( hbctraining). For a given cell, as sequencing depth increases each extra UMI is less likely to correspnf to a gene not already detected in that cell. Cells with small library size tend to have higher overall ‘novelty’ i.e. they have not reached saturation for any given gene. Outlier cell may have a library with low complexity. This may suggest the some cell types, e.g. red blood cells. The expected novelty is about 0.8.

Here we see that some PBMMCs have low novelty, ie overall fewer genes were detected for an equivalent number of UMIs in these cells than in others.

p <- colData(sce) %>%
  data.frame() %>%
  ggplot(aes(x=sum, y=detected, color=subsets_Mito_percent)) + 
  geom_point() + 
  stat_smooth(method=lm) +
  scale_x_log10() + 
  scale_y_log10() + 
  geom_vline(xintercept = 800) +
  facet_wrap(~source_name)
p

# write plot to file

tmpFn <- sprintf("%s/%s/%s/novelty_scat.png", projDir, outDirBit, qcPlotDirBit)
ggsave(plot=p, file=tmpFn)

# Novelty
# the number of genes per UMI for each cell,
# https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionIV/lessons/SC_quality_control_analysis.html

# Add number of UMIs per gene for each cell to metadata
colData(sce)$log10GenesPerUMI <- log10(colData(sce)$detected) / log10(colData(sce)$sum)
# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI
p <- colData(sce) %>%
    data.frame() %>%
        ggplot(aes(x=log10GenesPerUMI, color = source_name, fill = source_name)) +
        geom_density()
p

tmpFn <- sprintf("%s/%s/%s/novelty_dens.png", projDir, outDirBit, qcPlotDirBit)
ggsave(plot=p, file=tmpFn)

1.14 QC based on sparsity

The approach above identified poor-quality using thresholds on the number of genes detected and mitochondrial content. We will here specifically look at the sparsity of the data, both at the gene and cell levels.

1.14.1 Remove genes that are not expressed at all

Genes that are not expressed at all are not informative, so we remove them

not.expressed <- rowSums(counts(scePreQc)) == 0

# store the cell-wise information
cols.meta <- colData(scePreQc)
rows.meta <- rowData(scePreQc)

nz.counts <- counts(scePreQc)[!not.expressed, ]
sce.nz <- SingleCellExperiment(list(counts=nz.counts))

# reset the column data on the new object
colData(sce.nz) <- cols.meta
rowData(sce.nz) <- rows.meta[!not.expressed, ]

sce.nz
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_nz.Rds", projDir, outDirBit)
saveRDS(sce.nz, tmpFn)
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_nz.Rds", projDir, outDirBit)
sce.nz <- readRDS(tmpFn)

1.14.2 Sparsity plots

We will compute:

  • the cell sparsity: for each cell, the proportion of genes that are not detected
  • the gene sparsity: for each gene, the proportion of cells in which it is not detected
# compute - SLOW
cell_sparsity <- apply(counts(sce.nz) == 0, 2, sum)/nrow(counts(sce.nz))
gene_sparsity <- apply(counts(sce.nz) == 0, 1, sum)/ncol(counts(sce.nz))

colData(sce.nz)$cell_sparsity <- cell_sparsity
rowData(sce.nz)$gene_sparsity <- gene_sparsity

# write outcome to file for later use
tmpFn <- sprintf("%s/%s/Robjects/sce_nz_sparsityCellGene.Rds", projDir, outDirBit)
saveRDS(list("colData" = colData(sce.nz),
         "rowData" = rowData(sce.nz)),
    tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_nz_sparsityCellGene.Rds", projDir, outDirBit)
tmpList <- readRDS(tmpFn)
cell_sparsity <- tmpList$colData$cell_sparsity
gene_sparsity <- tmpList$rowData$gene_sparsity

We now plot the distribution of these two metrics.

The cell sparsity plot shows that cells have between 85% and 99% 0’s, which is typical.

hist(cell_sparsity, breaks=50, col="grey80", xlab="Cell sparsity", main="")

The gene sparsity plot shows that a large number of genes are almost never detected, which is also regularly observed.

hist(gene_sparsity, breaks=50, col="grey80", xlab="Gene sparsity", main="")

1.14.3 Filters

We also remove cells with sparsity higher than 0.99, and/or mitochondrial content higher than 20%.

Genes detected in a few cells only are unlikely to be informative and would hinder normalisation. We will remove genes that are expressed in fewer than 20 cells.

# filter
sparse.cells <- cell_sparsity > 0.99
mito.cells <- sce.nz$subsets_Mito_percent > 20

min.cells <- 1 - (20/length(cell_sparsity))
sparse.genes <- gene_sparsity > min.cells

Number of genes removed:

table(sparse.genes)
## sparse.genes
## FALSE  TRUE 
## 17875  7279

Number of cells removed:

table(sparse.cells, mito.cells)
##             mito.cells
## sparse.cells FALSE  TRUE
##        FALSE 36816   687
##        TRUE    557    68
# remove cells from the SCE object that are poor quality
# remove the sparse genes, then re-set the counts and row data accordingly
cols.meta <- colData(sce.nz)
rows.meta <- rowData(sce.nz)

counts.nz <- counts(sce.nz)[!sparse.genes, !(sparse.cells | mito.cells)]
sce.nz <- SingleCellExperiment(assays=list(counts=counts.nz))
colData(sce.nz) <- cols.meta[!(sparse.cells | mito.cells),]
rowData(sce.nz) <- rows.meta[!sparse.genes, ]
sce.nz
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_nz_postQc.Rds", projDir, outDirBit)
saveRDS(sce.nz, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_nz_postQc.Rds", projDir, outDirBit)
sce.nz <- readRDS(tmpFn)

Compare with filter above (mind that the comparison is not fair because we used a less stringent, hard filtering on mitochondrial content):

table(scePreQc$discard, (sparse.cells | mito.cells))
##        
##         FALSE  TRUE
##   FALSE 35302   372
##   TRUE   1514   940

1.14.4 Separate Caron and Hca batches

We will now check sparsity for each batch separately.

sce.nz.caron <- sce.nz[,sce.nz$setName=="Caron"]
sce.nz.hca <- sce.nz[,sce.nz$setName=="Hca"]

1.14.5 Caron only

setName <- "caron"
sce.x <- sce.nz.caron
# compute - SLOW
cell_sparsity <- apply(counts(sce.x) == 0, 2, sum)/nrow(counts(sce.x))
gene_sparsity <- apply(counts(sce.x) == 0, 1, sum)/ncol(counts(sce.x))

colData(sce.x)$cell_sparsity <- cell_sparsity
rowData(sce.x)$gene_sparsity <- gene_sparsity

# write outcome to file for later use
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_sparsityCellGene.Rds", projDir, outDirBit, setName)
saveRDS(list("colData" = colData(sce.x),
         "rowData" = rowData(sce.x)),
    tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_sparsityCellGene.Rds", projDir, outDirBit, setName)
tmpList <- readRDS(tmpFn)
cell_sparsity <- tmpList$colData$cell_sparsity
gene_sparsity <- tmpList$rowData$gene_sparsity
# plot
tmpFn <- sprintf("%s/%s/%s/%s_sparsity.png", projDir, outDirBit, qcPlotDirBit, setName)
png(tmpFn)
par(mfrow=c(1, 2))
hist(cell_sparsity, breaks=50, col="grey80", xlab="Cell sparsity", main="")
hist(gene_sparsity, breaks=50, col="grey80", xlab="Gene sparsity", main="")
abline(v=40, lty=2, col='purple')
dev.off()
tmpFn <- sprintf("%s/%s/%s/%s_sparsity.png", projDir, outDirBit, qcPlotDirBit, setName)
#tmpFn <- sprintf("../%s/%s_sparsity.png", qcPlotDirBit, setName)
knitr::include_graphics(tmpFn, auto_pdf = TRUE)

rm(tmpFn)
# filter
sparse.cells <- cell_sparsity > 0.99
mito.cells <- sce.x$subsets_Mito_percent > 20

min.cells <- 1 - (20/length(cell_sparsity))
sparse.genes <- gene_sparsity > min.cells
# remove cells from the SCE object that are poor quality
# remove the sparse genes, then re-set the counts and row data accordingly
cols.meta <- colData(sce.x)
rows.meta <- rowData(sce.x)

counts.x <- counts(sce.x)[!sparse.genes, !(sparse.cells | mito.cells)]
sce.x <- SingleCellExperiment(assays=list(counts=counts.x))
colData(sce.x) <- cols.meta[!(sparse.cells | mito.cells),]
rowData(sce.x) <- rows.meta[!sparse.genes, ]
sce.x

We write the R object to ‘caron_sce_nz_postQc.Rds’.

# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postQc.Rds", projDir, outDirBit, setName)
saveRDS(sce.x, tmpFn)
rm(sce.x)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postQc.Rds", projDir, outDirBit, setName)
sce.nz.caron <- readRDS(tmpFn)

1.14.6 Hca only

setName <- "hca"
sce.x <- sce.nz.hca
# compute - SLOW
cell_sparsity <- apply(counts(sce.x) == 0, 2, sum)/nrow(counts(sce.x))
gene_sparsity <- apply(counts(sce.x) == 0, 1, sum)/ncol(counts(sce.x))

colData(sce.x)$cell_sparsity <- cell_sparsity
rowData(sce.x)$gene_sparsity <- gene_sparsity

# write outcome to file for later use
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_sparsityCellGene.Rds", projDir, outDirBit, setName)
saveRDS(list("colData" = colData(sce.x),
         "rowData" = rowData(sce.x)),
    tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_sparsityCellGene.Rds", projDir, outDirBit, setName)
tmpList <- readRDS(tmpFn)
cell_sparsity <- tmpList$colData$cell_sparsity
gene_sparsity <- tmpList$rowData$gene_sparsity
# plot
tmpFn <- sprintf("%s/%s/%s/%s_sparsity.png", projDir, outDirBit, qcPlotDirBit, setName)
png(tmpFn)
par(mfrow=c(1, 2))
hist(cell_sparsity, breaks=50, col="grey80", xlab="Cell sparsity", main="")
hist(gene_sparsity, breaks=50, col="grey80", xlab="Gene sparsity", main="")
abline(v=40, lty=2, col='purple')
dev.off()
tmpFn <- sprintf("%s/%s/%s/%s_sparsity.png", projDir, outDirBit, qcPlotDirBit, setName)
#tmpFn <- sprintf("../%s/%s_sparsity.png", qcPlotDirBit, setName)
knitr::include_graphics(tmpFn, auto_pdf = TRUE)

rm(tmpFn)
# filter
sparse.cells <- cell_sparsity > 0.99
mito.cells <- sce.x$subsets_Mito_percent > 20

min.cells <- 1 - (20/length(cell_sparsity))
sparse.genes <- gene_sparsity > min.cells
# remove cells from the SCE object that are poor quality
# remove the sparse genes, then re-set the counts and row data accordingly
cols.meta <- colData(sce.x)
rows.meta <- rowData(sce.x)

counts.x <- counts(sce.x)[!sparse.genes, !(sparse.cells | mito.cells)]
sce.x <- SingleCellExperiment(assays=list(counts=counts.x))
colData(sce.x) <- cols.meta[!(sparse.cells | mito.cells),]
rowData(sce.x) <- rows.meta[!sparse.genes, ]
sce.x

We write the R object to ‘hca_sce_nz_postQc.Rds’.

# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postQc.Rds", projDir, outDirBit, setName)
saveRDS(sce.x, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postQc.Rds", projDir, outDirBit, setName)
sce.nz.hca <- readRDS(tmpFn)