We will use two sets of Bone Marrow Mononuclear Cells (BMMC):
Fastq files were retrieved from publicly available archive (SRA and HCA).
Sequencing quality was assessed and visualised using fastQC and MultiQC.
Reads were aligned against GRCh38 and features counted using cellranger (v3.1.0).
We will now check the quality of the data further:
We will then:
#wrkDir <- "/mnt/scratchb/bioinformatics/baller01/20200511_FernandesM_ME_crukBiSs2020/CaronBourque2020/grch38300"
projDir <- "/home/ubuntu/Course_Materials/scRNAseq"
wrkDir <- sprintf("%s/CaronBourque2020/grch38300", projDir)
#setwd(wrkDir)
outDirBit <- params$outDirBit # "AnaWiSeurat/Attempt1"
qcPlotDirBit <- "QcPlots"
poolBool <- FALSE # whether to read each sample in and pool them and write object to file, or just load that file.
biomartBool <- FALSE # biomaRt sometimes fails, do it once, write to file and use that copy.
addQcBool <- FALSE # biomaRt sometimes fails, do it once, write to file and use that copy.
runAll <- FALSE # TRUE
We will load both the Caron and Hca data sets.
# CaronBourque2020
cb_sampleSheetFn <- file.path(projDir, "Data/CaronBourque2020/SraRunTable.txt")
# Human Cell Atlas
hca_sampleSheetFn <- file.path(projDir, "Data/Hca/accList_Hca.txt")
# read sample sheet in:
splShtColToKeep <- c("Run", "Sample.Name", "source_name")
cb_sampleSheet <- read.table(cb_sampleSheetFn, header=T, sep=",")
hca_sampleSheet <- read.table(hca_sampleSheetFn, header=F, sep=",")
colnames(hca_sampleSheet) <- "Sample.Name"
hca_sampleSheet$Run <- hca_sampleSheet$Sample.Name
hca_sampleSheet$source_name <- "ABMMC" # adult BMMC
sampleSheet <- rbind(cb_sampleSheet[,splShtColToKeep], hca_sampleSheet[,splShtColToKeep])
sampleSheet %>%
as.data.frame() %>%
datatable(rownames = FALSE)
We will use a SingleCellExperiment object that is described here and stores various data types:
[singleCellExperimentClass.png](/home/ubuntu/Course_Materials/scRNAseq/Images/singleCellExperimentClass.png
We will load the data for the first sample in the sample sheet: SRR9264343.
i <- 1
sample.path <- sprintf("%s/%s/%s/outs/raw_feature_bc_matrix/", wrkDir, sampleSheet[i,"Run"], sampleSheet[i,"Run"])
sce.raw <- read10xCounts(sample.path, col.names=TRUE)
sce.raw
## class: SingleCellExperiment
## dim: 33538 737280
## metadata(1): Samples
## assays(1): counts
## rownames(33538): ENSG00000243485 ENSG00000237613 ... ENSG00000277475
## ENSG00000268674
## rowData names(3): ID Symbol Type
## colnames(737280): AAACCTGAGAAACCAT-1 AAACCTGAGAAACCGC-1 ...
## TTTGTCATCTTTAGTC-1 TTTGTCATCTTTCCTC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## altExpNames(0):
We can access these different types of data with various functions.
Number of genes and droplets in the count matrix:
dim(counts(sce.raw))
## [1] 33538 737280
Features, with rowData():
head(rowData(sce.raw))
## DataFrame with 6 rows and 3 columns
## ID Symbol Type
## <character> <character> <character>
## ENSG00000243485 ENSG00000243485 MIR1302-2HG Gene Expression
## ENSG00000237613 ENSG00000237613 FAM138A Gene Expression
## ENSG00000186092 ENSG00000186092 OR4F5 Gene Expression
## ENSG00000238009 ENSG00000238009 AL627309.1 Gene Expression
## ENSG00000239945 ENSG00000239945 AL627309.3 Gene Expression
## ENSG00000239906 ENSG00000239906 AL627309.2 Gene Expression
Samples, with colData():
head(colData(sce.raw))
## DataFrame with 6 rows and 2 columns
## Sample
## <character>
## AAACCTGAGAAACCAT-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAACCGC-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAACCTA-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAACGAG-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAACGCC-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## AAACCTGAGAAAGTGG-1 /home/ubuntu/Course_Materials/scRNAseq/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/raw_feature_bc_matrix/
## Barcode
## <character>
## AAACCTGAGAAACCAT-1 AAACCTGAGAAACCAT-1
## AAACCTGAGAAACCGC-1 AAACCTGAGAAACCGC-1
## AAACCTGAGAAACCTA-1 AAACCTGAGAAACCTA-1
## AAACCTGAGAAACGAG-1 AAACCTGAGAAACGAG-1
## AAACCTGAGAAACGCC-1 AAACCTGAGAAACGCC-1
## AAACCTGAGAAAGTGG-1 AAACCTGAGAAAGTGG-1
Single-cell RNA-seq data compared to bulk RNA-seq is sparse, especially with droplet-based methods such as 10X, mostly because:
Counts, with counts(). Given the large number of droplets in a sample, count matrices can be large. They are however very sparse and can be stored in a ‘sparse matrix’ that only stores non-zero values, for example a ‘dgCMatrix’ object (‘DelayedArray’ class).
counts(sce.raw) <- as(counts(sce.raw), "dgCMatrix")
#class(counts(sce.raw))
counts(sce.raw)[1:10, 1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
##
## ENSG00000243485 . . . . . . . . . .
## ENSG00000237613 . . . . . . . . . .
## ENSG00000186092 . . . . . . . . . .
## ENSG00000238009 . . . . . . . . . .
## ENSG00000239945 . . . . . . . . . .
## ENSG00000239906 . . . . . . . . . .
## ENSG00000241599 . . . . . . . . . .
## ENSG00000236601 . . . . . . . . . .
## ENSG00000284733 . . . . . . . . . .
## ENSG00000235146 . . . . . . . . . .
In /home/ubuntu/Course_Materials/scRNAseq/ScriptsForTut:
tmpFn <- sprintf("%s/Images/1_AAACCTGAGACTTTCG-1.rseqcGeneBodyCovCheck.txt.geneBodyCoverage.curves.png", projDir)
knitr::include_graphics(tmpFn, auto_pdf = TRUE)
We will use the information stored in the ‘molecule info’ file to count the number of UMI and reads for each gene in each cell.
##mol.info.file <- sprintf("%s/%s/%s/outs/molecule_info.h5", wrkDir, sampleSheet[i,"Run"], sampleSheet[i,"Run"])
##mol.info <- read10xMolInfo(mol.info.file)
# or mol.info object if issue with H5Fopen
i <- 1
mol.info.file <- sprintf("%s/%s/%s/outs/molecule_info_h5.Rds", wrkDir, sampleSheet[i,"Run"], sampleSheet[i,"Run"])
mol.info <- readRDS(mol.info.file)
# slow
mol.info$data
## DataFrame with 18544666 rows and 5 columns
## cell umi gem_group gene reads
## <character> <integer> <integer> <integer> <integer>
## 1 AAACCTGAGAAACCTA 467082 1 3287 1
## 2 AAACCTGAGAAACCTA 205888 1 3446 1
## 3 AAACCTGAGAAACCTA 866252 1 3896 3
## 4 AAACCTGAGAAACCTA 796027 1 3969 1
## 5 AAACCTGAGAAACCTA 542561 1 5008 1
## ... ... ... ... ... ...
## 18544662 TTTGTCATCTTTAGTC 927060 1 23634 1
## 18544663 TTTGTCATCTTTAGTC 975865 1 27143 1
## 18544664 TTTGTCATCTTTAGTC 364964 1 27467 4
## 18544665 TTTGTCATCTTTAGTC 152570 1 30125 7
## 18544666 TTTGTCATCTTTAGTC 383230 1 30283 5
head(mol.info$genes)
## [1] "ENSG00000243485" "ENSG00000237613" "ENSG00000186092" "ENSG00000238009"
## [5] "ENSG00000239945" "ENSG00000239906"
dd <- mol.info$data %>% data.frame()
dd$umi <- as.character(dd$umi)
tmpFn <- sprintf("%s/%s/Robjects/sce_preProc_ampDf1.Rds", projDir, outDirBit)
if(!FALSE) # slow
{
ampDf <- dd %>% group_by(cell, gene) %>%
summarise(nUmis = n(), totReads=sum(reads)) %>%
data.frame()
# Write object to file
saveRDS(ampDf, tmpFn)
} else {
ampDf <- readRDS(tmpFn)
}
rm(tmpFn)
summary(ampDf$nUmis)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 2.377 1.000 7137.000
summary(ampDf$totReads)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 6.00 15.23 12.00 79275.00
summary(log2(ampDf$nUmis))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.4504 0.0000 12.8011
summary(log2(ampDf$totReads))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.585 2.401 3.585 16.275
We now plot the amplification rate.
# too slow
sp <- ggplot(ampDf, aes(x=nUmis, y=totReads)) +
geom_point() +
scale_x_continuous(trans='log2') +
scale_y_continuous(trans='log2')
sp
#ggMarginal(sp)
sp2 <- ggplot(ampDf, aes(x=nUmis, y=totReads)) +
geom_bin2d(bins = 50) +
scale_fill_continuous(type = "viridis") +
scale_x_continuous(trans='log10') +
scale_y_continuous(trans='log10') +
theme_bw()
sp2
#rm(mol.info)
gc(verbose=FALSE)
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 8359027 446.5 35542973 1898.2 69419867 3707.5
## Vcells 105889427 807.9 277281324 2115.5 277281324 2115.5
For a given sample, amongst the tens of thousands of droplets used in the assay, some will contain a cell while many others will not. The presence of RNA in a droplet will show with non-zero UMI count. This is however not sufficient to infer that the droplet does contain a cell. Indeed, after sample preparation, some cell debris including RNA will still float in the mix. This ambient RNA is unwillignly captured during library preparation and sequenced.
Cellranger generates a count matrix that includes all droplets analysed in the assay. We will now load this ‘raw matrix’ for one sample and draw the distribution of UMI counts.
Distribution of UMI counts:
dd <- mol.info$data %>% data.frame()
dd$umi <- as.character(dd$umi)
libSizeDf <- dd %>% group_by(cell) %>%
summarise(nUmis = n(), totReads=sum(reads)) %>%
data.frame()
ggplot(libSizeDf, aes(x=log10(nUmis))) + geom_histogram(bins = 50)
Library size varies widely, both amongst empty droplets and droplets carrying cells, mostly due to:
Most cell counting methods try to identify the library size that best distinguishes empty from cell-carrying droplets.
This method by default fits a mixture of two normal distributions to the logged library sizes:
set.seed(100)
# get package
library("mixtools")
# have library sizes on a log10 scale
log10_lib_size <- log10(libSizeDf$nUmis)
# fit mixture
mix <- normalmixEM(log10_lib_size, maxrestarts=50, epsilon = 1e-03)
## One of the variances is going to zero; trying new starting values.
## number of iterations= 19
# plot
plot(mix, which=2, xlab2="log(mol per cell)")
# get density for each distribution:
p1 <- dnorm(log10_lib_size, mean=mix$mu[1], sd=mix$sigma[1])
p2 <- dnorm(log10_lib_size, mean=mix$mu[2], sd=mix$sigma[2])
# find intersection:
if (mix$mu[1] < mix$mu[2]) {
split <- min(log10_lib_size[p2 > p1])
} else {
split <- min(log10_lib_size[p1 > p2])
}
# show split on plot:
#log10_lib_size <- log10(libSizeDf$nUmis)
abline(v=split, lwd=2)
The barcode rank plot shows the library sizes against their rank in decreasing order.
barcode_rank <- rank(-libSizeDf$nUmis)
plot(barcode_rank, libSizeDf$nUmis, xlim=c(1,10000), ylab="library size")
Given the exponential shape of the curve above, library sizes can be shown on the log10 scale:
plot(barcode_rank, log10_lib_size, xlim=c(1,10000))
The point on the curve where it drops sharply may be used as the split point. Before that point library sizes are high, because droplets carry a cell. After that point, library sizes are far smaller because droplets do not carry a cell, only ambient RNA (… or do they?).
Here, we could manually count 2500 cells. There are however more robust and convenient methods.
We could also compute the inflection point.
o <- order(barcode_rank)
log10_lib_size <- log10_lib_size[o]
barcode_rank <- barcode_rank[o]
rawdiff <- diff(log10_lib_size)/diff(barcode_rank)
inflection <- which(rawdiff == min(rawdiff[100:length(rawdiff)], na.rm=TRUE))
plot(barcode_rank, log10_lib_size, xlim=c(1,10000))
abline(v=inflection, col="red", lwd=2)
The inflection is at 3279 UMIs and 2306 cells are detected.
Given an expected number of cells, cellranger used to assume a ~10-fold range of library sizes for real cells and estimate this range (cellranger (v1 and v2). The threshold was defined as the 99th quantile of the library size, divided by 10.
n_cells <- 2500
# CellRanger
totals <- sort(libSizeDf$nUmis, decreasing = TRUE)
# 99th percentile of top n_cells divided by 10
thresh = totals[round(0.01*n_cells)]/10
plot(log10(totals), xlim=c(1,10000))
abline(h=log10(thresh), col="red", lwd=2)
# table(libSizeDf$nUmis >= thresh)
The threshold is at 1452 UMIs and 2773 cells are detected.
The DropletUtils
package offers utilities to analyse droplet-based data, including cell counting using the library size as seen above. These simple approaches may exclude droplets with small or quiet cells with low RNA content. The emptyDrops
method calls cells by first computing the expression profile for droplets with RNA content so low they almost certainly do not contain any cell: the ‘background’ or ‘ambient’ profile. The method then tests each non-background droplet for significant difference in expression profile.
Let’s first check the knee and inflection methods.
my.counts <- counts(sce.raw)
br.out <- barcodeRanks(my.counts)
plot(br.out$rank, br.out$total, log="xy", xlab="Rank", ylab="Total UMI count")
o <- order(br.out$rank)
lines(br.out$rank[o], br.out$fitted[o], col="red")
abline(h=metadata(br.out)$knee, col="dodgerblue", lty=2)
abline(h=metadata(br.out)$inflection, col="forestgreen", lty=2)
legend("bottomleft", lty=2, col=c("dodgerblue", "forestgreen"),
legend=c("knee", "inflection"))
Testing for empty droplets.
We will call cells with a false discovery rate (FDR) of 0.1% so that at most 1 in 1000 droplets called may be empty.
# a bit slow
# significance is computed by simulation so we set a seed for reproducibility
set.seed(100)
# run analysis:
e.out <- emptyDrops(counts(sce.raw))
e.out
## DataFrame with 737280 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## AAACCTGAGAAACCAT-1 0 NA NA NA NA
## AAACCTGAGAAACCGC-1 0 NA NA NA NA
## AAACCTGAGAAACCTA-1 31 NA NA NA NA
## AAACCTGAGAAACGAG-1 0 NA NA NA NA
## AAACCTGAGAAACGCC-1 0 NA NA NA NA
## ... ... ... ... ... ...
## TTTGTCATCTTTACAC-1 0 NA NA NA NA
## TTTGTCATCTTTACGT-1 1 NA NA NA NA
## TTTGTCATCTTTAGGG-1 0 NA NA NA NA
## TTTGTCATCTTTAGTC-1 26 NA NA NA NA
## TTTGTCATCTTTCCTC-1 0 NA NA NA NA
NAs are assigned to droplets used to compute the ambient profile.
Get summary:
summary(e.out$FDR <= 0.001)
## Mode FALSE TRUE NA's
## logical 492 3070 733718
The test significance is computed by permutation. For each droplet tested, the number of permutations may limit the value of the p-value. This information is available in the ‘Limited’ column. If ‘Limited’ is ‘TRUE’ for any non-significant droplet, the number of permutations was too low, should be increased and the analysis re-run.
table(Sig=e.out$FDR <= 0.001, Limited=e.out$Limited)
## Limited
## Sig FALSE TRUE
## FALSE 492 0
## TRUE 71 2999
Let’s check that the background comprises only empty droplets. If the droplets used to define the background profile are indeed empty, testing them should result in a flat distribution of p-values. Let’s test the ‘ambient’ droplets and draw the p-value distribution.
set.seed(100)
limit <- 100
all.out <- emptyDrops(counts(sce.raw), lower=limit, test.ambient=TRUE)
hist(all.out$PValue[all.out$Total <= limit & all.out$Total > 0],
xlab="P-value", main="", col="grey80")
The distribution of p-values looks uniform with no large peak for small values: no cell in these droplets.
To evaluate the outcome of the analysis, we will plot the strength of the evidence against library size.
is.cell <- e.out$FDR <= 0.001
#is.cell <- e.out$FDR <= 0.01
sum(is.cell, na.rm=TRUE)
## [1] 3070
cellColour <- ifelse(is.cell, "red", "black") # rep("black", nrow((e.out)))
tmpBoolInflex <- e.out$Total < metadata(br.out)$inflection #
tmpBoolSmall <- e.out$FDR <= 0.001 # small cell
#table(tmpBoolInflex, tmpBoolSmall)
cellColour[tmpBoolInflex & tmpBoolSmall] <- "green" # 'recovered' cells
plot(log10(e.out$Total), -e.out$LogProb, col=ifelse(is.cell, "red", "black"), xlim=c(2,max(log10(e.out$Total))),
xlab="Total UMI count", ylab="-Log Probability")
points(log10(e.out$Total)[tmpBoolInflex & tmpBoolSmall], -e.out$LogProb[tmpBoolInflex & tmpBoolSmall], pch=16, col="green")
Let’s filter out empty droplets.
sce.ed <- sce.raw[,which(e.out$FDR <= 0.001)] # ed for empty droplet
sce.ed
## class: SingleCellExperiment
## dim: 33538 3070
## metadata(1): Samples
## assays(1): counts
## rownames(33538): ENSG00000243485 ENSG00000237613 ... ENSG00000277475
## ENSG00000268674
## rowData names(3): ID Symbol Type
## colnames(3070): AAACCTGAGACTTTCG-1 AAACCTGGTCTTCAAG-1 ...
## TTTGTCACAGGCTCAC-1 TTTGTCAGTTCGGCAC-1
## colData names(2): Sample Barcode
## reducedDimNames(0):
## altExpNames(0):
Cell calling in cellranger v3 uses a method similar to emptyDrops() and a ‘filtered matrix’ is generated that only keeps droplets deemed to contain a cell. We will load these filtered matrices now.
Each sample was analysed with cellranger separately. We load filtered matrices one sample at a time, showing for each the name and number of features and cells.
# a bit slow
# load data:
sce.list <- vector("list", length = nrow(sampleSheet))
for (i in 1:nrow(sampleSheet))
{
print(sprintf("%s: %s", sampleSheet[i,"Run"], sampleSheet[i,"Sample.Name"]))
sample.path <- sprintf("%s/%s/%s/outs/filtered_feature_bc_matrix/",
wrkDir, sampleSheet[i,"Run"], sampleSheet[i,"Run"])
sce.list[[i]] <- read10xCounts(sample.path)
#print(sce.list[[i]])
print(dim(sce.list[[i]]))
}
Let’s combine all 20 samples into a single object.
We first check the feature lists are identical.
# combine
# check row names are the same
rowNames1 <- rownames(sce.list[[1]])
for (i in 2:nrow(sampleSheet))
{
print(identical(rowNames1, rownames(sce.list[[i]])))
}
A cell barcode comprises the actual sequence and a ‘group ID’, e.g. AAACCTGAGAAACCAT-1. The latter helps distinguish cells with identical barcode sequence but come from different samples. As each sample was analysed separately, the group ID is set to 1 in all data sets. To pool these data sets we first need to change group IDs so cell barcodes are unique across all samples. We will use the position of the sample in the sample sheet.
# subset 2000 cells:
if(ncol(sce.list[[1]]) < 2000)
{
sce <- sce.list[[1]]
} else {
sce <- sce.list[[1]][, sample(ncol(sce.list[[1]]), 2000)]
}
colData(sce)$Barcode <- gsub("([0-9])$", 1, colData(sce)$Barcode)
print(head(colData(sce)$Barcode))
print(tail(colData(sce)$Barcode))
for (i in 2:nrow(sampleSheet))
{
if(ncol(sce.list[[i]]) < 2000)
{
sce.tmp <- sce.list[[i]]
} else {
sce.tmp <- sce.list[[i]][, sample(ncol(sce.list[[i]]), 2000)]
}
colData(sce.tmp)$Barcode <- gsub("([0-9])$", i, colData(sce.tmp)$Barcode)
sce <- cbind(sce, sce.tmp)
#print(head(colData(sce)$Barcode))
print(tail(colData(sce)$Barcode, 2))
}
We now add the sample sheet information to the object metadata.
colDataOrig <- colData(sce)
#colData(sce) <- colDataOrig[,1:2]
# split path:
tmpList <- strsplit(colDataOrig$Sample, split="/")
# get Run ID, to use to match sample in the meta data and sample sheet objects:
tmpVec <- unlist(lapply(tmpList, function(x){x[9]}))
colData(sce)$Run <- tmpVec
# merge:
colData(sce) <- colData(sce) %>% data.frame %>% left_join(sampleSheet[,splShtColToKeep], "Run") %>% DataFrame
Let’s save the object for future reference.
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_postPool.Rds", projDir, outDirBit)
saveRDS(sce, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_postPool.Rds", projDir, outDirBit)
sce <- readRDS(tmpFn)
# a bit slow
# TODO first remove emtpy droplets the mol.info data
mol.info$data
head(mol.info$genes)
dd <- mol.info$data %>% data.frame()
dd$umi <- as.character(dd$umi)
tmpFn <- sprintf("%s/%s/Robjects/sce_preProc_ampDf2.Rds", projDir, outDirBit)
if(!FALSE) # slow
{
ampDf <- dd %>%
filter(paste(cell,gem_group, sep="-") %in% sce$Barcode) %>%
group_by(cell, gene) %>%
summarise(nUmis = n(), totReads=sum(reads)) %>%
data.frame()
# Write object to file
saveRDS(ampDf, tmpFn)
} else {
ampDf <- readRDS(tmpFn)
}
rm(tmpFn)
#summary(ampDf$nUmis)
#summary(ampDf$totReads)
#summary(log2(ampDf$nUmis))
#summary(log2(ampDf$totReads))
hist(log2(ampDf$nUmis), n=100)
hist(log2(ampDf$totReads), n=100)
sp2 <- ggplot(ampDf, aes(x=nUmis, y=totReads)) +
geom_bin2d(bins = 50) +
scale_fill_continuous(type = "viridis") +
scale_x_continuous(trans='log10') +
scale_y_continuous(trans='log10') +
theme_bw()
sp2
e#rm(mol.info)
gc()
The number and identity of genes detected in a cell vary across cells: the total number of genes detected across all cells is far larger than the number of genes per cell.
# for each gene, compute total number of UMIs across all cells,
# then counts genes with at least one UMI:
sum(rowSums(counts(sce)) > 0)
## [1] 25154
# for each cell count number of genes with at least 1 UMI
# then compute distribution moments:
summary(colSums(counts(sce) > 0))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 29 717 939 1225 1523 6570
Now let’s plot for each gene, the total number of UMIs and the proportion of cells that express it. Lowly expressed genes tend to be detected in a large proportion of cells. The higher the overall expression the lower the proportion of cells.
# randomly subset 1000 cells.
tmpCounts <- counts(sce)[,sample(1000)]
plot(rowSums(tmpCounts),
rowMeans(tmpCounts == 0),
log = "x",
xlab="total number of UMIs",
ylab="proportion of cells expressing the gene"
)
Count genes that are not ‘expressed’ (detected):
not.expressed <- rowSums(counts(sce)) == 0
table(not.expressed)
## not.expressed
## FALSE TRUE
## 25154 8384
Plot the percentage of counts per gene and show genes with the highest UMI counts:
#Compute the relative expression of each gene per cell
rel_expression <- t( t(counts(sce)) / Matrix::colSums(counts(sce))) * 100
rownames(rel_expression) <- rowData(sce)$Symbol
most_expressed <- sort(Matrix::rowSums( rel_expression ),T)[20:1] / ncol(sce)
boxplot( as.matrix(t(rel_expression[names(most_expressed),])),cex=.1, las=1, xlab="% total count per cell",col=scales::hue_pal()(20)[20:1],horizontal=TRUE)
Mind that we have combined two data sets here. It may be interesting to count non-expressed genes in each set separately.
Cell calling performed above does not inform on the quality of the library in each of the droplets kept. Poor-quality cells, or rather droplets, may be caused by cell damage during dissociation or failed library preparation. They usually have low UMI counts, few genes detected and/or high mitochondrial content. The presence may affect normalisation, assessment of cell population heterogeneity, clustering and trajectory:
We will now exclude lowly expressed features and identify low-quality cells using the following metrics mostly:
We will first annotate genes, to know which lie in the mitochondrial genome, then use scater’s addPerCellQC()
to compute various metrics.
Annotate genes with biomaRt.
# retrieve the feature information
gene.info <- rowData(sce)
# setup the biomaRt connection to Ensembl using the correct species genome (hsapiens_gene_ensembl)
ensembl <- useEnsembl(biomart='ensembl', dataset='hsapiens_gene_ensembl')
# retrieve the attributes of interest from biomaRt using the Ensembl gene ID as the key
# beware that this will only retrieve information for matching IDs
gene_symbol <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name', 'chromosome_name', 'start_position', 'end_position', 'strand'),
filters='ensembl_gene_id', mart=ensembl,
values=gene.info[, 1])
# create a new data frame of the feature information
gene.merge <- merge(gene_symbol, gene.info, by.x=c('ensembl_gene_id'), by.y=c('ID'), all.y=TRUE)
rownames(gene.merge) <- gene.merge$ensembl_gene_id
# set the order for the same as the original gene information
gene.merge <- gene.merge[gene.info[, 1], ]
# reset the rowdata on the SCE object to contain all of this information
rowData(sce) <- gene.merge
# slow
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_postBiomart.Rds", projDir, outDirBit)
saveRDS(sce, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_postBiomart.Rds", projDir, outDirBit)
sce <- readRDS(tmpFn)
Number of genes per chromosome, inc. 13 on the mitochondrial genome:
# number of genes per chromosome
table(rowData(sce)$chromosome_name) %>%
as.data.frame() %>%
datatable(rownames = FALSE)
# mitochondrial genes
is.mito <- which(rowData(sce)$chromosome_name=="MT")
Calculate and store QC metrics for genes with addPerFeatureQC() and for cells with addPerCellQC().
# long
# for genes
sce <- addPerFeatureQC(sce)
# colnames(rowData(sce))
head(rowData(sce)) %>%
as.data.frame() %>%
datatable(rownames = TRUE)
Three columns of interest for cells:
# for cells
sce <- addPerCellQC(sce, subsets=list(Mito=is.mito))
# colnames(colData(sce))
head(colData(sce)) %>%
as.data.frame() %>%
datatable(rownames = TRUE)
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_postAddQc.Rds", projDir, outDirBit)
saveRDS(sce, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_postAddQc.Rds", projDir, outDirBit)
sce <- readRDS(tmpFn)
Overall:
par(mfrow=c(1, 2))
hist(log10(sce$sum), breaks=20, col="grey80", xlab="Log-total UMI count", main="")
hist(sce$subsets_Mito_percent, breaks=20, col="grey80", xlab="Proportion of reads in mitochondrial genes", main="")
abline(v=20, lty=2, col='purple')
Per sample group:
sce$source_name <- factor(sce$source_name)
sce$block <- sce$source_name
sce$setName <- ifelse(grepl("ABMMC", sce$source_name), "Hca", "Caron")
Library size (‘Total count’):
p <- plotColData(sce, x="block", y="sum",
other_fields="setName") + #facet_wrap(~setName) +
scale_y_log10() + ggtitle("Total count")
p
Number of genes (‘detected’):
p <- plotColData(sce, x="block", y="detected",
other_fields="setName") + #facet_wrap(~setName) +
scale_y_log10() + ggtitle("Detected features")
p
Mitochondrial content (‘subsets_Mito_percent’):
p <- plotColData(sce, x="block", y="subsets_Mito_percent",
other_fields="setName") + # facet_wrap(~setName) +
ggtitle("Mito percent")
p
#tmpFn <- sprintf("%s/%s/%s/qc_metricDistrib_plotColData.png", projDir, outDirBit, qcPlotDirBit)
tmpFn <- sprintf("%s/%s/%s/qc_metricDistrib_plotColData2.png", projDir, outDirBit, qcPlotDirBit)
png(tmpFn)
# violin plots
gridExtra::grid.arrange(
plotColData(sce, x="block", y="sum",
other_fields="setName") + #facet_wrap(~setName) +
scale_y_log10() + ggtitle("Total count"),
plotColData(sce, x="block", y="detected",
other_fields="setName") + #facet_wrap(~setName) +
scale_y_log10() + ggtitle("Detected features"),
plotColData(sce, x="block", y="subsets_Mito_percent",
other_fields="setName") + # facet_wrap(~setName) +
ggtitle("Mito percent"),
ncol=1
)
dev.off()
#tmpFn <- sprintf("%s/%s/%s/qc_metricDistrib_plotColData.png", projDir, outDirBit, qcPlotDirBit)
tmpFn <- sprintf("%s/%s/%s/qc_metricDistrib_plotColData2.png", projDir, outDirBit, qcPlotDirBit)
knitr::include_graphics(tmpFn, auto_pdf = TRUE)
rm(tmpFn)
Correlation between the number of genes detected and library size (‘detected’ against ‘sum’):
#sp <- ggplot(data.frame(colData(sce)), aes(x=sum, y=detected, col=source_name)) +
# geom_point()
#sp + facet_wrap(~source_name)
sp <- ggplot(data.frame(colData(sce)), aes(x=sum, y=detected, col=subsets_Mito_percent)) +
geom_point()
sp + facet_wrap(~source_name)
One can use hard threshold for the library size, number of genes detected and mitochondrial content. These will however vary across runs. It may therefore be preferable to rely on outlier detection to identify cells that markerdly differ from most cells.
We saw above that the distribution of the QC metrics is close to Normal. Hence, we can detect outlier using the median and the median absolute deviation (MAD) from the median (not the mean and the standard deviation that both are sensitive to outliers).
For a given metric, an outlier value is one that lies over some number of MADs away from the median. A cell will be excluded if it is an outlier in the part of the range to avoid, for example low gene counts, or high mitochondrial content. For a normal distribution, a threshold defined with a distance of 3 MADs from the median retains about 99% of values.
For the library size we use the log scale to avoid negative values for lower part of the distribution.
qc.lib2 <- isOutlier(sce$sum, log=TRUE, type="lower")
table(qc.lib2)
## qc.lib2
## FALSE TRUE
## 37788 340
Threshold values:
attr(qc.lib2, "thresholds")
## lower higher
## 571.1178 Inf
For the number of genes detected we also use the log scale to avoid negative values for lower part of the distribution.
qc.nexprs2 <- isOutlier(sce$detected, log=TRUE, type="lower")
table(qc.nexprs2)
## qc.nexprs2
## FALSE TRUE
## 37719 409
Threshold values:
attr(qc.nexprs2, "thresholds")
## lower higher
## 201.0401 Inf
For the mitochondrial content the exclusion zone is in the higher part of the distribution.
qc.mito2 <- isOutlier(sce$subsets_Mito_percent, type="higher")
table(qc.mito2)
## qc.mito2
## FALSE TRUE
## 35701 2427
Threshold values:
attr(qc.mito2, "thresholds")
## lower higher
## -Inf 8.72434
discard2 <- qc.lib2 | qc.nexprs2 | qc.mito2
# Summarize the number of cells removed for each reason.
DataFrame(LibSize=sum(qc.lib2),
NExprs=sum(qc.nexprs2),
MitoProp=sum(qc.mito2),
Total=sum(discard2))
## DataFrame with 1 row and 4 columns
## LibSize NExprs MitoProp Total
## <integer> <integer> <integer> <integer>
## 1 340 409 2427 2947
The steps above may be run at once with quickPerCellQC():
reasons <- quickPerCellQC(sce,
percent_subsets=c("subsets_Mito_percent"))
colSums(as.matrix(reasons)) %>%
as.data.frame() %>%
datatable(rownames = TRUE)
Data quality depends on the tissue analysed, some being difficult to dissociate, e.g. brain, so that one level of QC stringency will not fit all data sets.
Filtering based on QC metrics as done here assumes that these QC metrics are not correlated with biology. This may not necessarily be true in highly heterogenous data sets where some cell types represented by good-quality cells may have low RNA content or high mitochondrial content.
The two data sets analysed here may have been obtained in experiments with different settings, such as cell preparation or sequencing depth. Such differences between these two batches would affect the adaptive thesholds discussed above. We will now perform QC in each batch separately.
We will use the quickPerCellQC() ‘batch’ option.
batch.reasons <- quickPerCellQC(sce, percent_subsets=c("subsets_Mito_percent"), batch=sce$setName)
colSums(as.matrix(batch.reasons)) %>%
as.data.frame() %>%
datatable(rownames = TRUE)
sce$discard <- batch.reasons$discard
Fewer cells are discarded, in particular because of small library size and low gene number.
But the differences are deeper as the two sets only partially overlap:
table(reasons$discard, batch.reasons$discard)
##
## FALSE TRUE
## FALSE 34901 280
## TRUE 773 2174
Library size:
plotColData(sce, x="block", y="sum", colour_by="discard",
other_fields="setName") +
#facet_wrap(~setName) +
scale_y_log10() + ggtitle("Total count")
Number of genes detected:
plotColData(sce, x="block", y="detected", colour_by="discard",
other_fields="setName") +
#facet_wrap(~setName) +
scale_y_log10() + ggtitle("Detected features")
plotColData(sce, x="block", y="detected", colour_by="discard",
other_fields="setName") +
facet_wrap(~colour_by) +
scale_y_log10() + ggtitle("Detected features")
Mitochondrial content:
plotColData(sce, x="block", y="subsets_Mito_percent",
colour_by="discard", other_fields="setName") +
#facet_wrap(~setName) +
ggtitle("Mito percent")
We will now consider the ‘sample’ batch to illustrate how to identify batches with overall low quality or different from other batches. Let’s compare thresholds across sample groups.
# compute
discard.nexprs <- isOutlier(sce$detected, log=TRUE, type="lower", batch=sce$Sample.Name)
nexprs.thresholds <- attr(discard.nexprs, "thresholds")["lower",]
nexprs.thresholds %>%
round(0) %>%
as.data.frame() %>%
datatable(rownames = TRUE)
Without block:
# plots - without blocking
discard.nexprs.woBlock <- isOutlier(sce$detected, log=TRUE, type="lower")
without.blocking <- plotColData(sce, x="Sample.Name", y="detected",
colour_by=I(discard.nexprs.woBlock))
without.blocking + theme(axis.text.x = element_text(angle = 90, hjust = 1))
With block:
# plots - with blocking
with.blocking <- plotColData(sce, x="Sample.Name", y="detected",
colour_by=I(discard.nexprs))
with.blocking + theme(axis.text.x = element_text(angle = 90, hjust = 1))
discard.mito <- isOutlier(sce$subsets_Mito_percent, type="higher", batch=sce$Sample.Name)
mito.thresholds <- attr(discard.mito, "thresholds")["higher",]
mito.thresholds %>%
round(0) %>%
as.data.frame() %>%
datatable(rownames = TRUE)
Without block:
# plots - without blocking
discard.mito.woBlock <- isOutlier(sce$subsets_Mito_percent, type="higher")
without.blocking <- plotColData(sce, x="Sample.Name", y="subsets_Mito_percent",
colour_by=I(discard.mito.woBlock))
without.blocking + theme(axis.text.x = element_text(angle = 90, hjust = 1))
With block:
# plots - with blocking
with.blocking <- plotColData(sce, x="Sample.Name", y="subsets_Mito_percent",
colour_by=I(discard.mito))
with.blocking + theme(axis.text.x = element_text(angle = 90, hjust = 1))
Names of samples with a ‘low’ threshold for the number of genes detected:
# names
names(nexprs.thresholds)[isOutlier(nexprs.thresholds, type="lower")]
## character(0)
Names of samples with a ‘high’ threshold for mitocondrial content:
# names
names(mito.thresholds)[isOutlier(mito.thresholds, type="higher")]
## [1] "GSM3872434" "GSM3872437" "GSM3872438" "GSM3872443"
A similar approach exists to identify outliers using a set of metrics together. We will the same QC metrics as above.
# slow
stats <- cbind(log10(sce$sum),
log10(sce$detected),
sce$subsets_Mito_percent)
library(robustbase)
outlying <- adjOutlyingness(stats, only.outlyingness = TRUE)
multi.outlier <- isOutlier(outlying, type = "higher")
summary(multi.outlier)
## Mode FALSE TRUE
## logical 34323 3805
Compare with previous filtering:
table(sce$discard, multi.outlier)
## multi.outlier
## FALSE TRUE
## FALSE 33531 2143
## TRUE 792 1662
One can also perform a principal components analysis (PCA) on cells, based on the column metadata in a SingleCellExperiment object. Here we will only use the library size, the number of genes detected (which is correlated with library size) and the mitochondrial content.
sce <- runColDataPCA(sce, variables=list(
"sum", "detected", "subsets_Mito_percent"),
outliers=TRUE)
#reducedDimNames(sce)
#head(reducedDim(sce))
#head(colData(sce))
#p <- plotReducedDim(sce, dimred="PCA_coldata", colour_by = "Sample.Name")
p <- plotReducedDim(sce, dimred="PCA_coldata", colour_by = "outlier")
p + facet_wrap(~sce$discard)
Compare with previous filtering:
table(sce$discard, sce$outlier)
##
## FALSE TRUE
## FALSE 33989 1685
## TRUE 785 1669
table(multi.outlier, sce$outlier)
##
## multi.outlier FALSE TRUE
## FALSE 33207 1116
## TRUE 1567 2238
Mitochondrial content against library size:
plotColData(sce, x="sum", y="subsets_Mito_percent", colour_by="discard")
Split by sample group:
sp <- ggplot(data.frame(colData(sce)), aes(x=detected, y=subsets_Mito_percent, col=discard)) +
geom_point()
sp + facet_wrap(~source_name)
Mind distributions:
sp <- ggplot(data.frame(colData(sce)), aes(x=detected, y=subsets_Mito_percent)) +
geom_point(size = 0.05, alpha = 0.0)
sp +
#geom_density_2d_filled(alpha = 0.5) +
geom_density_2d(size = 0.5, colour = "black")
#sp <- ggplot(data.frame(colData(sce)), aes(x=detected, y=subsets_Mito_percent)) +
# geom_point()
#sp + geom_density_2d_filled() + facet_wrap(~source_name)
We will now exclude poor-quality cells.
scePreQc <- sce
sce <- scePreQc[,!scePreQc$discard]
sce
## class: SingleCellExperiment
## dim: 33538 35674
## metadata(20): Samples Samples ... Samples Samples
## assays(1): counts
## rownames(33538): ENSG00000243485 ENSG00000237613 ... ENSG00000277475
## ENSG00000268674
## rowData names(10): ensembl_gene_id external_gene_name ... mean detected
## colnames: NULL
## colData names(19): Sample Barcode ... discard outlier
## reducedDimNames(1): PCA_coldata
## altExpNames(0):
We also write the R object to file to use later if need be.
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_postQc.Rds", projDir, outDirBit)
saveRDS(sce, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_postQc.Rds", projDir, outDirBit)
sce <- readRDS(tmpFn)
The number of genes per UMI for each cell informs on the level of sequencing saturation achieved ( hbctraining). For a given cell, as sequencing depth increases each extra UMI is less likely to correspnf to a gene not already detected in that cell. Cells with small library size tend to have higher overall ‘novelty’ i.e. they have not reached saturation for any given gene. Outlier cell may have a library with low complexity. This may suggest the some cell types, e.g. red blood cells. The expected novelty is about 0.8.
Here we see that some PBMMCs have low novelty, ie overall fewer genes were detected for an equivalent number of UMIs in these cells than in others.
p <- colData(sce) %>%
data.frame() %>%
ggplot(aes(x=sum, y=detected, color=subsets_Mito_percent)) +
geom_point() +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
geom_vline(xintercept = 800) +
facet_wrap(~source_name)
p
# write plot to file
tmpFn <- sprintf("%s/%s/%s/novelty_scat.png", projDir, outDirBit, qcPlotDirBit)
ggsave(plot=p, file=tmpFn)
# Novelty
# the number of genes per UMI for each cell,
# https://hbctraining.github.io/In-depth-NGS-Data-Analysis-Course/sessionIV/lessons/SC_quality_control_analysis.html
# Add number of UMIs per gene for each cell to metadata
colData(sce)$log10GenesPerUMI <- log10(colData(sce)$detected) / log10(colData(sce)$sum)
# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI
p <- colData(sce) %>%
data.frame() %>%
ggplot(aes(x=log10GenesPerUMI, color = source_name, fill = source_name)) +
geom_density()
p
tmpFn <- sprintf("%s/%s/%s/novelty_dens.png", projDir, outDirBit, qcPlotDirBit)
ggsave(plot=p, file=tmpFn)
The approach above identified poor-quality using thresholds on the number of genes detected and mitochondrial content. We will here specifically look at the sparsity of the data, both at the gene and cell levels.
Genes that are not expressed at all are not informative, so we remove them
not.expressed <- rowSums(counts(scePreQc)) == 0
# store the cell-wise information
cols.meta <- colData(scePreQc)
rows.meta <- rowData(scePreQc)
nz.counts <- counts(scePreQc)[!not.expressed, ]
sce.nz <- SingleCellExperiment(list(counts=nz.counts))
# reset the column data on the new object
colData(sce.nz) <- cols.meta
rowData(sce.nz) <- rows.meta[!not.expressed, ]
sce.nz
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_nz.Rds", projDir, outDirBit)
saveRDS(sce.nz, tmpFn)
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_nz.Rds", projDir, outDirBit)
sce.nz <- readRDS(tmpFn)
We will compute:
# compute - SLOW
cell_sparsity <- apply(counts(sce.nz) == 0, 2, sum)/nrow(counts(sce.nz))
gene_sparsity <- apply(counts(sce.nz) == 0, 1, sum)/ncol(counts(sce.nz))
colData(sce.nz)$cell_sparsity <- cell_sparsity
rowData(sce.nz)$gene_sparsity <- gene_sparsity
# write outcome to file for later use
tmpFn <- sprintf("%s/%s/Robjects/sce_nz_sparsityCellGene.Rds", projDir, outDirBit)
saveRDS(list("colData" = colData(sce.nz),
"rowData" = rowData(sce.nz)),
tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_nz_sparsityCellGene.Rds", projDir, outDirBit)
tmpList <- readRDS(tmpFn)
cell_sparsity <- tmpList$colData$cell_sparsity
gene_sparsity <- tmpList$rowData$gene_sparsity
We now plot the distribution of these two metrics.
The cell sparsity plot shows that cells have between 85% and 99% 0’s, which is typical.
hist(cell_sparsity, breaks=50, col="grey80", xlab="Cell sparsity", main="")
The gene sparsity plot shows that a large number of genes are almost never detected, which is also regularly observed.
hist(gene_sparsity, breaks=50, col="grey80", xlab="Gene sparsity", main="")
We also remove cells with sparsity higher than 0.99, and/or mitochondrial content higher than 20%.
Genes detected in a few cells only are unlikely to be informative and would hinder normalisation. We will remove genes that are expressed in fewer than 20 cells.
# filter
sparse.cells <- cell_sparsity > 0.99
mito.cells <- sce.nz$subsets_Mito_percent > 20
min.cells <- 1 - (20/length(cell_sparsity))
sparse.genes <- gene_sparsity > min.cells
Number of genes removed:
table(sparse.genes)
## sparse.genes
## FALSE TRUE
## 17875 7279
Number of cells removed:
table(sparse.cells, mito.cells)
## mito.cells
## sparse.cells FALSE TRUE
## FALSE 36816 687
## TRUE 557 68
# remove cells from the SCE object that are poor quality
# remove the sparse genes, then re-set the counts and row data accordingly
cols.meta <- colData(sce.nz)
rows.meta <- rowData(sce.nz)
counts.nz <- counts(sce.nz)[!sparse.genes, !(sparse.cells | mito.cells)]
sce.nz <- SingleCellExperiment(assays=list(counts=counts.nz))
colData(sce.nz) <- cols.meta[!(sparse.cells | mito.cells),]
rowData(sce.nz) <- rows.meta[!sparse.genes, ]
sce.nz
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/sce_nz_postQc.Rds", projDir, outDirBit)
saveRDS(sce.nz, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/sce_nz_postQc.Rds", projDir, outDirBit)
sce.nz <- readRDS(tmpFn)
Compare with filter above (mind that the comparison is not fair because we used a less stringent, hard filtering on mitochondrial content):
table(scePreQc$discard, (sparse.cells | mito.cells))
##
## FALSE TRUE
## FALSE 35302 372
## TRUE 1514 940
We will now check sparsity for each batch separately.
sce.nz.caron <- sce.nz[,sce.nz$setName=="Caron"]
sce.nz.hca <- sce.nz[,sce.nz$setName=="Hca"]
setName <- "caron"
sce.x <- sce.nz.caron
# compute - SLOW
cell_sparsity <- apply(counts(sce.x) == 0, 2, sum)/nrow(counts(sce.x))
gene_sparsity <- apply(counts(sce.x) == 0, 1, sum)/ncol(counts(sce.x))
colData(sce.x)$cell_sparsity <- cell_sparsity
rowData(sce.x)$gene_sparsity <- gene_sparsity
# write outcome to file for later use
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_sparsityCellGene.Rds", projDir, outDirBit, setName)
saveRDS(list("colData" = colData(sce.x),
"rowData" = rowData(sce.x)),
tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_sparsityCellGene.Rds", projDir, outDirBit, setName)
tmpList <- readRDS(tmpFn)
cell_sparsity <- tmpList$colData$cell_sparsity
gene_sparsity <- tmpList$rowData$gene_sparsity
# plot
tmpFn <- sprintf("%s/%s/%s/%s_sparsity.png", projDir, outDirBit, qcPlotDirBit, setName)
png(tmpFn)
par(mfrow=c(1, 2))
hist(cell_sparsity, breaks=50, col="grey80", xlab="Cell sparsity", main="")
hist(gene_sparsity, breaks=50, col="grey80", xlab="Gene sparsity", main="")
abline(v=40, lty=2, col='purple')
dev.off()
tmpFn <- sprintf("%s/%s/%s/%s_sparsity.png", projDir, outDirBit, qcPlotDirBit, setName)
#tmpFn <- sprintf("../%s/%s_sparsity.png", qcPlotDirBit, setName)
knitr::include_graphics(tmpFn, auto_pdf = TRUE)
rm(tmpFn)
# filter
sparse.cells <- cell_sparsity > 0.99
mito.cells <- sce.x$subsets_Mito_percent > 20
min.cells <- 1 - (20/length(cell_sparsity))
sparse.genes <- gene_sparsity > min.cells
# remove cells from the SCE object that are poor quality
# remove the sparse genes, then re-set the counts and row data accordingly
cols.meta <- colData(sce.x)
rows.meta <- rowData(sce.x)
counts.x <- counts(sce.x)[!sparse.genes, !(sparse.cells | mito.cells)]
sce.x <- SingleCellExperiment(assays=list(counts=counts.x))
colData(sce.x) <- cols.meta[!(sparse.cells | mito.cells),]
rowData(sce.x) <- rows.meta[!sparse.genes, ]
sce.x
We write the R object to ‘caron_sce_nz_postQc.Rds’.
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postQc.Rds", projDir, outDirBit, setName)
saveRDS(sce.x, tmpFn)
rm(sce.x)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postQc.Rds", projDir, outDirBit, setName)
sce.nz.caron <- readRDS(tmpFn)
setName <- "hca"
sce.x <- sce.nz.hca
# compute - SLOW
cell_sparsity <- apply(counts(sce.x) == 0, 2, sum)/nrow(counts(sce.x))
gene_sparsity <- apply(counts(sce.x) == 0, 1, sum)/ncol(counts(sce.x))
colData(sce.x)$cell_sparsity <- cell_sparsity
rowData(sce.x)$gene_sparsity <- gene_sparsity
# write outcome to file for later use
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_sparsityCellGene.Rds", projDir, outDirBit, setName)
saveRDS(list("colData" = colData(sce.x),
"rowData" = rowData(sce.x)),
tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_sparsityCellGene.Rds", projDir, outDirBit, setName)
tmpList <- readRDS(tmpFn)
cell_sparsity <- tmpList$colData$cell_sparsity
gene_sparsity <- tmpList$rowData$gene_sparsity
# plot
tmpFn <- sprintf("%s/%s/%s/%s_sparsity.png", projDir, outDirBit, qcPlotDirBit, setName)
png(tmpFn)
par(mfrow=c(1, 2))
hist(cell_sparsity, breaks=50, col="grey80", xlab="Cell sparsity", main="")
hist(gene_sparsity, breaks=50, col="grey80", xlab="Gene sparsity", main="")
abline(v=40, lty=2, col='purple')
dev.off()
tmpFn <- sprintf("%s/%s/%s/%s_sparsity.png", projDir, outDirBit, qcPlotDirBit, setName)
#tmpFn <- sprintf("../%s/%s_sparsity.png", qcPlotDirBit, setName)
knitr::include_graphics(tmpFn, auto_pdf = TRUE)
rm(tmpFn)
# filter
sparse.cells <- cell_sparsity > 0.99
mito.cells <- sce.x$subsets_Mito_percent > 20
min.cells <- 1 - (20/length(cell_sparsity))
sparse.genes <- gene_sparsity > min.cells
# remove cells from the SCE object that are poor quality
# remove the sparse genes, then re-set the counts and row data accordingly
cols.meta <- colData(sce.x)
rows.meta <- rowData(sce.x)
counts.x <- counts(sce.x)[!sparse.genes, !(sparse.cells | mito.cells)]
sce.x <- SingleCellExperiment(assays=list(counts=counts.x))
colData(sce.x) <- cols.meta[!(sparse.cells | mito.cells),]
rowData(sce.x) <- rows.meta[!sparse.genes, ]
sce.x
We write the R object to ‘hca_sce_nz_postQc.Rds’.
# Write object to file
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postQc.Rds", projDir, outDirBit, setName)
saveRDS(sce.x, tmpFn)
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postQc.Rds", projDir, outDirBit, setName)
sce.nz.hca <- readRDS(tmpFn)