projDir <- "/mnt/scratcha/bioinformatics/baller01/20200511_FernandesM_ME_crukBiSs2020"
outDirBit <- "AnaWiSce/Attempt1"
nbPcToComp <- 50
suppressMessages(library(ggplot2))
suppressMessages(library(scater))
suppressMessages(library(scran))
suppressMessages(library(dplyr))
fontsize <- theme(axis.text=element_text(size=12), axis.title=element_text(size=16))
Source: Cell cycle assignment of the OSCA book.
We will load the R file keeping the SCE object with the normalised counts for 500 cells per sample.
setName <- "caron"
setSuf <- "_5hCellPerSpl"
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postDeconv%s_clustered.Rds", projDir, outDirBit, setName, setSuf)
print(tmpFn)
[1] "/mnt/scratcha/bioinformatics/baller01/20200511_FernandesM_ME_crukBiSs2020/AnaWiSce/Attempt1/Robjects/caron_sce_nz_postDeconv_5hCellPerSpl_clustered.Rds"
if(!file.exists(tmpFn))
{
knitr::knit_exit()
}
sce <- readRDS(tmpFn)
sce
class: SingleCellExperiment
dim: 18372 5500
metadata(0):
assays(2): counts logcounts
rownames(18372): ENSG00000238009 ENSG00000237491 ... ENSG00000275063
ENSG00000271254
rowData names(11): ensembl_gene_id external_gene_name ... detected
gene_sparsity
colnames: NULL
colData names(22): Sample Barcode ... cluster kmeans10
reducedDimNames(3): PCA TSNE UMAP
altExpNames(0):
sce$clusterStg <- factor(paste0("c", sce$cluster),
levels = paste0("c", levels( sce$cluster)) )
On occasion, it can be desirable to determine cell cycle activity from scRNA-seq data. In and of itself, the distribution of cells across phases of the cell cycle is not usually informative, but we can use this to determine if there are differences in proliferation between subpopulations or across treatment conditions. Many of the key events in the cell cycle (e.g., passage through checkpoints) are post-translational and thus not directly visible in transcriptomic data; nonetheless, there are enough changes in expression that can be exploited to determine cell cycle phase. We demonstrate using the 416B dataset, which is known to contain actively cycling cells after oncogene induction.
The cyclins control progression through the cell cycle and have well-characterized patterns of expression across cell cycle phases. Cyclin D is expressed throughout but peaks at G1; cyclin E is expressed highest in the G1/S transition; cyclin A is expressed across S and G2; and cyclin B is expressed highest in late G2 and mitosis. Inspection of the relative expression of cyclins across the population can often be sufficient to determine the relative cell cycle activity in each cluster (Figure 16.1). For example, cluster 1 is likely to be in G1 while the other clusters are scattered across the later phases.
#cyclin.genes <- grep("^CCN[ABDE][0-9]$", rowData(sce)$SYMBOL)
cyclin.genes <- grep("^CCN[ABDE]+", rowData(sce)$Symbol)
cyclin.genes <- rownames(sce)[cyclin.genes]
rowData(sce)[cyclin.genes,]
DataFrame with 11 rows and 11 columns
ensembl_gene_id external_gene_name chromosome_name
<character> <character> <character>
ENSG00000145386 ENSG00000145386 CCNA2 4
ENSG00000134057 ENSG00000134057 CCNB1 5
ENSG00000112576 ENSG00000112576 CCND3 6
ENSG00000175305 ENSG00000175305 CCNE2 8
ENSG00000110092 ENSG00000110092 CCND1 11
ENSG00000118971 ENSG00000118971 CCND2 12
ENSG00000133101 ENSG00000133101 CCNA1 13
ENSG00000100814 ENSG00000100814 CCNB1IP1 14
ENSG00000166946 ENSG00000166946 CCNDBP1 15
ENSG00000157456 ENSG00000157456 CCNB2 15
ENSG00000105173 ENSG00000105173 CCNE1 19
start_position end_position strandNum Symbol
<integer> <integer> <integer> <character>
ENSG00000145386 121816444 121823933 -1 CCNA2
ENSG00000134057 69167135 69178245 1 CCNB1
ENSG00000112576 41934934 42050357 -1 CCND3
ENSG00000175305 94879770 94896678 -1 CCNE2
ENSG00000110092 69641156 69654474 1 CCND1
ENSG00000118971 4273762 4305353 1 CCND2
ENSG00000133101 36431520 36442870 1 CCNA1
ENSG00000100814 20311368 20333312 -1 CCNB1IP1
ENSG00000166946 43185118 43197177 1 CCNDBP1
ENSG00000157456 59105126 59125045 1 CCNB2
ENSG00000105173 29811991 29824312 1 CCNE1
Type mean detected gene_sparsity
<character> <numeric> <numeric> <numeric>
ENSG00000145386 Gene Expression 0.18827907 6.582173 0.916136
ENSG00000134057 Gene Expression 0.24689175 6.112821 0.926208
ENSG00000112576 Gene Expression 0.89323057 42.111037 0.404038
ENSG00000175305 Gene Expression 0.03021450 2.333095 0.975673
ENSG00000110092 Gene Expression 0.00974467 0.790423 0.987453
ENSG00000118971 Gene Expression 0.16729488 11.804110 0.885239
ENSG00000133101 Gene Expression 0.00209762 0.173194 0.993431
ENSG00000100814 Gene Expression 0.14573204 11.180853 0.844533
ENSG00000166946 Gene Expression 0.37205349 23.893930 0.698132
ENSG00000157456 Gene Expression 0.27627042 7.518866 0.907662
ENSG00000105173 Gene Expression 0.08911249 4.335072 0.976746
# only use the 10 most variable cyclins:
tmpVar <- DelayedMatrixStats::rowVars( DelayedArray(assay(sce[cyclin.genes,], "logcounts")))
names(tmpVar) <- cyclin.genes
cyclin.genes.sub <- names(tmpVar[order(tmpVar, decreasing=T)])[1:10]
rowData(sce)[cyclin.genes.sub,c("ensembl_gene_id", "Symbol")]
DataFrame with 10 rows and 2 columns
ensembl_gene_id Symbol
<character> <character>
ENSG00000112576 ENSG00000112576 CCND3
ENSG00000166946 ENSG00000166946 CCNDBP1
ENSG00000100814 ENSG00000100814 CCNB1IP1
ENSG00000157456 ENSG00000157456 CCNB2
ENSG00000118971 ENSG00000118971 CCND2
ENSG00000134057 ENSG00000134057 CCNB1
ENSG00000145386 ENSG00000145386 CCNA2
ENSG00000105173 ENSG00000105173 CCNE1
ENSG00000175305 ENSG00000175305 CCNE2
ENSG00000110092 ENSG00000110092 CCND1
library(scater)
plotHeatmap(sce, order_columns_by="clusterStg",
cluster_rows=TRUE, features=sort(cyclin.genes.sub))
For example, we can use standard DE methods (Chapter 11) to look for upregulation of each cyclin, which would imply that a subpopulation contains more cells in the corresponding cell cycle phase. The same logic applies to comparisons between treatment conditions, as described in Chapter 14.
library(scran)
markers <- findMarkers(sce, groups=sce$clusterStg, subset.row=cyclin.genes,
test.type="wilcox", direction="up")
# We can infer that cluster 4 has more cells in G2/M than the other clusters,
# based on higher expression of the cyclin B's.
markers[[4]] %>% data.frame() %>%
tibble::rownames_to_column("ensembl_gene_id") %>%
left_join(data.frame(rowData(sce))[, c("ensembl_gene_id", "Symbol")], by="ensembl_gene_id") %>%
select(Symbol, Top, p.value, FDR, summary.AUC)
Direct examination of cyclin expression is easily to understand, interpret and experimentally validate. However, it is best suited for statements about relative cell cycle activity; for example, we would find it difficult to assign cell cycle phase in Figure 16.1 without the presence of clusters spanning all phases to provide benchmarks for “high” and “low” expression of each cyclin. We also assume that cyclin expression is not affected by biological processes other than the cell cycle, which may be a strong assumption in some cases, e.g., malignant cells. This strategy is strongly dependent on having good sequencing coverage of the cyclins, which is less of an issue for the whole-of-transcriptome methods described below that use information from more genes.
The prediction method described by Scialdone et al. (2015) is another approach for classifying cells into cell cycle phases. Using a reference dataset, we first compute the sign of the difference in expression between each pair of genes. Pairs with changes in the sign across cell cycle phases are chosen as markers. Cells in a test dataset can then be classified into the appropriate phase, based on whether the observed sign for each marker pair is consistent with one phase or another. This approach is implemented in the cyclone() function from the scran package, which also contains pre-trained set of marker pairs for mouse and human data.
set.seed(100)
library(scran)
hs.pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds",
package="scran"))
# Using Ensembl IDs to match up with the annotation in 'mm.pairs'.
assignments <- cyclone(sce, hs.pairs, gene.names=rowData(sce)$ensembl_gene_id) # SLOW
Write assignments to file.
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postDeconv%s_cyclone.Rds", projDir, outDirBit, setName, setSuf)
print(tmpFn)
saveRDS(assignments, file=tmpFn)
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postDeconv%s_cyclone.Rds", projDir, outDirBit, setName, setSuf)
print(tmpFn)
[1] "/mnt/scratcha/bioinformatics/baller01/20200511_FernandesM_ME_crukBiSs2020/AnaWiSce/Attempt1/Robjects/caron_sce_nz_postDeconv_5hCellPerSpl_cyclone.Rds"
assignments <- readRDS(file=tmpFn)
Copy cell cycle assignments to SCE object:
colData(sce)$phases <- assignments$phases
For each cell, a higher score for a phase corresponds to a higher probability that the cell is in that phase. We focus on the G1 and G2/M scores as these are the most informative for classification.
The plot below show the cell cycle phase scores obtained by applying the pair-based classifier on the dataset. Each point represents a cell, plotted according to its scores for G1 and G2/M phases.
plot(assignments$score$G1, assignments$score$G2M,
xlab="G1 score", ylab="G2/M score", pch=16)
Cells are classified as being in G1 phase if the G1 score is above 0.5 and greater than the G2/M score; in G2/M phase if the G2/M score is above 0.5 and greater than the G1 score; and in S phase if neither score is above 0.5.
colLabels(sce) <- colData(sce)$source_name
table(assignments$phases, colLabels(sce))
ABMMC ETV6-RUNX1 HHD PBMMC PRE-T
G1 0 1597 650 1085 641
G2M 0 118 78 232 132
S 0 285 272 183 227
# prop per phase
round(prop.table(table(assignments$phases, colLabels(sce)),1),2)
ABMMC ETV6-RUNX1 HHD PBMMC PRE-T
G1 0.00 0.40 0.16 0.27 0.16
G2M 0.00 0.21 0.14 0.41 0.24
S 0.00 0.29 0.28 0.19 0.23
# prop per type
round(prop.table(table(assignments$phases, colLabels(sce)),2),2)
ABMMC ETV6-RUNX1 HHD PBMMC PRE-T
G1 0.80 0.65 0.72 0.64
G2M 0.06 0.08 0.15 0.13
S 0.14 0.27 0.12 0.23
tsne1 <- plotTSNE(sce, colour_by="phases") + fontsize
tsne1 + facet_wrap(. ~ sce$source_name)
tsne1 + facet_wrap(. ~ sce$phases)
umap1 <- plotUMAP(sce, colour_by="phases") + fontsize
umap1 + facet_wrap(. ~ sce$source_name)
umap1 + facet_wrap(. ~ sce$phases)
For some time, it was popular to regress out the cell cycle phase prior to downstream analyses. The aim was to remove uninteresting variation due to cell cycle, thus improving resolution of other biological processes of interest. We could implement this by performing cell cycle phase assignment as described above, treating each phase as a separate batch and applying any of the batch correction strategies. The most common approach is to use a linear model to simply regress out the phase effect, e.g., via regressBatches().
library(batchelor)
sce.nocycle <- regressBatches(sce, batch=sce$phases)
# PCA
#plotPCA(sce, colour_by = "Sample.Name")
sce.nocycle <- runPCA(
sce.nocycle,
exprs_values = "corrected"
)
p <- plotPCA(
sce.nocycle,
colour_by = "batch"
)
p
# TSNE
sce.nocycle <- runTSNE(sce.nocycle, exprs_values = "corrected")
p <- plotTSNE(
sce.nocycle,
colour_by = "batch"
)
p
That said, we do not consider adjusting for cell cycle to be a necessary step in routine scRNA-seq analyses. In most applications, the cell cycle is a minor factor of variation, secondary to differences between cell types. Any attempt at removal would also need to assume that the cell cycle effect is orthogonal to other biological processes. For example, regression would potentially remove interesting signal if cell cycle activity varied across clusters or conditions, with a prime example being the increased proliferation of activated T cells (Richard et al. 2018). We suggest only performing cell cycle adjustment on an as-needed basis in populations with clear cell cycle effects.
Challenge Remove the cell cycle genes listed in the ‘cell cycle’ GO term, perform PCA and plot t-SNE.
library(org.Hs.eg.db)
Loading required package: AnnotationDbi
Attaching package: 'AnnotationDbi'
The following object is masked from 'package:dplyr':
select
cc.genes <- select(org.Hs.eg.db, keys="GO:0007049", keytype="GOALL", column="ENSEMBL")
'select()' returned 1:many mapping between keys and columns
length(cc.genes)
[1] 4
sce.uncycle <- sce[!rowData(sce)$ensembl_gene_id %in% cc.genes$ENSEMBL,]
# PCA
sce.uncycle <- runPCA(
sce.uncycle,
exprs_values = "logcounts"
)
p <- plotPCA(
sce.uncycle,
colour_by = "phases",
size_by = "sum",
shape_by = "source_name"
)
p
# TSNE
sce.uncycle <- runTSNE(sce.uncycle, exprs_values = "logcounts")
p <- plotTSNE(
sce.uncycle,
colour_by = "phases",
size_by = "sum",
shape_by = "source_name"
)
p
# UMAP
sce.uncycle <- runUMAP(sce.uncycle, exprs_values = "logcounts")
p <- plotUMAP(
sce.uncycle,
colour_by = "phases",
size_by = "sum",
shape_by = "source_name"
)
p
Write SCE object to file.
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postDeconv%s_cellCycle.Rds", projDir, outDirBit, setName, setSuf)
print(tmpFn)
saveRDS(sce, file=tmpFn)