projDir <- "/mnt/scratcha/bioinformatics/baller01/20200511_FernandesM_ME_crukBiSs2020"
outDirBit <- "AnaWiSce/Attempt1"
nbPcToComp <- 50
suppressMessages(library(ggplot2))
suppressMessages(library(scater))
suppressMessages(library(scran))
suppressMessages(library(dplyr))
suppressMessages(library(dynamicTreeCut))
suppressMessages(library(cluster)) # for silhouette
fontsize <- theme(axis.text=element_text(size=12), axis.title=element_text(size=16))
Source: clustering methods in the Hemberg group material and its ‘biocellgen variant’, with some of its text copied with few edits only. Also see the OSCA chapter on clustering.
Once we have normalized the data and removed confounders we can carry out analyses that are relevant to the biological questions at hand. The exact nature of the analysis depends on the dataset. One of the most promising applications of scRNA-seq is de novo discovery and annotation of cell-types based on transcription profiles. This requires the identification of groups of cells based on the similarities of the transcriptomes without any prior knowledge of the labels, or unsupervised clustering. To avoid the challenges caused byt the noise and high dimensionality of the scRNA-seq data, clusring is performed after feature selection and dimensionality reduction, usually on the PCA output.
We will introduce three widely used clustering methods: 1) hierarchical, 2) k-means and 3) graph-based clustering, apply them on the data set studied and measure their quality.
We will use the denoised log-expression values to cluster cells.
We will load the R file keeping the SCE (SingleCellExperiment) object with the normalised counts for 500 cells per sample and the outcome of feature selection followed by dimensionality reduction.
setName <- "caron"
setSuf <- "_5hCellPerSpl"
# Read object in:
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postDeconv%s_denoised.Rds", projDir, outDirBit, setName, setSuf)
print(tmpFn)
[1] "/mnt/scratcha/bioinformatics/baller01/20200511_FernandesM_ME_crukBiSs2020/AnaWiSce/Attempt1/Robjects/caron_sce_nz_postDeconv_5hCellPerSpl_denoised.Rds"
if(!file.exists(tmpFn))
{
knitr::knit_exit()
}
sce <- readRDS(tmpFn)
sce
class: SingleCellExperiment
dim: 18372 5500
metadata(0):
assays(2): counts logcounts
rownames(18372): ENSG00000238009 ENSG00000237491 ... ENSG00000275063
ENSG00000271254
rowData names(11): ensembl_gene_id external_gene_name ... detected
gene_sparsity
colnames: NULL
colData names(20): Sample Barcode ... cell_sparsity sizeFactor
reducedDimNames(3): PCA TSNE UMAP
altExpNames(0):
# head(rowData(sce))
#any(duplicated(rowData(sce)$ensembl_gene_id))
# some function(s) used below complain about 'strand' already being used in row data,
# so rename that column now:
#colnames(rowData(sce))[colnames(rowData(sce)) == "strand"] <- "strandNum"
#assayNames(sce)
#reducedDimNames(sce)
Hierarchical clustering builds a hierarchy of clusters yielding a dendrogram that groups together cells with similar expression patterns across the chosen genes.
There are two types of strategies: - Agglomerative (bottom-up): each observation starts in its own cluster, and pairs of clusters are merged as one moves up the hierarchy. - Divisive (top-down): all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy.
Here we will use hierarchical clustering on the Euclidean distances between cells, using Ward D2 criterion to minimize the total variance within each cluster.
# get PCs
pcs <- reducedDim(sce, "PCA")
# compute distance:
my.dist <- dist(pcs)
# derive tree:
my.tree <- hclust(my.dist, method="ward.D2")
Show tree:
plot(my.tree, labels = FALSE)
Clusters are identified in the dendrogram using a dynamic tree cut [@doi:10.1093/bioinformatics/btm563].
# identify clustering by cutting branches, requesting a minimum cluster size of 20 cells.
my.clusters <- unname(cutreeDynamic(my.tree, distM=as.matrix(my.dist), minClusterSize=20, verbose=0))
Let us count cells for each cluster and each sample group and for each sample.
# per sample group
table(my.clusters, sce$source_name)
my.clusters ABMMC ETV6-RUNX1 HHD PBMMC PRE-T
1 0 1078 1 8 0
2 0 0 883 0 3
3 0 0 0 0 808
4 0 177 51 429 76
5 0 128 50 417 61
6 0 463 0 0 1
7 0 4 3 249 6
8 0 106 0 116 5
9 0 7 7 180 30
10 0 37 5 101 10
# per sample
table(my.clusters, sce$Sample.Name)
my.clusters GSM3872434 GSM3872435 GSM3872436 GSM3872437 GSM3872438 GSM3872439
1 37 462 230 349 1 0
2 0 0 0 0 404 479
3 0 0 0 0 0 0
4 3 12 140 22 41 10
5 2 13 94 19 44 6
6 451 9 3 0 0 0
7 2 1 0 1 3 0
8 2 2 17 85 0 0
9 1 1 0 5 7 0
10 2 0 16 19 0 5
my.clusters GSM3872440 GSM3872441 GSM3872442 GSM3872443 GSM3872444
1 0 0 0 0 8
2 3 0 0 0 0
3 486 322 0 0 0
4 4 72 190 114 125
5 4 57 103 135 179
6 1 0 0 0 0
7 0 6 164 27 58
8 0 5 0 113 3
9 2 28 30 47 103
10 0 10 13 64 24
Clusters mostly include cells from one sample or the other. This suggests that the samples differ, and/or the presence of batch effect.
Let us show cluster assignments on the t-SNE.
# store cluster assignemnt in SCE object:
sce$cluster <- factor(my.clusters)
# make, store and show TSNE plot:
g <- plotTSNE(sce, colour_by = "cluster", size_by = "sum")
g
Split by sample group:
# split by sample and show:
g <- g + facet_wrap(. ~ sce$source_name)
g
In some areas cells are not all assigned to the same cluster.
The congruence of clusters may be assessed by computing the sillhouette for each cell. The larger the value the closer the cell to cells in its cluster than to cells in other clusters. Cells closer to cells in other clusters have a negative value. Good cluster separation is indicated by clusters whose cells have large silhouette values.
Compute silhouette:
sil <- silhouette(my.clusters, dist = my.dist)
Plot silhouettes with one color per cluster and cells with a negative silhouette with the color of their closest cluster. Add the average silhouette for each cluster and all cells.
# prepare colours:
clust.col <- scater:::.get_palette("tableau10medium") # hidden scater colours
sil.cols <- clust.col[ifelse(sil[,3] > 0, sil[,1], sil[,2])]
sil.cols <- sil.cols[order(-sil[,1], sil[,3])]
#
plot(sil, main = paste(length(unique(my.clusters)), "clusters"),
border=sil.cols, col=sil.cols, do.col.sort=FALSE)
The plot shows cells with negative silhoutette indicating too many clusters were defined. The method and parameters used defined clusters with properties that may not fit the data set, eg clusters with the same diameter.
In k-means clustering, the goal is to partition N cells into k different clusters. In an iterative manner, cluster centers are assigned and each cell is assigned to its nearest cluster:
This approach assumes a pre-determined number of round equally-sized clusters.
The dendogram built above suggests there may be 6 large populations.
Let us define 6 clusters.
# define clusters:
kclust <- kmeans(pcs, centers=6)
# compute silhouette
sil <- silhouette(kclust$cluster, dist(pcs))
# plot silhouette:
clust.col <- scater:::.get_palette("tableau10medium") # hidden scater colours
sil.cols <- clust.col[ifelse(sil[,3] > 0, sil[,1], sil[,2])]
sil.cols <- sil.cols[order(-sil[,1], sil[,3])]
plot(sil, main = paste(length(unique(kclust$cluster)), "clusters"),
border=sil.cols, col=sil.cols, do.col.sort=FALSE)
Show clusters on t-SNE:
tSneCoord <- as.data.frame(reducedDim(sce, "TSNE"))
colnames(tSneCoord) <- c("x", "y")
p2 <- ggplot(tSneCoord, aes(x, y)) +
geom_point(aes(color = as.factor(kclust$cluster)))
p2
Split by sample type:
p2 + facet_wrap(~ sce$source_name)
To find the most appropriate number of clusters, one performs the analysis for a series of k values, computes a measure of fit of the clusters defined: the within cluster sum-of-square. This value decreases as k increases, by an amount that decreases with k. Choose k at the inflexion point of the curve.
library(broom)
require(tibble)
Loading required package: tibble
require(dplyr)
require(tidyr)
Loading required package: tidyr
Attaching package: 'tidyr'
The following object is masked from 'package:S4Vectors':
expand
library(purrr)
Attaching package: 'purrr'
The following object is masked from 'package:DelayedArray':
simplify
The following object is masked from 'package:GenomicRanges':
reduce
The following object is masked from 'package:IRanges':
reduce
points <- as_tibble(pcs)
kclusts <- tibble(k = 1:20) %>%
mutate(
kclust = map(k, ~kmeans(points, .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, points)
)
clusters <- kclusts %>%
unnest(tidied)
assignments <- kclusts %>%
unnest(augmented)
clusterings <- kclusts %>%
unnest(glanced)
Plot the total within cluster sum-of-squares and decide on k.
ggplot(clusterings, aes(k, tot.withinss)) +
geom_line()
Copy the cluster assignment to the SCE object.
df <- as.data.frame(assignments)
sce$kmeans10 <- as.numeric(df[df$k == 10, ".cluster"])
Check silhouette for a k of 10.
library(cluster)
clust.col <- scater:::.get_palette("tableau10medium") # hidden scater colours
sil <- silhouette(sce$kmeans10, dist = my.dist)
sil.cols <- clust.col[ifelse(sil[,3] > 0, sil[,1], sil[,2])]
sil.cols <- sil.cols[order(-sil[,1], sil[,3])]
plot(sil, main = paste(length(unique(sce$kmeans10)), "clusters"),
border=sil.cols, col=sil.cols, do.col.sort=FALSE)
Graph-based clustering entails building a shared nearest-neighbour graph using cells as nodes and their similarity as edges, then identifying ‘communities’ of cells within the network.
We will: build the graph, define clusters, check membership across samples, show membership on t-SNE and assess its quality.
#compute graph
snn.gr <- buildSNNGraph(sce, use.dimred="PCA")
# derive clusters
cluster.out <- igraph::cluster_walktrap(snn.gr)
# count cell in each cluster for each sample
my.clusters <- cluster.out$membership
table(my.clusters, sce$source_name)
my.clusters ABMMC ETV6-RUNX1 HHD PBMMC PRE-T
1 0 0 687 1 4
2 0 3 1 162 0
3 0 0 0 0 288
4 0 0 0 0 206
5 0 186 53 253 79
6 0 106 0 119 5
7 0 5 9 193 16
8 0 0 0 0 259
9 0 83 37 115 25
10 0 547 0 14 0
11 0 0 197 0 0
12 0 402 0 0 0
13 0 86 0 0 0
14 0 2 2 155 2
15 0 504 0 0 0
16 0 17 1 57 5
17 0 3 0 30 5
18 0 35 7 97 28
19 0 5 1 121 23
20 0 0 0 0 55
21 0 0 1 179 0
22 0 16 4 4 0
table(my.clusters, sce$Sample.Name)
my.clusters GSM3872434 GSM3872435 GSM3872436 GSM3872437 GSM3872438 GSM3872439
1 0 0 0 0 212 475
2 0 0 3 0 1 0
3 0 0 0 0 0 0
4 0 0 0 0 0 0
5 4 12 147 23 43 10
6 2 2 17 85 0 0
7 1 1 2 1 9 0
8 0 0 0 0 0 0
9 1 7 66 9 32 5
10 4 71 130 342 0 0
11 0 0 0 0 193 4
12 388 13 1 0 0 0
13 85 1 0 0 0 0
14 1 0 0 1 2 0
15 12 386 99 7 0 0
16 0 0 7 10 0 1
17 1 0 1 1 0 0
18 0 6 20 9 6 1
19 0 1 0 4 1 0
20 0 0 0 0 0 0
21 0 0 0 0 1 0
22 1 0 7 8 0 4
my.clusters GSM3872440 GSM3872441 GSM3872442 GSM3872443 GSM3872444
1 3 1 0 0 1
2 0 0 71 25 66
3 6 282 0 0 0
4 176 30 0 0 0
5 3 76 14 112 127
6 0 5 0 116 3
7 0 16 96 41 56
8 255 4 0 0 0
9 3 22 6 46 63
10 0 0 0 4 10
11 0 0 0 0 0
12 0 0 0 0 0
13 0 0 0 0 0
14 0 2 99 17 39
15 0 0 0 0 0
16 0 5 2 42 13
17 0 5 10 13 7
18 3 25 19 46 32
19 2 21 4 36 81
20 49 6 0 0 0
21 0 0 179 0 0
22 0 0 0 2 2
# store membership
sce$cluster <- factor(my.clusters)
# shoe clusters on TSNE
p <- plotTSNE(sce, colour_by="cluster") + fontsize
p
p + facet_wrap(~ sce$source_name)
Compute modularity to assess clusters quality. The closer to 1 the better.
igraph::modularity(cluster.out)
[1] 0.8546914
mod.out <- clusterModularity(snn.gr, my.clusters, get.weights=TRUE)
ratio <- mod.out$observed/mod.out$expected
lratio <- log10(ratio + 1)
library(pheatmap)
pheatmap(lratio, cluster_rows=FALSE, cluster_cols=FALSE,
color=colorRampPalette(c("white", "blue"))(100))
Show similarity between clusters on a network.
cluster.gr <- igraph::graph_from_adjacency_matrix(ratio,
mode="undirected", weighted=TRUE, diag=FALSE)
plot(cluster.gr, edge.width=igraph::E(cluster.gr)$weight*10)
Write SCE object to file.
tmpFn <- sprintf("%s/%s/Robjects/%s_sce_nz_postDeconv%s_clustered.Rds", projDir, outDirBit, setName, setSuf)
print(tmpFn)
[1] "/mnt/scratcha/bioinformatics/baller01/20200511_FernandesM_ME_crukBiSs2020/AnaWiSce/Attempt1/Robjects/caron_sce_nz_postDeconv_5hCellPerSpl_clustered.Rds"
saveRDS(sce, file=tmpFn)
Challenge Apply graph based clustering on a single sample (group?).
# eg SRR9264354, GSM3872444, a PBMMC
SplToGet <- "GSM3872444"
# extract cells for this sample:
cellsToGet <- colData(sce) %>%
data.frame() %>%
filter(Sample.Name == SplToGet) %>%
pull(Barcode)
sce_c <- sce[, cellsToGet]
# normalise
# PCA
# cluster