splSetToGet <- "PBMMC,ETV6-RUNX1"
splSetVec <- unlist(strsplit(splSetToGet, ","))
splSetToGet2 <- gsub(",", "_", splSetToGet)
nbPcToComp <- 50
figSize <- 7
library(scater)
library(scran)
library(batchelor)
library(edgeR)
library(tidyverse)
library(patchwork)
library(DT)
fontsize <- theme(axis.text=element_text(size=12), axis.title=element_text(size=16))
Source: Multi-sample comparisons of the OSCA book.
Identify label-specific DE genes that are significant in ‘c10’ yet not DE in any other label.
Plot the top-ranked gene for inspection.
Load the SCE object (with 1200 cells per sample):
# Read object in:
merged <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged.Rds")
# also get raw counts that were written to a separate file
# (to help file sharing)
merged_counts <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged_counts.Rds")
# put raw counts back:
counts(merged) <- merged_counts
# tidy:
rm(merged_counts)
A brief inspection of the results shows clusters contain varying contributions from samples:
colLabels(merged) <- merged$clusters.mnn
tab <- table(colLabels(merged), merged$SampleName)
tab
##
## ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 ETV6-RUNX1_4 PBMMC_1 PBMMC_3
## c1 159 6 1 6 82 26
## c2 310 94 67 140 7 2
## c3 275 465 191 275 17 9
## c4 35 324 213 81 6 1
## c5 0 4 1 2 10 16
## c6 3 13 108 25 55 112
## c7 375 189 73 329 214 47
## c8 2 4 50 204 5 273
## c9 0 0 2 2 254 51
## c10 34 61 22 12 205 55
## c11 1 19 107 26 32 106
## c12 5 18 333 44 111 225
## c13 0 2 27 40 34 149
## c14 1 1 5 14 168 128
##
## PBMMC_4
## c1 40
## c2 9
## c3 39
## c4 30
## c5 10
## c6 108
## c7 106
## c8 20
## c9 3
## c10 202
## c11 116
## c12 253
## c13 41
## c14 223
On the t-SNE plots below, cells are coloured by type or sample (‘batch of origin’). Cluster numbers are superimposed based on the median coordinate of cells assigned to that cluster.
p1 <- plotTSNE(merged, colour_by="SampleGroup", text_by="label", point_size=0.3)
p2 <- plotTSNE(merged, colour_by="SampleName", point_size=0.3) +
facet_wrap(~colData(merged)$SampleName)
p1
p2
Sum counts together for all cells with the same combination of label and sample, with aggregateAcrossCells
.
# Using 'label' and 'sample' as our two factors; each column of the output
# corresponds to one unique combination of these two factors.
columnsToUse <- c("batch", "SampleName", "SampleGroup", "clusters.mnn")
colData(merged) <- colData(merged) %>% data.frame() %>% select(all_of(columnsToUse)) %>% DataFrame
summed <- aggregateAcrossCells(merged,
id = DataFrame(
label = merged$clusters.mnn,
sample = merged$SampleName
)
)
colData(summed) %>% head(3)
## DataFrame with 3 rows and 7 columns
## batch SampleName SampleGroup clusters.mnn label sample
## <character> <factor> <factor> <factor> <factor> <factor>
## 1 ETV6-RUNX1_1 ETV6-RUNX1_1 ETV6-RUNX1 c1 c1 ETV6-RUNX1_1
## 2 ETV6-RUNX1_2 ETV6-RUNX1_2 ETV6-RUNX1 c1 c1 ETV6-RUNX1_2
## 3 ETV6-RUNX1_3 ETV6-RUNX1_3 ETV6-RUNX1 c1 c1 ETV6-RUNX1_3
## ncells
## <integer>
## 1 159
## 2 6
## 3 1
Filter out all sample-label combinations with insufficient cells, 20 cells or less.
summed.filt <- summed[,summed$ncells >= 20]
Construct a common design matrix that will be used in the analysis for each label.
# Pulling out a sample-level 'targets' data.frame:
targets <- colData(merged)[!duplicated(merged$SampleName),] %>%
data.frame() %>%
select(-clusters.mnn)
# Constructing the design matrix:
design <- model.matrix(~factor(SampleGroup), data=targets)
rownames(design) <- targets$SampleName
Apply the pseudoBulkDGE
function to obtain a list of DE genes for each label.
summed.filt$SampleGroup <- factor(summed.filt$SampleGroup)
de.results <- pseudoBulkDGE(summed.filt,
label = summed.filt$label,
design = ~SampleGroup,
coef = "SampleGroupPBMMC",
condition = summed.filt$SampleGroup
)
Examine the numbers of DEGs at a FDR of 5% for each label using the decideTestsPerLabel
function.
is.de <- decideTestsPerLabel(de.results, threshold=0.05)
summarizeTestsPerLabel(is.de)
## -1 0 1 NA
## c1 428 3903 378 10262
## c10 138 682 135 14016
## c11 0 2549 3 12419
## c12 2 4529 6 10434
## c13 50 3862 21 11038
## c3 0 8494 0 6477
## c4 0 2629 2 12340
## c6 0 2346 2 12623
## c7 685 6148 520 7618
## c8 67 1909 48 12947
For each gene, we compute the percentage of cell types in which that gene is upregulated or downregulated. (Here, we consider a gene to be non-DE if it is not retained after filtering.).
# Upregulated across most cell types.
up.de <- is.de > 0 & !is.na(is.de)
head(sort(rowMeans(up.de), decreasing=TRUE), 10)
## LYZ S100A8 S100A9 IGKC IGLC2 RGS2 IGLL1 RSL24D1 HINT1 LRRC75A
## 0.7 0.7 0.6 0.5 0.4 0.3 0.3 0.3 0.3 0.3
# Downregulated across cell types.
down.de <- is.de < 0 & !is.na(is.de)
head(sort(rowMeans(down.de), decreasing=TRUE), 10)
## DNTT MDK SOCS2 CD99 CD74 ID3 TERF2 HHEX SLC35E3 MME
## 0.6 0.5 0.5 0.4 0.4 0.4 0.4 0.4 0.4 0.4
Identify label-specific DE genes that are significant in ‘c10’ yet not DE in any other label (FDR threshold of 50%).
Plot the top-ranked gene for inspection.
remotely.de <- decideTestsPerLabel(de.results, threshold=0.5)
not.de <- remotely.de==0 | is.na(remotely.de)
# get c10's 'unique.degs':
# 2nd cluster in is.de
cx <- "c10"
other.labels <- setdiff(colnames(not.de), cx)
unique.degs <- is.de[,cx]!=0 & rowMeans(not.de[,other.labels])==1
unique.degs <- names(which(unique.degs))
head(unique.degs)
## [1] "RBM6" "CD9" "CELF2" "LUC7L3" "RTN4" "ILF3"
# Choosing the top-ranked gene for inspection:
de.inspec <- list()
de.inspec[[cx]] <- de.results[[cx]]
de.inspec[[cx]] <- de.inspec[[cx]][order(de.inspec[[cx]]$PValue),]
de.inspec[[cx]] <- de.inspec[[cx]][rownames(de.inspec[[cx]]) %in% unique.degs,]
# plot expression of top gene
# use plotExpression()
sizeFactors(summed.filt) <- NULL
plotExpression(logNormCounts(summed.filt),
features=rownames(de.inspec[[cx]])[1],
x="SampleGroup", colour_by="SampleGroup",
other_fields="label") +
facet_wrap(~label) +
ggtitle(glue::glue("{cx}: {rownames(de.inspec[[cx]])[1]}"))