1 Differential expression and abundance between conditions

splSetToGet <- "PBMMC,ETV6-RUNX1"
splSetVec <- unlist(strsplit(splSetToGet, ","))
splSetToGet2 <- gsub(",", "_", splSetToGet)
nbPcToComp <- 50
figSize <- 7
library(scater)
library(scran)
library(batchelor)
library(edgeR)
library(tidyverse)
library(patchwork)
library(DT)
fontsize <- theme(axis.text=element_text(size=12), axis.title=element_text(size=16))

Source: Multi-sample comparisons of the OSCA book.

1.1 Exercise 1 - differential expression

Identify label-specific DE genes that are significant in ‘c10’ yet not DE in any other label.

Plot the top-ranked gene for inspection.

1.2 Setting up the data

Load the SCE object (with 1200 cells per sample):

# Read object in:
merged <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged.Rds")
# also get raw counts that were written to a separate file
# (to help file sharing)
merged_counts <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged_counts.Rds")
# put raw counts back:
counts(merged) <- merged_counts
# tidy:
rm(merged_counts)

A brief inspection of the results shows clusters contain varying contributions from samples:

colLabels(merged) <- merged$clusters.mnn
tab <- table(colLabels(merged), merged$SampleName)
tab
##      
##       ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 ETV6-RUNX1_4 PBMMC_1 PBMMC_3
##   c1           159            6            1            6      82      26
##   c2           310           94           67          140       7       2
##   c3           275          465          191          275      17       9
##   c4            35          324          213           81       6       1
##   c5             0            4            1            2      10      16
##   c6             3           13          108           25      55     112
##   c7           375          189           73          329     214      47
##   c8             2            4           50          204       5     273
##   c9             0            0            2            2     254      51
##   c10           34           61           22           12     205      55
##   c11            1           19          107           26      32     106
##   c12            5           18          333           44     111     225
##   c13            0            2           27           40      34     149
##   c14            1            1            5           14     168     128
##      
##       PBMMC_4
##   c1       40
##   c2        9
##   c3       39
##   c4       30
##   c5       10
##   c6      108
##   c7      106
##   c8       20
##   c9        3
##   c10     202
##   c11     116
##   c12     253
##   c13      41
##   c14     223

On the t-SNE plots below, cells are coloured by type or sample (‘batch of origin’). Cluster numbers are superimposed based on the median coordinate of cells assigned to that cluster.

p1 <- plotTSNE(merged, colour_by="SampleGroup", text_by="label", point_size=0.3)
p2 <- plotTSNE(merged, colour_by="SampleName", point_size=0.3) +
  facet_wrap(~colData(merged)$SampleName)
p1

p2

1.3 Creating pseudo-bulk samples

Sum counts together for all cells with the same combination of label and sample, with aggregateAcrossCells.

# Using 'label' and 'sample' as our two factors; each column of the output
# corresponds to one unique combination of these two factors.
columnsToUse <- c("batch", "SampleName", "SampleGroup", "clusters.mnn")
colData(merged) <- colData(merged) %>% data.frame() %>% select(all_of(columnsToUse)) %>% DataFrame
summed <- aggregateAcrossCells(merged, 
                    id = DataFrame(
                        label = merged$clusters.mnn,
                        sample = merged$SampleName
                    )
)
colData(summed) %>% head(3)
## DataFrame with 3 rows and 7 columns
##          batch   SampleName SampleGroup clusters.mnn    label       sample
##    <character>     <factor>    <factor>     <factor> <factor>     <factor>
## 1 ETV6-RUNX1_1 ETV6-RUNX1_1  ETV6-RUNX1           c1       c1 ETV6-RUNX1_1
## 2 ETV6-RUNX1_2 ETV6-RUNX1_2  ETV6-RUNX1           c1       c1 ETV6-RUNX1_2
## 3 ETV6-RUNX1_3 ETV6-RUNX1_3  ETV6-RUNX1           c1       c1 ETV6-RUNX1_3
##      ncells
##   <integer>
## 1       159
## 2         6
## 3         1

1.4 Performing the DE analysis

Filter out all sample-label combinations with insufficient cells, 20 cells or less.

summed.filt <- summed[,summed$ncells >= 20]

Construct a common design matrix that will be used in the analysis for each label.

# Pulling out a sample-level 'targets' data.frame:
targets <- colData(merged)[!duplicated(merged$SampleName),] %>%
  data.frame() %>%
  select(-clusters.mnn)

# Constructing the design matrix:
design <- model.matrix(~factor(SampleGroup), data=targets)
rownames(design) <- targets$SampleName

Apply the pseudoBulkDGE function to obtain a list of DE genes for each label.

summed.filt$SampleGroup <- factor(summed.filt$SampleGroup)

de.results <- pseudoBulkDGE(summed.filt, 
    label = summed.filt$label,
    design = ~SampleGroup,
    coef = "SampleGroupPBMMC",
    condition = summed.filt$SampleGroup
)

Examine the numbers of DEGs at a FDR of 5% for each label using the decideTestsPerLabel function.

is.de <- decideTestsPerLabel(de.results, threshold=0.05)
summarizeTestsPerLabel(is.de)
##      -1    0   1    NA
## c1  428 3903 378 10262
## c10 138  682 135 14016
## c11   0 2549   3 12419
## c12   2 4529   6 10434
## c13  50 3862  21 11038
## c3    0 8494   0  6477
## c4    0 2629   2 12340
## c6    0 2346   2 12623
## c7  685 6148 520  7618
## c8   67 1909  48 12947

For each gene, we compute the percentage of cell types in which that gene is upregulated or downregulated. (Here, we consider a gene to be non-DE if it is not retained after filtering.).

# Upregulated across most cell types.
up.de <- is.de > 0 & !is.na(is.de)
head(sort(rowMeans(up.de), decreasing=TRUE), 10)
##     LYZ  S100A8  S100A9    IGKC   IGLC2    RGS2   IGLL1 RSL24D1   HINT1 LRRC75A 
##     0.7     0.7     0.6     0.5     0.4     0.3     0.3     0.3     0.3     0.3
# Downregulated across cell types.
down.de <- is.de < 0 & !is.na(is.de)
head(sort(rowMeans(down.de), decreasing=TRUE), 10)
##    DNTT     MDK   SOCS2    CD99    CD74     ID3   TERF2    HHEX SLC35E3     MME 
##     0.6     0.5     0.5     0.4     0.4     0.4     0.4     0.4     0.4     0.4

Identify label-specific DE genes that are significant in ‘c10’ yet not DE in any other label (FDR threshold of 50%).

Plot the top-ranked gene for inspection.

remotely.de <- decideTestsPerLabel(de.results, threshold=0.5)
not.de <- remotely.de==0 | is.na(remotely.de)
# get c10's 'unique.degs':

# 2nd cluster in is.de
cx <- "c10"
other.labels <- setdiff(colnames(not.de), cx)
unique.degs <- is.de[,cx]!=0 & rowMeans(not.de[,other.labels])==1
unique.degs <- names(which(unique.degs))
head(unique.degs)
## [1] "RBM6"   "CD9"    "CELF2"  "LUC7L3" "RTN4"   "ILF3"
# Choosing the top-ranked gene for inspection:
de.inspec <- list()
de.inspec[[cx]] <- de.results[[cx]] 
de.inspec[[cx]] <- de.inspec[[cx]][order(de.inspec[[cx]]$PValue),]
de.inspec[[cx]] <- de.inspec[[cx]][rownames(de.inspec[[cx]]) %in% unique.degs,]

# plot expression of top gene
# use plotExpression()
sizeFactors(summed.filt) <- NULL
plotExpression(logNormCounts(summed.filt), 
    features=rownames(de.inspec[[cx]])[1],
    x="SampleGroup", colour_by="SampleGroup", 
    other_fields="label") + 
    facet_wrap(~label) +
  ggtitle(glue::glue("{cx}: {rownames(de.inspec[[cx]])[1]}"))