1 Differential expression and abundance between conditions

splSetToGet <- "PBMMC,ETV6-RUNX1"
splSetVec <- unlist(strsplit(splSetToGet, ","))
splSetToGet2 <- gsub(",", "_", splSetToGet)
nbPcToComp <- 50
figSize <- 7
library(scater)
library(scran)
library(batchelor)
library(edgeR)
library(tidyverse)
library(patchwork)
library(DT)
fontsize <- theme(axis.text=element_text(size=12), axis.title=element_text(size=16))

Source: Multi-sample comparisons of the OSCA book.

1.1 Exercise 2 - differential abundance

Imagine ETV6-RUNX1_4 failed, leaving you with three ETV6-RUNX1 replicates … but all else remains as above, including the clusters identified.

Identify clusters whose abundance differ between conditions.

1.2 Setting up the data

Load the SCE object (with 1200 cells per sample):

# Read object in:
merged <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged.Rds")
# also get raw counts that were written to a separate file
# (to help file sharing)
merged_counts <- readRDS("../Robjects/caron_sce_nz_postDeconv_1p2kcps_dsi_PBMMC_ETV6-RUNX1_merged_counts.Rds")
# put raw counts back:
counts(merged) <- merged_counts
# tidy:
rm(merged_counts)
bcToKeep <- colData(merged) %>%
  data.frame() %>%
  filter(!SampleName == "ETV6-RUNX1_4") %>%
  pull(Barcode)
indToKeep <- which(colnames(merged) %in% bcToKeep)
merged <- merged[,indToKeep]

merged$SampleName <- factor(merged$SampleName)

A brief inspection of the results shows clusters contain varying contributions from samples:

colLabels(merged) <- merged$clusters.mnn
tab <- table(colLabels(merged), merged$SampleName)
tab
##      
##       ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 PBMMC_1 PBMMC_3 PBMMC_4
##   c1           159            6            1      82      26      40
##   c2           310           94           67       7       2       9
##   c3           275          465          191      17       9      39
##   c4            35          324          213       6       1      30
##   c5             0            4            1      10      16      10
##   c6             3           13          108      55     112     108
##   c7           375          189           73     214      47     106
##   c8             2            4           50       5     273      20
##   c9             0            0            2     254      51       3
##   c10           34           61           22     205      55     202
##   c11            1           19          107      32     106     116
##   c12            5           18          333     111     225     253
##   c13            0            2           27      34     149      41
##   c14            1            1            5     168     128     223

Count cells assigned to each cluster:

abundances <- table(merged$clusters.mnn, merged$SampleName) 
abundances <- unclass(abundances) 
head(abundances)
##     
##      ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 PBMMC_1 PBMMC_3 PBMMC_4
##   c1          159            6            1      82      26      40
##   c2          310           94           67       7       2       9
##   c3          275          465          191      17       9      39
##   c4           35          324          213       6       1      30
##   c5            0            4            1      10      16      10
##   c6            3           13          108      55     112     108

Prepare edgeR object

# think 'DGEList'
extra.info <- colData(merged)[match(colnames(abundances), merged$SampleName),]
y.ab <- DGEList(abundances, samples=extra.info)
y.ab
## An object of class "DGEList"
## $counts
##     
##      ETV6-RUNX1_1 ETV6-RUNX1_2 ETV6-RUNX1_3 PBMMC_1 PBMMC_3 PBMMC_4
##   c1          159            6            1      82      26      40
##   c2          310           94           67       7       2       9
##   c3          275          465          191      17       9      39
##   c4           35          324          213       6       1      30
##   c5            0            4            1      10      16      10
## 9 more rows ...
## 
## $samples
##              group lib.size norm.factors        batch
## ETV6-RUNX1_1     1     1200            1 ETV6-RUNX1_1
## ETV6-RUNX1_2     1     1200            1 ETV6-RUNX1_2
## ETV6-RUNX1_3     1     1200            1 ETV6-RUNX1_3
## PBMMC_1          1     1200            1      PBMMC_1
## PBMMC_3          1     1200            1      PBMMC_3
## PBMMC_4          1     1200            1      PBMMC_4
##                                                                                                                                                         Sample
## ETV6-RUNX1_1 /ssd/personal/baller01/20200511_FernandesM_ME_crukBiSs2020/Data/CaronBourque2020/grch38300/SRR9264343/SRR9264343/outs/filtered_feature_bc_matrix/
## ETV6-RUNX1_2 /ssd/personal/baller01/20200511_FernandesM_ME_crukBiSs2020/Data/CaronBourque2020/grch38300/SRR9264344/SRR9264344/outs/filtered_feature_bc_matrix/
## ETV6-RUNX1_3 /ssd/personal/baller01/20200511_FernandesM_ME_crukBiSs2020/Data/CaronBourque2020/grch38300/SRR9264345/SRR9264345/outs/filtered_feature_bc_matrix/
## PBMMC_1      /ssd/personal/baller01/20200511_FernandesM_ME_crukBiSs2020/Data/CaronBourque2020/grch38300/SRR9264352/SRR9264352/outs/filtered_feature_bc_matrix/
## PBMMC_3      /ssd/personal/baller01/20200511_FernandesM_ME_crukBiSs2020/Data/CaronBourque2020/grch38300/SRR9264353/SRR9264353/outs/filtered_feature_bc_matrix/
## PBMMC_4      /ssd/personal/baller01/20200511_FernandesM_ME_crukBiSs2020/Data/CaronBourque2020/grch38300/SRR9264354/SRR9264354/outs/filtered_feature_bc_matrix/
##                          Barcode        Run Sample.Name source_name  sum
## ETV6-RUNX1_1  GGACGTCCATCGACGC-1 SRR9264343  GSM3872434  ETV6-RUNX1 2887
## ETV6-RUNX1_2  GTACGTAAGTCAATAG-2 SRR9264344  GSM3872435  ETV6-RUNX1 4759
## ETV6-RUNX1_3  CCTCAGTAGAGCTGCA-3 SRR9264345  GSM3872436  ETV6-RUNX1 2357
## PBMMC_1      CCGTACTGTAGGCATG-10 SRR9264352  GSM3872442       PBMMC 4156
## PBMMC_3      CAGGTGCAGCGCCTTG-11 SRR9264353  GSM3872443       PBMMC 2744
## PBMMC_4      GTTTCTAGTGAGGGTT-12 SRR9264354  GSM3872444       PBMMC 3893
##              detected subsets_Mito_sum subsets_Mito_detected
## ETV6-RUNX1_1     1266              196                    12
## ETV6-RUNX1_2     2012              118                    11
## ETV6-RUNX1_3      853               74                     9
## PBMMC_1           816               33                     8
## PBMMC_3           893               99                    11
## PBMMC_4          1502              241                    10
##              subsets_Mito_percent total      block setName discard outlier
## ETV6-RUNX1_1            6.7890544  2887 ETV6-RUNX1   Caron   FALSE   FALSE
## ETV6-RUNX1_2            2.4795125  4759 ETV6-RUNX1   Caron   FALSE   FALSE
## ETV6-RUNX1_3            3.1395842  2357 ETV6-RUNX1   Caron   FALSE   FALSE
## PBMMC_1                 0.7940327  4156      PBMMC   Caron   FALSE   FALSE
## PBMMC_3                 3.6078717  2744      PBMMC   Caron   FALSE   FALSE
## PBMMC_4                 6.1905985  3893      PBMMC   Caron   FALSE   FALSE
##              cell_sparsity sizeFactor Sample.Name2 label   SampleName
## ETV6-RUNX1_1     0.9413319  0.9317278 ETV6-RUNX1_1    c3 ETV6-RUNX1_1
## ETV6-RUNX1_2     0.9067612  1.7736537 ETV6-RUNX1_2    c2 ETV6-RUNX1_2
## ETV6-RUNX1_3     0.9604708  0.5814998 ETV6-RUNX1_3   c12 ETV6-RUNX1_3
## PBMMC_1          0.9621855  0.5579368      PBMMC_1    c5      PBMMC_1
## PBMMC_3          0.9586172  0.5643228      PBMMC_3   c12      PBMMC_3
## PBMMC_4          0.9303953  1.1336564      PBMMC_4   c14      PBMMC_4
##              SampleGroup clusters.mnn
## ETV6-RUNX1_1  ETV6-RUNX1           c3
## ETV6-RUNX1_2  ETV6-RUNX1           c2
## ETV6-RUNX1_3  ETV6-RUNX1          c12
## PBMMC_1            PBMMC           c5
## PBMMC_3            PBMMC          c12
## PBMMC_4            PBMMC          c14

Filter out low-abundance labels

# think 'filterByExpr'
keep <- filterByExpr(y.ab, group=y.ab$samples$SampleGroup)
y.ab <- y.ab[keep,]
summary(keep)
##    Mode   FALSE    TRUE 
## logical       1      13

Make design matrix

design <- model.matrix(~factor(SampleGroup), y.ab$samples)

Estimate the NB dispersion

# think 'estimateDisp'
y.ab <- estimateDisp(y.ab, design, trend="none")

Estimate the QL dispersion

# think 'glmQLFit'
fit.ab <- glmQLFit(y.ab, design, robust=TRUE, abundance.trend=FALSE)

Test for differences in abundance between sample groups using glmQLFTest

res <- glmQLFTest(fit.ab, coef=ncol(design))

Get summary

# think 'decideTests'
summary(decideTests(res))
##        factor(SampleGroup)PBMMC
## Down                          1
## NotSig                       11
## Up                            1

Display whole table:

# think 'topTags'
topTags(res)
## Coefficient:  factor(SampleGroup)PBMMC 
##          logFC   logCPM          F       PValue        FDR
## c14  6.1379897 16.18445 12.5003158 0.0008647146 0.01124129
## c2  -4.6810591 16.08165  8.4873772 0.0052604443 0.03419289
## c4  -3.9368152 16.39144  6.4877527 0.0138598745 0.05037579
## c3  -3.8325512 17.09027  6.2642705 0.0155002433 0.05037579
## c13  2.9332512 15.16283  3.8958438 0.0537296832 0.13969718
## c5   2.7586106 12.84091  3.1687027 0.0809052505 0.17529471
## c8   2.4039992 15.62869  2.7508652 0.1032226412 0.19169919
## c10  1.9779382 16.32000  1.9479700 0.1687371320 0.27419784
## c6   1.1467011 15.79598  0.6827729 0.4124088822 0.59570172
## c11  0.9978747 15.73137  0.5193625 0.4743411764 0.61664353