Section 1: Contrast matrices

One 3-level factor:
Two categorical predictors:
Two categorical predictors:

Section 2: DESeq2

Introduction slide
Section 2 slides dedicated to dispersion

Section 3: Large Scale Hypothesis testing: FDR

Section 1: Contrast matrices

One 3-level factor:

one3levelfactor = data.frame(condition =
rep(c("TreatmentA", "TreatmentB", "Control"), 2))
# model without intercept and default levels:
model.matrix(~ condition - 1, data = one3levelfactor)

##   conditionControl conditionTreatmentA conditionTreatmentB
## 1                0                   1                   0
## 2                0                   0                   1
## 3                1                   0                   0
## 4                0                   1                   0
## 5                0                   0                   1
## 6                1                   0                   0
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$condition
## [1] "contr.treatment"

# model with intercept and default levels
model.matrix(~ condition, data = one3levelfactor)

##   (Intercept) conditionTreatmentA conditionTreatmentB
## 1           1                   1                   0
## 2           1                   0                   1
## 3           1                   0                   0
## 4           1                   1                   0
## 5           1                   0                   1
## 6           1                   0                   0
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$condition
## [1] "contr.treatment"

# model with intercept and self-defined levels
levels(one3levelfactor$condition)

## [1] "Control"    "TreatmentA" "TreatmentB"

levels(one3levelfactor$condition) = c("TreatmentB", "TreatmentA", "Control") 
model.matrix(~ condition, data = one3levelfactor)

##   (Intercept) conditionTreatmentA conditionControl
## 1           1                   1                0
## 2           1                   0                1
## 3           1                   0                0
## 4           1                   1                0
## 5           1                   0                1
## 6           1                   0                0
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$condition
## [1] "contr.treatment"

Two categorical predictors:

# create dataset
two2levelfactor = data.frame(treatment = rep(c("TreatA","NoTreat"),4), er = rep(c("+","-"),each=4))
# design matrix without interaction
model.matrix(~ treatment + er, data=two2levelfactor)

##   (Intercept) treatmentTreatA er+
## 1           1               1   1
## 2           1               0   1
## 3           1               1   1
## 4           1               0   1
## 5           1               1   0
## 6           1               0   0
## 7           1               1   0
## 8           1               0   0
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$treatment
## [1] "contr.treatment"
## 
## attr(,"contrasts")$er
## [1] "contr.treatment"

# design matrix with interaction
model.matrix(~ treatment + er + treatment:er, data=two2levelfactor)

##   (Intercept) treatmentTreatA er+ treatmentTreatA:er+
## 1           1               1   1                   1
## 2           1               0   1                   0
## 3           1               1   1                   1
## 4           1               0   1                   0
## 5           1               1   0                   0
## 6           1               0   0                   0
## 7           1               1   0                   0
## 8           1               0   0                   0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$treatment
## [1] "contr.treatment"
## 
## attr(,"contrasts")$er
## [1] "contr.treatment"

model.matrix(~ treatment * er, data=two2levelfactor)

##   (Intercept) treatmentTreatA er+ treatmentTreatA:er+
## 1           1               1   1                   1
## 2           1               0   1                   0
## 3           1               1   1                   1
## 4           1               0   1                   0
## 5           1               1   0                   0
## 6           1               0   0                   0
## 7           1               1   0                   0
## 8           1               0   0                   0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$treatment
## [1] "contr.treatment"
## 
## attr(,"contrasts")$er
## [1] "contr.treatment"

Two categorical predictors:

# create dataset
two2levelfactor = data.frame(treatment = rep(c("TreatA","NoTreat"),4), er = rep(c("+","-"),each=4))
# design matrix without interaction
model.matrix(~ treatment + er, data=two2levelfactor)

##   (Intercept) treatmentTreatA er+
## 1           1               1   1
## 2           1               0   1
## 3           1               1   1
## 4           1               0   1
## 5           1               1   0
## 6           1               0   0
## 7           1               1   0
## 8           1               0   0
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$treatment
## [1] "contr.treatment"
## 
## attr(,"contrasts")$er
## [1] "contr.treatment"

# design matrix with interaction
model.matrix(~ treatment + er + treatment:er, data=two2levelfactor)

##   (Intercept) treatmentTreatA er+ treatmentTreatA:er+
## 1           1               1   1                   1
## 2           1               0   1                   0
## 3           1               1   1                   1
## 4           1               0   1                   0
## 5           1               1   0                   0
## 6           1               0   0                   0
## 7           1               1   0                   0
## 8           1               0   0                   0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$treatment
## [1] "contr.treatment"
## 
## attr(,"contrasts")$er
## [1] "contr.treatment"

model.matrix(~ treatment * er, data=two2levelfactor)

##   (Intercept) treatmentTreatA er+ treatmentTreatA:er+
## 1           1               1   1                   1
## 2           1               0   1                   0
## 3           1               1   1                   1
## 4           1               0   1                   0
## 5           1               1   0                   0
## 6           1               0   0                   0
## 7           1               1   0                   0
## 8           1               0   0                   0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$treatment
## [1] "contr.treatment"
## 
## attr(,"contrasts")$er
## [1] "contr.treatment"

Section 2: DESeq2

Introduction slide

Let’s generate

cnts, a toy matrix of counts of 1000 genes for 20 samples,
cond, a vector indicating to which condition each sample belongs (1 for treatment 1, 2 for treatment 2),

set.seed(777)
cnts <- matrix(rnbinom(n=20000, mu=100, size=1/.25), ncol=20)
cond <- factor(rep(1:2, each=10))

Let’s

combine the count matrix, the sample information and the assumed model in an object of class DESeqDataSet,
perform the DE analysis via the function DESeq
print the results

library(DESeq2)
dds <- DESeqDataSetFromMatrix(cnts, DataFrame(cond), ~ cond)
dds <- DESeq(dds)
results(dds)

## log2 fold change (MLE): cond 2 vs 1 
## Wald test p-value: cond 2 vs 1 
## DataFrame with 1000 rows and 6 columns
##              baseMean      log2FoldChange             lfcSE
##             <numeric>           <numeric>         <numeric>
## 1    97.3140070275624  -0.682067456083922  0.34452548280397
## 2    109.986001522326  -0.228819342511955 0.450719570130495
## 3    98.8111339049887   0.104290983234424 0.462113448191991
## 4    103.261468650616   0.306399638856188 0.297682184947304
## 5    97.9405986819028   0.316338122846712 0.357241885530114
## ...               ...                 ...               ...
## 996   86.805727240247    0.04677025495103 0.287042149795611
## 997  101.443692661503  -0.207080563623306 0.339886451284232
## 998  78.1356102563776  -0.637278950950912  0.36951488929412
## 999  89.2919596938848   0.755472532550507 0.306192302512061
## 1000 103.556924422375 -0.0728875235997886 0.348655022016602
##                    stat             pvalue              padj
##               <numeric>          <numeric>         <numeric>
## 1     -1.97973006389199 0.0477338685071651 0.745841695424454
## 2    -0.507675631758581  0.611680840528566 0.944353784116972
## 3     0.225682640577678  0.821448260598229  0.97838220015929
## 4      1.02928443269263  0.303346036357983 0.944353784116972
## 5     0.885501212651186  0.375886365866081 0.944353784116972
## ...                 ...                ...               ...
## 996   0.162938631083738  0.870566754045311 0.980044486284683
## 997  -0.609263955185239  0.542349494692315 0.944353784116972
## 998   -1.72463673160315  0.084592959500015 0.824309867118894
## 999    2.46731392772602  0.013613095213581 0.614613332611614
## 1000 -0.209053416693129  0.834406539253083  0.97838220015929

Section 2 slides dedicated to dispersion

Let’s print the relevant information to deduce the estimated NB distribution assumed for each gene and condition:

mcols(dds)[,c("Intercept","cond_2_vs_1","dispGeneEst","dispFit","dispersion")]

## DataFrame with 1000 rows and 5 columns
##             Intercept         cond_2_vs_1       dispGeneEst
##             <numeric>           <numeric>         <numeric>
## 1    6.90565301909102  -0.682067456083922 0.294082154716229
## 2    6.89102316136555  -0.228819342511955 0.479230606512793
## 3    6.57355070389213   0.104290983234424 0.503276086973451
## 4    6.52875472002919   0.306399638856188 0.189798622917205
## 5    6.44716073806192   0.316338122846712 0.327392889229233
## ...               ...                 ...               ...
## 996  6.41604810736421    0.04677025495103 0.167776129518199
## 997  6.76441531168783  -0.207080563623306   0.2825326850941
## 998  6.57183941992272  -0.637278950950912 0.363912466734467
## 999  6.05379830726471   0.755472532550507  0.20664384771742
## 1000 6.73029484971501 -0.0728875235997886 0.304930318666185
##                dispFit        dispersion
##              <numeric>         <numeric>
## 1    0.234623522081347 0.274708262519391
## 2    0.230524862455347 0.479230606512793
## 3    0.235958890942538 0.503276086973451
## 4    0.235749211622816 0.203479032667289
## 5    0.235236483769548 0.296668289860503
## ...                ...               ...
## 996  0.230672517849539 0.186869944860045
## 997   0.23678896943264 0.268003091126513
## 998   0.22268683529524 0.315104699589792
## 999  0.229561562524407 0.213730041838963
## 1000 0.235483221659877 0.282744569338977

Let’s reproduce the plot showing the fitted probability mass functions per condition for gene 1:

axe.x = seq(0,400)
f.x1  = dnbinom(axe.x, mu=2^6.90565, size=1/0.274708)
f.x2  = dnbinom(axe.x, mu=2^(6.90565-0.682067), size=1/0.274708)

par(mfrow=c(1,1),mar=c(4,4,0,0))
ylimw = max(c(f.x1,f.x2))
plot(1,1,ylim=c(0,ylimw),xlim=c(0,max(axe.x)),pch="",xlab="Counts",ylab="Probability",
     axes=FALSE)
lines(axe.x,f.x1,col=.cruk$col[1])
lines(axe.x,f.x2,col=.cruk$col[3])
axis(1,pos=0)
axis(2,las=2,pos=0)
legend("topright",bg="light gray",lty=1,col=.cruk$col[c(1,3)],
       legend=c("Condition 1","Condition 2"),title="Estimated distributions",box.lwd=NA)
abline(v=2^6.90565,col=.cruk$col[1],lty=3)
abline(v=2^(6.90565-0.682067),col=.cruk$col[3],lty=3)

Section 3: Large Scale Hypothesis testing: FDR

When we are doing thousands of tests for differential expression, the overall significance level of a test is very difficult to control. Let’s see why: First, we simulate 40,000 genes not differentially expressed (with a mean of zero). We assume that we have 10 replicates of this experiment:

X <- matrix(rnorm(N* R, 0, 1), nrow=N)

Now we assume that we run a t-test under the null hypothesis that the mean is zero for each of these genes, that is each row in the matrix:

t.test(X[1,])$p.value

## [1] 0.5723298

pvals <- apply(X, 1, function(y) t.test(y)$p.value)

Because we have generated this data with mean zero, we know that none of these genes are differentially expressed, so we would like to be able to not reject any of the hypothesis. However, if you choose a significance level of 0.05 we get

sum(pvals<0.05)

## [1] 1967

Too many rejections!!! In fact, if we look at the distributions of the p-values obtained we get:

hist(pvals)

That is, if the null hypothesis is true, the p-values will follow a uniform distribution. This is the key to all methods that aim to control the proportion of false positives amongs the genes that we call differentially expressed. Let’s add 1000 genes to our set that are really differentially expressed (mean of 1):

Y <- matrix(rnorm(df* R, 1, 1), nrow=df)
Z <- rbind(X, Y)
pvals <- apply(Z, 1, function(y) t.test(y)$p.value)

plot(pvals,col=rep(1:2,c(40000,1000)))

plot(p.adjust(pvals, method="BH"),col=rep(1:2,c(40000,1000)))

tapply(p.adjust(pvals, method="BH")<0.05,rep(1:2,c(40000,1000)),mean)

##     1     2 
## 0.000 0.002

Let’s look at the distribution of p-values now:

hist(pvals)

What would be the number of false positives now? How many would we expect if we reject p-values samller than our significance level, 0.05?

exp.sig<- (nrow(Z))*0.05
obs.sig <- sum(pvals<0.05)
FDR <- exp.sig / obs.sig

## [1] 0.7337151

We can compare this with the Benjamini-Hochberg method:

pvals.adj <- p.adjust(pvals, method="BH")
plot(pvals, pvals.adj)
abline(v=0.05, col=2)

Statistical analysis of RNAseq data

D.-L. Couturier and O. Rueda

Last modified: 20 May 2020

Section 1: Contrast matrices

One 3-level factor:

Two categorical predictors:

Two categorical predictors:

Section 2: DESeq2

Introduction slide

Section 2 slides dedicated to dispersion

Section 3: Large Scale Hypothesis testing: FDR