one2levelfactor = data.frame(
sample = 1:6,
condition = rep(c("TreatmentA", "Control"), 3),
stringsAsFactors = TRUE)
# model without intercept and default levels:
X1 = model.matrix(~ condition - 1, data = one2levelfactor)
X1
## conditionControl conditionTreatmentA
## 1 0 1
## 2 1 0
## 3 0 1
## 4 1 0
## 5 0 1
## 6 1 0
## attr(,"assign")
## [1] 1 1
## attr(,"contrasts")
## attr(,"contrasts")$condition
## [1] "contr.treatment"
# model with intercept and default levels
X2 = model.matrix(~ condition, data = one2levelfactor)
X2
## (Intercept) conditionTreatmentA
## 1 1 1
## 2 1 0
## 3 1 1
## 4 1 0
## 5 1 1
## 6 1 0
## attr(,"assign")
## [1] 0 1
## attr(,"contrasts")
## attr(,"contrasts")$condition
## [1] "contr.treatment"
# matrix multiplication: let s assume a mean of 2 for control and of 3 for group A
X1%*%c(2,3)
## [,1]
## 1 3
## 2 2
## 3 3
## 4 2
## 5 3
## 6 2
X2%*%c(2,1)
## [,1]
## 1 3
## 2 2
## 3 3
## 4 2
## 5 3
## 6 2
one3levelfactor = data.frame(
sample = 1:6,
condition = rep(c("TreatmentA", "TreatmentB", "Control"), 2),
stringsAsFactors = TRUE)
# model without intercept and default levels:
X1 = model.matrix(~ condition - 1, data = one3levelfactor)
X1
## conditionControl conditionTreatmentA conditionTreatmentB
## 1 0 1 0
## 2 0 0 1
## 3 1 0 0
## 4 0 1 0
## 5 0 0 1
## 6 1 0 0
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$condition
## [1] "contr.treatment"
# model with intercept and default levels
X2 = model.matrix(~ condition, data = one3levelfactor)
X2
## (Intercept) conditionTreatmentA conditionTreatmentB
## 1 1 1 0
## 2 1 0 1
## 3 1 0 0
## 4 1 1 0
## 5 1 0 1
## 6 1 0 0
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$condition
## [1] "contr.treatment"
# model with intercept and self-defined levels
one3levelfactor$condition2 = factor(one3levelfactor$condition,
levels=c("TreatmentB","TreatmentA","Control"))
X3 = model.matrix(~ condition2, data = one3levelfactor)
X3
## (Intercept) condition2TreatmentA condition2Control
## 1 1 1 0
## 2 1 0 0
## 3 1 0 1
## 4 1 1 0
## 5 1 0 0
## 6 1 0 1
## attr(,"assign")
## [1] 0 1 1
## attr(,"contrasts")
## attr(,"contrasts")$condition2
## [1] "contr.treatment"
# matrix multiplication: let s assume a mean of
# > 2 for control
# > 3 for group A
# > 4 for group B
# ! CHALLENGE !
# ! FIND the values of the BETA vector corresponding to X1, X2 and X3 !
# create dataset
two2levelfactor = data.frame(
sample = 1:8,
treatment = rep(c("TreatA","NoTreat"),4),
er = rep(c("+","-"),each=4),
stringsAsFactors = TRUE)
# design matrix without interaction
X1 = model.matrix(~ treatment + er, data=two2levelfactor)
X1
## (Intercept) treatmentTreatA er+
## 1 1 1 1
## 2 1 0 1
## 3 1 1 1
## 4 1 0 1
## 5 1 1 0
## 6 1 0 0
## 7 1 1 0
## 8 1 0 0
## attr(,"assign")
## [1] 0 1 2
## attr(,"contrasts")
## attr(,"contrasts")$treatment
## [1] "contr.treatment"
##
## attr(,"contrasts")$er
## [1] "contr.treatment"
# design matrix with interaction
X2 = model.matrix(~ treatment * er, data=two2levelfactor)
X2
## (Intercept) treatmentTreatA er+ treatmentTreatA:er+
## 1 1 1 1 1
## 2 1 0 1 0
## 3 1 1 1 1
## 4 1 0 1 0
## 5 1 1 0 0
## 6 1 0 0 0
## 7 1 1 0 0
## 8 1 0 0 0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$treatment
## [1] "contr.treatment"
##
## attr(,"contrasts")$er
## [1] "contr.treatment"
model.matrix(~ treatment + er + treatment:er, data=two2levelfactor)
## (Intercept) treatmentTreatA er+ treatmentTreatA:er+
## 1 1 1 1 1
## 2 1 0 1 0
## 3 1 1 1 1
## 4 1 0 1 0
## 5 1 1 0 0
## 6 1 0 0 0
## 7 1 1 0 0
## 8 1 0 0 0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$treatment
## [1] "contr.treatment"
##
## attr(,"contrasts")$er
## [1] "contr.treatment"
# matrix multiplication: let s assume a mean of
# > 2 for control and ER-
# > 3 for group A and ER-
# > 4 for control and ER+
# > 6 for group A and ER+
# ! CHALLENGE !
# ! FIND the values of the BETA vector corresponding to X1, X2 and X3 !
Let's generate
set.seed(777)
cnts <- matrix(rnbinom(n=20000, mu=100, size=1/.25), ncol=20)
cond <- factor(rep(1:2, each=10))
Let's
library(DESeq2)
dds <- DESeqDataSetFromMatrix(cnts, DataFrame(cond), ~ cond)
dds <- DESeq(dds)
results(dds)
## log2 fold change (MLE): cond 2 vs 1
## Wald test p-value: cond 2 vs 1
## DataFrame with 1000 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## 1 97.3140 -0.682067 0.344525 -1.979730 0.0477339 0.745842
## 2 109.9860 -0.228819 0.450720 -0.507676 0.6116808 0.944354
## 3 98.8111 0.104291 0.462113 0.225683 0.8214483 0.978382
## 4 103.2615 0.306400 0.297682 1.029284 0.3033460 0.944354
## 5 97.9406 0.316338 0.357242 0.885501 0.3758864 0.944354
## ... ... ... ... ... ... ...
## 996 86.8057 0.0467703 0.287042 0.162939 0.8705668 0.980044
## 997 101.4437 -0.2070806 0.339886 -0.609264 0.5423495 0.944354
## 998 78.1356 -0.6372790 0.369515 -1.724637 0.0845930 0.824310
## 999 89.2920 0.7554725 0.306192 2.467314 0.0136131 0.614613
## 1000 103.5569 -0.0728875 0.348655 -0.209053 0.8344065 0.978382
Let's print the relevant information to deduce the estimated NB distribution assumed for each gene and condition:
mcols(dds)[,c("Intercept","cond_2_vs_1","dispGeneEst","dispFit","dispersion")]
## DataFrame with 1000 rows and 5 columns
## Intercept cond_2_vs_1 dispGeneEst dispFit dispersion
## <numeric> <numeric> <numeric> <numeric> <numeric>
## 1 6.90565 -0.682067 0.294082 0.234624 0.274708
## 2 6.89102 -0.228819 0.479231 0.230525 0.479231
## 3 6.57355 0.104291 0.503276 0.235959 0.503276
## 4 6.52875 0.306400 0.189799 0.235749 0.203479
## 5 6.44716 0.316338 0.327393 0.235236 0.296668
## ... ... ... ... ... ...
## 996 6.41605 0.0467703 0.167776 0.230673 0.186870
## 997 6.76442 -0.2070806 0.282533 0.236789 0.268003
## 998 6.57184 -0.6372790 0.363912 0.222687 0.315105
## 999 6.05380 0.7554725 0.206644 0.229562 0.213730
## 1000 6.73029 -0.0728875 0.304930 0.235483 0.282745
Let's reproduce the plot showing the fitted probability mass functions per condition for gene 1:
axe.x = seq(0,400)
f.x1 = dnbinom(axe.x, mu=2^6.90565, size=1/0.274708)
f.x2 = dnbinom(axe.x, mu=2^(6.90565-0.682067), size=1/0.274708)
par(mfrow=c(1,1),mar=c(4,4,0,0))
ylimw = max(c(f.x1,f.x2))
plot(1,1,ylim=c(0,ylimw),xlim=c(0,max(axe.x)),pch="",xlab="Counts",ylab="Probability",
axes=FALSE)
lines(axe.x,f.x1,col=.cruk$col[1])
lines(axe.x,f.x2,col=.cruk$col[3])
axis(1,pos=0)
axis(2,las=2,pos=0)
legend("topright",bg="light gray",lty=1,col=.cruk$col[c(1,3)],
legend=c("Condition 1","Condition 2"),title="Estimated distributions",box.lwd=NA)
abline(v=2^6.90565,col=.cruk$col[1],lty=3)
abline(v=2^(6.90565-0.682067),col=.cruk$col[3],lty=3)
When we are doing thousands of tests for differential expression, the overall significance level of a test is very difficult to control. Let's see why: First, we simulate 40,000 genes not differentially expressed (with a mean of zero). We assume that we have 10 replicates of this experiment:
N <- 40000
R <- 10
X <- matrix(rnorm(N* R, 0, 1), nrow=N)
Now we assume that we run a t-test under the null hypothesis that the mean is zero for each of these genes, that is each row in the matrix:
t.test(X[1,])$p.value
## [1] 0.5723298
pvals <- apply(X, 1, function(y) t.test(y)$p.value)
Because we have generated this data with mean zero, we know that none of these genes are differentially expressed, so we would like to be able to not reject any of the hypothesis. However, if you choose a significance level of 0.05 we get
sum(pvals<0.05)
## [1] 1967
Too many rejections!!! In fact, if we look at the distributions of the p-values obtained we get:
hist(pvals)
That is, if the null hypothesis is true, the p-values will follow a uniform distribution. This is the key to all methods that aim to control the proportion of false positives amongs the genes that we call differentially expressed. Let's add 1000 genes to our set that are really differentially expressed (mean of 1):
df <- 1000
Y <- matrix(rnorm(df* R, 1, 1), nrow=df)
Z <- rbind(X, Y)
pvals <- apply(Z, 1, function(y) t.test(y)$p.value)
#
plot(pvals,col=rep(1:2,c(40000,1000)))
plot(p.adjust(pvals, method="BH"),col=rep(1:2,c(40000,1000)))
#
tapply(p.adjust(pvals, method="BH")<0.05,rep(1:2,c(40000,1000)),mean)
## 1 2
## 0.000 0.002
Let's look at the distribution of p-values now:
hist(pvals)
What would be the number of false positives now? How many would we expect if we reject p-values samller than our significance level, 0.05?
exp.sig<- (nrow(Z))*0.05
obs.sig <- sum(pvals<0.05)
FDR <- exp.sig / obs.sig
FDR
## [1] 0.7337151
We can compare this with the Benjamini-Hochberg method:
pvals.adj <- p.adjust(pvals, method="BH")
plot(pvals, pvals.adj)
abline(v=0.05, col=2)