Mark Dunning
27/07/2015
You’ll probably see this quote in every stats course you go to
To consult the statistician after an experiment is finished is often merely to ask him to conduct a post mortem examination. He can perhaps say what the experiment died of.”. R.A. Fisher, 1938
If you haven’t designed your experiment properly, then all the Bioinformatics we teach you won’t help: Make friends with your local statistician
?
or help.start()
browseVignettes()
to see package user guides (‘vignettes’).csv
or .txt
file
<-
x <- sqrt(25)
x + 5
## [1] 10
y <- x +5
y
## [1] 10
[]
operatorvec <- c(1,2,3,6)
vec[1]
## [1] 1
vec <- c("A"," B","C","D")
vec[1]
## [1] "A"
vec <- c(1,2,3,6)
vec+2
## [1] 3 4 5 8
vec*2
## [1] 2 4 6 12
mean(vec)
## [1] 3
sum(vec)
## [1] 12
df <- data.frame(A = c(1,2,3,6), B = c(7,8,10,12))
df
## A B
## 1 1 7
## 2 2 8
## 3 3 10
## 4 6 12
Don’t need the same data type in each column
df <- data.frame(A = c(1,2,3,6),
B = month.name[c(7,8,10,12)])
df
## A B
## 1 1 July
## 2 2 August
## 3 3 October
## 4 6 December
read.csv
, read.delim
read.xls
from gdata
for Excel datahead
prints the first few linesdim
will print the dimensionsOnce we have imported our data into R, we can start to explore it
[]
, but need to specify row and column indicesdf[1,2]
## [1] July
## Levels: August December July October
df[2,1]
## [1] 2
Or leave the row or column index blank to get all rows and columns respectively
df[1,]
## A B
## 1 1 July
df[,2]
## [1] July August October December
## Levels: August December July October
Can also subset using the column name - the result is a vector
df$A
## [1] 1 2 3 6
df$B
## [1] July August October December
## Levels: August December July October
c
:
makes a sequence with start and end valueseq
function can also be used
?seq
df[c(1,2,3),]
## A B
## 1 1 July
## 2 2 August
## 3 3 October
df[1:3,]
## A B
## 1 1 July
## 2 2 August
## 3 3 October
example
function (runs example code that is defined for the function)
example(plot)
example(barplot)
example(boxplot)
example(hist)
x <- 1:10
y <- 2*x
plot(x,y)
x <- runif(100)
hist(x)
dd <- data.frame(A=rnorm(100, mean = 2),
B = rnorm(100, mean=5))
boxplot(dd)
plot(x,y,xlab="My X label",ylab="My Y label"
,main="My Title",col="steelblue",pch=16,xlim=c(0,20))
points(x,x,col="red",pch=17)
grid()
abline(0,2,lty=2)
abline(0,1,lty=2)
text(12,10, label="y = x")
text(12,20, label="y = 2x")
<
,>
, ==
, !=
TRUE
or FALSE
logical or boolean valuevalues <- rnorm(10)
values < 0
## [1] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
values > 0
## [1] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
data <- data.frame(Counts = values, Name = rep(c("A","B")))
data
## Counts Name
## 1 -0.47782883 A
## 2 0.50436773 B
## 3 0.55725571 A
## 4 -1.09057047 B
## 5 -0.38772987 A
## 6 -0.06819884 B
## 7 -1.27982231 A
## 8 -0.95813690 B
## 9 -0.94292422 A
## 10 -0.98992467 B
data[values>0,]
## Counts Name
## 2 0.5043677 B
## 3 0.5572557 A
data$Name == "A"
## [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
data[data$Name == "A",]
## Counts Name
## 1 -0.4778288 A
## 3 0.5572557 A
## 5 -0.3877299 A
## 7 -1.2798223 A
## 9 -0.9429242 A
data[data$Name !="A",]
## Counts Name
## 2 0.50436773 B
## 4 -1.09057047 B
## 6 -0.06819884 B
## 8 -0.95813690 B
## 10 -0.98992467 B
&
and |
when we want all tests, or any test, to be truedata$Name == "A" & values > 0
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
data[data$Name == "A" & values > 0,]
## Counts Name
## 3 0.5572557 A
data$Name == "A" | values > 0
## [1] TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
data[data$Name == "A" | values > 0,]
## Counts Name
## 1 -0.4778288 A
## 2 0.5043677 B
## 3 0.5572557 A
## 5 -0.3877299 A
## 7 -1.2798223 A
## 9 -0.9429242 A
if (some.condition.holds){
do.this...
} else {
do.this.instead.....
}
file.exists
will return TRUE
or FALSE
if a specified file name could be foundstop
will stop and give a message to the userif(file.exists(myfile)){
data <- read.delim(myfile)
...rest of your code here...
...
} else{
stop("Could not find input file")
}
hist(data[,1])
hist(data[,2])
hist(data[,3])
....
for
loophist(data[,i])
for
loopi <- 1
hist(data[,i])
i <- 2
hist(data[,i])
i <- 3
hist(data[,i])
for
loop{}
will be run for each value of i in turnfor(i in 1:3){
hist(data[,i])
}
for
loop{}
for(i in 1:3){
...process the data....
hist(data[,i])
...customise the plot...
...export...
...etc...
}
install.packages("your.package.name.here")
source("http://www.bioconductor.org/biocLite.R")
biocLite("my.bioc.package")
library
functionlibrary(your.package.name.here)
library(my.bioc.package)
Each package has its own landing page. e.g. http://bioconductor.org/packages/release/bioc/html/beadarray.html. Here you’ll find;
library
function. e.g. library(beadarray)
Recall that data can be read into R using read.csv
, read.delim
, read.table
etc. Several packages provided special modifications of these to read raw data from different manufacturers
limma
for various two-colour platformsaffy
for Affymetrix databeadarray
, lumi
, limma
for Illumina BeadArray dataA dataset may be split into different components
In Bioconductor we will often put these data the same object for easy referencing. The Biobase
package has all the code to do this.
library(Biobase)
data(sample.ExpressionSet)
sample.ExpressionSet
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 500 features, 26 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: A B ... Z (26 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
accessor
functions are providedevals <- exprs(sample.ExpressionSet)
dim(evals)
## [1] 500 26
evals[1:4,1:3]
## A B C
## AFFX-MurIL2_at 192.7420 85.75330 176.7570
## AFFX-MurIL10_at 97.1370 126.19600 77.9216
## AFFX-MurIL4_at 45.8192 8.83135 33.0632
## AFFX-MurFAS_at 22.5445 3.60093 14.6883
sampleMat <- pData(sample.ExpressionSet)
dim(sampleMat)
## [1] 26 3
head(sampleMat)
## sex type score
## A Female Control 0.75
## B Male Case 0.40
## C Male Control 0.73
## D Male Case 0.42
## E Female Case 0.93
## F Male Control 0.22
ExpressionSet
objects are designed to behave like data frames. e.g. to subset the first 10 genes
sample.ExpressionSet[1:10,]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 10 features, 26 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: A B ... Z (26 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
What does this do?
sample.ExpressionSet[,1:10]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 500 features, 10 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: A B ... J (10 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
males <- sampleMat[,1] == "Male"
sample.ExpressionSet[,males]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 500 features, 15 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: B C ... X (15 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
maleData <- sample.ExpressionSet[,males]
sample.ExpressionSet[,
sampleMat$score < 0.5
]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 500 features, 14 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: B D ... Z (14 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
Recall that several plots can be created from a vector of numerical values
hist(evals[,1])
Or from a data frame
boxplot(evals[,1:5])
One sample against another
plot(evals[,1],evals[,2])
One gene against another
plot(evals[1,],evals[2,])
We often work with M and A values as defined
M <- log2(evals[,1]) - log2(evals[,2])
A <- 0.5*(log2(evals[,1]) + log2(evals[,2]))
plot(A,M)
R
started as a language for statisticians, made by statisticianst.test
wilcox.test
var.test
anova
mygene
## A B C D E F
## 11.069500 -26.100600 14.165500 8.759730 4.473810 9.857120
## G H I J K L
## 2.129690 -3.160350 23.917000 8.841620 -13.306100 6.991630
## M N O P Q R
## 4.625380 -7.571780 23.905600 11.327000 10.738700 12.639800
## S T U V W X
## 9.737230 12.834800 -0.203757 22.000500 12.463800 2.936350
## Y Z
## 10.891500 12.021200
myfactor
## [1] Female Male Male Male Female Male Male Male Female Male
## [11] Male Female Male Male Female Female Female Male Male Female
## [21] Male Female Male Male Female Female
## Levels: Female Male
The tilde (~
) is R’s way of creating a formula
boxplot(mygene~myfactor)
t.test(mygene~myfactor)
##
## Welch Two Sample t-test
##
## data: mygene by myfactor
## t = 3.215, df = 23.216, p-value = 0.003808
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4.020114 18.508597
## sample estimates:
## mean in group Female mean in group Male
## 13.651931 2.387576
hgu95av2.db
which can be installed in the same manner as other Bioconductor packageslibrary(hgu95av2.db)
mget("31553_at",hgu95av2SYMBOL)
## $`31553_at`
## [1] "ZNF460"
mget("31553_at",hgu95av2ENTREZID)
## $`31553_at`
## [1] "10794"
mget("31553_at",hgu95av2GO)
## $`31553_at`
## $`31553_at`$`GO:0006351`
## $`31553_at`$`GO:0006351`$GOID
## [1] "GO:0006351"
##
## $`31553_at`$`GO:0006351`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0006351`$Ontology
## [1] "BP"
##
##
## $`31553_at`$`GO:0006355`
## $`31553_at`$`GO:0006355`$GOID
## [1] "GO:0006355"
##
## $`31553_at`$`GO:0006355`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0006355`$Ontology
## [1] "BP"
##
##
## $`31553_at`$`GO:0005634`
## $`31553_at`$`GO:0005634`$GOID
## [1] "GO:0005634"
##
## $`31553_at`$`GO:0005634`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0005634`$Ontology
## [1] "CC"
##
##
## $`31553_at`$`GO:0003677`
## $`31553_at`$`GO:0003677`$GOID
## [1] "GO:0003677"
##
## $`31553_at`$`GO:0003677`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0003677`$Ontology
## [1] "MF"
##
##
## $`31553_at`$`GO:0046872`
## $`31553_at`$`GO:0046872`$GOID
## [1] "GO:0046872"
##
## $`31553_at`$`GO:0046872`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0046872`$Ontology
## [1] "MF"
##
##
## $`31553_at`$`GO:0005515`
## $`31553_at`$`GO:0005515`$GOID
## [1] "GO:0005515"
##
## $`31553_at`$`GO:0005515`$Evidence
## [1] "IPI"
##
## $`31553_at`$`GO:0005515`$Ontology
## [1] "MF"