Mark Dunning
27/07/2015
You’ll probably see this quote in every stats course you go to
To consult the statistician after an experiment is finished is often merely to ask him to conduct a post mortem examination. He can perhaps say what the experiment died of.”. R.A. Fisher, 1938
If you haven’t designed your experiment properly, then all the Bioinformatics we teach you won’t help: Make friends with your local statistician
? or help.start()browseVignettes() to see package user guides (‘vignettes’).csv or .txt file
<-x <- sqrt(25)
x + 5
## [1] 10
y <- x +5
y
## [1] 10
[] operatorvec <- c(1,2,3,6)
vec[1]
## [1] 1
vec <- c("A"," B","C","D")
vec[1]
## [1] "A"
vec <- c(1,2,3,6)
vec+2
## [1] 3 4 5 8
vec*2
## [1] 2 4 6 12
mean(vec)
## [1] 3
sum(vec)
## [1] 12
df <- data.frame(A = c(1,2,3,6), B = c(7,8,10,12))
df
## A B
## 1 1 7
## 2 2 8
## 3 3 10
## 4 6 12
Don’t need the same data type in each column
df <- data.frame(A = c(1,2,3,6),
B = month.name[c(7,8,10,12)])
df
## A B
## 1 1 July
## 2 2 August
## 3 3 October
## 4 6 December
read.csv, read.delimread.xls from gdata for Excel datahead prints the first few linesdim will print the dimensionsOnce we have imported our data into R, we can start to explore it
[], but need to specify row and column indicesdf[1,2]
## [1] July
## Levels: August December July October
df[2,1]
## [1] 2
Or leave the row or column index blank to get all rows and columns respectively
df[1,]
## A B
## 1 1 July
df[,2]
## [1] July August October December
## Levels: August December July October
Can also subset using the column name - the result is a vector
df$A
## [1] 1 2 3 6
df$B
## [1] July August October December
## Levels: August December July October
c: makes a sequence with start and end valueseq function can also be used
?seqdf[c(1,2,3),]
## A B
## 1 1 July
## 2 2 August
## 3 3 October
df[1:3,]
## A B
## 1 1 July
## 2 2 August
## 3 3 October
example function (runs example code that is defined for the function)
example(plot)example(barplot)example(boxplot)example(hist)x <- 1:10
y <- 2*x
plot(x,y)
x <- runif(100)
hist(x)
dd <- data.frame(A=rnorm(100, mean = 2),
B = rnorm(100, mean=5))
boxplot(dd)
plot(x,y,xlab="My X label",ylab="My Y label"
,main="My Title",col="steelblue",pch=16,xlim=c(0,20))
points(x,x,col="red",pch=17)
grid()
abline(0,2,lty=2)
abline(0,1,lty=2)
text(12,10, label="y = x")
text(12,20, label="y = 2x")
<,>, ==, !=TRUE or FALSE logical or boolean valuevalues <- rnorm(10)
values < 0
## [1] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
values > 0
## [1] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
data <- data.frame(Counts = values, Name = rep(c("A","B")))
data
## Counts Name
## 1 -0.47782883 A
## 2 0.50436773 B
## 3 0.55725571 A
## 4 -1.09057047 B
## 5 -0.38772987 A
## 6 -0.06819884 B
## 7 -1.27982231 A
## 8 -0.95813690 B
## 9 -0.94292422 A
## 10 -0.98992467 B
data[values>0,]
## Counts Name
## 2 0.5043677 B
## 3 0.5572557 A
data$Name == "A"
## [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
data[data$Name == "A",]
## Counts Name
## 1 -0.4778288 A
## 3 0.5572557 A
## 5 -0.3877299 A
## 7 -1.2798223 A
## 9 -0.9429242 A
data[data$Name !="A",]
## Counts Name
## 2 0.50436773 B
## 4 -1.09057047 B
## 6 -0.06819884 B
## 8 -0.95813690 B
## 10 -0.98992467 B
& and | when we want all tests, or any test, to be truedata$Name == "A" & values > 0
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
data[data$Name == "A" & values > 0,]
## Counts Name
## 3 0.5572557 A
data$Name == "A" | values > 0
## [1] TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE
data[data$Name == "A" | values > 0,]
## Counts Name
## 1 -0.4778288 A
## 2 0.5043677 B
## 3 0.5572557 A
## 5 -0.3877299 A
## 7 -1.2798223 A
## 9 -0.9429242 A
if (some.condition.holds){
do.this...
} else {
do.this.instead.....
}
file.exists will return TRUE or FALSE if a specified file name could be foundstop will stop and give a message to the userif(file.exists(myfile)){
data <- read.delim(myfile)
...rest of your code here...
...
} else{
stop("Could not find input file")
}
hist(data[,1])
hist(data[,2])
hist(data[,3])
....
for loophist(data[,i])
for loopi <- 1
hist(data[,i])
i <- 2
hist(data[,i])
i <- 3
hist(data[,i])
for loop{} will be run for each value of i in turnfor(i in 1:3){
hist(data[,i])
}
for loop{}for(i in 1:3){
...process the data....
hist(data[,i])
...customise the plot...
...export...
...etc...
}
install.packages("your.package.name.here")
source("http://www.bioconductor.org/biocLite.R")
biocLite("my.bioc.package")
library functionlibrary(your.package.name.here)
library(my.bioc.package)
Each package has its own landing page. e.g. http://bioconductor.org/packages/release/bioc/html/beadarray.html. Here you’ll find;
library function. e.g. library(beadarray)Recall that data can be read into R using read.csv, read.delim, read.table etc. Several packages provided special modifications of these to read raw data from different manufacturers
limma for various two-colour platformsaffy for Affymetrix databeadarray, lumi, limma for Illumina BeadArray dataA dataset may be split into different components
In Bioconductor we will often put these data the same object for easy referencing. The Biobase package has all the code to do this.
library(Biobase)
data(sample.ExpressionSet)
sample.ExpressionSet
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 500 features, 26 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: A B ... Z (26 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
accessor functions are providedevals <- exprs(sample.ExpressionSet)
dim(evals)
## [1] 500 26
evals[1:4,1:3]
## A B C
## AFFX-MurIL2_at 192.7420 85.75330 176.7570
## AFFX-MurIL10_at 97.1370 126.19600 77.9216
## AFFX-MurIL4_at 45.8192 8.83135 33.0632
## AFFX-MurFAS_at 22.5445 3.60093 14.6883
sampleMat <- pData(sample.ExpressionSet)
dim(sampleMat)
## [1] 26 3
head(sampleMat)
## sex type score
## A Female Control 0.75
## B Male Case 0.40
## C Male Control 0.73
## D Male Case 0.42
## E Female Case 0.93
## F Male Control 0.22
ExpressionSet objects are designed to behave like data frames. e.g. to subset the first 10 genes
sample.ExpressionSet[1:10,]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 10 features, 26 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: A B ... Z (26 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
What does this do?
sample.ExpressionSet[,1:10]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 500 features, 10 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: A B ... J (10 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
males <- sampleMat[,1] == "Male"
sample.ExpressionSet[,males]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 500 features, 15 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: B C ... X (15 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
maleData <- sample.ExpressionSet[,males]
sample.ExpressionSet[,
sampleMat$score < 0.5
]
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 500 features, 14 samples
## element names: exprs, se.exprs
## protocolData: none
## phenoData
## sampleNames: B D ... Z (14 total)
## varLabels: sex type score
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
Recall that several plots can be created from a vector of numerical values
hist(evals[,1])
Or from a data frame
boxplot(evals[,1:5])
One sample against another
plot(evals[,1],evals[,2])
One gene against another
plot(evals[1,],evals[2,])
We often work with M and A values as defined
M <- log2(evals[,1]) - log2(evals[,2])
A <- 0.5*(log2(evals[,1]) + log2(evals[,2]))
plot(A,M)
R started as a language for statisticians, made by statisticianst.testwilcox.testvar.testanovamygene
## A B C D E F
## 11.069500 -26.100600 14.165500 8.759730 4.473810 9.857120
## G H I J K L
## 2.129690 -3.160350 23.917000 8.841620 -13.306100 6.991630
## M N O P Q R
## 4.625380 -7.571780 23.905600 11.327000 10.738700 12.639800
## S T U V W X
## 9.737230 12.834800 -0.203757 22.000500 12.463800 2.936350
## Y Z
## 10.891500 12.021200
myfactor
## [1] Female Male Male Male Female Male Male Male Female Male
## [11] Male Female Male Male Female Female Female Male Male Female
## [21] Male Female Male Male Female Female
## Levels: Female Male
The tilde (~) is R’s way of creating a formula
boxplot(mygene~myfactor)
t.test(mygene~myfactor)
##
## Welch Two Sample t-test
##
## data: mygene by myfactor
## t = 3.215, df = 23.216, p-value = 0.003808
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4.020114 18.508597
## sample estimates:
## mean in group Female mean in group Male
## 13.651931 2.387576
hgu95av2.db which can be installed in the same manner as other Bioconductor packageslibrary(hgu95av2.db)
mget("31553_at",hgu95av2SYMBOL)
## $`31553_at`
## [1] "ZNF460"
mget("31553_at",hgu95av2ENTREZID)
## $`31553_at`
## [1] "10794"
mget("31553_at",hgu95av2GO)
## $`31553_at`
## $`31553_at`$`GO:0006351`
## $`31553_at`$`GO:0006351`$GOID
## [1] "GO:0006351"
##
## $`31553_at`$`GO:0006351`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0006351`$Ontology
## [1] "BP"
##
##
## $`31553_at`$`GO:0006355`
## $`31553_at`$`GO:0006355`$GOID
## [1] "GO:0006355"
##
## $`31553_at`$`GO:0006355`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0006355`$Ontology
## [1] "BP"
##
##
## $`31553_at`$`GO:0005634`
## $`31553_at`$`GO:0005634`$GOID
## [1] "GO:0005634"
##
## $`31553_at`$`GO:0005634`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0005634`$Ontology
## [1] "CC"
##
##
## $`31553_at`$`GO:0003677`
## $`31553_at`$`GO:0003677`$GOID
## [1] "GO:0003677"
##
## $`31553_at`$`GO:0003677`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0003677`$Ontology
## [1] "MF"
##
##
## $`31553_at`$`GO:0046872`
## $`31553_at`$`GO:0046872`$GOID
## [1] "GO:0046872"
##
## $`31553_at`$`GO:0046872`$Evidence
## [1] "IEA"
##
## $`31553_at`$`GO:0046872`$Ontology
## [1] "MF"
##
##
## $`31553_at`$`GO:0005515`
## $`31553_at`$`GO:0005515`$GOID
## [1] "GO:0005515"
##
## $`31553_at`$`GO:0005515`$Evidence
## [1] "IPI"
##
## $`31553_at`$`GO:0005515`$Ontology
## [1] "MF"