patients <- read_tsv("data/patient-data-cleaned.txt") %>%
select(Height, Weight, -BMI)
Lets look at a tidyverse function definition
top_n
function
So the general syntax is:
Function_Name <- function(arg1, arg2, arg3, ...){
Code_Line_1
Code_Line_2
...
}
Let’s create a simple function. This one has no arguments.
my_first_function <- function(){
message("Hello World!")
}
my_first_function()
## Hello World!
Let’s make it a bit more useful by adding an argument
sayHello <- function(name){
message("Hello ", name, "!")
}
sayHello("Ash")
## Hello Ash!
sayHello("Matt")
## Hello Matt!
So far our function just printed a message to the console, but most functions return a value.
In R a function can only return a single object
Let’s create a function to calculate BMI: weight (kg) / height2 (m) –> kg/m2
bmi <- function(height, weight){
weight / (height ^ 2)
}
bmi(1.75, 78)
## [1] 25.46939
Let’s try that with our patients table.
patients %>%
mutate(BMI=bmi(Height, Weight))
## # A tibble: 100 x 3
## Height Weight BMI
## <dbl> <dbl> <dbl>
## 1 183. 76.6 0.00229
## 2 179. 80.4 0.00251
## 3 169. 75.5 0.00264
## # … with 97 more rows
This isn’t right because the height should be in metres. Let’s add some additional arguments to define the units and then some extra code to adjust the values depending on the units provided.
bmi <- function(height, weight, height_units="m", weight_units="kg"){
if(height_units == "cm"){ height <- height / 100 }
if(weight_units == "g"){ weight <- weight / 1000 }
weight / (height ^ 2)
}
bmi(1.75, 78)
## [1] 25.46939
patients %>%
mutate(BMI=bmi(Height, Weight, height_units = "cm"))
## # A tibble: 100 x 3
## Height Weight BMI
## <dbl> <dbl> <dbl>
## 1 183. 76.6 22.9
## 2 179. 80.4 25.1
## 3 169. 75.5 26.4
## # … with 97 more rows
There is a function return
but we don’t usually need it.
bmi <- function(height, weight, height_units="m", weight_units="kg"){
if(height_units == "cm"){ height <- height / 100 }
if(weight_units == "g"){ weight <- weight / 1000 }
bmi <- weight / (height ^ 2)
return(bmi)
}
bmi(165, 67)
## [1] 0.002460973
diabetes <- read_tsv("data/diabetes.txt")
diabetes %>%
filter(ID == "AC/AH/001") %>%
ggplot(aes(x = Date, y = Glucose)) +
geom_line(colour = "blue") +
labs(title = "Patient: AC/AH/001")
How can we turn this into a function? Step by step…
patientID <- "AC/AH/001"
plotTitle <- str_c("Patient: ", patientID)
diabetes %>%
filter(ID == patientID) %>%
ggplot(aes(x = Date, y = Glucose)) +
geom_line(colour = "blue") +
labs(title = plotTitle)
plotGlucose <- function(patientID, dat=diabetes, myColour="blue"){
plotTitle <- str_c("Patient: ", patientID)
dat %>%
filter(ID == patientID) %>%
ggplot(aes(x = Date, y = Glucose)) +
geom_line(colour = myColour) +
labs(title = plotTitle)
}
plotGlucose("AC/AH/104")
plot1 <- plotGlucose("AC/AH/017", myColour="purple")
plot1
R works nicely with vectors
x <- c(1, 2, 3, 4, 5)
log2(x)
## [1] 0.000000 1.000000 1.584963 2.000000 2.321928
but not always the behaviour we want
sum(x, 5)
## [1] 20
so we could do…
sum(1, 5)
sum(2, 5)
sum(3, 5)
sum(4, 5)
sum(5, 5)
If you are repeating code like this, you should use loop.
The basic syntax of a for
loop is:
for(a_variable in a_vector){
some code operating on a_variable
}
Let’s look at a simple example:
for(a_number in c(11, 21, 16, 4, 500)){
sum(a_number, 5)
}
In this case the code has been run 5 times…
More commonly we would define the vector first:
some_data <- c(11, 21, 16, 4, 500)
for(a_number in some_data){
sum(a_number, 5)
}
often we will use a sequence
some_data <- c(11, 21, 16, 4, 500)
for(a_number in seq(1, 5)){
new_number <- sum(some_data[a_number], 5)
message("Result number ", a_number, " is ", new_number)
}
## Result number 1 is 16
## Result number 2 is 26
## Result number 3 is 21
## Result number 4 is 9
## Result number 5 is 505
A better way is to use seq_along
some_data <- c(11, 21, 16, 4, 500)
for(a_number in seq_along(some_data)){
new_number <- sum(some_data[a_number], 5)
message("Result number ", a_number, " is ", new_number)
}
## Result number 1 is 16
## Result number 2 is 26
## Result number 3 is 21
## Result number 4 is 9
## Result number 5 is 505
Let’s say that with our patients data we want to export a separate table for each state, containing just the men, with ID, Name, Smokes, BMI and their their average BP from the diabetes table.
states <- unique(patients$State)
avg_bp <- diabetes %>%
group_by(ID) %>%
summarise(AvgBP=mean(BP)) %>%
ungroup()
for(state in states){
outputFile <- str_c("data/", state, "_male_patient_data.tsv") %>%
str_replace(" ", "_")
patients %>%
filter(State == state & Sex == "Male") %>%
left_join(avg_bp, by = "ID") %>%
select(ID, Name, Smokes, BMI, AvgBP) %>%
write_tsv(outputFile)
}
There are many ways to solve this problem; this is just one of them.
all_patients <- unique(diabetes$ID)
len <- length(all_patients)
start_index <- seq(1, len, by = 16)
plotGlucose <- function(dat){
ggplot(dat, aes(x = Date, y = Glucose)) +
geom_line(colour = "red") +
geom_point(shape = 21, fill = "red") +
facet_wrap(~ID, ncol=4)
}
pdf("data/My_plots.pdf")
for(stt in start_index){
end <- min(stt + 15, 100)
these_patients <- all_patients[seq(stt, end)]
diabetes %>%
filter(ID %in% these_patients) %>%
plotGlucose() %>%
print()
}
dev.off()
The last page isn’t ideal perhaps. We could solve this with some more advanced plotting using gridExtra
to create the grid of plots from a list
of individual plots. We will also need a nested loop - a loop within a loop.
library(gridExtra)
pdf("data/My_plots.pdf")
for(stt in start_index){
end <- min(stt + 15, 100)
these_patients <- all_patients[seq(stt, end)]
plotList <- list()
for(i in seq_along(these_patients)){
glPlot <- diabetes %>%
filter(ID %in% these_patients[i]) %>%
plotGlucose()
plotList[[i]] <- glPlot
}
grid.arrange(grobs = plotList, ncol = 4, nrow = 4)
}
dev.off()