In this session we will:
- import our counts into R
- look at the effects of variance and how to mitigate this with data transformation
- do some initial exploration of the raw count data using principle component analysis
November 2024
In this session we will:
First we load a table of sample metadata
library(tidyverse) sampleinfo <- read_tsv("data/samplesheet.tsv", col_types = c("cccc")) arrange(sampleinfo, Status, TimePoint, Replicate)
## # A tibble: 12 × 4 ## SampleName Replicate Status TimePoint ## <chr> <chr> <chr> <chr> ## 1 SRR7657878 1 Infected d11 ## 2 SRR7657881 2 Infected d11 ## 3 SRR7657880 3 Infected d11 ## 4 SRR7657874 1 Infected d33 ## 5 SRR7657882 2 Infected d33 ## 6 SRR7657872 3 Infected d33 ## 7 SRR7657877 1 Uninfected d11 ## 8 SRR7657876 2 Uninfected d11 ## 9 SRR7657879 3 Uninfected d11 ## 10 SRR7657883 1 Uninfected d33 ## 11 SRR7657873 2 Uninfected d33 ## 12 SRR7657875 3 Uninfected d33
quant.sf
filestximport
package to do thistx2gene <- read_tsv("references/tx2gene.tsv") tx2gene
## # A tibble: 119,414 × 2 ## TxID GeneID ## <chr> <chr> ## 1 ENSMUST00000177564.1 ENSMUSG00000096176 ## 2 ENSMUST00000196221.1 ENSMUSG00000096749 ## 3 ENSMUST00000179664.1 ENSMUSG00000096749 ## 4 ENSMUST00000178537.1 ENSMUSG00000095668 ## 5 ENSMUST00000178862.1 ENSMUSG00000094569 ## 6 ENSMUST00000179520.1 ENSMUSG00000094028 ## # ℹ 119,408 more rows
library(tximport) txi <- tximport(salmon_files, type = "salmon", tx2gene = tx2gene) str(txi)
## List of 4 ## $ abundance : num [1:35896, 1:12] 20.39 0 1.97 1.06 0.95 ... ## ..- attr(*, "dimnames")=List of 2 ## .. ..$ : chr [1:35896] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000037" ... ## .. ..$ : chr [1:12] "SRR7657878" "SRR7657881" "SRR7657880" "SRR7657874" ... ## $ counts : num [1:35896, 1:12] 1039 0 65 39 8 ... ## ..- attr(*, "dimnames")=List of 2 ## .. ..$ : chr [1:35896] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000037" ... ## .. ..$ : chr [1:12] "SRR7657878" "SRR7657881" "SRR7657880" "SRR7657874" ... ## $ length : num [1:35896, 1:12] 2903 541 1883 2098 480 ... ## ..- attr(*, "dimnames")=List of 2 ## .. ..$ : chr [1:35896] "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000037" ... ## .. ..$ : chr [1:12] "SRR7657878" "SRR7657881" "SRR7657880" "SRR7657874" ... ## $ countsFromAbundance: chr "no"
class(txi$counts)
## [1] "matrix" "array"
typeof(txi$counts)
## [1] "double"
txi$counts[1:8, 1:5]
## SRR7657878 SRR7657881 SRR7657880 SRR7657874 SRR7657882 ## ENSMUSG00000000001 1039.000 1005.889 892.000 917.360 1136.690 ## ENSMUSG00000000003 0.000 0.000 0.000 0.000 0.000 ## ENSMUSG00000000028 65.000 73.999 72.000 44.000 46.000 ## ENSMUSG00000000037 39.000 47.000 29.000 53.999 67.000 ## ENSMUSG00000000049 8.000 9.000 4.000 4.000 4.000 ## ENSMUSG00000000056 2163.469 2067.819 2006.925 1351.675 2367.801 ## ENSMUSG00000000058 555.000 470.000 619.000 434.326 725.263 ## ENSMUSG00000000078 1324.000 1456.202 1752.178 1125.101 1288.000
rawCounts <- round(txi$counts, 0) keep <- rowSums(rawCounts) > 5 filtCounts <- rawCounts[keep,]
There are a large number of genes with very low counts, we first removed these before carrying on with data exploration.
We also need to tranform the data because:
summary(filtCounts[,1:4])
## SRR7657878 SRR7657881 SRR7657880 SRR7657874 ## Min. : 0 Min. : 0 Min. : 0 Min. : 0 ## 1st Qu.: 14 1st Qu.: 17 1st Qu.: 15 1st Qu.: 22 ## Median : 327 Median : 351 Median : 333 Median : 346 ## Mean : 1387 Mean : 1346 Mean : 1330 Mean : 1200 ## 3rd Qu.: 1305 3rd Qu.: 1297 3rd Qu.: 1268 3rd Qu.: 1193 ## Max. :652318 Max. :590723 Max. :435516 Max. :444448
There are a large number of genes with very low counts, we first removed these before carrying on with data exploration.
We also need to tranform the data because:
There are a large number of genes with very low counts, we first removed these before carrying on with data exploration.
We also need to tranform the data because:
To avoid problems posed by raw counts, they can be transformed. DESeq2 provides 2 transformation methods:
Both log2 transform the data and attempt to stabilise the variance with regard to the mean expression of the gene.
rlogCounts <- rlog(filtCounts)
sampleinfo <- mutate(sampleinfo, Status=case_when( SampleName=="SRR7657882" ~ "Uninfected", SampleName=="SRR7657873" ~ "Infected", TRUE ~ Status))
See the Extended Materials …