Introduction to Bulk RNAseq data analysis

Exercise 1 - pathview
Exercise 2 - GSEA

Exercise 1 - pathview

Load the required packages and data for Day 11 if you have not already done so.

library(msigdbr)
library(clusterProfiler)

##

## Registered S3 methods overwritten by 'treeio':
##   method              from    
##   MRCA.phylo          tidytree
##   MRCA.treedata       tidytree
##   Nnode.treedata      tidytree
##   Ntip.treedata       tidytree
##   ancestor.phylo      tidytree
##   ancestor.treedata   tidytree
##   child.phylo         tidytree
##   child.treedata      tidytree
##   full_join.phylo     tidytree
##   full_join.treedata  tidytree
##   groupClade.phylo    tidytree
##   groupClade.treedata tidytree
##   groupOTU.phylo      tidytree
##   groupOTU.treedata   tidytree
##   inner_join.phylo    tidytree
##   inner_join.treedata tidytree
##   is.rooted.treedata  tidytree
##   nodeid.phylo        tidytree
##   nodeid.treedata     tidytree
##   nodelab.phylo       tidytree
##   nodelab.treedata    tidytree
##   offspring.phylo     tidytree
##   offspring.treedata  tidytree
##   parent.phylo        tidytree
##   parent.treedata     tidytree
##   root.treedata       tidytree
##   rootnode.phylo      tidytree
##   sibling.phylo       tidytree

## clusterProfiler v4.8.3  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, 2(3):100141

## 
## Attaching package: 'clusterProfiler'

## The following object is masked from 'package:stats':
## 
##     filter

library(pathview)

##

## ##############################################################################
## Pathview is an open source software package distributed under GNU General
## Public License version 3 (GPLv3). Details of GPLv3 is available at
## http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
## formally cite the original Pathview paper (not just mention it) in publications
## or products. For details, do citation("pathview") within R.
## 
## The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
## license agreement (details at http://www.kegg.jp/kegg/legal.html).
## ##############################################################################

library(tidyverse)

## ── Attaching core tidyverse packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()   masks clusterProfiler::filter(), stats::filter()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ purrr::simplify() masks clusterProfiler::simplify()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

shrink.d11 <- readRDS("RObjects/Shrunk_Results.d11.rds")

Use pathview to export a figure for “mmu04659”or “mmu04658”, but this time only use genes that are statistically significant at FDR < 0.01

logFC <- shrink.d11 %>% 
  drop_na(padj, Entrez) %>% 
  filter(padj < 0.01) %>% 
  pull(log2FoldChange, Entrez) 

pathview(gene.data = logFC, 
         pathway.id = "mmu04659", 
         species = "mmu", 
         limit = list(gene=5, cpd=1))

## Loading required namespace: org.Mm.eg.db

##

## 'select()' returned 1:1 mapping between keys and columns

## Info: Working in directory /mnt/c/Users/hugot/Documents/BTF/course_materials/Bulk_RNAseq_Course_Base/course_files

## Info: Writing image file mmu04659.pathview.png

mmu04659.pathview.png:

mmu04659 - Th17 cell differentiation

Exercise 2 - GSEA

Another common way to rank the genes is to order by pvalue, but also, sorting so that upregulated genes are at the start and downregulated at the end - you can do this combining the sign of the fold change and the pvalue.

First load the pathway details if you have not already done so.

library(msigdbr)
term2gene <- msigdbr(species = "Mus musculus", category = "H") %>% 
  dplyr::select(gs_name, ensembl_gene)
term2name <- msigdbr(species = "Mus musculus", category = "H") %>% 
  dplyr::select(gs_name, gs_description) %>% 
  distinct()

Rank the genes by statistical significance - you will need to create a new ranking value using -log10({p value}) * sign({Fold Change}).

# rank genes
rankedGenes.e11 <- shrink.d11 %>%
  drop_na(GeneID, pvalue, log2FoldChange) %>%
  mutate(rank = -log10(pvalue) * sign(log2FoldChange)) %>%
  arrange(desc(rank)) %>%
  pull(rank, GeneID)

Run GSEA using the new ranked genes and the Hallmark pathways.

# conduct analysis:
gseaRes.e11 <- GSEA(rankedGenes.e11,
                TERM2GENE = term2gene,
                TERM2NAME = term2name,
                pvalueCutoff = 1.00, 
                minGSSize = 15,
                maxGSSize = 500)

## preparing geneSet collections...

## GSEA analysis...

## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (0% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.

## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.

## leading edge analysis...

## done...

View the results:

as_tibble(gseaRes.e11) %>% 
  arrange(desc(abs(NES))) %>% 
  top_n(10, wt=-p.adjust) %>% 
  dplyr::select(-core_enrichment) %>%
  mutate(across(c("enrichmentScore", "NES"), round, digits=3)) %>% 
  mutate(across(c("pvalue", "p.adjust", "qvalue"), scales::scientific))

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(c("enrichmentScore", "NES"), round, digits = 3)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

Conduct the same analysis for the day 33 Infected vs Uninfected contrast.

# read d33 data in:
shrink.d33 <- readRDS("RObjects/Shrunk_Results.d33.rds")

# rank genes
rankedGenes.e33 <- shrink.d33 %>%
  drop_na(GeneID, pvalue, log2FoldChange) %>%
  mutate(rank = -log10(pvalue) * sign(log2FoldChange)) %>%
  arrange(desc(rank)) %>%
  pull(rank,GeneID)

# perform analysis
gseaRes.e33 <- GSEA(rankedGenes.e33,
                TERM2GENE = term2gene,
                TERM2NAME = term2name,
                pvalueCutoff = 1.00, 
                minGSSize = 15,
                maxGSSize = 500)

## preparing geneSet collections...

## GSEA analysis...

## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : There were 3 pathways for which P-values were not calculated
## properly due to unbalanced (positive and negative) gene-level statistic values.
## For such pathways pval, padj, NES, log2err are set to NA. You can try to
## increase the value of the argument nPermSimple (for example set it nPermSimple
## = 10000)

## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.

## leading edge analysis...

## done...

View the results:

as_tibble(gseaRes.e33) %>% 
  arrange(desc(abs(NES))) %>% 
  top_n(10, wt=-p.adjust) %>% 
  dplyr::select(-core_enrichment) %>%
  mutate(across(c("enrichmentScore", "NES"), round, digits=3)) %>% 
  mutate(across(c("pvalue", "p.adjust", "qvalue"), scales::scientific))

Introduction to Bulk RNAseq data analysis

Gene Set Testing for RNA-seq - Solutions

Exercise 1 - pathview

Exercise 2 - GSEA