Contents

1 Peak Analysis

1.1 Set working directory

bash-script

cd /home/participant/Course_Materials/ChIPSeq/Materials/

#mkdir motifs if it doesn't exist

cd motifs

1.2 Extract region around peak summit

R-script

# read in the peak summit file
 
setwd("/home/participant/Course_Materials/ChIPSeq/Materials/motifs")
getwd()


peakfile <-read.table("/home/participant/Course_Materials/ChIPSeq/Materials/Results/Peaks/TAp73beta_r2.fastq_trimmed.fastq_sorted_standard_summits.bed")
head(peakfile)

# get a 400bp window around summit (-200 and +200)
Motif_Peaks <- data.frame((peakfile$V1), (peakfile$V2-200), (peakfile$V2+200), stringsAsFactors = FALSE)
head(Motif_Peaks)

# create bed file
options(scipen=999)

write.table(Motif_Peaks, file= "/home/participant/Course_Materials/ChIPSeq/Materials/motifs/Peaks_for_motif_detection.bed", 
            row.names=FALSE,col.names = FALSE, sep="\t", quote =FALSE)

# list files in directory and check if the file you created is there.
dir()

1.3 ChIP peak annotation

R-script

library("ChIPpeakAnno")
library("GenomicRanges")

#library("rtracklayer")
#bed <- import("/home/participant/Course_Materials/ChIPSeq/Materials/motifs/Peaks_for_motif_detection.bed")
#gr1 <- GRanges(seqnames=bed$V1, ranges=IRanges(start=bed$V2, end=bed$V3)) 

#read in the bed file
df1<-read.table("Peaks_for_motif_detection.bed", header=FALSE)

#convert the peaks to a GRanges object
gr1 <- GRanges(seqnames=df1$V1, ranges=IRanges(start=df1$V2, end=df1$V3))



##annotation
library("EnsDb.Hsapiens.v86")


## create annotation file from EnsDb (Ensembl) or TxDb (transcript annotation) packages

annoData <- toGRanges(EnsDb.Hsapiens.v86, feature="gene")
annoData[1:2]

# annotate the peak GRanges object with peaks mapped to gene with a -2000 and 500 bp window around the TSS

anno.gr1 <- annotatePeakInBatch(gr1, 
AnnotationData=annoData, 
output="nearestBiDirectionalPromoters",
bindingRegion=c(-2000, 500))

#trim out of bound ranges
anno.gr1 <- trim(anno.gr1)




#annotate with Entrez IDs

library("org.Hs.eg.db")
anno.gr1 <- addGeneIDs(anno.gr1,"org.Hs.eg.db",IDs2Add = "entrez_id")

# list annotated peaks
head(anno.gr1)

1.4 Motif detection

MEME-ChIP performs several motif analysis steps on a set of user provided DNA sequences in fasta format. It is especially useful for analyzing peaks from ChIP-seq experiments. MEME-ChIP can

  1. Discover novel DNA-binding motifs (MEME and DREME)
  2. Analyze them for similarity to known binding motifs (TOMTOM)
  3. Visualize the arrangement of the predicted motif sites in your input sequences (SPAMO, Fimo)
  4. Identify motif enrichment relative to a background model (Centrimo)

1.4.1 Get fasta for peak regions

bash-script

# If strand information is known use -s option as well
bedtools getfasta -fi "/home/participant/Course_Materials/Introduction/SS_DB/Reference/BWA/hg38_chr3.fa" \
-bed "/home/participant/Course_Materials/ChIPSeq/Materials/motifs/Peaks_for_motif_detection.bed" -fo "/home/participant/Course_Materials/ChIPSeq/Materials/motifs/Tp73_peaks.fa"

less Tp73_peaks.fa


# usefull sed commands
# find a line (3rd line in this case) and see its contents
# sed -n '3p'  Tp73_peaks.fa

# delete a line (DON'T RUN unless there is a need to remove a line)
# sed -i".bak" '3d' filename

1.4.2 Meme and Dreme

bash-script


#look at the options in meme-chip, meme and dreme

meme-chip --help

meme --help

dreme --help

Meme is an Expectation Maximisation (EM) algorithm and Dreme uses pattern matching to finds de novo motifs.

The fasta file should be 500bp for each peak, Meme looks at a 100bp window around the peak center for motifs. Repeat masking is recommended.

  • TOMTOM compares motifs discovered with known motifs from PWM databases.
  • Centrimo looks for motif enrichment.
  • MAST can visualize the location of the discovered motifs within a sequence.
  • AMA measures how strongly a motif is associated with each sequence
  • AME discovers subtly enriched known bindng motifs in the sequences.
  • FIMO detects motif locations

1.5 Motif Enrichment Analysis

Motif database for meme analysis has been setup. If it’s NOT there, follow the instructions below:

Download meme motif PWM databases onto your working folder PWM databases

bash-script

Run only IF motif database is missing!

cd "/home/participant/Course_Materials/ChIPSeq/Materials/motifs"

wget "http://meme-suite.org/meme-software/Databases/motifs/motif_databases.12.18.tgz"
chmod 755 motif_databases.12.18.tgz

#unzip them

tar -zxvf motif_databases.12.18.tgz

1.5.1 MemeChIP

bash-script


# If the meme directory is missing create it
# mkdir meme

meme-chip -oc /home/participant/Course_Materials/ChIPSeq/Materials/motifs/meme/ \
-db /home/participant/Course_Materials/ChIPSeq/Materials/motifs/motif_databases/EUKARYOTE/jolma2013.meme \
-db /home/participant/Course_Materials/ChIPSeq/Materials/motifs/motif_databases/JASPAR/JASPAR_CORE_2016_vertebrates.meme \
-meme-mod zoops -meme-minw 6 -order 1 \
-meme-maxw 30 -meme-nmotifs 10 -dreme-e 0.05 -centrimo-score 5.0 -centrimo-ethresh 10.0 /home/participant/Course_Materials/ChIPSeq/Materials/motifs/Tp73_peaks.fa

or upload fasta file meme-chip

1.5.2 PScanChIP

Demonstration on web version of PScanChIP

PScanChIP website

  • Upload your bed file

2 Network Analysis

2.1 STRING

STRING website

STRING

STRING

R-script

#install

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("STRINGdb")


#load package

library(STRINGdb)
string_db <- STRINGdb$new( version="10", species=9606, 
                           score_threshold=0, input_directory="" )
                           
STRINGdb$methods()              # To list all the methods available.
STRINGdb$help("get_graph")      # To visualize their documentation.

data(diff_exp_example1)
head(diff_exp_example1)


example1_mapped <- string_db$map( diff_exp_example1, "gene", removeUnmappedRows = TRUE )

options(SweaveHooks=list(fig=function()
par(mar=c(2.1, 0.1, 4.1, 2.1))))
#par(mar=c(1.1, 0.1, 4.1, 2.1))))

hits <- example1_mapped$STRING_id[1:200]

#plot network

getOption("SweaveHooks")[["fig"]]()
string_db$plot_network( hits )

# filter by p-value and add a color column 
# (i.e. green down-regulated gened and red for up-regulated genes)
example1_mapped_pval05 <- string_db$add_diff_exp_color( subset(example1_mapped, pvalue<0.05), 
                                                            logFcColStr="logFC" ) 
                                                            
# post payload information to the STRING server
payload_id <- string_db$post_payload( example1_mapped_pval05$STRING_id, 
                                        colors=example1_mapped_pval05$color)     
getOption("SweaveHooks")[["fig"]]()
# display a STRING network png with the "halo"
string_db$plot_network( hits, payload_id=payload_id )

options(SweaveHooks=list(fig=function()
par(mar=c(2.1, 2.1, 4.1, 2.1))))

#plot enrichment
getOption("SweaveHooks")[["fig"]]()
# plot the enrichment for the best 1000 genes
string_db$plot_ppi_enrichment( example1_mapped$STRING_id[1:1000], quiet=TRUE ) 

2.2 GeneMANIA

GeneMANIA website

GeneMANIA (web app and Cytoscape module) finds connectivity between a genelist or other genes that are related to a set of input genes, using a very large set of functional association data. Association data include protein and genetic interactions, pathways, co-expression, co-localization and protein domain similarity. You can use GeneMANIA to find new members of a pathway or complex, find additional genes you may have missed in your screen or find new genes with a specific function, such as protein kinases. Your question is defined by the set of genes you input.

GeneMANIA

GeneMANIA

3 Ontology Enrichment Analysis

3.1 GREAT (Genomic Regions Enrichment of Annotations Tool)

Genes co-regulated by TFs or associated with certain epigenomic modifications may belong to common functional groups. Gene Ontology enrichment analysis, GSEA, pathway and interaction analysis helps identify these enriched biological features and their connectivity.

GREAT website

Ontology enrichment analysis may be performed on the sets of genes with peaks associated to them. ChIPpeakAnno can map peaks to genes. In this example we will consider genes with peaks within 5000bp of a gene’s TSS.

How to associate genomic regions to genes: * basalPlusExt: mode ‘Basal plus extension’. Gene regulatory domain definition: Each gene is assigned a basal regulatory domain of a minimum distance upstream and downstream of the TSS (regardless of other nearby genes). The gene regulatory domain is extended in both directions to the nearest gene’s basal domain but no more than the maximum extension in one direction. * adv_upstream: proximal extension to upstream (unit: kb) * adv_downstream: proximal extension to downstream (unit: kb) * adv_span: maximum extension (unit: kb) * twoClosest: mode ‘Two nearest genes’. Gene regulatory domain definition: Each gene is assigned a regulatory domain that extends in both directions to the nearest gene’s TSS but no more than the maximum extension in one direction. adv_twoDistance: maximum extension (unit: kb) * oneClosest: mode ‘Single nearest gene’. Gene regulatory domain definition: Each gene is assigned a regulatory domain that extends in both directions to the midpoint between the gene’s TSS and the nearest gene’s TSS but no more than the maximum extension in one direction. * adv_oneDistance: maximum extension (unit: kb)

R-script

library("rGREAT")
library("circlize")

#setwd(THE/FOLDER/WHERE/YOUR/PEAKFILES/ARE)


# If you want to use one of the peak files you generated use the LiftOver tool to conver your hg38
# peak file to hg19.

# liftOver input.bed /home/participant/Course_Materials/ChIPSeq/Materials/RawData/hg38ToHg19.over.chain.gz output.bed unlifted.bed

# example:
peakfile <- read.table("/home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.narrowPeak")
head(peakfile)
P73_Peaks <- data.frame((peakfile$V1), (peakfile$V2), (peakfile$V3), stringsAsFactors = FALSE)
options(scipen=999)
write.table(P73_Peaks, file= "/home/participant/Course_Materials/ChIPSeq/Materials/motifs/Peaks_for_Great.bed", 
            row.names=FALSE,col.names = FALSE, sep="\t", quote =FALSE)

3.1.1 Using the liftOver tool

liftOver tool converts genome coordinates and genome annotation files between assemblies (reference genomes)

bash-script

#This is an example of how to convert hg38 coordinates to hg19 without re-aligning 

cd "/home/participant/Course_Materials/ChIPSeq/Materials/motifs/"


liftOver Peaks_for_Great.bed "/home/participant/Course_Materials/hg38ToHg19.over.chain.gz" Peaks_for_Great_liftedTohg19.bed unlifted.bed

R-script

# Toy example on randomly generated bed file
set.seed(123)
bed = circlize::generateRandomBed(nr = 1000, nc = 0)
bed[1:2, ]


# submit job (set for once every 300seconds)
job <-  submitGreatJob(bed,request_interval = 300, species="hg19", version = "default")

## job <-  submitGreatJob(bed, rule = "oneClosest", adv_oneDistance=5000, version = "3.0")


# list of dataframes, each one a table of single ontology
tb  <-  getEnrichmentTables(job)
names(tb)
tb[[1]][1:2, ]
job

# list available ontologies or categories

availableOntologies(job)
availableCategories(job)

availableOntologies(job, category = "Pathway Data")

# You can get results by either specifying the ontologies or by the pre-defined categories (categories already contains pre-defined sets of ontologies)

tb = getEnrichmentTables(job, ontology = c("GO Molecular Function", "BioCyc Pathway"))
tb = getEnrichmentTables(job, category = c("GO"))

Association between genomic regions and genes

R-script

par(mfrow = c(1, 3))

#res is a GenomicRanges object
res = plotRegionGeneAssociationGraphs(job)

R-script

res[1:2, ]

By specifying ontology and term ID, you can get the association in a certain term. Here the term ID is from the first column of the data frame which is returned by getEnrichmentTables().

R-script

par(mfrow = c(1, 3))
res = plotRegionGeneAssociationGraphs(job, ontology = "GO Molecular Function",
    termID = "GO:0004984")

Alternatively:

  • Use liftOver tool at UCSC convert your hg38 bed file to hg19 coordinates.
  • Download the converted file and name it “Peaks_for_Great_liftedTohg19.bed”
  • upload this to web version of GREAT