bash
pwd
cd "/home/participant/Course_Materials/ChIPSeq/Materials"
#mkdir Utils
cd Utils
R
# show current directory
getwd()
# set new working directory in R
setwd("/home/participant/Course_Materials/ChIPSeq/Materials/Utils")
# shows directory contents
dir()
Install and load packages needed for the tutorial. Uncomment install commands if your computer doesn’t have the packages.
R
#load libraries
library("GenomicRanges")
library("TxDb.Hsapiens.UCSC.hg38.knownGene")
library("EnsDb.Hsapiens.v86")
library("org.Hs.eg.db")
library("ChIPseeker")
library("ChIPpeakAnno")
library("chipenrich")
library ("rtracklayer")
Read the peak files into GRanges objects.
A complete collection of peak files are in the “../Course_Materials/ChIPSeq/Preprocessed”" folder. We will read the (Excel) xls file for one of the TP73 replicate into a R dataframe.
R
library("GenomicRanges")
peakfile1 <- "~/Course_Materials/ChIPSeq/Preprocessed/Peaks/TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.xls"
peakfile2 <- "~/Course_Materials/ChIPSeq/Preprocessed/Peaks/TAp73alpha_r2.fastq_trimmed.fastq_sorted_peaks.xls"
# Process the two peakfiles or look at peak calling practical from day3
# or readPeakFile from ChIPseeker
peaks_DF1 <- read.delim2(peakfile1, comment.char="#")
peaks_DF2 <- read.delim2(peakfile2, comment.char="#")
# look at the first 3 rows
peaks_DF1[1:3,]
peaks_DF2[1:3,]
library(GenomicRanges)
peaks_GR1 <- GRanges(
seqnames=peaks_DF1[,"chr"],
IRanges(peaks_DF1[,"start"],
peaks_DF1[,"end"]
)
)
# GRanges object
peaks_GR1
# dataframe
df1 <- data.frame(seqnames=seqnames(peaks_GR1),
starts=start(peaks_GR1)-1,
ends=end(peaks_GR1),
names=c(rep(".", length(peaks_GR1))),
scores=c(rep(".", length(peaks_GR1))),
strands=strand(peaks_GR1))
# second file
library(GenomicRanges)
peaks_GR2 <- GRanges(
seqnames=peaks_DF2[,"chr"],
IRanges(peaks_DF2[,"start"],
peaks_DF2[,"end"]
)
)
df2 <- data.frame(seqnames=seqnames(peaks_GR2),
starts=start(peaks_GR2)-1,
ends=end(peaks_GR2),
names=c(rep(".", length(peaks_GR2))),
scores=c(rep(".", length(peaks_GR2))),
strands=strand(peaks_GR2))
# change the name according to sample and create bed files
write.table(df1, file="TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed", quote=F, sep="\t", row.names=F, col.names=F)
write.table(df2, file="TAp73alpha_r2.fastq_trimmed.fastq_sorted_peaks.bed", quote=F, sep="\t", row.names=F, col.names=F)
These commands need to be run in a bash shell. Create any bed files needed using the previous code chunk.
cd into working directory in bash shell
bash
pwd
cd /home/participant/Course_Materials/ChIPSeq/Materials/Utils
Bedtools is a command-line tool. To bring up the help, just type
bash
bedtools
Use bedtools with the appropriate subcommand.
Examples:
bash
bedtools intersect -h
bedtools merge -h
bedtools subtract -h
Version:
bash
bedtools --version
bash
# Download the CpG island track from UCSC table browser and name it hg38_CpG_Islands.bed
bedtools intersect -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed | \
head -5
#write to file
bedtools intersect -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed > \
TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks_CPG.bed
head -n 5 TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks_CPG.bed
bedtools intersect -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed -wa -wb | \
head -5
bedtools intersect -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed -wa -wb > \
TAp73alpha_r1_CPG_all.bed
bedtools intersect -wo -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed | head -10
bedtools intersect -wo -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed > TAp73alpha_r1_CPG_overlap_nt.bed
bedtools intersect -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed -c | head
bedtools intersect -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed -c > \
TAp73alpha_r1_CPG_overlap_ft.bed
bash
bedtools intersect -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed -v | head -10
bedtools intersect -a TAp73alpha_r1.fastq_trimmed.fastq_sorted_peaks.bed -b /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed -v > \
TAp73alpha_r1_CPG_overlap_notoverlap.bed
bash
pwd
# various sort options
bedtools sort -i /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed -g /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38.chrom.sizes > \ /home/participant/Course_Materials/ChIPSeq/Materials/Utils/hg38_CpG_Islands_sorted3.bed
# unix sort by chromosome and then by start position
# sort -k 1,1 -k2,2n /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38_CpG_Islands.bed > /home/participant/Course_Materials/ChIPSeq/Materials/Utils/hg38_CpG_Islands_sorted2.bed
# complement
bedtools complement -i /home/participant/Course_Materials/ChIPSeq/Materials/Utils/hg38_CpG_Islands_sorted3.bed \
-g /home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/hg38.chrom.sizes > non_cpg_regions.bed
R
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(ChIPseeker)
library(clusterProfiler)
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)
# use GRanges object generated in the previous section
tagMatrix <- getTagMatrix(peaks_GR1, windows=promoter)
R
tagHeatmap(tagMatrix, xlim=c(-3000, 3000), color="red")
Average Profile of ChIP peaks binding to TSS region
R
plotAvgProf(tagMatrix, xlim=c(-3000, 3000), xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency")
# or use this single function (without generating a tag matrix)
plotAvgProf2(peaks_GR1, TxDb=txdb, upstream=3000, downstream=3000,
xlab="Genomic Region (5'->3')", ylab = "Read Count Frequency")
# confidance intervals by resampling
plotAvgProf(tagMatrix, xlim=c(-3000, 3000), conf = 0.95, resample = 1000)
R
peakAnno <- annotatePeak(peaks_GR1, tssRegion=c(-3000, 3000), TxDb=txdb, annoDb="org.Hs.eg.db")
R
#pie chart
plotAnnoPie(peakAnno)
#bar plot
plotAnnoBar(peakAnno)
R
upsetplot(peakAnno, vennpie=TRUE)
R
plotDistToTSS(peakAnno, title="Distribution of transcription factor-binding loci\nrelative to TSS")
R
library("EnsDb.Hsapiens.v86")
library("ChIPpeakAnno")
# Prepare annotation data with toGRanges
annoData <- toGRanges(EnsDb.Hsapiens.v86)
annoData[1:2]
# Annotate the peaks with annotatePeakInBatch
## keep the seqnames in the same style
seqlevelsStyle(peaks_GR1) <- seqlevelsStyle(annoData)
## do annotation by nearest TSS
anno <- annotatePeakInBatch(peaks_GR1, AnnotationData=annoData)
anno[1:2]
R
pie1(table(anno$insideFeature))
R
library(org.Hs.eg.db)
anno <- addGeneIDs(anno, orgAnn="org.Hs.eg.db",
feature_id_type="ensembl_gene_id",
IDs2Add=c("symbol"))
head(anno)
R
df <- data.frame(seqnames=seqnames(anno),
starts=start(anno)-1,
ends=end(anno),
names=c(rep(".", length(anno))),
scores=c(rep(".", length(anno))),
strands=strand(anno))
write.table(df, file="TP73_anno.bed", quote=F, sep="\t", row.names=F, col.names=F)
str("TP73_anno.bed")
Gene Set Enrichment analysis of genmic regions
R
library("chipenrich")
# This practical gives a brief overview of the chipenrich package
# A locus definition is a way of defining a gene regulatory region, and enables us to associate peaks with genes.
# Nearest TSS, Nearest gene, exon, intron, proximal promoter etc
head(supported_locusdefs())
# Can define custome locus definitions (in the format of: chr, start, end, entrez_id)
# Gene sets
head(supported_genesets())
# Can define custom gene sets (gene_set_id, entrez_gene_id)
# Enrichment testing
## broadenrich() -use with broad reagions that intersect multiple features
data(peaks_E2F4, package = 'chipenrich.data')
data(peaks_H3K4me3_GM12878, package = 'chipenrich.data')
gs_path = system.file('extdata','vignette_genesets.txt', package='chipenrich')
results = broadenrich(peaks = peaks_H3K4me3_GM12878, genome = 'hg19', genesets = gs_path,
locusdef = "nearest_tss", qc_plots = FALSE, out_name = NULL, n_cores=1)
results.be = results$results
print(results.be[1:5,1:5])
## chipenrich()- use with 1-10k narrow regions (ex: TF ChIP-seq)
# Without mappability
gs_path = system.file('extdata','vignette_genesets.txt', package='chipenrich')
results = chipenrich(peaks = peaks_E2F4, genome = 'hg19', genesets = gs_path,
locusdef = "nearest_tss", qc_plots = FALSE, out_name = NULL, n_cores = 1)
results.ce = results$results
print(results.ce[1:5,1:5])
# With mappability
gs_path = system.file('extdata','vignette_genesets.txt', package='chipenrich')
results = chipenrich(peaks = peaks_E2F4, genome = 'hg19', genesets = gs_path,
locusdef = "nearest_tss", mappability=24, qc_plots = FALSE,
out_name = NULL,n_cores=1)
results.cem = results$results
print(results.cem[1:5,1:5])
## polyenrich() -narrow regions mappiing to > 10k features
gs_path = system.file('extdata','vignette_genesets.txt', package='chipenrich')
results = polyenrich(peaks = peaks_E2F4, genome = 'hg19', genesets = gs_path, method = 'polyenrich',
locusdef = "nearest_tss", qc_plots = FALSE, out_name = NULL, n_cores = 1)
results.pe = results$results
print(results.pe[1:5,1:5])
# If many gene loci have multiple peaks assigned to them, polyenrich is likely an appropriate method. If there are a low number of peaks per gene, then chipenrich() may be the appropriate method
plot_polyenrich_spline(peaks = peaks_E2F4, locusdef = 'nearest_tss', genome = 'hg19')
## hybridenrich() - intermediate between chipenrich and polyenrich
gs_path = system.file('extdata','vignette_genesets.txt', package='chipenrich')
results = hybridenrich(peaks = peaks_E2F4, genome = 'hg19', genesets = gs_path,
locusdef = "nearest_tss", qc_plots = F, out_name = NULL, n_cores = 1)
results.hybrid = results$results
print(results.hybrid[1:5,1:5])
deepTools2 support a wide range of functions, such as various quality controls, different normalization schemes and genome-wide visualizations. A brief example is given below:
bash
heatmapFolder="/home/participant/Course_Materials/ChIPSeq/Preprocessed/deeptools_heatmap/"
bigWigFolder="/home/participant/Course_Materials/ChIPSeq/Preprocessed/differential_binding/"
preprocessedPeakFolder="/home/participant/Course_Materials/ChIPSeq/Preprocessed/Peaks/"
cat ${preprocessedPeakFolder}*_summits.bed > ${heatmapFolder}allPeakSummits.bed
computeMatrix reference-point -S ${bigWigFolder}TAp73beta_versus_tp53_-s1-rep0.bw \
${bigWigFolder}TAp73beta_versus_tp53_-s1-rep1.bw \
${bigWigFolder}TAp73beta_versus_tp53_-s2-rep0.bw \
${bigWigFolder}TAp73beta_versus_tp53_-s2-rep1.bw \
-R ${heatmapFolder}allPeakSummits.bed \
--referencePoint "center" \
-a 3000 -b 3000 \
-p max/2 \
-out ${heatmapFolder}TAp73beta_versus_tp53.mat.gz
plotHeatmap -m ${heatmapFolder}TAp73beta_versus_tp53.mat.gz \
-out ${heatmapFolder}TAp73beta_versus_tp53.png \
--plotFileFormat "png" \
--colorMap coolwarm \
--kmeans 1 \
--whatToShow 'plot, heatmap and colorbar' \
--startLabel "peak start" --endLabel "peak end" \
--regionsLabel 1 \
--refPointLabel "Peak center" \
--outFileSortedRegions ${heatmapFolder}TAp73beta_versus_tp53.bed \
--outFileNameMatrix ${heatmapFolder}TAp73beta_versus_tp53.tab \
--samplesLabel "TAp73beta r1" "TAp73beta r2" "tp53 r1" "tp53 r2"
sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Fedora 23 (Workstation Edition)
##
## Matrix products: default
## BLAS: /home/SS861/src/R_3.5.0/lib64/R/lib/libRblas.so
## LAPACK: /home/SS861/src/R_3.5.0/lib64/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] BiocStyle_2.8.2
##
## loaded via a namespace (and not attached):
## [1] compiler_3.5.0 backports_1.1.2 bookdown_0.7 magrittr_1.5
## [5] rprojroot_1.3-2 tools_3.5.0 htmltools_0.3.6 yaml_2.1.19
## [9] Rcpp_0.12.17 stringi_1.2.4 rmarkdown_1.10 knitr_1.20
## [13] xfun_0.3 stringr_1.3.1 digest_0.6.15 evaluate_0.11