Mark Dunning
Last modified: 23 Jul 2015
IRanges
package in Bioconductor allows us to work with intervals
IRanges
is an example of thisJust some of the packages that depend on IRanges
Suppose we want to capture information on the following intervals
IRanges
function from the IRanges
package is used to construct a new object
data.frame
, vector
or matrix
library(IRanges)
ir <- IRanges(
start = c(7,9,12,14,22:24),
end=c(15,11,13,18,26,27,28))
str(ir)
## Formal class 'IRanges' [package "IRanges"] with 6 slots
## ..@ start : int [1:7] 7 9 12 14 22 23 24
## ..@ width : int [1:7] 9 3 2 5 5 5 5
## ..@ NAMES : NULL
## ..@ elementType : chr "integer"
## ..@ elementMetadata: NULL
## ..@ metadata : list()
[]
should give a hint about how to access the data…ir
## IRanges of length 7
## start end width
## [1] 7 15 9
## [2] 9 11 3
## [3] 12 13 2
## [4] 14 18 5
## [5] 22 26 5
## [6] 23 27 5
## [7] 24 28 5
We can give our ranges names
ir <- IRanges(
start = c(7,9,12,14,22:24),
end=c(15,11,13,18,26,27,28),names=LETTERS[1:7])
ir
## IRanges of length 7
## start end width names
## [1] 7 15 9 A
## [2] 9 11 3 B
## [3] 12 13 2 C
## [4] 14 18 5 D
## [5] 22 26 5 E
## [6] 23 27 5 F
## [7] 24 28 5 G
IRanges
can be treated as if they were vectors
[
to subsetindices
that you want values for
:
shortcutir[1:2]
## IRanges of length 2
## start end width names
## [1] 7 15 9 A
## [2] 9 11 3 B
ir[c(2,4,6)]
## IRanges of length 3
## start end width names
## [1] 9 11 3 B
## [2] 14 18 5 D
## [3] 23 27 5 F
start(ir)
## [1] 7 9 12 14 22 23 24
end(ir)
## [1] 15 11 13 18 26 27 28
width(ir)
## [1] 9 3 2 5 5 5 5
TRUE
or FALSE
<
, >
, ==
width(ir) == 5
## [1] FALSE FALSE FALSE TRUE TRUE TRUE TRUE
ir[width(ir)==5]
## IRanges of length 4
## start end width names
## [1] 14 18 5 D
## [2] 22 26 5 E
## [3] 23 27 5 F
## [4] 24 28 5 G
start(ir) > 10
## [1] FALSE FALSE TRUE TRUE TRUE TRUE TRUE
end(ir) < 27
## [1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE
ir[start(ir) > 10]
## IRanges of length 5
## start end width names
## [1] 12 13 2 C
## [2] 14 18 5 D
## [3] 22 26 5 E
## [4] 23 27 5 F
## [5] 24 28 5 G
&
(and), |
(or)
ir[end(ir) < 27]
## IRanges of length 5
## start end width names
## [1] 7 15 9 A
## [2] 9 11 3 B
## [3] 12 13 2 C
## [4] 14 18 5 D
## [5] 22 26 5 E
ir[start(ir) > 10 & end(ir) < 27]
## IRanges of length 3
## start end width names
## [1] 12 13 2 C
## [2] 14 18 5 D
## [3] 22 26 5 E
e.g. sliding windows
ir
## IRanges of length 7
## start end width names
## [1] 7 15 9 A
## [2] 9 11 3 B
## [3] 12 13 2 C
## [4] 14 18 5 D
## [5] 22 26 5 E
## [6] 23 27 5 F
## [7] 24 28 5 G
shift(ir, 5)
## IRanges of length 7
## start end width names
## [1] 12 20 9 A
## [2] 14 16 3 B
## [3] 17 18 2 C
## [4] 19 23 5 D
## [5] 27 31 5 E
## [6] 28 32 5 F
## [7] 29 33 5 G
Size of shift doesn’t need to be constant
ir
## IRanges of length 7
## start end width names
## [1] 7 15 9 A
## [2] 9 11 3 B
## [3] 12 13 2 C
## [4] 14 18 5 D
## [5] 22 26 5 E
## [6] 23 27 5 F
## [7] 24 28 5 G
shift(ir, 7:1)
## IRanges of length 7
## start end width names
## [1] 14 22 9 A
## [2] 15 17 3 B
## [3] 17 18 2 C
## [4] 18 22 5 D
## [5] 25 29 5 E
## [6] 25 29 5 F
## [7] 25 29 5 G
e.g. trimming reads
ir
## IRanges of length 7
## start end width names
## [1] 7 15 9 A
## [2] 9 11 3 B
## [3] 12 13 2 C
## [4] 14 18 5 D
## [5] 22 26 5 E
## [6] 23 27 5 F
## [7] 24 28 5 G
resize(ir,3)
## IRanges of length 7
## start end width names
## [1] 7 9 3 A
## [2] 9 11 3 B
## [3] 12 14 3 C
## [4] 14 16 3 D
## [5] 22 24 3 E
## [6] 23 25 3 F
## [7] 24 26 3 G
coverage
returns a Run Length Encoding - an efficient representation of repeated values
cvg <- coverage(ir)
cvg
## integer-Rle of length 28 with 10 runs
## Lengths: 6 2 7 3 3 1 1 3 1 1
## Values : 0 1 2 1 0 1 2 3 2 1
as.vector(cvg)
## [1] 0 0 0 0 0 0 1 1 2 2 2 2 2 2 2 1 1 1 0 0 0 1 2 3 3 3 2 1
e.g. counting - The terminology of overlapping defines a query and a subject
ir3 <- IRanges(start = c(1, 14, 27), end = c(13,
18, 30),names=c("X","Y","Z"))
ir3
## IRanges of length 3
## start end width names
## [1] 1 13 13 X
## [2] 14 18 5 Y
## [3] 27 30 4 Z
findOverlaps
function is used for overlap
queryHits
and subjectHits
query <- ir
subject <- ir3
ov <- findOverlaps(query, subject)
ov
## Hits object with 7 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 1
## [2] 1 2
## [3] 2 1
## [4] 3 1
## [5] 4 2
## [6] 6 3
## [7] 7 3
## -------
## queryLength: 7
## subjectLength: 3
queryHits
returns indices from the query
queryHits(ov)
## [1] 1 1 2 3 4 6 7
subjectHits
returns indices from the subject
subjectHits(ov)
## [1] 1 2 1 1 2 3 3
query[queryHits(ov)[1]]
## IRanges of length 1
## start end width names
## [1] 7 15 9 A
subject[subjectHits(ov)[1]]
## IRanges of length 1
## start end width names
## [1] 1 13 13 X
query[queryHits(ov)[2]]
## IRanges of length 1
## start end width names
## [1] 7 15 9 A
subject[subjectHits(ov)[2]]
## IRanges of length 1
## start end width names
## [1] 14 18 5 Y
query[queryHits(ov)[3]]
## IRanges of length 1
## start end width names
## [1] 9 11 3 B
subject[subjectHits(ov)[3]]
## IRanges of length 1
## start end width names
## [1] 1 13 13 X
countOverlaps
countOverlaps(query,subject)
## A B C D E F G
## 2 1 1 1 0 1 1
countOverlaps(subject,query)
## X Y Z
## 3 2 2
findOverlaps(query,subject,type="within")
## Hits object with 3 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 2 1
## [2] 3 1
## [3] 4 2
## -------
## queryLength: 7
## subjectLength: 3
findOverlaps(query,subject,type="within")
## Hits object with 3 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 2 1
## [2] 3 1
## [3] 4 2
## -------
## queryLength: 7
## subjectLength: 3
findOverlaps(query,subject,type="within")
## Hits object with 3 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 2 1
## [2] 3 1
## [3] 4 2
## -------
## queryLength: 7
## subjectLength: 3
intersect(ir,ir3)
## IRanges of length 2
## start end width
## [1] 7 18 12
## [2] 27 28 2
setdiff(ir,ir3)
## IRanges of length 1
## start end width
## [1] 22 26 5
The Biostrings package is specifically-designed for biological sequences
DNAStringSet
for storing sequencesDNAStringSet
functionlibrary(Biostrings)
myseq <- DNAStringSet(randomStrings)
myseq
## A DNAStringSet instance of length 100
## width seq
## [1] 15 TACAGTTCAGCGCTC
## [2] 14 GATCATCATTGGAT
## [3] 20 ACTAGAGCTTAGGTTGGATT
## [4] 13 TTTTTTGAACCCC
## [5] 20 GGACTACTATGCGACATTCG
## ... ... ...
## [96] 19 TTTGCCCTTACCCAGGGTT
## [97] 20 TAGAGTACAGGGGTAACAAC
## [98] 14 GTGCCTGTACATTG
## [99] 19 TCGTCCCGTATAAGGCACG
## [100] 15 AAATCAGATTGAGCA
str(myseq)
## Formal class 'DNAStringSet' [package "Biostrings"] with 5 slots
## ..@ pool :Formal class 'SharedRaw_Pool' [package "XVector"] with 2 slots
## .. .. ..@ xp_list :List of 1
## .. .. .. ..$ :<externalptr>
## .. .. ..@ .link_to_cached_object_list:List of 1
## .. .. .. ..$ :<environment: 0x6dd0b10>
## ..@ ranges :Formal class 'GroupedIRanges' [package "XVector"] with 7 slots
## .. .. ..@ group : int [1:100] 1 1 1 1 1 1 1 1 1 1 ...
## .. .. ..@ start : int [1:100] 1 16 30 50 63 83 99 114 134 151 ...
## .. .. ..@ width : int [1:100] 15 14 20 13 20 16 15 20 17 14 ...
## .. .. ..@ NAMES : NULL
## .. .. ..@ elementType : chr "integer"
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
## ..@ elementType : chr "DNAString"
## ..@ elementMetadata: NULL
## ..@ metadata : list()
Biostrings
object like a standard vectormyseq[1:5]
## A DNAStringSet instance of length 5
## width seq
## [1] 15 TACAGTTCAGCGCTC
## [2] 14 GATCATCATTGGAT
## [3] 20 ACTAGAGCTTAGGTTGGATT
## [4] 13 TTTTTTGAACCCC
## [5] 20 GGACTACTATGCGACATTCG
width
and as.character
width(myseq)
## [1] 15 14 20 13 20 16 15 20 17 14 18 10 12 14 12 18 20 17 20 15 13 13 20
## [24] 18 20 13 12 15 14 15 13 16 19 13 11 15 14 15 14 11 11 14 16 19 12 16
## [47] 17 18 18 10 14 20 19 19 10 12 12 11 12 13 19 12 19 19 13 20 11 18 19
## [70] 16 12 15 20 15 16 20 18 18 19 11 14 19 19 16 13 18 20 13 15 13 11 15
## [93] 11 13 10 19 20 14 19 15
head(as.character(myseq))
## [1] "TACAGTTCAGCGCTC" "GATCATCATTGGAT" "ACTAGAGCTTAGGTTGGATT"
## [4] "TTTTTTGAACCCC" "GGACTACTATGCGACATTCG" "CTGACGTGTACAATGT"
What does this do?
myseq[width(myseq)>19]
## A DNAStringSet instance of length 13
## width seq
## [1] 20 ACTAGAGCTTAGGTTGGATT
## [2] 20 GGACTACTATGCGACATTCG
## [3] 20 CCGACAACTTAGAGATCGCT
## [4] 20 TAACACATTCATCGCGCAGC
## [5] 20 CGACTCAGGTGTTAAATTGT
## ... ... ...
## [9] 20 CCTGCTCCCTTTTGGTGCCG
## [10] 20 CGTATAACAAACACGAAGCC
## [11] 20 GTCAAACGACGCAGCTTTCG
## [12] 20 GATACGAACTGACGGGATTA
## [13] 20 TAGAGTACAGGGGTAACAAC
myseq[subseq(myseq,1,3) == "TTC"]
## A DNAStringSet instance of length 3
## width seq
## [1] 14 TTCTTAAGTTGTAC
## [2] 13 TTCCACTTGTTTT
## [3] 15 TTCGTACAGTAATAC
We can also use the matchPattern
function + see practical for details
Some useful string operation functions are provided
af <- alphabetFrequency(myseq, baseOnly=TRUE)
head(af)
## A C G T other
## [1,] 3 5 3 4 0
## [2,] 4 2 3 5 0
## [3,] 5 2 6 7 0
## [4,] 2 4 1 6 0
## [5,] 5 5 5 5 0
## [6,] 4 3 4 5 0
myseq[af[,1] ==0,]
## A DNAStringSet instance of length 2
## width seq
## [1] 14 CCTCCCTTTTGTGG
## [2] 20 CCTGCTCCCTTTTGGTGCCG
boxplot(af)
reverse(myseq)
## A DNAStringSet instance of length 100
## width seq
## [1] 15 CTCGCGACTTGACAT
## [2] 14 TAGGTTACTACTAG
## [3] 20 TTAGGTTGGATTCGAGATCA
## [4] 13 CCCCAAGTTTTTT
## [5] 20 GCTTACAGCGTATCATCAGG
## ... ... ...
## [96] 19 TTGGGACCCATTCCCGTTT
## [97] 20 CAACAATGGGGACATGAGAT
## [98] 14 GTTACATGTCCGTG
## [99] 19 GCACGGAATATGCCCTGCT
## [100] 15 ACGAGTTAGACTAAA
reverseComplement(myseq)
## A DNAStringSet instance of length 100
## width seq
## [1] 15 GAGCGCTGAACTGTA
## [2] 14 ATCCAATGATGATC
## [3] 20 AATCCAACCTAAGCTCTAGT
## [4] 13 GGGGTTCAAAAAA
## [5] 20 CGAATGTCGCATAGTAGTCC
## ... ... ...
## [96] 19 AACCCTGGGTAAGGGCAAA
## [97] 20 GTTGTTACCCCTGTACTCTA
## [98] 14 CAATGTACAGGCAC
## [99] 19 CGTGCCTTATACGGGACGA
## [100] 15 TGCTCAATCTGATTT
translate(myseq)
## A AAStringSet instance of length 100
## width seq
## [1] 5 YSSAL
## [2] 4 DHHW
## [3] 6 TRA*VG
## [4] 4 FFEP
## [5] 6 GLLCDI
## ... ... ...
## [96] 6 FALTQG
## [97] 6 *STGVT
## [98] 4 VPVH
## [99] 6 SSRIRH
## [100] 5 KSD*A
Recall that sequence reads are represented in text format
readLines(path.to.my.fastq ,n=10)
It should be possible to represent these as Biostrings
objects
ShortRead
packageOne of the first NGS packages in Bioconductor
library(ShortRead)
fq <- readFastq(path.to.my.fastq)
fq
BSgenome
library(BSgenome)
head(available.genomes())
## [1] "BSgenome.Alyrata.JGI.v1"
## [2] "BSgenome.Amellifera.BeeBase.assembly4"
## [3] "BSgenome.Amellifera.UCSC.apiMel2"
## [4] "BSgenome.Amellifera.UCSC.apiMel2.masked"
## [5] "BSgenome.Athaliana.TAIR.04232008"
## [6] "BSgenome.Athaliana.TAIR.TAIR9"
Various versions of the human genome
ag <- available.genomes()
ag[grep("Hsapiens",ag)]
## [1] "BSgenome.Hsapiens.NCBI.GRCh38"
## [2] "BSgenome.Hsapiens.UCSC.hg17"
## [3] "BSgenome.Hsapiens.UCSC.hg17.masked"
## [4] "BSgenome.Hsapiens.UCSC.hg18"
## [5] "BSgenome.Hsapiens.UCSC.hg18.masked"
## [6] "BSgenome.Hsapiens.UCSC.hg19"
## [7] "BSgenome.Hsapiens.UCSC.hg19.masked"
## [8] "BSgenome.Hsapiens.UCSC.hg38"
## [9] "BSgenome.Hsapiens.UCSC.hg38.masked"
library(BSgenome.Hsapiens.UCSC.hg19)
hg19 <- BSgenome.Hsapiens.UCSC.hg19::Hsapiens
hg19
## Human genome:
## # organism: Homo sapiens (Human)
## # provider: UCSC
## # provider version: hg19
## # release date: Feb. 2009
## # release name: Genome Reference Consortium GRCh37
## # 93 sequences:
## # chr1 chr2 chr3
## # chr4 chr5 chr6
## # chr7 chr8 chr9
## # chr10 chr11 chr12
## # chr13 chr14 chr15
## # ... ... ...
## # chrUn_gl000235 chrUn_gl000236 chrUn_gl000237
## # chrUn_gl000238 chrUn_gl000239 chrUn_gl000240
## # chrUn_gl000241 chrUn_gl000242 chrUn_gl000243
## # chrUn_gl000244 chrUn_gl000245 chrUn_gl000246
## # chrUn_gl000247 chrUn_gl000248 chrUn_gl000249
## # (use 'seqnames()' to see all the sequence names, use the '$' or '[['
## # operator to access a given sequence)
names
of the object are chromosome names
[[]]
to get chromsome sequenceDNAString
head(names(hg19))
## [1] "chr1" "chr2" "chr3" "chr4" "chr5" "chr6"
chrX <- hg19[["chrX"]]
chrX
## 155270560-letter "DNAString" instance
## seq: NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
alphabetFrequency(chrX,baseOnly=TRUE)
## A C G T other
## 45648952 29813353 29865831 45772424 4170000
getSeq
to do thistp53 <- getSeq(hg19, "chr17", 7577851, 7590863)
tp53
## 13013-letter "DNAString" instance
## seq: TTGTATTTTTCAGTAGAGACGGGGTTTCACCGTT...GTCTTGAGCACATGGGAGGGGAAAACCCCAATC
as.character(tp53[1:10])
## [1] "TTGTATTTTT"
alphabetFrequency(tp53,baseOnly=TRUE)
## A C G T other
## 3102 3375 3025 3511 0
subseq(tp53, 1000,1010)
## 11-letter "DNAString" instance
## seq: TATAGGTGTGC
Don’t need to load the whole genome into memory, so reading a particular sequence is fast
system.time(tp53 <- getSeq(hg19, "chr17", 7577851, 7598063))
## user system elapsed
## 0.116 0.000 0.117
We can now use Biostrings
operations to manipulate the sequence
translate(subseq(tp53, 1000,1010))
## Warning in .Call2("DNAStringSet_translate", x, skip_code,
## dna_codes[codon_alphabet], : last 2 bases were ignored
## 3-letter "AAString" instance
## seq: YRC
reverseComplement(subseq(tp53, 1000,2000))
## 1001-letter "DNAString" instance
## seq: CCTATGGAAACTGTGAGTGGATCCATTGGAAGGG...AAAATTAGCCAGGCATGGTGGTGCACACCTATA
GRanges
are a special kind of IRanges
object used to manipulate genomic intervals in an efficient mannerlibrary(GenomicRanges)
gr <- GRanges(c("A","A","A","B","B","B","B"), ranges=ir)
gr
## GRanges object with 7 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## A A [ 7, 15] *
## B A [ 9, 11] *
## C A [12, 13] *
## D B [14, 18] *
## E B [22, 26] *
## F B [23, 27] *
## G B [24, 28] *
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
We can add extra metadata to these ranges
mcols
can be set to be a data frame with one row for each rangemcols(gr) <- data.frame(Count = runif(length(gr)), Gene =sample(LETTERS,length(gr)))
gr
## GRanges object with 7 ranges and 2 metadata columns:
## seqnames ranges strand | Count Gene
## <Rle> <IRanges> <Rle> | <numeric> <factor>
## A A [ 7, 15] * | 0.9324835 H
## B A [ 9, 11] * | 0.1731538 W
## C A [12, 13] * | 0.1333582 C
## D B [14, 18] * | 0.1834847 G
## E B [22, 26] * | 0.9159181 N
## F B [23, 27] * | 0.7105521 K
## G B [24, 28] * | 0.6650954 T
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
gr[mcols(gr)$Count > 0.5]
## GRanges object with 4 ranges and 2 metadata columns:
## seqnames ranges strand | Count Gene
## <Rle> <IRanges> <Rle> | <numeric> <factor>
## A A [ 7, 15] * | 0.9324835 H
## E B [22, 26] * | 0.9159181 N
## F B [23, 27] * | 0.7105521 K
## G B [24, 28] * | 0.6650954 T
## -------
## seqinfo: 2 sequences from an unspecified genome; no seqlengths
mygene <- GRanges("chr17", ranges=IRanges(7577851, 7598063))
myseq <- getSeq(hg19, mygene)
myseq
## A DNAStringSet instance of length 1
## width seq
## [1] 20213 TTGTATTTTTCAGTAGAGACGGGGTTTCACC...CTACTTGGGAGGCTGAGGTGGGAGGATCGCT
tp53
## 20213-letter "DNAString" instance
## seq: TTGTATTTTTCAGTAGAGACGGGGTTTCACCGTT...AGCTACTTGGGAGGCTGAGGTGGGAGGATCGCT
Work through section 1 of the practical
IRanges
and GRanges
objectsWe will assume that the sequencing reads have been aligned and that we are interested in processing the alignments.
Rsamtools
provides an interface for doing this.readGAlignments
tool in GenomicAlignments
which extracts the essential information from the bam file.
library(GenomicAlignments)
bam <- readGAlignments(mybam,use.names = TRUE)
str(bam)
## Formal class 'GAlignments' [package "GenomicAlignments"] with 8 slots
## ..@ NAMES : chr [1:175346] "SRR031715.1138209" "SRR031714.776678" "SRR031715.3258011" "SRR031715.4791418" ...
## ..@ seqnames :Formal class 'Rle' [package "S4Vectors"] with 4 slots
## .. .. ..@ values : Factor w/ 8 levels "chr2L","chr2R",..: 5
## .. .. ..@ lengths : int 175346
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
## ..@ start : int [1:175346] 169 184 187 193 326 943 944 946 946 957 ...
## ..@ cigar : chr [1:175346] "37M" "37M" "37M" "37M" ...
## ..@ strand :Formal class 'Rle' [package "S4Vectors"] with 4 slots
## .. .. ..@ values : Factor w/ 3 levels "+","-","*": 1 2 1 2 1 2 1 2 1 2 ...
## .. .. ..@ lengths : int [1:37319] 1 2 1 1 3 2 3 10 3 1 ...
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
## ..@ elementMetadata:Formal class 'DataFrame' [package "S4Vectors"] with 6 slots
## .. .. ..@ rownames : NULL
## .. .. ..@ nrows : int 175346
## .. .. ..@ listData : Named list()
## .. .. ..@ elementType : chr "ANY"
## .. .. ..@ elementMetadata: NULL
## .. .. ..@ metadata : list()
## ..@ seqinfo :Formal class 'Seqinfo' [package "GenomeInfoDb"] with 4 slots
## .. .. ..@ seqnames : chr [1:8] "chr2L" "chr2R" "chr3L" "chr3R" ...
## .. .. ..@ seqlengths : int [1:8] 23011544 21146708 24543557 27905053 1351857 19517 22422827 347038
## .. .. ..@ is_circular: logi [1:8] NA NA NA NA NA NA ...
## .. .. ..@ genome : chr [1:8] NA NA NA NA ...
## ..@ metadata : list()
The result looks a lot like a GRanges object. In fact, a lot of the same operations can be used
bam
## GAlignments object with 175346 alignments and 0 metadata columns:
## seqnames strand cigar qwidth
## <Rle> <Rle> <character> <integer>
## SRR031715.1138209 chr4 + 37M 37
## SRR031714.776678 chr4 - 37M 37
## SRR031715.3258011 chr4 - 37M 37
## SRR031715.4791418 chr4 + 37M 37
## SRR031715.1138209 chr4 - 37M 37
## ... ... ... ... ...
## SRR031714.1650928 chr4 + 37M 37
## SRR031714.1650928 chr4 - 37M 37
## SRR031714.5192891 chr4 + 37M 37
## SRR031715.2351056 chr4 + 37M 37
## SRR031714.864195 chr4 + 37M 37
## start end width njunc
## <integer> <integer> <integer> <integer>
## SRR031715.1138209 169 205 37 0
## SRR031714.776678 184 220 37 0
## SRR031715.3258011 187 223 37 0
## SRR031715.4791418 193 229 37 0
## SRR031715.1138209 326 362 37 0
## ... ... ... ... ...
## SRR031714.1650928 1349708 1349744 37 0
## SRR031714.1650928 1349838 1349874 37 0
## SRR031714.5192891 1351640 1351676 37 0
## SRR031715.2351056 1351640 1351676 37 0
## SRR031714.864195 1351760 1351796 37 0
## -------
## seqinfo: 8 sequences from an unspecified genome
length(bam)
## [1] 175346
bam[1:5]
## GAlignments object with 5 alignments and 0 metadata columns:
## seqnames strand cigar qwidth
## <Rle> <Rle> <character> <integer>
## SRR031715.1138209 chr4 + 37M 37
## SRR031714.776678 chr4 - 37M 37
## SRR031715.3258011 chr4 - 37M 37
## SRR031715.4791418 chr4 + 37M 37
## SRR031715.1138209 chr4 - 37M 37
## start end width njunc
## <integer> <integer> <integer> <integer>
## SRR031715.1138209 169 205 37 0
## SRR031714.776678 184 220 37 0
## SRR031715.3258011 187 223 37 0
## SRR031715.4791418 193 229 37 0
## SRR031715.1138209 326 362 37 0
## -------
## seqinfo: 8 sequences from an unspecified genome
bam[sample(1:length(bam),5)]
## GAlignments object with 5 alignments and 0 metadata columns:
## seqnames strand cigar qwidth
## <Rle> <Rle> <character> <integer>
## SRR031714.4384916 chr4 + 37M 37
## SRR031714.2524042 chr4 + 37M 37
## SRR031714.510378 chr4 + 37M 37
## SRR031714.989932 chr4 + 37M 37
## SRR031714.4398277 chr4 - 37M 37
## start end width njunc
## <integer> <integer> <integer> <integer>
## SRR031714.4384916 677587 677623 37 0
## SRR031714.2524042 566364 566400 37 0
## SRR031714.510378 676904 676940 37 0
## SRR031714.989932 12122 12158 37 0
## SRR031714.4398277 881765 881801 37 0
## -------
## seqinfo: 8 sequences from an unspecified genome
table(strand(bam))
##
## + - *
## 84871 90475 0
summary(width(bam))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 37.00 37.00 37.00 58.72 37.00 19350.00
range(start(bam))
## [1] 169 1351760
head(cigar(bam))
## [1] "37M" "37M" "37M" "37M" "37M" "37M"
GAlignments
object can be used in findOverlaps
gr <- GRanges("chr4", IRanges(start = 20000, end = 20100))
gr
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr4 [20000, 20100] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
findOverlaps(gr,bam)
## Hits object with 12 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## [1] 1 6699
## [2] 1 6700
## [3] 1 6701
## [4] 1 6702
## [5] 1 6703
## ... ... ...
## [8] 1 6706
## [9] 1 6707
## [10] 1 6708
## [11] 1 6709
## [12] 1 6710
## -------
## queryLength: 1
## subjectLength: 175346
bam.sub <- bam[bam %over% gr]
bam.sub
## GAlignments object with 12 alignments and 0 metadata columns:
## seqnames strand cigar qwidth
## <Rle> <Rle> <character> <integer>
## SRR031714.4092638 chr4 - 37M 37
## SRR031714.4275537 chr4 - 37M 37
## SRR031715.1315719 chr4 - 37M 37
## SRR031715.1502533 chr4 - 37M 37
## SRR031714.336402 chr4 - 37M 37
## ... ... ... ... ...
## SRR031715.3358559 chr4 + 37M 37
## SRR031715.4831822 chr4 + 37M 37
## SRR031715.4459351 chr4 + 37M 37
## SRR031715.2716654 chr4 - 37M 37
## SRR031715.1552693 chr4 + 37M 37
## start end width njunc
## <integer> <integer> <integer> <integer>
## SRR031714.4092638 19968 20004 37 0
## SRR031714.4275537 19968 20004 37 0
## SRR031715.1315719 19968 20004 37 0
## SRR031715.1502533 19968 20004 37 0
## SRR031714.336402 19971 20007 37 0
## ... ... ... ... ...
## SRR031715.3358559 19974 20010 37 0
## SRR031715.4831822 19975 20011 37 0
## SRR031715.4459351 19981 20017 37 0
## SRR031715.2716654 19986 20022 37 0
## SRR031715.1552693 20046 20082 37 0
## -------
## seqinfo: 8 sequences from an unspecified genome
gr <- GRanges("4", IRanges(start = 20000, end = 20100))
gr
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] 4 [20000, 20100] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
findOverlaps(gr,bam)
## Warning in .Seqinfo.mergexy(x, y): The 2 combined objects have no sequence levels in common. (Use
## suppressWarnings() to suppress this warning.)
## Hits object with 0 hits and 0 metadata columns:
## queryHits subjectHits
## <integer> <integer>
## -------
## queryLength: 1
## subjectLength: 175346
gr
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] 4 [20000, 20100] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
gr <- renameSeqlevels(gr, c("4"="chr4"))
gr
## GRanges object with 1 range and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## [1] chr4 [20000, 20100] *
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
readGAlignments
uses the Rsamtools
interface, which allows more control over how we import data
ScanBamParam
(!) function allows the user to customise what fields from the bam file are imported
?ScanBamParam
bam.extra <- readGAlignments(file=mybam,param=ScanBamParam(what=c("mapq","qual","flag")))
bam.extra[1:5]
## GAlignments object with 5 alignments and 3 metadata columns:
## seqnames strand cigar qwidth start
## <Rle> <Rle> <character> <integer> <integer>
## [1] chr4 + 37M 37 169
## [2] chr4 - 37M 37 184
## [3] chr4 - 37M 37 187
## [4] chr4 + 37M 37 193
## [5] chr4 - 37M 37 326
## end width njunc | mapq
## <integer> <integer> <integer> | <integer>
## [1] 205 37 0 | 255
## [2] 220 37 0 | 255
## [3] 223 37 0 | 255
## [4] 229 37 0 | 255
## [5] 362 37 0 | 255
## qual flag
## <PhredQuality> <integer>
## [1] IIIIIIIIIIIIIIIIIIIIIIIIII8IIIIIIIGII 99
## [2] IIIIIIIIEIIIIIIIIIIIIIIIIIIIIIIIIIIII 153
## [3] II6II7IIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 89
## [4] IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIFII3I 137
## [5] ++I+4-05>*I2GF/II6IIIIIIIIIIIIIIIII<I 147
## -------
## seqinfo: 8 sequences from an unspecified genome
table(mcols(bam.extra)$flag)
##
## 65 73 81 83 89 97 99 113 129 137
## 29 4891 14737 23037 7769 14781 22791 34 29 4576
## 145 147 153 161 163 177
## 14781 22791 7292 14737 23037 34
dupReads <- readGAlignments(file=mybam,param=ScanBamParam(scanBamFlag(isDuplicate = TRUE)))
nodupReads <- readGAlignments(file=mybam,param=ScanBamParam(scanBamFlag(isDuplicate = FALSE)))
allreads <- readGAlignments(file=mybam,param=ScanBamParam(scanBamFlag(isDuplicate = NA)))
length(dupReads)
## [1] 0
length(nodupReads)
## [1] 175346
length(allreads)
## [1] 175346
length(allreads) - length(dupReads)
## [1] 175346
bam.sub2 <-
readGAlignments(file=mybam,param=ScanBamParam(which=gr),use.names = TRUE)
length(bam.sub2)
## [1] 14
bam.sub2
## GAlignments object with 14 alignments and 0 metadata columns:
## seqnames strand cigar qwidth
## <Rle> <Rle> <character> <integer>
## SRR031714.4100693 chr4 + 31M7704N6M 37
## SRR031715.5248298 chr4 + 29M7704N8M 37
## SRR031714.4092638 chr4 - 37M 37
## SRR031714.4275537 chr4 - 37M 37
## SRR031715.1315719 chr4 - 37M 37
## ... ... ... ... ...
## SRR031715.3358559 chr4 + 37M 37
## SRR031715.4831822 chr4 + 37M 37
## SRR031715.4459351 chr4 + 37M 37
## SRR031715.2716654 chr4 - 37M 37
## SRR031715.1552693 chr4 + 37M 37
## start end width njunc
## <integer> <integer> <integer> <integer>
## SRR031714.4100693 13660 21400 7741 1
## SRR031715.5248298 13662 21402 7741 1
## SRR031714.4092638 19968 20004 37 0
## SRR031714.4275537 19968 20004 37 0
## SRR031715.1315719 19968 20004 37 0
## ... ... ... ... ...
## SRR031715.3358559 19974 20010 37 0
## SRR031715.4831822 19975 20011 37 0
## SRR031715.4459351 19981 20017 37 0
## SRR031715.2716654 19986 20022 37 0
## SRR031715.1552693 20046 20082 37 0
## -------
## seqinfo: 8 sequences from an unspecified genome
Now, work through Section 2 of the practical