# load data
load("../Robjects/DE.RData")
## set up connection to ensembl database
ensembl=useMart("ENSEMBL_MART_ENSEMBL")
# list the available datasets (species)
listDatasets(ensembl) %>%
filter(str_detect(description, "Mouse"))
# specify a data set to use
ensembl = useDataset("mmusculus_gene_ensembl", mart=ensembl)
# Set the filter type and values
filterType <- "ensembl_gene_id"
# annotated
load("../Robjects/Ensembl_annotations.RData")
annotLvV <- as.data.frame(resLvV) %>%
rownames_to_column("GeneID") %>%
left_join(ensemblAnnot, "GeneID") %>%
rename(logFC=log2FoldChange, FDR=padj)
# txdb object
txMm <- makeTxDbFromBiomart(dataset="mmusculus_gene_ensembl")
Challenge 1
That was just 1000 genes. We need annotations for the entire results table. Also, there may be some other interesting columns in BioMart that we wish to retrieve.
- Search the attributes and add the following to our list of attributes:
- The gene description
- The genomic position - chromosome, start, end, and strand (4 columns)
- The gene biotype
- Query BioMart using all of the genes in our results table (
resLvV
)
- How many Ensembl genes have multipe Entrez IDs associated with them?
- How many Ensembl genes in
resLvV
don’t have any annotation? Why is this?
filterValues <- rownames(resLvV)
# check the available "attributes" - things you can retreive
listAttributes(ensembl) %>%
head(20)
attributeNames <- c('ensembl_gene_id',
'entrezgene',
'external_gene_name',
'description',
'gene_biotype')
# run the query
annot <- getBM(attributes=attributeNames,
filters = ourFilterType,
values = filterValues,
mart = ensembl)
# dulicate ids
sum(duplicated(annot$ensembl_gene_id))
# missing gens
missingGenes <- !rownames(resLvV)%in%annot$ensembl_gene_id
rownames(resLvV)[missingGenes]
Challenge 2
Use the log2 fold change (logFC
) on the x-axis, and use -log10(FDR)
on the y-axis. (This >-log10
transformation is commonly used for p-values as it means that more significant genes have a >higher scale)
Create a column of -log10(FDR) values
Create a plot with points coloured by if FDR < 0.05
# first remove the filtered genes (FDR=NA) and create a -log10(FDR) column
filtTab <- shrinkLvV %>%
filter(!is.na(FDR)) %>%
mutate(`-log10(FDR)` = -log10(FDR))
ggplot(filtTab, aes(x = logFC, y=`-log10(FDR)`)) +
geom_point(aes(colour=FDR < 0.05), size=1)
Challenge 3 {.challenge} - In Supplementary Materials
Use the txMm to retrieve the exon coordinates for the genes: + ENSMUSG00000021604
+ ENSMUSG00000022146
+ ENSMUSG00000040118
LS0tCnRpdGxlOiAiUk5BLXNlcSBBbmFseXNpcyBpbiBSIgpzdWJ0aXRsZTogIkFubm90YXRpb24gYW5kIFZpc3VhbGlzYXRpb24gb2YgUk5BLXNlcSByZXN1bHRzIC0gU29sdXRpb25zIgphdXRob3I6ICJTdGVwaGFuZSBCYWxsZXJlYXUsIE1hcmsgRHVubmluZywgT3NjYXIgUnVlZGEsIEFzaGxleSBTYXdsZSIKZGF0ZTogJ2ByIGZvcm1hdChTeXMudGltZSgpLCAiTGFzdCBtb2RpZmllZDogJWQgJWIgJVkiKWAnCm91dHB1dDoKICBodG1sX25vdGVib29rOgogICAgdG9jOiB5ZXMKICAgIHRvY19mbG9hdDogeWVzCiAgaHRtbF9kb2N1bWVudDoKICAgIHRvYzogeWVzCiAgICB0b2NfZmxvYXQ6IHllcwptaW51dGVzOiAzMDAKbGF5b3V0OiBwYWdlCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmxpYnJhcnkoYmlvbWFSdCkKbGlicmFyeShERVNlcTIpCmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCmBgYHtyIHByZXBhcmVEYXRhfQojIGxvYWQgZGF0YQpsb2FkKCIuLi9Sb2JqZWN0cy9ERS5SRGF0YSIpCiMjIHNldCB1cCBjb25uZWN0aW9uIHRvIGVuc2VtYmwgZGF0YWJhc2UKZW5zZW1ibD11c2VNYXJ0KCJFTlNFTUJMX01BUlRfRU5TRU1CTCIpCiMgbGlzdCB0aGUgYXZhaWxhYmxlIGRhdGFzZXRzIChzcGVjaWVzKQpsaXN0RGF0YXNldHMoZW5zZW1ibCkgJT4lIAogICAgZmlsdGVyKHN0cl9kZXRlY3QoZGVzY3JpcHRpb24sICJNb3VzZSIpKQojIHNwZWNpZnkgYSBkYXRhIHNldCB0byB1c2UKZW5zZW1ibCA9IHVzZURhdGFzZXQoIm1tdXNjdWx1c19nZW5lX2Vuc2VtYmwiLCBtYXJ0PWVuc2VtYmwpCiMgU2V0IHRoZSBmaWx0ZXIgdHlwZSBhbmQgdmFsdWVzCmZpbHRlclR5cGUgPC0gImVuc2VtYmxfZ2VuZV9pZCIKIyBhbm5vdGF0ZWQKbG9hZCgiLi4vUm9iamVjdHMvRW5zZW1ibF9hbm5vdGF0aW9ucy5SRGF0YSIpCmFubm90THZWIDwtIGFzLmRhdGEuZnJhbWUocmVzTHZWKSAlPiUgCiAgICByb3duYW1lc190b19jb2x1bW4oIkdlbmVJRCIpICU+JSAKICAgIGxlZnRfam9pbihlbnNlbWJsQW5ub3QsICJHZW5lSUQiKSAlPiUgCiAgICByZW5hbWUobG9nRkM9bG9nMkZvbGRDaGFuZ2UsIEZEUj1wYWRqKQojIHR4ZGIgb2JqZWN0CnR4TW0gPC0gbWFrZVR4RGJGcm9tQmlvbWFydChkYXRhc2V0PSJtbXVzY3VsdXNfZ2VuZV9lbnNlbWJsIikKYGBgCgoKPiAjIyMgQ2hhbGxlbmdlIDEgey5jaGFsbGVuZ2V9Cj4gVGhhdCB3YXMganVzdCAxMDAwIGdlbmVzLiBXZSBuZWVkIGFubm90YXRpb25zIGZvciB0aGUgZW50aXJlIHJlc3VsdHMgdGFibGUuCj4gQWxzbywgdGhlcmUgbWF5IGJlIHNvbWUgb3RoZXIgaW50ZXJlc3RpbmcgY29sdW1ucyBpbiBCaW9NYXJ0IHRoYXQgd2Ugd2lzaCB0bwo+IHJldHJpZXZlLiAgCj4KPiAoYSkgU2VhcmNoIHRoZSBhdHRyaWJ1dGVzIGFuZCBhZGQgdGhlIGZvbGxvd2luZyB0byBvdXIgbGlzdCBvZiBhdHRyaWJ1dGVzOiAgCj4gICAgICAgKGkpIFRoZSBnZW5lIGRlc2NyaXB0aW9uICAgCj4gICAgICAgKGlpKSBUaGUgZ2Vub21pYyBwb3NpdGlvbiAtIGNocm9tb3NvbWUsIHN0YXJ0LCBlbmQsIGFuZCBzdHJhbmQgKDQgY29sdW1ucykgCj4gICAgICAgKGlpaSkgVGhlIGdlbmUgYmlvdHlwZSAgCj4gKGIpIFF1ZXJ5IEJpb01hcnQgdXNpbmcgYWxsIG9mIHRoZSBnZW5lcyBpbiBvdXIgcmVzdWx0cyB0YWJsZSAoYHJlc0x2VmApICAKPiAoYykgSG93IG1hbnkgRW5zZW1ibCBnZW5lcyBoYXZlIG11bHRpcGUgRW50cmV6IElEcyBhc3NvY2lhdGVkIHdpdGggdGhlbT8gIAo+IChkKSBIb3cgbWFueSBFbnNlbWJsIGdlbmVzIGluIGByZXNMdlZgIGRvbid0IGhhdmUgYW55IGFubm90YXRpb24/IFdoeSBpcyB0aGlzPwoKYGBge3Igc29sdXRpb25DaGFsbGVuZ2UxfQpmaWx0ZXJWYWx1ZXMgPC0gcm93bmFtZXMocmVzTHZWKQoKIyBjaGVjayB0aGUgYXZhaWxhYmxlICJhdHRyaWJ1dGVzIiAtIHRoaW5ncyB5b3UgY2FuIHJldHJlaXZlCmxpc3RBdHRyaWJ1dGVzKGVuc2VtYmwpICU+JQogICAgaGVhZCgyMCkKYXR0cmlidXRlTmFtZXMgPC0gYygnZW5zZW1ibF9nZW5lX2lkJywKICAgICAgICAgICAgICAgICAgICAnZW50cmV6Z2VuZScsCiAgICAgICAgICAgICAgICAgICAgJ2V4dGVybmFsX2dlbmVfbmFtZScsCiAgICAgICAgICAgICAgICAgICAgJ2Rlc2NyaXB0aW9uJywKICAgICAgICAgICAgICAgICAgICAnZ2VuZV9iaW90eXBlJykKCiMgcnVuIHRoZSBxdWVyeQphbm5vdCA8LSBnZXRCTShhdHRyaWJ1dGVzPWF0dHJpYnV0ZU5hbWVzLAogICAgICAgICAgICAgICBmaWx0ZXJzID0gb3VyRmlsdGVyVHlwZSwKICAgICAgICAgICAgICAgdmFsdWVzID0gZmlsdGVyVmFsdWVzLAogICAgICAgICAgICAgICBtYXJ0ID0gZW5zZW1ibCkKCiMgZHVsaWNhdGUgaWRzCnN1bShkdXBsaWNhdGVkKGFubm90JGVuc2VtYmxfZ2VuZV9pZCkpCgojIG1pc3NpbmcgZ2VucwptaXNzaW5nR2VuZXMgPC0gIXJvd25hbWVzKHJlc0x2ViklaW4lYW5ub3QkZW5zZW1ibF9nZW5lX2lkCnJvd25hbWVzKHJlc0x2VilbbWlzc2luZ0dlbmVzXQpgYGAKCgo+ICMjIyBDaGFsbGVuZ2UgMiB7LmNoYWxsZW5nZX0KCj4gVXNlIHRoZSBsb2cyIGZvbGQgY2hhbmdlIChgbG9nRkNgKSBvbiB0aGUgeC1heGlzLCBhbmQgdXNlIGAtbG9nMTAoRkRSKWAgb24gdGhlIHktYXhpcy4gKFRoaXMgPmAtbG9nMTBgIHRyYW5zZm9ybWF0aW9uIGlzIGNvbW1vbmx5IHVzZWQgZm9yIHAtdmFsdWVzIGFzIGl0IG1lYW5zIHRoYXQgbW9yZSBzaWduaWZpY2FudCBnZW5lcyBoYXZlIGEgPmhpZ2hlciBzY2FsZSkgCj4KPiAoYSkgQ3JlYXRlIGEgY29sdW1uIG9mIC1sb2cxMChGRFIpIHZhbHVlcwo+Cj4gKGIpIENyZWF0ZSBhIHBsb3Qgd2l0aCBwb2ludHMgY29sb3VyZWQgYnkgaWYgRkRSIDwgMC4wNQoKCmBgYHtyIHZvbGNhbm9QbG90LCBmaWcuaGVpZ2h0PTUsIGZpZy53aWR0aD03fQojIGZpcnN0IHJlbW92ZSB0aGUgZmlsdGVyZWQgZ2VuZXMgKEZEUj1OQSkgYW5kIGNyZWF0ZSBhIC1sb2cxMChGRFIpIGNvbHVtbgpmaWx0VGFiIDwtIHNocmlua0x2ViAlPiUgCiAgICBmaWx0ZXIoIWlzLm5hKEZEUikpICU+JSAKICAgIG11dGF0ZShgLWxvZzEwKEZEUilgID0gLWxvZzEwKEZEUikpCgpnZ3Bsb3QoZmlsdFRhYiwgYWVzKHggPSBsb2dGQywgeT1gLWxvZzEwKEZEUilgKSkgKyAKICAgIGdlb21fcG9pbnQoYWVzKGNvbG91cj1GRFIgPCAwLjA1KSwgc2l6ZT0xKQpgYGAKCgo+ICMjIyBDaGFsbGVuZ2UgMyB7LmNoYWxsZW5nZX0gLSBJbiBTdXBwbGVtZW50YXJ5IE1hdGVyaWFscwo+Cj4gVXNlIHRoZSB0eE1tIHRvIHJldHJpZXZlIHRoZSBleG9uIGNvb3JkaW5hdGVzIGZvciB0aGUgZ2VuZXM6IAogICAgKyBgRU5TTVVTRzAwMDAwMDIxNjA0YAogICAgKyBgRU5TTVVTRzAwMDAwMDIyMTQ2YAogICAgKyBgRU5TTVVTRzAwMDAwMDQwMTE4YCAKPgoKYGBge3Igc29sdXRpb25DaGFsbGVuZ2UzLCBlY2hvPUZBTFNFLCB3YXJuaW5nPUZBTFNFLCBtZXNzYWdlPUZBTFNFfQprZXlMaXN0IDwtIGMoIkVOU01VU0cwMDAwMDAyMTYwNCIsICJFTlNNVVNHMDAwMDAwMjIxNDYiLCAiRU5TTVVTRzAwMDAwMDQwMTE4IikKc2VsZWN0KHR4TW0sIAogICAgICAga2V5cz1rZXlMaXN0LAogICAgICAga2V5dHlwZSA9ICJHRU5FSUQiLAogICAgICAgY29sdW1ucz1jKCJUWE5BTUUiLCAiVFhDSFJPTSIsICJUWFNUQVJUIiwgIlRYRU5EIiwgIlRYU1RSQU5EIiwgIlRYVFlQRSIpCiAgICAgICkKYGBg