# load data
load("../Robjects/DE.RData")
## set up connection to ensembl database
ensembl=useMart("ENSEMBL_MART_ENSEMBL")
# list the available datasets (species)
listDatasets(ensembl) %>% 
    filter(str_detect(description, "Mouse"))
# specify a data set to use
ensembl = useDataset("mmusculus_gene_ensembl", mart=ensembl)
# Set the filter type and values
filterType <- "ensembl_gene_id"
# annotated
load("../Robjects/Ensembl_annotations.RData")
annotLvV <- as.data.frame(resLvV) %>% 
    rownames_to_column("GeneID") %>% 
    left_join(ensemblAnnot, "GeneID") %>% 
    rename(logFC=log2FoldChange, FDR=padj)
# txdb object
txMm <- makeTxDbFromBiomart(dataset="mmusculus_gene_ensembl")

Challenge 1

That was just 1000 genes. We need annotations for the entire results table. Also, there may be some other interesting columns in BioMart that we wish to retrieve.

  1. Search the attributes and add the following to our list of attributes:
    1. The gene description
    2. The genomic position - chromosome, start, end, and strand (4 columns)
    3. The gene biotype
  2. Query BioMart using all of the genes in our results table (resLvV)
  3. How many Ensembl genes have multipe Entrez IDs associated with them?
  4. How many Ensembl genes in resLvV don’t have any annotation? Why is this?
filterValues <- rownames(resLvV)

# check the available "attributes" - things you can retreive
listAttributes(ensembl) %>%
    head(20)
attributeNames <- c('ensembl_gene_id',
                    'entrezgene',
                    'external_gene_name',
                    'description',
                    'gene_biotype')

# run the query
annot <- getBM(attributes=attributeNames,
               filters = ourFilterType,
               values = filterValues,
               mart = ensembl)

# dulicate ids
sum(duplicated(annot$ensembl_gene_id))

# missing gens
missingGenes <- !rownames(resLvV)%in%annot$ensembl_gene_id
rownames(resLvV)[missingGenes]

Challenge 2

Use the log2 fold change (logFC) on the x-axis, and use -log10(FDR) on the y-axis. (This >-log10 transformation is commonly used for p-values as it means that more significant genes have a >higher scale)

  1. Create a column of -log10(FDR) values

  2. Create a plot with points coloured by if FDR < 0.05

# first remove the filtered genes (FDR=NA) and create a -log10(FDR) column
filtTab <- shrinkLvV %>% 
    filter(!is.na(FDR)) %>% 
    mutate(`-log10(FDR)` = -log10(FDR))
ggplot(filtTab, aes(x = logFC, y=`-log10(FDR)`)) + 
    geom_point(aes(colour=FDR < 0.05), size=1)

Challenge 3 {.challenge} - In Supplementary Materials

Use the txMm to retrieve the exon coordinates for the genes: + ENSMUSG00000021604 + ENSMUSG00000022146 + ENSMUSG00000040118

LS0tCnRpdGxlOiAiUk5BLXNlcSBBbmFseXNpcyBpbiBSIgpzdWJ0aXRsZTogIkFubm90YXRpb24gYW5kIFZpc3VhbGlzYXRpb24gb2YgUk5BLXNlcSByZXN1bHRzIC0gU29sdXRpb25zIgphdXRob3I6ICJTdGVwaGFuZSBCYWxsZXJlYXUsIE1hcmsgRHVubmluZywgT3NjYXIgUnVlZGEsIEFzaGxleSBTYXdsZSIKZGF0ZTogJ2ByIGZvcm1hdChTeXMudGltZSgpLCAiTGFzdCBtb2RpZmllZDogJWQgJWIgJVkiKWAnCm91dHB1dDoKICBodG1sX25vdGVib29rOgogICAgdG9jOiB5ZXMKICAgIHRvY19mbG9hdDogeWVzCiAgaHRtbF9kb2N1bWVudDoKICAgIHRvYzogeWVzCiAgICB0b2NfZmxvYXQ6IHllcwptaW51dGVzOiAzMDAKbGF5b3V0OiBwYWdlCi0tLQoKYGBge3Igc2V0dXAsIGluY2x1ZGU9RkFMU0V9CmxpYnJhcnkoYmlvbWFSdCkKbGlicmFyeShERVNlcTIpCmxpYnJhcnkodGlkeXZlcnNlKQpgYGAKCmBgYHtyIHByZXBhcmVEYXRhfQojIGxvYWQgZGF0YQpsb2FkKCIuLi9Sb2JqZWN0cy9ERS5SRGF0YSIpCiMjIHNldCB1cCBjb25uZWN0aW9uIHRvIGVuc2VtYmwgZGF0YWJhc2UKZW5zZW1ibD11c2VNYXJ0KCJFTlNFTUJMX01BUlRfRU5TRU1CTCIpCiMgbGlzdCB0aGUgYXZhaWxhYmxlIGRhdGFzZXRzIChzcGVjaWVzKQpsaXN0RGF0YXNldHMoZW5zZW1ibCkgJT4lIAogICAgZmlsdGVyKHN0cl9kZXRlY3QoZGVzY3JpcHRpb24sICJNb3VzZSIpKQojIHNwZWNpZnkgYSBkYXRhIHNldCB0byB1c2UKZW5zZW1ibCA9IHVzZURhdGFzZXQoIm1tdXNjdWx1c19nZW5lX2Vuc2VtYmwiLCBtYXJ0PWVuc2VtYmwpCiMgU2V0IHRoZSBmaWx0ZXIgdHlwZSBhbmQgdmFsdWVzCmZpbHRlclR5cGUgPC0gImVuc2VtYmxfZ2VuZV9pZCIKIyBhbm5vdGF0ZWQKbG9hZCgiLi4vUm9iamVjdHMvRW5zZW1ibF9hbm5vdGF0aW9ucy5SRGF0YSIpCmFubm90THZWIDwtIGFzLmRhdGEuZnJhbWUocmVzTHZWKSAlPiUgCiAgICByb3duYW1lc190b19jb2x1bW4oIkdlbmVJRCIpICU+JSAKICAgIGxlZnRfam9pbihlbnNlbWJsQW5ub3QsICJHZW5lSUQiKSAlPiUgCiAgICByZW5hbWUobG9nRkM9bG9nMkZvbGRDaGFuZ2UsIEZEUj1wYWRqKQojIHR4ZGIgb2JqZWN0CnR4TW0gPC0gbWFrZVR4RGJGcm9tQmlvbWFydChkYXRhc2V0PSJtbXVzY3VsdXNfZ2VuZV9lbnNlbWJsIikKYGBgCgoKPiAjIyMgQ2hhbGxlbmdlIDEgey5jaGFsbGVuZ2V9Cj4gVGhhdCB3YXMganVzdCAxMDAwIGdlbmVzLiBXZSBuZWVkIGFubm90YXRpb25zIGZvciB0aGUgZW50aXJlIHJlc3VsdHMgdGFibGUuCj4gQWxzbywgdGhlcmUgbWF5IGJlIHNvbWUgb3RoZXIgaW50ZXJlc3RpbmcgY29sdW1ucyBpbiBCaW9NYXJ0IHRoYXQgd2Ugd2lzaCB0bwo+IHJldHJpZXZlLiAgCj4KPiAoYSkgU2VhcmNoIHRoZSBhdHRyaWJ1dGVzIGFuZCBhZGQgdGhlIGZvbGxvd2luZyB0byBvdXIgbGlzdCBvZiBhdHRyaWJ1dGVzOiAgCj4gICAgICAgKGkpIFRoZSBnZW5lIGRlc2NyaXB0aW9uICAgCj4gICAgICAgKGlpKSBUaGUgZ2Vub21pYyBwb3NpdGlvbiAtIGNocm9tb3NvbWUsIHN0YXJ0LCBlbmQsIGFuZCBzdHJhbmQgKDQgY29sdW1ucykgCj4gICAgICAgKGlpaSkgVGhlIGdlbmUgYmlvdHlwZSAgCj4gKGIpIFF1ZXJ5IEJpb01hcnQgdXNpbmcgYWxsIG9mIHRoZSBnZW5lcyBpbiBvdXIgcmVzdWx0cyB0YWJsZSAoYHJlc0x2VmApICAKPiAoYykgSG93IG1hbnkgRW5zZW1ibCBnZW5lcyBoYXZlIG11bHRpcGUgRW50cmV6IElEcyBhc3NvY2lhdGVkIHdpdGggdGhlbT8gIAo+IChkKSBIb3cgbWFueSBFbnNlbWJsIGdlbmVzIGluIGByZXNMdlZgIGRvbid0IGhhdmUgYW55IGFubm90YXRpb24/IFdoeSBpcyB0aGlzPwoKYGBge3Igc29sdXRpb25DaGFsbGVuZ2UxfQpmaWx0ZXJWYWx1ZXMgPC0gcm93bmFtZXMocmVzTHZWKQoKIyBjaGVjayB0aGUgYXZhaWxhYmxlICJhdHRyaWJ1dGVzIiAtIHRoaW5ncyB5b3UgY2FuIHJldHJlaXZlCmxpc3RBdHRyaWJ1dGVzKGVuc2VtYmwpICU+JQogICAgaGVhZCgyMCkKYXR0cmlidXRlTmFtZXMgPC0gYygnZW5zZW1ibF9nZW5lX2lkJywKICAgICAgICAgICAgICAgICAgICAnZW50cmV6Z2VuZScsCiAgICAgICAgICAgICAgICAgICAgJ2V4dGVybmFsX2dlbmVfbmFtZScsCiAgICAgICAgICAgICAgICAgICAgJ2Rlc2NyaXB0aW9uJywKICAgICAgICAgICAgICAgICAgICAnZ2VuZV9iaW90eXBlJykKCiMgcnVuIHRoZSBxdWVyeQphbm5vdCA8LSBnZXRCTShhdHRyaWJ1dGVzPWF0dHJpYnV0ZU5hbWVzLAogICAgICAgICAgICAgICBmaWx0ZXJzID0gb3VyRmlsdGVyVHlwZSwKICAgICAgICAgICAgICAgdmFsdWVzID0gZmlsdGVyVmFsdWVzLAogICAgICAgICAgICAgICBtYXJ0ID0gZW5zZW1ibCkKCiMgZHVsaWNhdGUgaWRzCnN1bShkdXBsaWNhdGVkKGFubm90JGVuc2VtYmxfZ2VuZV9pZCkpCgojIG1pc3NpbmcgZ2VucwptaXNzaW5nR2VuZXMgPC0gIXJvd25hbWVzKHJlc0x2ViklaW4lYW5ub3QkZW5zZW1ibF9nZW5lX2lkCnJvd25hbWVzKHJlc0x2VilbbWlzc2luZ0dlbmVzXQpgYGAKCgo+ICMjIyBDaGFsbGVuZ2UgMiB7LmNoYWxsZW5nZX0KCj4gVXNlIHRoZSBsb2cyIGZvbGQgY2hhbmdlIChgbG9nRkNgKSBvbiB0aGUgeC1heGlzLCBhbmQgdXNlIGAtbG9nMTAoRkRSKWAgb24gdGhlIHktYXhpcy4gKFRoaXMgPmAtbG9nMTBgIHRyYW5zZm9ybWF0aW9uIGlzIGNvbW1vbmx5IHVzZWQgZm9yIHAtdmFsdWVzIGFzIGl0IG1lYW5zIHRoYXQgbW9yZSBzaWduaWZpY2FudCBnZW5lcyBoYXZlIGEgPmhpZ2hlciBzY2FsZSkgCj4KPiAoYSkgQ3JlYXRlIGEgY29sdW1uIG9mIC1sb2cxMChGRFIpIHZhbHVlcwo+Cj4gKGIpIENyZWF0ZSBhIHBsb3Qgd2l0aCBwb2ludHMgY29sb3VyZWQgYnkgaWYgRkRSIDwgMC4wNQoKCmBgYHtyIHZvbGNhbm9QbG90LCBmaWcuaGVpZ2h0PTUsIGZpZy53aWR0aD03fQojIGZpcnN0IHJlbW92ZSB0aGUgZmlsdGVyZWQgZ2VuZXMgKEZEUj1OQSkgYW5kIGNyZWF0ZSBhIC1sb2cxMChGRFIpIGNvbHVtbgpmaWx0VGFiIDwtIHNocmlua0x2ViAlPiUgCiAgICBmaWx0ZXIoIWlzLm5hKEZEUikpICU+JSAKICAgIG11dGF0ZShgLWxvZzEwKEZEUilgID0gLWxvZzEwKEZEUikpCgpnZ3Bsb3QoZmlsdFRhYiwgYWVzKHggPSBsb2dGQywgeT1gLWxvZzEwKEZEUilgKSkgKyAKICAgIGdlb21fcG9pbnQoYWVzKGNvbG91cj1GRFIgPCAwLjA1KSwgc2l6ZT0xKQpgYGAKCgo+ICMjIyBDaGFsbGVuZ2UgMyB7LmNoYWxsZW5nZX0gLSBJbiBTdXBwbGVtZW50YXJ5IE1hdGVyaWFscwo+Cj4gVXNlIHRoZSB0eE1tIHRvIHJldHJpZXZlIHRoZSBleG9uIGNvb3JkaW5hdGVzIGZvciB0aGUgZ2VuZXM6IAogICAgKyBgRU5TTVVTRzAwMDAwMDIxNjA0YAogICAgKyBgRU5TTVVTRzAwMDAwMDIyMTQ2YAogICAgKyBgRU5TTVVTRzAwMDAwMDQwMTE4YCAKPgoKYGBge3Igc29sdXRpb25DaGFsbGVuZ2UzLCBlY2hvPUZBTFNFLCB3YXJuaW5nPUZBTFNFLCBtZXNzYWdlPUZBTFNFfQprZXlMaXN0IDwtIGMoIkVOU01VU0cwMDAwMDAyMTYwNCIsICJFTlNNVVNHMDAwMDAwMjIxNDYiLCAiRU5TTVVTRzAwMDAwMDQwMTE4IikKc2VsZWN0KHR4TW0sIAogICAgICAga2V5cz1rZXlMaXN0LAogICAgICAga2V5dHlwZSA9ICJHRU5FSUQiLAogICAgICAgY29sdW1ucz1jKCJUWE5BTUUiLCAiVFhDSFJPTSIsICJUWFNUQVJUIiwgIlRYRU5EIiwgIlRYU1RSQU5EIiwgIlRYVFlQRSIpCiAgICAgICkKYGBg