##                  dataset                  description   version
## 1  mmurinus_gene_ensembl Mouse Lemur genes (Mmur_3.0)  Mmur_3.0
## 2 mmusculus_gene_ensembl      Mouse genes (GRCm38.p6) GRCm38.p6

Challenge 1

That was just 1000 genes. We need annotations for the entire results table. Also, there may be some other interesting columns in BioMart that we wish to retrieve.

  1. Search the attributes and add the following to our list of attributes:
    1. The gene description
    2. The gene biotype
  2. Query BioMart using all of the genes in our results table (resLvV)
  3. How many Ensembl genes have multipe Entrez IDs associated with them?
# Set the filter type and values
ourFilterType <- "ensembl_gene_id"

# set the values for the filter
filterValues <- rownames(resLvV)

# check the available "attributes" - things you can retreive
listAttributes(ensembl) %>%
    filter(str_detect(name, "description"))

listAttributes(ensembl) %>%
    filter(str_detect(name, "biotype"))
##                          name                description         page
## 1                 description           Gene description feature_page
## 2       phenotype_description      Phenotype description feature_page
## 3      goslim_goa_description     GOSlim GOA Description feature_page
## 4             mgi_description            MGI description feature_page
## 5      entrezgene_description      NCBI gene description feature_page
## 6        wikigene_description       WikiGene description feature_page
## 7          family_description Ensembl Family Description feature_page
## 8  interpro_short_description Interpro Short Description feature_page
## 9        interpro_description       Interpro Description feature_page
## 10                description           Gene description    structure
## 11                description           Gene description     homologs
## 12                description           Gene description          snp
## 13         source_description Variant source description          snp
## 14                description           Gene description    sequences
##                 name     description         page
## 1       gene_biotype       Gene type feature_page
## 2 transcript_biotype Transcript type feature_page
## 3       gene_biotype       Gene type    structure
## 4       gene_biotype       Gene type    sequences
## 5 transcript_biotype Transcript type    sequences
# set attributes
attributeNames <- c('ensembl_gene_id',
                    'entrezgene_id',
                    'external_gene_name',
                    'description',
                    'gene_biotype')

# run the query
annot <- getBM(attributes=attributeNames,
               filters = ourFilterType,
               values = filterValues,
               mart = ensembl)

# count duplicate ids
annot %>%
  add_count(ensembl_gene_id) %>% 
  filter(n>1) %>% 
  distinct(ensembl_gene_id) %>% 
  nrow()
## [1] 97

Challenge 2

Use the log2 fold change (logFC) on the x-axis, and use -log10(FDR) on the y-axis. (This >-log10 transformation is commonly used for p-values as it means that more significant genes have a higher scale).

  1. Create a column of -log10(FDR) values

  2. Create a plot with points coloured by if FDR < 0.05

# first remove the filtered genes (FDR=NA) and create a -log10(FDR) column
filtTab <- shrinkLvV %>% 
    filter(!is.na(FDR)) %>% 
    mutate(`-log10(FDR)` = -log10(FDR))

ggplot(filtTab, aes(x = logFC, y=`-log10(FDR)`)) + 
    geom_point(aes(colour=FDR < 0.05), size=1)

Challenge 3 - In Supplementary Materials

Use the txMm to retrieve the exon coordinates for the genes:

  • ENSMUSG00000021604
  • ENSMUSG00000022146
  • ENSMUSG00000040118
keyList <- c("ENSMUSG00000021604", "ENSMUSG00000022146", "ENSMUSG00000040118")
AnnotationDbi::select(txMm, 
       keys=keyList,
       keytype = "GENEID",
       columns=c("TXNAME", "TXCHROM", "TXSTART", "TXEND", "TXSTRAND", "TXTYPE")
      )
##                GENEID             TXNAME     TXTYPE TXCHROM TXSTRAND
## 1  ENSMUSG00000021604 ENSMUST00000176684 transcript      13        +
## 2  ENSMUSG00000021604 ENSMUST00000022095 transcript      13        +
## 3  ENSMUSG00000022146 ENSMUST00000022746 transcript      15        -
## 4  ENSMUSG00000022146 ENSMUST00000176826 transcript      15        -
## 5  ENSMUSG00000022146 ENSMUST00000176554 transcript      15        -
## 6  ENSMUSG00000022146 ENSMUST00000175862 transcript      15        -
## 7  ENSMUSG00000022146 ENSMUST00000177478 transcript      15        -
## 8  ENSMUSG00000022146 ENSMUST00000177263 transcript      15        -
## 9  ENSMUSG00000040118 ENSMUST00000167946 transcript       5        +
## 10 ENSMUSG00000040118 ENSMUST00000101581 transcript       5        +
## 11 ENSMUSG00000040118 ENSMUST00000039370 transcript       5        +
## 12 ENSMUSG00000040118 ENSMUST00000180204 transcript       5        +
## 13 ENSMUSG00000040118 ENSMUST00000199704 transcript       5        +
## 14 ENSMUSG00000040118 ENSMUST00000078272 transcript       5        +
## 15 ENSMUSG00000040118 ENSMUST00000115281 transcript       5        +
## 16 ENSMUSG00000040118 ENSMUST00000196750 transcript       5        +
## 17 ENSMUSG00000040118 ENSMUST00000200270 transcript       5        +
## 18 ENSMUSG00000040118 ENSMUST00000200158 transcript       5        +
## 19 ENSMUSG00000040118 ENSMUST00000200294 transcript       5        +
## 20 ENSMUSG00000040118 ENSMUST00000199236 transcript       5        +
##     TXSTART    TXEND
## 1  73260479 73269608
## 2  73260497 73269608
## 3   6813577  6874969
## 4   6815037  6874969
## 5   6820637  6824595
## 6   6836758  6874268
## 7   6843969  6874257
## 8   6854987  6874296
## 9  15934691 16374511
## 10 15934788 16371051
## 11 15934788 16371069
## 12 15934788 16371069
## 13 15934788 16371069
## 14 15934788 16374504
## 15 15934829 16370727
## 16 15934911 16089022
## 17 16025714 16268604
## 18 16325985 16329883
## 19 16326151 16341059
## 20 16361395 16362326