library(biomaRt)
library(tidyverse)

Retrieve full annotation

Challenge 1

That was just 1000 genes. We need annotations for the entire results table. Also, there may be some other interesting columns in BioMart that we wish to retrieve.

  1. Search the attributes and add the following to our list of attributes:
    1. The gene description
    2. The gene biotype
ensembl <- useEnsembl(biomart = 'genes', 
                      dataset = 'mmusculus_gene_ensembl',
                      version = 102)
ensembl_attributes <- listAttributes(ensembl)
filter(ensembl_attributes, str_detect(name, "description"))
##                          name                                 description
## 1                 description                            Gene description
## 2       phenotype_description                       Phenotype description
## 3      goslim_goa_description                      GOSlim GOA Description
## 4             mgi_description                             MGI description
## 5      entrezgene_description NCBI gene (formerly Entrezgene) description
## 6        wikigene_description                        WikiGene description
## 7  interpro_short_description                  Interpro Short Description
## 8        interpro_description                        Interpro Description
## 9                 description                            Gene description
## 10                description                            Gene description
## 11                description                            Gene description
## 12         source_description                  Variant source description
## 13                description                            Gene description
##            page
## 1  feature_page
## 2  feature_page
## 3  feature_page
## 4  feature_page
## 5  feature_page
## 6  feature_page
## 7  feature_page
## 8  feature_page
## 9     structure
## 10     homologs
## 11          snp
## 12          snp
## 13    sequences
filter(ensembl_attributes, str_detect(name, "biotype"))
##                 name     description         page
## 1       gene_biotype       Gene type feature_page
## 2 transcript_biotype Transcript type feature_page
## 3       gene_biotype       Gene type    structure
## 4       gene_biotype       Gene type    sequences
## 5 transcript_biotype Transcript type    sequences
  1. Query BioMart using all of the genes in our results table (results.interaction.11)
# Set the filter type and values
ourFilterType <- "ensembl_gene_id"

# get the Ensembl IDs from our results table
filterValues <- rownames(results.interaction.11)[1:1000]

# Set the list of attributes
attributeNames <- c("ensembl_gene_id",
                    "external_gene_name", 
                    "entrezgene_id", 
                    "entrezgene_accession",
                    "description",
                    "gene_biotype")

# run the query
annot <- getBM(attributes=attributeNames, 
               filters = ourFilterType, 
               values = filterValues, 
               mart = ensembl)
  1. How many Ensembl genes have multipe Entrez IDs associated with them?
annot %>% 
  add_count(ensembl_gene_id) %>% 
  filter(n>1) %>% 
  pull(ensembl_gene_id) %>% 
  unique() %>% 
  length()
## [1] 59