# load data
load("../../Course_Materials/Robjects/DE.RData")
## set up connection to ensembl database
ensembl <- useMart("ENSEMBL_MART_ENSEMBL")
# specify a data set to use
ensembl <- useDataset("mmusculus_gene_ensembl", mart=ensembl)

Challenge 1

That was just 1000 genes. We need annotations for the entire results table. Also, there may be some other interesting columns in BioMart that we wish to retrieve.

  1. Search the attributes and add the following to our list of attributes:
    1. The gene description
    2. The gene biotype
  2. Query BioMart using all of the genes in our results table (resLvV)
  3. How many Ensembl genes have multipe Entrez IDs associated with them?
# Set the filter type and values
ourFilterType <- "ensembl_gene_id"

# set the values for the filter
filterValues <- rownames(resLvV)[1:1000]

# check the available "attributes" - things you can retreive
listAttributes(ensembl) %>%
    filter(str_detect(name, "description"))
##                          name                                 description
## 1                 description                            Gene description
## 2       phenotype_description                       Phenotype description
## 3      goslim_goa_description                      GOSlim GOA Description
## 4             mgi_description                             MGI description
## 5      entrezgene_description NCBI gene (formerly Entrezgene) description
## 6        wikigene_description                        WikiGene description
## 7          family_description                  Ensembl Family Description
## 8  interpro_short_description                  Interpro Short Description
## 9        interpro_description                        Interpro Description
## 10                description                            Gene description
## 11                description                            Gene description
## 12                description                            Gene description
## 13         source_description                  Variant source description
## 14                description                            Gene description
##            page
## 1  feature_page
## 2  feature_page
## 3  feature_page
## 4  feature_page
## 5  feature_page
## 6  feature_page
## 7  feature_page
## 8  feature_page
## 9  feature_page
## 10    structure
## 11     homologs
## 12          snp
## 13          snp
## 14    sequences
listAttributes(ensembl) %>%
    filter(str_detect(name, "biotype"))
##                 name     description         page
## 1       gene_biotype       Gene type feature_page
## 2 transcript_biotype Transcript type feature_page
## 3       gene_biotype       Gene type    structure
## 4       gene_biotype       Gene type    sequences
## 5 transcript_biotype Transcript type    sequences
# set attributes
attributeNames <- c('ensembl_gene_id',
                    'entrezgene_id',
                    'external_gene_name',
                    'description',
                    'gene_biotype')

# run the query
annot <- getBM(attributes=attributeNames,
               filters = ourFilterType,
               values = filterValues,
               mart = ensembl)

# count duplicate ids
annot %>%
  add_count(ensembl_gene_id) %>% 
  filter(n>1) %>% 
  distinct(ensembl_gene_id) %>% 
  nrow()
## [1] 97