# load data
load("../../Course_Materials/Robjects/DE.RData")
## set up connection to ensembl database
ensembl <- useMart("ENSEMBL_MART_ENSEMBL")
# specify a data set to use
ensembl <- useDataset("mmusculus_gene_ensembl", mart=ensembl)
That was just 1000 genes. We need annotations for the entire results table. Also, there may be some other interesting columns in BioMart that we wish to retrieve.
- Search the attributes and add the following to our list of attributes:
- The gene description
- The gene biotype
- Query BioMart using all of the genes in our results table (
resLvV
)
- How many Ensembl genes have multipe Entrez IDs associated with them?
# Set the filter type and values
ourFilterType <- "ensembl_gene_id"
# set the values for the filter
filterValues <- rownames(resLvV)[1:1000]
# check the available "attributes" - things you can retreive
listAttributes(ensembl) %>%
filter(str_detect(name, "description"))
## name description
## 1 description Gene description
## 2 phenotype_description Phenotype description
## 3 goslim_goa_description GOSlim GOA Description
## 4 mgi_description MGI description
## 5 entrezgene_description NCBI gene (formerly Entrezgene) description
## 6 wikigene_description WikiGene description
## 7 family_description Ensembl Family Description
## 8 interpro_short_description Interpro Short Description
## 9 interpro_description Interpro Description
## 10 description Gene description
## 11 description Gene description
## 12 description Gene description
## 13 source_description Variant source description
## 14 description Gene description
## page
## 1 feature_page
## 2 feature_page
## 3 feature_page
## 4 feature_page
## 5 feature_page
## 6 feature_page
## 7 feature_page
## 8 feature_page
## 9 feature_page
## 10 structure
## 11 homologs
## 12 snp
## 13 snp
## 14 sequences
listAttributes(ensembl) %>%
filter(str_detect(name, "biotype"))
## name description page
## 1 gene_biotype Gene type feature_page
## 2 transcript_biotype Transcript type feature_page
## 3 gene_biotype Gene type structure
## 4 gene_biotype Gene type sequences
## 5 transcript_biotype Transcript type sequences
# set attributes
attributeNames <- c('ensembl_gene_id',
'entrezgene_id',
'external_gene_name',
'description',
'gene_biotype')
# run the query
annot <- getBM(attributes=attributeNames,
filters = ourFilterType,
values = filterValues,
mart = ensembl)
# count duplicate ids
annot %>%
add_count(ensembl_gene_id) %>%
filter(n>1) %>%
distinct(ensembl_gene_id) %>%
nrow()
## [1] 97