This document provides the necessary code for creating the rds file with the annotation table used in the course. It uses biomaRt
to retrieve the annotation from Ensembl. The list of Ensembl IDs to retrieve annotation for is extracted from an rds object containing the DESeq2 differential expression results, which was already created.
This document is not intended as a tutorial, for a detailed explanation of how to use biomaRt
please see supplementary materials Annotation with biomaRt. Nor is this document intended to be a guide for creating your own annotations, some totally arbitrary decisions were made in order to save effort.
library(DESeq2)
library(biomaRt)
library(tidyverse)
results.interaction.11 <- readRDS("RObjects/DESeqResults.interaction_d11.rds")
ensembl <- useEnsembl(biomart = 'genes',
dataset = 'mmusculus_gene_ensembl',
version = 102)
filterType <- "ensembl_gene_id"
filterValues <- rownames(results.interaction.11)
attributeNames <- c("ensembl_gene_id",
"entrezgene_id",
"external_gene_name",
"description",
"gene_biotype",
"chromosome_name",
"start_position",
"end_position",
"strand",
"entrezgene_accession")
annot <- getBM(attributes=attributeNames,
filters = filterType,
values = filterValues,
mart = ensembl)
This is the complete annotation with all the one-to-many relationships. Export it, so that we won’t have to run the query again if we wish to come back to it.
saveRDS(annot, file="Full_annotation_with_duplicates.rds")
annotUn %>%
filter(!is.na(entrezgene_id)) %>%
add_count(entrezgene_id) %>%
filter(n>1) %>%
count(entrezgene_id)
## entrezgene_id n
## 1 11877 2
## 2 14356 2
## 3 15953 2
## 4 18752 2
## 5 18861 2
## 6 19243 2
## 7 26912 2
## 8 27366 2
## 9 50518 2
## 10 66461 2
## 11 66990 2
## 12 67118 2
## 13 67149 2
## 14 67238 2
## 15 68646 2
## 16 70579 2
## 17 74477 2
## 18 78797 2
## 19 94089 2
## 20 99889 2
## 21 102115 2
## 22 207728 2
## 23 218734 2
## 24 280287 2
## 25 637515 3
## 26 664987 2
## 27 677884 3
## 28 100039796 2
## 29 100041194 2
## 30 100042100 3
## 31 100043914 2
There are 31 Entrez IDs that match multiple Ensembl IDs. Resolving which of these is correct would require more extensive research on the data bases. We need a pragmatic solution for the course. Let’s have alook at the signficance of these genes.
as.data.frame(results.interaction.11) %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(annotUn, by = "ensembl_gene_id") %>%
filter(!is.na(entrezgene_id)) %>%
add_count(entrezgene_id) %>%
filter(n>1) %>%
select(ensembl_gene_id, entrezgene_id, padj, chromosome_name) %>%
arrange(padj)
## ensembl_gene_id entrezgene_id padj chromosome_name
## 1 ENSMUSG00000115886 100039796 2.188768e-77 CHR_WSB_EIJ_MMCHR11_CTG1
## 2 ENSMUSG00000078920 15953 3.437499e-60 11
## 3 ENSMUSG00000078921 100039796 2.667329e-34 11
## 4 ENSMUSG00000096488 100042100 5.880162e-02 14
## 5 ENSMUSG00000022253 68646 9.627923e-02 15
## 6 ENSMUSG00000072812 100041194 9.939717e-02 12
## 7 ENSMUSG00000116158 280287 1.922501e-01 1
## 8 ENSMUSG00000070390 637515 1.962265e-01 11
## 9 ENSMUSG00000089945 677884 2.536130e-01 4
## 10 ENSMUSG00000116275 70579 3.226737e-01 1
## 11 ENSMUSG00000110195 207728 3.388073e-01 7
## 12 ENSMUSG00000118332 67238 3.523385e-01 5
## 13 ENSMUSG00000074513 99889 3.679077e-01 3
## 14 ENSMUSG00000111527 68646 3.795302e-01 CHR_MG4288_PATCH
## 15 ENSMUSG00000078532 67149 3.894284e-01 4
## 16 ENSMUSG00000096793 100042100 4.522771e-01 14
## 17 ENSMUSG00000102976 70579 5.774588e-01 1
## 18 ENSMUSG00000116408 94089 6.618999e-01 CHR_WSB_EIJ_MMCHR11_CTG1
## 19 ENSMUSG00000063235 66461 6.767142e-01 2
## 20 ENSMUSG00000083012 67238 7.066279e-01 5
## 21 ENSMUSG00000000325 11877 7.075947e-01 16
## 22 ENSMUSG00000022684 67118 7.175289e-01 16
## 23 ENSMUSG00000106964 67149 7.280606e-01 CHR_MG4266_PATCH
## 24 ENSMUSG00000078816 18752 7.361723e-01 7
## 25 ENSMUSG00000079737 67118 7.361723e-01 16
## 26 ENSMUSG00000107877 74477 7.381156e-01 11
## 27 ENSMUSG00000114378 218734 7.461406e-01 14
## 28 ENSMUSG00000117310 19243 7.567281e-01 1
## 29 ENSMUSG00000075569 18861 7.639096e-01 5
## 30 ENSMUSG00000079109 18861 7.680020e-01 5
## 31 ENSMUSG00000024571 27366 7.924897e-01 18
## 32 ENSMUSG00000116254 637515 8.323231e-01 CHR_CAST_EI_MMCHR11_CTG4
## 33 ENSMUSG00000084897 50518 8.461295e-01 2
## 34 ENSMUSG00000089847 14356 8.554819e-01 7
## 35 ENSMUSG00000078905 664987 8.595182e-01 2
## 36 ENSMUSG00000115074 78797 8.648626e-01 2
## 37 ENSMUSG00000057130 27366 8.678565e-01 18
## 38 ENSMUSG00000110860 66461 8.826651e-01 CHR_MG191_PATCH
## 39 ENSMUSG00000114004 102115 8.858318e-01 10
## 40 ENSMUSG00000006378 26912 8.969835e-01 15
## 41 ENSMUSG00000006471 78797 8.969835e-01 2
## 42 ENSMUSG00000026064 19243 9.024507e-01 1
## 43 ENSMUSG00000097394 66990 9.087617e-01 CHR_MG153_PATCH
## 44 ENSMUSG00000090053 677884 9.161891e-01 4
## 45 ENSMUSG00000078898 100043914 9.177512e-01 2
## 46 ENSMUSG00000090691 100042100 9.327098e-01 14
## 47 ENSMUSG00000033111 218734 9.334572e-01 14
## 48 ENSMUSG00000097449 18752 9.357120e-01 CHR_MG4151_PATCH
## 49 ENSMUSG00000040350 94089 9.519887e-01 11
## 50 ENSMUSG00000116378 26912 9.537884e-01 15
## 51 ENSMUSG00000110234 14356 9.544490e-01 7
## 52 ENSMUSG00000102805 99889 9.688617e-01 3
## 53 ENSMUSG00000078440 102115 9.729195e-01 10
## 54 ENSMUSG00000020807 74477 9.754994e-01 11
## 55 ENSMUSG00000098615 11877 9.792462e-01 CHR_MG3833_MG4220_PATCH
## 56 ENSMUSG00000024845 66990 9.818681e-01 19
## 57 ENSMUSG00000030653 207728 9.829405e-01 7
## 58 ENSMUSG00000090093 664987 9.883529e-01 2
## 59 ENSMUSG00000038729 677884 9.908806e-01 4
## 60 ENSMUSG00000115958 280287 9.960817e-01 1
## 61 ENSMUSG00000027596 50518 NA 2
## 62 ENSMUSG00000095545 100043914 NA 2
## 63 ENSMUSG00000116073 637515 NA CHR_PWK_PHJ_MMCHR11_CTG2
## 64 ENSMUSG00000116330 15953 NA CHR_WSB_EIJ_MMCHR11_CTG1
## 65 ENSMUSG00000116477 100041194 NA CHR_MG3490_PATCH
These genes are mostly non-significant, so it’s really not going to affect the results of the downstream analyses. Seeing as this is only a teaching exercise, we’ll arbitrarily set the second entry to NA
. Some of the duplicates are on patch scaffolds, we’ll arrange by chromosome, so that these get set to NA
.
dupEntrez <- annotUn %>%
add_count(entrezgene_id) %>%
filter(n>1) %>%
select(-n) %>%
arrange(entrezgene_id, chromosome_name)
dupEntrez$entrezgene_id[duplicated(dupEntrez$entrezgene_id)] <- NA
annotFinal <- annotUn %>%
add_count(entrezgene_id) %>%
filter(n==1) %>%
select(-n) %>%
bind_rows(dupEntrez)
Final checks
dim(annotFinal)
## [1] 20091 10
annotFinal %>%
filter(!is.na(entrezgene_id)) %>%
add_count(entrezgene_id) %>%
filter(n>1)
## [1] ensembl_gene_id entrezgene_id external_gene_name
## [4] description gene_biotype chromosome_name
## [7] start_position end_position strand
## [10] entrezgene_accession n
## <0 rows> (or 0-length row.names)
all(filterValues%in%annotFinal$ensembl_gene_id)
## [1] TRUE
ensemblAnnot <- rownames(results.interaction.11) %>%
enframe(name = NULL, value = "ensembl_gene_id") %>%
left_join(annotFinal) %>%
dplyr::select(GeneID="ensembl_gene_id", Entrez="entrezgene_id",
Symbol="external_gene_name", Description="description",
Biotype="gene_biotype", Chr="chromosome_name",
Start="start_position", End="end_position",
Strand="strand")
saveRDS(ensemblAnnot, file="RObjects/Ensembl_annotations.rds")