if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
::install(c("DESeq2", "EnhancedVolcano")) BiocManager
AD Knowledge Portal Workshop: Differential Expression Analysis of 5xFAD mouse models
This notebook will take the raw counts matrix and metadata files we downloaded in the first part of the workshop (5XFAD_data_R_tutorial.qmd
) to run a basic differential expression analysis on a single time point (12 months) in male mice. You can amend the code to compare wild type and 5XFAD mice from either sex, at any time point. For a more in-depth tutorial on DESeq2 and how to handle more complicated experimental setups, see this vignette on DESeq2.
The data used in this notebook is obtained from The Jax.IU.Pitt_5XFAD Study (Jax.IU.Pitt_5XFAD), which can be found here on the AD Knowledge Portal.
Setup
Install and load packages
We will need several new packages from Bioconductor to run this analysis:
If not already installed, be sure to install the synapser
, tidyverse
, and lubridate
packages from part 1 of this workshop.
Load necessary libraries.
library(DESeq2)
library(ggplot2)
library(EnhancedVolcano)
library(dplyr)
library(synapser)
library(readr)
library(tibble)
library(lubridate)
Download counts and metadata from Synapse
The code below is a (more condensed) repeat of the code from Part 1 of the workshop (5XFAD_data_r_tutorial.qmd
) that fetches the counts file and metadata files. If you have just run that notebook and still have counts
and covars
in your environment, you likely do not need to re-run the code below and can skip to Modify the data for analysis.
First, we log in to Syanpse:
synLogin()
Then, we fetch the counts and metadata files. As mentioned in part 1, it is good practice to assign the output of synGet
to a variable and use variable$path
to reference the file name, as done below. For this part of the workshop, we will skip the bulk download step for metadata files and instead download each file by ID.
# counts
<- "syn22108847"
counts_id <- synGet(counts_id,
counts_file downloadLocation = "files/",
ifcollision = "overwrite.local")
<- read_tsv(counts_file$path, show_col_types = FALSE)
counts
# individual metadata
<- "syn22103212"
ind_metaID <- synGet(ind_metaID,
ind_file downloadLocation = "files/",
ifcollision = "overwrite.local")
<- read_csv(ind_file$path, show_col_types = FALSE)
ind_meta
# biospecimen metadata
<- "syn22103213"
bio_metaID <- synGet(bio_metaID,
bio_file downloadLocation = "files/",
ifcollision = "overwrite.local")
<- read_csv(bio_file$path, show_col_types = FALSE)
bio_meta
# RNA assay metadata
<- "syn22110328"
rna_metaID <- synGet(rna_metaID,
rna_file downloadLocation = "files/",
ifcollision = "overwrite.local")
<- read_csv(rna_file$path, show_col_types = FALSE) rna_meta
Join metadata files together
Join the three metadata files by IDs in common so we can associate the column names of counts
(which are specimenIDs) with individual mice from the individual metadata file.
<- rna_meta |> # start with the rnaseq assay metadata
joined_meta
# join rows from biospecimen that match specimenID
left_join(bio_meta, by = "specimenID") |>
# join rows from individual that match individualID
left_join(ind_meta, by = "individualID")
Create a timepoint variable (months since birth) from the dateBirth
and dateDeath
fields in the metadata.
# convert columns of strings to month-date-year format using lubridate
<- joined_meta |>
joined_meta_time mutate(dateBirth = mdy(dateBirth),
dateDeath = mdy(dateDeath)) |>
# create a new column that subtracts dateBirth from dateDeath in days, then
# divide by 30 to get months
mutate(timepoint = as.numeric(difftime(dateDeath, dateBirth,
units = "days")) / 30) |>
# convert numeric ages to timepoint categories
mutate(timepoint = case_when(timepoint > 10 ~ "12 mo",
< 10 & timepoint > 5 ~ "6 mo",
timepoint < 5 ~ "4 mo"))
timepoint
# check that the timepoint column looks ok (should be 6 mice in each group)
|>
joined_meta_time group_by(sex, genotype, timepoint) |>
count()
Select the covariates needed for the analysis
<- joined_meta_time |>
covars ::select(individualID, specimenID, sex, genotype, timepoint)
dplyr
covars
Utility function that maps Ensembl IDs to gene symbols (copied from Part 1)
# Assumes that the rownames of "df" are the Ensembl IDs
<- function(df) {
map_ensembl_ids <- read.csv(file = "ensembl_translation_key.csv")
ensembl_to_gene
<- df |>
mapped_df # Make a gene_id column that matches the ensembl_to_gene table
rownames_to_column("gene_id") |>
::left_join(ensembl_to_gene, by = "gene_id") |>
dplyrrelocate(gene_name, .after = gene_id)
# The first two genes in the matrix are the humanized genes PSEN1
# (ENSG00000080815) and APP (ENSG00000142192). Set these manually:
1, "gene_name"] <- "PSEN1"
mapped_df[2, "gene_name"] <- "APP"
mapped_df[
return(mapped_df)
}
End of repeated code from part 1.
Modify the data for analysis
Clean up the covars
data: coerce covars into a dataframe, label the rows by specimenID, and check the result
<- as.data.frame(covars)
covars rownames(covars) <- covars$specimenID
covars
Order the data (counts columns and metadata rows MUST be in the same order), and subset the counts matrix and metadata to include only 12 month old male mice
.12M.Male <- covars |>
metasubset(sex == "male" & timepoint == "12 mo")
# Subsets counts to only the 12 month male samples, and puts the samples in the
# same order they appear in meta.12M.Male
.12M.Male <- counts |>
counts# Set the rownames to the gene ID, remove "gene_id" column
column_to_rownames("gene_id") |>
# Only use columns that appear in meta.12M.Male
select(meta.12M.Male$specimenID)
This leaves us with 12 samples, 6 per genotype:
.12M.Male |>
metagroup_by(sex, genotype, timepoint) |>
::count() dplyr
Verify that the columns in counts.12M.Male
are in the same order as the specimenIDs in meta.12M.Male
:
all(colnames(counts.12M.Male) == meta.12M.Male$specimenID)
[1] TRUE
We should now have:
- A data.frame of metadata for 12-month-old male mice, one row per specimen
- A matrix of counts where each row is a gene and each column is a single specimen
We can now analyze this data with DESeq2.
Differential Expression Analysis using DESeq2
Set up data for analysis. All samples are male, 12 month old mice, so we are only interested in looking at the effect of genotype on the data. We specify this for DESeq2 by setting the design
argument to ~ genotype
. This tells DESeq2 to use the linear model expression ~ genotype
when solving for the effect of genotype. For this to work properly, we need to make sure genotype
is a factor so DESeq2 knows it is a categorical value.
.12M.Male$genotype <- factor(meta.12M.Male$genotype)
meta
<- DESeqDataSetFromMatrix(countData = counts.12M.Male,
ddsHTSeq colData = meta.12M.Male,
design = ~ genotype)
Note on R formula syntax: Using ~ variable
will automatically be expanded to represent the linear equation expression ~ (beta1 * 1) + (beta2 * variable)
, where 1 represents the intercept of the equation and beta1
and beta2
are the coefficients that should be estimated.
For more complicated formula setup, refer to the DESeq2 vignette.
Back to analysis…
Filter out genes that have all zero counts across all samples. You can also use more stringent criteria like only keeping genes that have at least X counts in at least Y samples, but for this workshop we will just remove zero-genes.
paste("Total genes before filtering:", nrow(ddsHTSeq))
[1] "Total genes before filtering: 55489"
<- ddsHTSeq[rowSums(counts(ddsHTSeq)) >= 1, ]
ddsHTSeq
paste("Total genes after filtering:", nrow(ddsHTSeq))
[1] "Total genes after filtering: 40784"
Set wild-type mice (5XFAD_noncarrier) as the reference genotype, so the comparison is 5XFAD_carrier - 5XFAD_noncarrier.
$genotype <- relevel(ddsHTSeq$genotype, ref = "5XFAD_noncarrier") ddsHTSeq
Run DESeq
This function normalizes the read counts, estimates dispersions, and fits the linear model using the formula we specified in design
above (~ genotype
)
<- DESeq(ddsHTSeq, parallel = TRUE) dds
Extract a table of results
The significance threshold can be set using the alpha
argument of the results
function. Here we use 0.05.
<- results(dds, alpha = 0.05)
res summary(res)
out of 40784 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 1091, 2.7%
LFC < 0 (down) : 492, 1.2%
outliers [1] : 33, 0.081%
low counts [2] : 15803, 39%
(mean count < 6)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
head(as.data.frame(res))
Add gene symbols to the results
<- map_ensembl_ids(as.data.frame(res)) res
What are some of the top up-regulated genes?
|>
res subset(padj < 0.05) |>
slice_max(order_by = log2FoldChange, n = 10) %>%
select(gene_id, gene_name, log2FoldChange, padj)
What are some of the top down-regulated genes?
|>
res subset(padj < 0.05) |>
slice_min(order_by = log2FoldChange, n = 10) %>%
select(gene_id, gene_name, log2FoldChange, padj)
Plot results
Volcano plot of differential expression results: all genes with p < 0.05 and log2FC > 0.5
<- EnhancedVolcano(res,
plot_DEGvolcano lab = res$gene_name,
x = 'log2FoldChange',
y = 'padj',
legendPosition = 'none',
title = 'DE Results of 12 mo. old Male Mice',
subtitle = '',
FCcutoff = 0.5,
pCutoff = 0.05,
xlim = c(-3, 17),
pointSize = 1,
labSize = 4)
plot_DEGvolcano
Save results table and plot:
write.csv(res, file="5XFAD_DEresults_12mo_males.csv", row.names=FALSE)
ggsave("VolcanoPlot.png", width = 8, height = 6, units = "in")
R Package Info
sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Monterey 12.7.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
time zone: America/Los_Angeles
tzcode source: internal
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] lubridate_1.9.3 tibble_3.2.1
[3] readr_2.1.5 synapser_2.1.1.259
[5] dplyr_1.1.4 EnhancedVolcano_1.22.0
[7] ggrepel_0.9.6 ggplot2_3.5.1
[9] DESeq2_1.44.0 SummarizedExperiment_1.34.0
[11] Biobase_2.64.0 MatrixGenerics_1.16.0
[13] matrixStats_1.4.1 GenomicRanges_1.56.2
[15] GenomeInfoDb_1.40.1 IRanges_2.38.1
[17] S4Vectors_0.42.1 BiocGenerics_0.50.0
loaded via a namespace (and not attached):
[1] gtable_0.3.5 rjson_0.2.21 xfun_0.48
[4] htmlwidgets_1.6.4 lattice_0.22-6 tzdb_0.4.0
[7] vctrs_0.6.5 tools_4.4.1 generics_0.1.3
[10] parallel_4.4.1 fansi_1.0.6 pkgconfig_2.0.3
[13] Matrix_1.7-0 lifecycle_1.0.4 GenomeInfoDbData_1.2.12
[16] farver_2.1.2 compiler_4.4.1 textshaping_0.4.0
[19] munsell_0.5.1 codetools_0.2-20 htmltools_0.5.8.1
[22] yaml_2.3.10 pillar_1.9.0 crayon_1.5.3
[25] BiocParallel_1.38.0 DelayedArray_0.30.1 abind_1.4-8
[28] tidyselect_1.2.1 locfit_1.5-9.10 digest_0.6.37
[31] labeling_0.4.3 rprojroot_2.0.4 fastmap_1.2.0
[34] grid_4.4.1 here_1.0.1 colorspace_2.1-1
[37] cli_3.6.3 SparseArray_1.4.8 magrittr_2.0.3
[40] S4Arrays_1.4.1 utf8_1.2.4 withr_3.0.1
[43] scales_1.3.0 UCSC.utils_1.0.0 bit64_4.5.2
[46] timechange_0.3.0 rmarkdown_2.28 XVector_0.44.0
[49] httr_1.4.7 bit_4.5.0 reticulate_1.39.0
[52] ragg_1.3.3 hms_1.1.3 png_0.1-8
[55] evaluate_1.0.1 knitr_1.48 rlang_1.1.4
[58] Rcpp_1.0.13 glue_1.8.0 vroom_1.6.5
[61] rstudioapi_0.16.0 jsonlite_1.8.9 R6_2.5.1
[64] systemfonts_1.1.0 zlibbioc_1.50.0