--- title: "Single-cell RNA-seq of naive/primed embryonic stem cells: differential variability" author: Tobias Messmer and Aaron Lun date: "`r Sys.Date()`" output: BiocStyle::html_document: toc: true toc_float: true depth: 3 number_sections: true theme: united highlight: tango fig_caption: false --- ```{r, echo=FALSE, results="hide"} dir.create("figure-correlations/", showWarning=FALSE) knitr::opts_chunk$set(error=FALSE, warning=FALSE, message=FALSE, fig.path="figure-correlations/") ``` # Data preparation As both our populations appear to be quite homogenous, we can use them for calculating correlations between specific genes of interest. We set up the two populations, starting with the naive population _sans_ the transition cells: ```{r} library(scran) groups <- read.table("results-transition/groups.tsv", header=TRUE, stringsAsFactors=FALSE) is.transition <- groups$Cell[groups$Type=="transition"] sce_naive <- readRDS("results-naive/sce_naive.rds") sce_naive <- sce_naive[,!colnames(sce_naive) %in% is.transition] sce_naive <- normalize(sce_naive) table(sce_naive$batch) ``` ... and then the primed population. ```{r} sce_primed <- readRDS("results-primed/sce_primed.rds") table(sce_primed$batch) ``` We also load in the lists of interesting genes, including Ferdinand's hand-picked markers: ```{r loadlists} library(org.Hs.eg.db) inPresentSet <- function(present, desired) { ensembls <- mapIds(org.Hs.eg.db, keytype="SYMBOL", keys=desired, column="ENSEMBL") which(present %in% ensembls) } markers <- read.csv("../data/marker/ferd_marker_set.csv", stringsAsFactors = FALSE, header = FALSE) head(markers) chosen_markers <- inPresentSet(rowData(sce_naive)$ensembl_gene_id, markers[,1]) length(chosen_markers) ``` ... and for epigenetic readers and modifiers: ```{r} epi_modifier <- read.csv("../data/marker/epigen_modifiers.csv", stringsAsFactors = FALSE, header = FALSE) head(epi_modifier) epi_modifier <- inPresentSet(rowData(sce_naive)$ensembl_gene_id, epi_modifier[,1]) length(epi_modifier) epi_reader <- read.csv("../data/marker/epigen_readers.csv", stringsAsFactors = FALSE, header = FALSE) head(epi_reader) epi_reader <- inPresentSet(rowData(sce_naive)$ensembl_gene_id, epi_reader[,1]) length(epi_reader) ``` Finally, we set up the output directory. ```{r makedir} resdir <- "results-correlations" dir.create(resdir, showWarning=FALSE) ``` # Correlations between markers and epigenetic readers ## In naive pluripotency We consider all correlations between the markers and epigenetic readers in naive cells. This is done while blocking on the batch of origin to ensure that the batch does not drive correlations. ```{r} set.seed(100) library(scran) cor.read_naive <- correlatePairs(sce_naive, block=sce_naive$batch, pairings=list(chosen_markers, epi_reader)) table(Sig=cor.read_naive$FDR <= 0.05, Limited=cor.read_naive$limited) ``` We can examine the distribution of significant markers. ```{r} CHECK_LINEAGE <- function(genes, sig) { in.all <- markers[,1] %in% genes in.sig <- markers[,1] %in% genes[sig] lineage <- factor(markers[,2]) data.frame(Detected=as.integer(table(lineage[in.sig])), Total=as.integer(table(lineage[in.all])), row.names=levels(lineage)) } CHECK_LINEAGE(cor.read_naive$gene1, cor.read_naive$FDR <= 0.05) ``` ## In primed pluripotency Same again for primed cells. ```{r} set.seed(100) cor.read_primed <- correlatePairs(sce_primed, block=sce_primed$batch, pairings=list(chosen_markers, epi_reader)) table(Sig=cor.read_primed$FDR <= 0.05, Limited=cor.read_primed$limited) ``` We can examine the distribution of significant markers. ```{r} CHECK_LINEAGE(cor.read_primed$gene1, cor.read_primed$FDR <= 0.05) ``` ## Comparing the pairs We can examine the difference between the correlations more visually. The naive correlations are more spread out than the primed correlations. ```{r} op <- order(cor.read_primed$gene1, cor.read_primed$gene2) on <- order(cor.read_naive$gene1, cor.read_naive$gene2) smoothScatter(cor.read_naive$rho[on], cor.read_primed$rho[op], xlab="Naive rho", ylab="Primed rho") abline(a=0, b=1, col="red") ``` # Correlations between markers and epigenetic modifiers ## In naive pluripotency We consider all correlations between the markers and epigenetic modifiers in naive cells. ```{r} set.seed(100) library(scran) cor.mod_naive <- correlatePairs(sce_naive, block=sce_naive$batch, pairings=list(chosen_markers, epi_modifier)) table(Sig=cor.mod_naive$FDR <= 0.05, Limited=cor.mod_naive$limited) ``` We can examine the distribution of significant markers. ```{r} CHECK_LINEAGE(cor.mod_naive$gene1, cor.mod_naive$FDR <= 0.05) ``` ## In primed pluripotency Same again for primed cells. ```{r} set.seed(100) cor.mod_primed <- correlatePairs(sce_primed, block=sce_primed$batch, pairings=list(chosen_markers, epi_modifier)) table(Sig=cor.mod_primed$FDR <= 0.05, Limited=cor.mod_primed$limited) ``` We can examine the distribution of significant markers. ```{r} CHECK_LINEAGE(cor.mod_primed$gene1, cor.mod_primed$FDR <= 0.05) ``` ## Comparing the pairs We can examine the difference between the correlations more visually. The naive correlations are more spread out than the primed correlations. ```{r} op <- order(cor.mod_primed$gene1, cor.mod_primed$gene2) on <- order(cor.mod_naive$gene1, cor.mod_naive$gene2) smoothScatter(cor.mod_naive$rho[on], cor.mod_primed$rho[op], xlab="Naive rho", ylab="Primed rho") abline(a=0, b=1, col="red") ``` # Picking the top gene pairs We need to pick a set of 25 markers and 25 readers/writers that have the strongest correlations across all comparisons. We do so by merging all the previous results and taking the first 25 genes of each category (readers and writers are bundled together here). It is important to merge both the naive and primed results to reduce any preference for strong correlations in one condition. ```{r} altogether <- rbind(cor.mod_naive, cor.read_naive, cor.mod_primed, cor.read_primed) altogether <- altogether[order(altogether$p.value, abs(altogether$rho)),] restrict.size <- 25 top.markers <- unique(altogether$gene1)[seq_len(restrict.size)] top.markers top.epi <- unique(altogether$gene2)[seq_len(restrict.size)] top.epi ``` We create a matrix of these correlations in the naive population. For simplicity, we'll just recompute them in `correlatePairs`, rather than trying to extract them manually. ```{r} set.seed(100) naive.out <- correlatePairs(sce_naive, block=sce_naive$batch, pairings=list(top.markers, top.epi)) naive_mat <- matrix(0, restrict.size, restrict.size) dimnames(naive_mat) <- list(top.markers, top.epi) naive_mat[cbind(naive.out$gene1, naive.out$gene2)] <- naive.out$rho saveRDS(file=file.path(resdir, "naive_out.rds"), naive.out) ``` We repeat this for the primed population. ```{r} set.seed(100) primed.out <- correlatePairs(sce_primed, block=sce_primed$batch, pairings=list(top.markers, top.epi)) primed_mat <- matrix(0, restrict.size, restrict.size) dimnames(primed_mat) <- list(top.markers, top.epi) primed_mat[cbind(primed.out$gene1, primed.out$gene2)] <- primed.out$rho saveRDS(file=file.path(resdir, "primed_out.rds"), primed.out) ``` We then construct trees to order genes along the sides of a heatmap. Note that we use information from both naive and primed conditions to avoid biasing our visualization. ```{r} row_tree <- hclust(dist(cbind(naive_mat, primed_mat))) col_tree <- hclust(dist(t(rbind(naive_mat, primed_mat)))) naive_mat <- naive_mat[row_tree$order, col_tree$order] saveRDS(file=file.path(resdir, "naive_mat.rds"), naive_mat) primed_mat <- primed_mat[row_tree$order, col_tree$order] saveRDS(file=file.path(resdir, "primed_mat.rds"), primed_mat) ``` Now, visualizing for naive cells: ```{r} library(pheatmap) limited <- naive_mat limited[limited < -0.5] <- -0.5 limited[limited > 0.5] <- 0.5 pheatmap(limited, breaks=seq(-0.5, 0.5, length.out=101), main = 'naive', color = colorRampPalette(c("navy", "white", "orangered"))(101), treeheight_row = 0, treeheight_col = 0, cluster_rows = FALSE, cluster_cols = FALSE) ``` ... and also for primed cells. ```{r} limited <- primed_mat limited[limited < -0.5] <- -0.5 limited[limited > 0.5] <- 0.5 pheatmap(limited, breaks=seq(-0.5, 0.5, length.out=101), main = 'primed', color = colorRampPalette(c("navy", "white", "orangered"))(101), treeheight_row = 0, treeheight_col = 0, cluster_rows = FALSE, cluster_cols = FALSE) ``` ## Visualization of correlations By plotting the rho-values of the naive and the primed correlations, we can visualize what in which populations there is more correlation between lineage and epigenetic marker genes. ```{r} naive.fdr <- primed.fdr <- colour.code <- naive_mat naive.fdr[cbind(naive.out$gene1, naive.out$gene2)] <- naive.out$FDR primed.fdr[cbind(primed.out$gene1, primed.out$gene2)] <- primed.out$FDR fdr <- 0.05 colour.code[,] <- 1 #insignificant colour.code[naive.fdr < fdr & primed.fdr < fdr] <- 2 #significant in both colour.code[naive.fdr < fdr & primed.fdr >= fdr] <- 3 #significant in naive colour.code[naive.fdr >= fdr & primed.fdr < fdr] <- 4 #signifcant in primed colour.scheme <- c('grey', 'black', 'blue', 'red') colour.code <- colour.scheme[colour.code] plot(naive_mat, primed_mat, xlim = c(-0.5, 0.5) , ylim = c(-0.5, 0.5), xlab = 'Naive correlations [rho]', ylab='Primed correlations [rho]', pch=16, col = colour.code) legend('topleft', legend = c(paste0('FDR > ', fdr,': ', sum(colour.code=='grey')), paste0('Sig in both: ', sum(colour.code== 'black')), paste0('Sig in naive: ', sum(colour.code== 'blue')), paste0('Sig in primed: ', sum(colour.code== 'red'))), pch = 16, col = c('grey', 'black', 'blue', 'red'), bty='n', cex=1.5) ``` # Functional relations ```{r} library(gdata) pastor <- read.xls('~/Downloads/41556_2018_89_MOESM7_ESM.xlsx', fill = TRUE, stringsAsFactors = FALSE, header = FALSE, sheet = 'Differentially regulated sets') qin <- read.xls('~/Downloads/1-s2.0-S2211124716301395-mmc2.xlsx', fill = TRUE, stringsAsFactors = FALSE, header = TRUE) cur.epi <- which(top.epi %in% qin$SYMBOL) cur.mark <- which(top.markers %in% qin$SYMBOL) naive.fdr[cur.mark, cur.epi] primed.fdr[cur.mark, cur.epi] points(naive_mat[cur.mark, cur.epi], primed_mat[cur.mark, cur.epi], pch = 16, cex = 1.1, col = 'green') cur.epi <- which(top.epi %in% pastor$V5 | top.epi %in% pastor$V7) cur.epi <- which(top.markers %in% pastor$V5 | top.markers %in% pastor$V7) naive.fdr[cur.mark, cur.epi] primed.fdr[cur.mark, cur.epi] points(naive_mat[cur.mark, cur.epi], primed_mat[cur.mark, cur.epi], pch = 16, cex = 1.1, col = 'pink') legend('bottomright', legend = c('DE in TFAP2C-/-' , 'DE in YAP/LAP'), col = c('pink', 'green'), pch = 16) length(unique(altogether[altogether$FDR < 0.05,]$gene1)) length(unique(altogether[altogether$FDR < 0.05,]$gene2)) ``` ```{r} pastor.genes <- c(pastor$V5, pastor$V7) pastor.genes <- pastor.genes[-which(pastor.genes == '')] corrs <- correlatePairs(sce_naive, block=sce_naive$batch, pairings=list('TFAP2C', rownames(sce_naive))) table(corrs$FDR < 0.05) corr.genes <- corrs[corrs$FDR < 0.05,]$gene2 table(isCor = rownames(sce_naive) %in% corr.genes, isDE = rownames(sce_naive) %in% pastor.genes) fisher.test(table(isCor = rownames(sce_naive) %in% corr.genes, isDE = rownames(sce_naive) %in% pastor.genes)) ``` ```{r} qin.genes <- qin$SYMBOL corrs <- correlatePairs(sce_naive, block=sce_naive$batch, pairings=list('YAP1', rownames(sce_naive))) table(corrs$FDR < 0.1) corr.genes <- corrs[corrs$FDR < 0.1,]$gene2 table(isCor = rownames(sce_naive) %in% corr.genes, isDE = rownames(sce_naive) %in% qin.genes) fisher.test(table(isCor = rownames(sce_naive) %in% corr.genes, isDE = rownames(sce_naive) %in% qin.genes)) ``` # Session information ```{r} sessionInfo() ```