---
title: "Single-cell RNA-seq of naive/primed embryonic stem cells: differential variability" 
author: Tobias Messmer and Aaron Lun
date: "`r Sys.Date()`"
output: 
  BiocStyle::html_document:
    toc: true
    toc_float: true
    depth: 3
    number_sections: true
    theme: united 
    highlight: tango 
    fig_caption: false
---

```{r, echo=FALSE, results="hide"}
dir.create("figure-correlations/", showWarning=FALSE)
knitr::opts_chunk$set(error=FALSE, warning=FALSE, message=FALSE, fig.path="figure-correlations/")
```

# Data preparation

As both our populations appear to be quite homogenous, we can use them for calculating correlations between specific genes of interest.
We set up the two populations, starting with the naive population _sans_ the transition cells:

```{r}
library(scran)
groups <- read.table("results-transition/groups.tsv",
            header=TRUE, stringsAsFactors=FALSE)
is.transition <- groups$Cell[groups$Type=="transition"]

sce_naive <- readRDS("results-naive/sce_naive.rds")
sce_naive <- sce_naive[,!colnames(sce_naive) %in% is.transition]
sce_naive <- normalize(sce_naive)
table(sce_naive$batch)
```

... and then the primed population.

```{r}
sce_primed <- readRDS("results-primed/sce_primed.rds")
table(sce_primed$batch)
```

We also load in the lists of interesting genes, including Ferdinand's hand-picked markers:

```{r loadlists}
library(org.Hs.eg.db)
inPresentSet <- function(present, desired) {
    ensembls <- mapIds(org.Hs.eg.db, keytype="SYMBOL", keys=desired, column="ENSEMBL")
    which(present %in% ensembls)    
}

markers <- read.csv("../data/marker/ferd_marker_set.csv", stringsAsFactors = FALSE, header = FALSE)
head(markers)
chosen_markers <- inPresentSet(rowData(sce_naive)$ensembl_gene_id, markers[,1])
length(chosen_markers)
```

... and for epigenetic readers and modifiers:

```{r}
epi_modifier <- read.csv("../data/marker/epigen_modifiers.csv", stringsAsFactors = FALSE, header = FALSE)
head(epi_modifier)
epi_modifier <- inPresentSet(rowData(sce_naive)$ensembl_gene_id, epi_modifier[,1])
length(epi_modifier)

epi_reader <- read.csv("../data/marker/epigen_readers.csv", stringsAsFactors = FALSE, header = FALSE)
head(epi_reader)
epi_reader <- inPresentSet(rowData(sce_naive)$ensembl_gene_id, epi_reader[,1])
length(epi_reader)
```

Finally, we set up the output directory.

```{r makedir}
resdir <- "results-correlations"
dir.create(resdir, showWarning=FALSE)
```

# Correlations between markers and epigenetic readers 

## In naive pluripotency

We consider all correlations between the markers and epigenetic readers in naive cells.
This is done while blocking on the batch of origin to ensure that the batch does not drive correlations.

```{r}
set.seed(100)
library(scran)
cor.read_naive <- correlatePairs(sce_naive, block=sce_naive$batch, 
    pairings=list(chosen_markers, epi_reader))
table(Sig=cor.read_naive$FDR <= 0.05, Limited=cor.read_naive$limited)
```

We can examine the distribution of significant markers.

```{r}
CHECK_LINEAGE <- function(genes, sig) {
    in.all <- markers[,1] %in% genes
    in.sig <- markers[,1] %in% genes[sig]
    lineage <- factor(markers[,2])
    data.frame(Detected=as.integer(table(lineage[in.sig])), 
        Total=as.integer(table(lineage[in.all])),
        row.names=levels(lineage))
}

CHECK_LINEAGE(cor.read_naive$gene1, cor.read_naive$FDR <= 0.05)
```

## In primed pluripotency

Same again for primed cells.

```{r}
set.seed(100)
cor.read_primed <- correlatePairs(sce_primed, block=sce_primed$batch, 
    pairings=list(chosen_markers, epi_reader))
table(Sig=cor.read_primed$FDR <= 0.05, Limited=cor.read_primed$limited)
```

We can examine the distribution of significant markers.

```{r}
CHECK_LINEAGE(cor.read_primed$gene1, cor.read_primed$FDR <= 0.05)
```

## Comparing the pairs

We can examine the difference between the correlations more visually.
The naive correlations are more spread out than the primed correlations.

```{r}
op <- order(cor.read_primed$gene1, cor.read_primed$gene2)
on <- order(cor.read_naive$gene1, cor.read_naive$gene2)
smoothScatter(cor.read_naive$rho[on], cor.read_primed$rho[op],
    xlab="Naive rho", ylab="Primed rho")
abline(a=0, b=1, col="red")
```

# Correlations between markers and epigenetic modifiers 

## In naive pluripotency

We consider all correlations between the markers and epigenetic modifiers in naive cells.

```{r}
set.seed(100)
library(scran)
cor.mod_naive <- correlatePairs(sce_naive, block=sce_naive$batch, 
    pairings=list(chosen_markers, epi_modifier))
table(Sig=cor.mod_naive$FDR <= 0.05, Limited=cor.mod_naive$limited)
```

We can examine the distribution of significant markers.

```{r}
CHECK_LINEAGE(cor.mod_naive$gene1, cor.mod_naive$FDR <= 0.05)
```

## In primed pluripotency

Same again for primed cells.

```{r}
set.seed(100)
cor.mod_primed <- correlatePairs(sce_primed, block=sce_primed$batch, 
    pairings=list(chosen_markers, epi_modifier))
table(Sig=cor.mod_primed$FDR <= 0.05, Limited=cor.mod_primed$limited)
```

We can examine the distribution of significant markers.

```{r}
CHECK_LINEAGE(cor.mod_primed$gene1, cor.mod_primed$FDR <= 0.05)
```

## Comparing the pairs

We can examine the difference between the correlations more visually.
The naive correlations are more spread out than the primed correlations.

```{r}
op <- order(cor.mod_primed$gene1, cor.mod_primed$gene2)
on <- order(cor.mod_naive$gene1, cor.mod_naive$gene2)
smoothScatter(cor.mod_naive$rho[on], cor.mod_primed$rho[op],
    xlab="Naive rho", ylab="Primed rho")
abline(a=0, b=1, col="red")
```

# Picking the top gene pairs 

We need to pick a set of 25 markers and 25 readers/writers that have the strongest correlations across all comparisons.
We do so by merging all the previous results and taking the first 25 genes of each category (readers and writers are bundled together here).
It is important to merge both the naive and primed results to reduce any preference for strong correlations in one condition.

```{r}
altogether <- rbind(cor.mod_naive, cor.read_naive, cor.mod_primed, cor.read_primed)
altogether <- altogether[order(altogether$p.value, abs(altogether$rho)),]   
restrict.size <- 25
top.markers <- unique(altogether$gene1)[seq_len(restrict.size)]
top.markers
top.epi <- unique(altogether$gene2)[seq_len(restrict.size)]
top.epi
```

We create a matrix of these correlations in the naive population.
For simplicity, we'll just recompute them in `correlatePairs`, rather than trying to extract them manually.

```{r}
set.seed(100)
naive.out <- correlatePairs(sce_naive, block=sce_naive$batch,
    pairings=list(top.markers, top.epi))
naive_mat <- matrix(0, restrict.size, restrict.size)
dimnames(naive_mat) <- list(top.markers, top.epi)
naive_mat[cbind(naive.out$gene1, naive.out$gene2)] <- naive.out$rho
saveRDS(file=file.path(resdir, "naive_out.rds"), naive.out)
```

We repeat this for the primed population.

```{r}
set.seed(100)
primed.out <- correlatePairs(sce_primed, block=sce_primed$batch,
    pairings=list(top.markers, top.epi))
primed_mat <- matrix(0, restrict.size, restrict.size)
dimnames(primed_mat) <- list(top.markers, top.epi)
primed_mat[cbind(primed.out$gene1, primed.out$gene2)] <- primed.out$rho
saveRDS(file=file.path(resdir, "primed_out.rds"), primed.out)
```

We then construct trees to order genes along the sides of a heatmap.
Note that we use information from both naive and primed conditions to avoid biasing our visualization.

```{r}
row_tree <- hclust(dist(cbind(naive_mat, primed_mat)))
col_tree <- hclust(dist(t(rbind(naive_mat, primed_mat))))

naive_mat <- naive_mat[row_tree$order, col_tree$order]
saveRDS(file=file.path(resdir, "naive_mat.rds"), naive_mat)
primed_mat <- primed_mat[row_tree$order, col_tree$order]
saveRDS(file=file.path(resdir, "primed_mat.rds"), primed_mat)
```

Now, visualizing for naive cells:

```{r}
library(pheatmap)
limited <- naive_mat
limited[limited < -0.5] <- -0.5
limited[limited > 0.5] <- 0.5
pheatmap(limited, breaks=seq(-0.5, 0.5, length.out=101), 
    main = 'naive', color = colorRampPalette(c("navy", "white", "orangered"))(101),
    treeheight_row = 0, treeheight_col = 0, cluster_rows = FALSE, cluster_cols = FALSE)
```

... and also for primed cells.

```{r}
limited <- primed_mat
limited[limited < -0.5] <- -0.5
limited[limited > 0.5] <- 0.5
pheatmap(limited, breaks=seq(-0.5, 0.5, length.out=101), 
    main = 'primed', color = colorRampPalette(c("navy", "white", "orangered"))(101),
    treeheight_row = 0, treeheight_col = 0, cluster_rows = FALSE, cluster_cols = FALSE)
```

## Visualization of correlations

By plotting the rho-values of the naive and the primed correlations, we can visualize what in which populations there is more correlation between lineage and epigenetic marker genes.

```{r}
naive.fdr <- primed.fdr <- colour.code <- naive_mat
naive.fdr[cbind(naive.out$gene1, naive.out$gene2)] <- naive.out$FDR
primed.fdr[cbind(primed.out$gene1, primed.out$gene2)] <- primed.out$FDR

fdr <- 0.05
colour.code[,] <- 1 #insignificant
colour.code[naive.fdr < fdr & primed.fdr < fdr] <- 2 #significant in both
colour.code[naive.fdr < fdr & primed.fdr >= fdr] <- 3 #significant in naive
colour.code[naive.fdr >= fdr & primed.fdr < fdr] <- 4 #signifcant in primed

colour.scheme <- c('grey', 'black', 'blue', 'red')
colour.code <- colour.scheme[colour.code]

plot(naive_mat, primed_mat, xlim = c(-0.5, 0.5) , ylim = c(-0.5, 0.5), xlab = 'Naive correlations [rho]', ylab='Primed correlations [rho]', pch=16, col = colour.code)
legend('topleft', legend = c(paste0('FDR > ', fdr,': ', sum(colour.code=='grey')), 
                             paste0('Sig in both: ', sum(colour.code== 'black')), 
                             paste0('Sig in naive: ', sum(colour.code== 'blue')),
                             paste0('Sig in primed: ', sum(colour.code== 'red'))), 
       pch = 16, col = c('grey', 'black', 'blue', 'red'), bty='n', cex=1.5)

```

# Functional relations

```{r}
library(gdata)
pastor <- read.xls('~/Downloads/41556_2018_89_MOESM7_ESM.xlsx', fill = TRUE, stringsAsFactors = FALSE, header = FALSE, sheet = 'Differentially regulated sets')
qin <- read.xls('~/Downloads/1-s2.0-S2211124716301395-mmc2.xlsx', fill = TRUE, stringsAsFactors = FALSE, header = TRUE)


cur.epi <- which(top.epi %in% qin$SYMBOL)
cur.mark <- which(top.markers %in% qin$SYMBOL)
naive.fdr[cur.mark, cur.epi]
primed.fdr[cur.mark, cur.epi]
points(naive_mat[cur.mark, cur.epi], primed_mat[cur.mark, cur.epi], pch = 16, cex = 1.1, col = 'green')

cur.epi <- which(top.epi %in% pastor$V5 | top.epi %in% pastor$V7)
cur.epi <- which(top.markers %in% pastor$V5 | top.markers %in% pastor$V7)
naive.fdr[cur.mark, cur.epi]
primed.fdr[cur.mark, cur.epi]
points(naive_mat[cur.mark, cur.epi], primed_mat[cur.mark, cur.epi], pch = 16, cex = 1.1, col = 'pink')
legend('bottomright', legend = c('DE in TFAP2C-/-' , 'DE in YAP/LAP'), col = c('pink', 'green'), pch = 16)


length(unique(altogether[altogether$FDR < 0.05,]$gene1))
length(unique(altogether[altogether$FDR < 0.05,]$gene2))
```


```{r}
pastor.genes <- c(pastor$V5, pastor$V7)
pastor.genes <- pastor.genes[-which(pastor.genes == '')]

corrs <- correlatePairs(sce_naive, block=sce_naive$batch, pairings=list('TFAP2C', rownames(sce_naive)))
table(corrs$FDR < 0.05)
corr.genes <- corrs[corrs$FDR < 0.05,]$gene2
table(isCor = rownames(sce_naive) %in% corr.genes, isDE = rownames(sce_naive) %in% pastor.genes)
fisher.test(table(isCor = rownames(sce_naive) %in% corr.genes, isDE = rownames(sce_naive) %in% pastor.genes))
```

```{r}
qin.genes <- qin$SYMBOL

corrs <- correlatePairs(sce_naive, block=sce_naive$batch, pairings=list('YAP1', rownames(sce_naive)))
table(corrs$FDR < 0.1)
corr.genes <- corrs[corrs$FDR < 0.1,]$gene2
table(isCor = rownames(sce_naive) %in% corr.genes, isDE = rownames(sce_naive) %in% qin.genes)
fisher.test(table(isCor = rownames(sce_naive) %in% corr.genes, isDE = rownames(sce_naive) %in% qin.genes))
```

# Session information

```{r}
sessionInfo()
```