---
title: "Single-cell RNA-seq Analysis of naive/primed embryonic stem cells: marker validation in public bulk RNA data"
author: Tobias Messmer and Aaron Lun
date: "`r Sys.Date()`"
output: 
  BiocStyle::html_document:
    toc: true
    toc_float: true
    depth: 3
    number_sections: true
    theme: united 
    highlight: tango 
    fig_caption: false
---

```{r, echo=FALSE, results="hide"}
dir.create("figure-bulk/", showWarning=FALSE)
knitr::opts_chunk$set(error=FALSE, warning=FALSE, message=FALSE, fig.path="figure-bulk/")
options(bitmapType="cairo", width=100)
```

# Validation of naive and primed markers in RNA-seq data from other cell lines

We want to use other cell lines to validate the markers that we identified by DE analysis between our naive and primed cells.
For this, we use the following published bulk RNA-seq data sets of naive and primed hESCs:

- Pastor et al. (2016), _Cell Stem Cell_. Naive Human Pluripotent Cells Feature a Methylation Landscape Devoid of Blastocyst or Germline Memory. 
- Theunissen et al. (2016), _Cell Stem Cell_. Molecular Criteria for Defining the Naive Human Pluripotent State.
- Guo et al. (2017), _Development_. Epigenetic resetting of human pluripotency. 

The aim is to identify DE genes in those data sets and to assess how many of them overlap with our previously identified naive and primed markers.

```{r}
resdir <- "results-bulk"
dir.create(resdir, showWarning=FALSE)
```

# Pastor et al., 2016

First, we read in the data:

```{r pastor.in}
pastor16 <- read.table(file.path("../data/other_counts", 
    "Pastor2016_rawReadcountRNAseq.txt.gz"), 
     sep = "\t", header = TRUE)
gene.names <- pastor16$Probe

# Selecting only the columns containing counts
pastor16 <- as.matrix(pastor16[,14:19]) 
rownames(pastor16) <- gene.names

# Remove duplicated gene names
pastor16 <- pastor16[!duplicated(rownames(pastor16)),] 
dim(pastor16)
```

Next, we test for differences in expression between the naive and primed conditions using `r Biocpkg("edgeR")`.
We first do some filtering:

```{r}
library(edgeR)
y <- DGEList(pastor16)
keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size))
y <- y[keep,]
table(keep)
```

We compute TMM normalization factors:

```{r}
y <- calcNormFactors(y)
y$samples
```

We create a design model based on the phenotype of each sample to estimate the NB dispersion:

```{r pastor.model}
ptype <- factor(rep(c('Primed', 'Naive'), each=3))
design_pheno <- model.matrix(~0 + ptype)
y <- estimateDisp(y, design_pheno)
plotBCV(y)
```

... and also the QL dispersion:

```{r}
fit <- glmQLFit(y, design_pheno, robust=TRUE)
plotQLDisp(fit)
```

We finally test for DE genes between conditions using a log-fold change threshold of 0.5.

```{r pastor.de}
con <- makeContrasts(ptypeNaive - ptypePrimed, levels = design_pheno)
res <- glmTreat(fit, contrast=con, lfc = 0.5)
de.tab.pastor <- topTags(res, n=Inf)$table
saveRDS(de.tab.pastor, file = file.path(resdir, 'de_pastor.rds'))
head(de.tab.pastor)
```

# Theunissen et al., 2016

Again, reading in data:

```{r theu.in}
theunissen16 <- read.table(file.path("../data/other_counts", 
    "Theunissen2016_rawReadcountRNAseq.txt.gz"),
    sep = "\t", header = TRUE)
gene.names <- theunissen16$Probe

# Selecting only the columns containing counts
theunissen16 <- as.matrix(theunissen16[,14:23]) 
rownames(theunissen16) <- gene.names

#Remove duplicated gene names
theunissen16 <- theunissen16[!duplicated(rownames(theunissen16)),] 
dim(theunissen16)
```

We do some filtering:

```{r}
library(edgeR)
y <- DGEList(theunissen16)
keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size))
y <- y[keep,]
table(keep)
```

We compute TMM normalization factors:

```{r}
y <- calcNormFactors(y)
y$samples
```

We create a design model based on the phenotype of each sample to estimate the NB dispersion:

```{r}
ptype <- factor(c('Primed', 
    'Naive', 'Naive', 'Naive', 'Naive', 'Naive', 'Naive', 
    'Primed', 'Primed', 'Primed'))
design_pheno <- model.matrix(~0 + ptype)
y <- estimateDisp(y, design_pheno)
plotBCV(y)
```

... and also the QL dispersion:

```{r}
fit <- glmQLFit(y, design_pheno, robust=TRUE)
plotQLDisp(fit)
```

We finally test for DE genes between conditions using a log-fold change threshold of 0.5.

```{r}
con <- makeContrasts(ptypeNaive - ptypePrimed, levels = design_pheno)
res <- glmTreat(fit, contrast=con, lfc = 0.5)
de.tab.theunissen <- topTags(res, n=Inf)$table
saveRDS(de.tab.theunissen, file = file.path(resdir, 'de_theunissen.rds'))
head(de.tab.theunissen)
```

# Guo et al., 2017

## Overview 

we read in the Guo data set.

```{r guo.in}
guo17 <- read.table(file.path("../data/other_counts", 
    "Guo2017_rawReadcountRNAseq.txt"), sep = "\t", header = TRUE)
guo17 <- guo17[!duplicated(guo17$Probe),] #Remove duplicated gene names
gene.names <- guo17$Probe
rownames(guo17) <- gene.names
```

There's two cell lines in this set: HNES and Shef7.
For consistency with the other analyses (and to minimize differences in power due to differences in residual d.f.), we will analyse them separately from each other.

## HNES1

Extracting out the HNES1 samples:

```{r}
hnes <- as.matrix(guo17[,c(32:34, 26:28)]) 
dim(hnes)
```

We do some filtering:

```{r}
library(edgeR)
y <- DGEList(hnes)
keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size))
y <- y[keep,]
table(keep)
```

We compute TMM normalization factors:

```{r}
y <- calcNormFactors(y)
y$samples
```

We create a design model based on the phenotype of each sample to estimate the NB dispersion:

```{r}
ptype <- factor(rep(c('Naive', 'Primed'), each=3))
design_pheno <- model.matrix(~0 + ptype)
y <- estimateDisp(y, design_pheno)
plotBCV(y)
```

... and also the QL dispersion:

```{r}
fit <- glmQLFit(y, design_pheno, robust=TRUE)
plotQLDisp(fit)
```

We finally test for DE genes between conditions using a log-fold change threshold of 0.5.

```{r}
con <- makeContrasts(ptypeNaive - ptypePrimed, levels = design_pheno)
res <- glmTreat(fit, contrast=con, lfc = 0.5)
de.tab.guo_hnes1 <- topTags(res, n=Inf)$table
saveRDS(de.tab.guo_hnes1, file = file.path(resdir, 'de_guo_hnes1.rds'))
head(de.tab.guo_hnes1)
```

## Shef7

Extracting out the Shef7 samples:

```{r}
shef <- as.matrix(guo17[,c(23:25, 35:37)]) 
dim(hnes)
```

We do some filtering:

```{r}
library(edgeR)
y <- DGEList(shef)
keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size))
y <- y[keep,]
table(keep)
```

We compute TMM normalization factors:

```{r}
y <- calcNormFactors(y)
y$samples
```

We create a design model based on the phenotype of each sample to estimate the NB dispersion:

```{r}
ptype <- factor(rep(c('Naive', 'Primed'), each=3))
design_pheno <- model.matrix(~0 + ptype)
y <- estimateDisp(y, design_pheno)
plotBCV(y)
```

... and also the QL dispersion:

```{r}
fit <- glmQLFit(y, design_pheno, robust=TRUE)
plotQLDisp(fit)
```

We finally test for DE genes between conditions using a log-fold change threshold of 0.5.

```{r}
con <- makeContrasts(ptypeNaive - ptypePrimed, levels = design_pheno)
res <- glmTreat(fit, contrast=con, lfc = 0.5)
de.tab.guo_shef7 <- topTags(res, n=Inf)$table
saveRDS(de.tab.guo_shef7, file = file.path(resdir, 'de_guo_shef7.rds'))
head(de.tab.guo_shef7)
```

# Single-cell marker validation

## Loading in our markers

We load in our top 200 markers for each population:

```{r markers.in}
de.genes <- read.table(file = file.path('results-overall', 'de.tsv'),
    sep = "\t", header = TRUE, row.names = 1)
naive.set <- de.genes[de.genes$logFC > 0 & de.genes$FDR <= 0.05,]
primed.set <- de.genes[de.genes$logFC < 0 & de.genes$FDR <= 0.05,] 
naive.markers <- rownames(naive.set)[1:200]
primed.markers <- rownames(primed.set)[1:200]
```

We filter them for only the genes that also exist in all other data sets.

```{r}
other.names <- list(
    rownames(de.tab.pastor),
    rownames(de.tab.theunissen),
    rownames(de.tab.guo_hnes1),
    rownames(de.tab.guo_shef7)
)
shared.naive.markers <- Reduce(intersect, c(list(naive.markers), other.names))
length(shared.naive.markers)   
shared.primed.markers <- Reduce(intersect, c(list(primed.markers), other.names))
length(shared.primed.markers) 
```

## Visualizing the log-fold changes

We can visualize the logFC of our markers between the naive and primed conditions of the other data sets in a heatmap.
This gives a first estimate whether there's a similar expression trend.

```{r}
bound <- 10
naive_mat <- data.frame(
    Pastor = de.tab.pastor[shared.naive.markers,]$logFC, 
    Theunissen = de.tab.theunissen[shared.naive.markers,]$logFC, 
    Guo.HNES = de.tab.guo_hnes1[shared.naive.markers,]$logFC, 
    Guo.SHEF = de.tab.guo_shef7[shared.naive.markers,]$logFC
)
rownames(naive_mat) <- shared.naive.markers
naive_mat[naive_mat >= bound] <- bound
naive_mat[naive_mat < -bound] <- -bound

primed_mat <- data.frame(
    Pastor = de.tab.pastor[shared.primed.markers,]$logFC, 
    Theunissen = de.tab.theunissen[shared.primed.markers,]$logFC, 
    Guo.HNES = de.tab.guo_hnes1[shared.primed.markers,]$logFC, 
    Guo.SHEF = de.tab.guo_shef7[shared.primed.markers,]$logFC
)
rownames(primed_mat) <- shared.primed.markers 
primed_mat[primed_mat >= bound] <- bound
primed_mat[primed_mat < -bound] <- -bound

library(pheatmap)
anno <- data.frame(Marker = rep(c('naive', 'primed'), 
    c(length(shared.naive.markers), length(shared.primed.markers))))
rownames(anno) <- c(shared.naive.markers, shared.primed.markers)
pheatmap(rbind(naive_mat, primed_mat), annotation_row = anno, 
    cluster_rows=FALSE, cluster_cols = FALSE)
```

## Volcano plots

We use volcano plots to visualize the p-values of our markers in each of the three data sets.
We also add a line representing the FDR threshold in each data set.
First Pastor:

```{r pastor.volcano}
plot(de.tab.pastor$logFC, -log10(de.tab.pastor$PValue), pch = 16, xlab = 'Log FC', 
    ylab = '- Log10(Pval)', bty = 'n', main = 'Pastor, 2016', xlim = c(-11, 11))
points(de.tab.pastor[naive.markers,]$logFC, -log10(de.tab.pastor[naive.markers,]$PValue), 
    pch = 16, cex = 1.5, col = 'orange')
points(de.tab.pastor[primed.markers,]$logFC, -log10(de.tab.pastor[primed.markers,]$PValue), 
    pch = 16, cex = 1.5, col = 'royalblue1')

max.pval <- min(-log10(de.tab.pastor[de.tab.pastor$FDR < 0.05,]$PValue))
lines(x = c(-11, 11), y = c(max.pval, max.pval), col = 'red', lty = 2, lwd = 3)
text(x = -9, y = max.pval, labels = 'FDR < 0.05', pos = 1, col = 'red')
```

Then Theunissen:

```{r theu.volcano}
plot(de.tab.theunissen$logFC, -log10(de.tab.theunissen$PValue), pch = 16, xlab = 'Log FC', 
    ylab = '- Log10(Pval)', bty = 'n', main = 'Theunissen, 2016', xlim = c(-15, 15))
points(de.tab.theunissen[naive.markers,]$logFC, -log10(de.tab.theunissen[naive.markers,]$PValue), 
    pch = 16, cex = 1.5, col = 'orange')
points(de.tab.theunissen[primed.markers,]$logFC, -log10(de.tab.theunissen[primed.markers,]$PValue), 
    pch = 16, cex = 1.5, col = 'royalblue1')

max.pval <- min(-log10(de.tab.theunissen[de.tab.theunissen$FDR < 0.05,]$PValue))
lines(x = c(-15, 15), y = c(max.pval, max.pval), col = 'red', lty = 2, lwd = 3)
text(x = -12, y = max.pval, labels = 'FDR < 0.05', pos = 1, col = 'red')
```

Guo HNES1:

```{r hnes.volcano}
plot(de.tab.guo_hnes1$logFC, -log10(de.tab.guo_hnes1$PValue), pch = 16, xlab = 'Log FC', 
    ylab = '- Log10(Pval)', bty = 'n', main = 'HNES1 (Guo, 2017)', xlim = c(-15, 15))
points(de.tab.guo_hnes1[naive.markers,]$logFC, -log10(de.tab.guo_hnes1[naive.markers,]$PValue), 
    pch = 16, cex = 1.5, col = 'orange')
points(de.tab.guo_hnes1[primed.markers,]$logFC, -log10(de.tab.guo_hnes1[primed.markers,]$PValue), 
    pch = 16, cex = 1.5, col = 'royalblue1')

max.pval <- min(-log10(de.tab.guo_hnes1[de.tab.guo_hnes1$FDR < 0.05,]$PValue))
lines(x = c(-15, 15), y = c(max.pval, max.pval), col = 'red', lty = 2, lwd = 3)
text(x = -12, y = max.pval, labels = 'FDR < 0.05', pos = 1, col = 'red')
```

And Guo Shef7:

```{r shef.volcano}
plot(de.tab.guo_shef7$logFC, -log10(de.tab.guo_shef7$PValue), pch = 16, xlab = 'Log FC', 
    ylab = '- Log10(Pval)', bty = 'n', main = 'SHEF7 (Guo, 2017)', xlim = c(-15, 15))
points(de.tab.guo_shef7[naive.markers,]$logFC, -log10(de.tab.guo_shef7[naive.markers,]$PValue), 
    pch = 16, cex = 1.5, col = 'orange')
points(de.tab.guo_shef7[primed.markers,]$logFC, -log10(de.tab.guo_shef7[primed.markers,]$PValue), 
    pch = 16, cex = 1.5, col = 'royalblue1')

max.pval <- min(-log10(de.tab.guo_shef7[de.tab.guo_shef7$FDR < 0.05,]$PValue))
lines(x = c(-15, 15), y = c(max.pval, max.pval), col = 'red', lty = 2, lwd = 3)
text(x = -12, y = max.pval, labels = 'FDR < 0.05', pos = 1, col = 'red')
```

# Identifying intersecting markers

We want to create a list with all interesecting markers genes by performing an intersection-union test.
For each gene, we take the largest p-value across all four comparisons.
(We'll ignore the HNES1 results as it seems to be an outlier with respect to the number of naive/primed differences.)
If the signs of the log-fold changes are not the same, the P-values is set to 1.

```{r intersect.fc}
common.genes <- Reduce(intersect, list(
    rownames(de.tab.pastor), 
    rownames(de.tab.theunissen), 
    rownames(de.tab.guo_shef7), 
    rownames(de.genes))
)

all.stats <- list(
    de.tab.pastor[common.genes,],
    de.tab.theunissen[common.genes,],
    de.tab.guo_shef7[common.genes,],
    de.genes[common.genes,]
)

max.pval <- do.call(pmax, lapply(all.stats, "[[", "PValue"))
lfc.sign <- Reduce("+", lapply(all.stats, FUN=function(x) x$logFC > 0))
max.pval[lfc.sign > 0L & lfc.sign < length(all.stats)] <- 1 # inconsistent directions.
```
We create a table that contains the log-fold changes from each comparison and add the p-value that was just computed.

```{r intersect.pval}
de.list <- data.frame(
    H9NK2 = de.genes[common.genes,]$logFC, 
    UCLA1 = de.tab.pastor[common.genes,]$logFC, 
    WIBR3 = de.tab.theunissen[common.genes,]$logFC, 
    SHEF7 = de.tab.guo_shef7[common.genes,]$logFC,
    PValue = max.pvals,
    row.names = common.genes)
```

Finally, we correct the p-values for multiple testing using Benjamini-Hochberg, order the table for increasing FDR and export the table.

```{r intersect.exp}
de.list$FDR <- p.adjust(de.list$PVal, method = 'BH')
de.list <- de.list[order(de.list$PValue),]
write.table(de.list, file = file.path(resdir, 'overlapping.markers.tsv'), 
    sep="\t", quote=FALSE, col.names=NA)
head(de.list)
```

# Wrapping up

To wrap this story up, we report the session information.

```{r end}
sessionInfo()
```