---
title: "Single-cell RNA-seq of naive/primed embryonic stem cells: overall analysis"
author: Tobias Messmer and Aaron Lun
date: "`r Sys.Date()`"
output: 
  BiocStyle::html_document:
    toc: true
    toc_float: true
    depth: 3
    number_sections: true
    theme: united 
    highlight: tango
    fig_caption: false
---

```{r, echo=FALSE, results="hide"}
dir.create("figure-overall/", showWarning=FALSE)
knitr::opts_chunk$set(error=FALSE, warning=FALSE, message=FALSE, fig.path="figure-overall/")
```

# Reading in the data 

First, we read in the processed data set from the previous script.

```{r readin}
library(scran)
sce <- readRDS("results-preprocess/sce_all.rds")
sce
```

To get a first impression of the data, expression profiles of typical ESC markers are plotted. 
This is done after shuffling the cell order to avoid all naive cells being plotted after primed cells.

```{r explot}
library(scater)
shuffled.sce <- sce[, sample(ncol(sce))]
fontsize <- theme(axis.text=element_text(size=12), axis.title=element_text(size=16))
plotExpression(shuffled.sce, c("KLF4", "KLF17", "DPPA3", "TFCP2L1", "NANOG", "ZFP42", "POU5F1"), 
    colour_by="phenotype", shape_by = "batch", show_violin = FALSE) + fontsize
```

Based on the variabilty of gene expression over the whole population, we can identify subpopulations within the data set. 
As we already know that the cells are either in a naive or primed state, this clustering serves mainly as confirmation. 

Finally, we set up a directory to save various results into.

```{r}
resdir <- "results-overall"
dir.create(resdir, showWarning=FALSE)
```

# Detecting highly variable genes across all cells

We fit a trend to the variances of the spike-ins.
This represents the technical variance at any given mean count.
For each endogenous gene, the difference between the technical and total variance is the biological component of variability.
We do this separately for each batch, as there is at least an order of magnitude difference in the amount of spike-in added to each batch.

```{r trendvarfit, fig.height=6, fig.width=10}
sce <- multiBlockNorm(sce, sce$batch)
var.out <- multiBlockVar(sce, block=sce$batch,
    trend.args=list(parametric=TRUE, loess.args=list(span=0.4)))

par(mfrow=c(1,2))
is.spike <- isSpike(sce)
for (batch in levels(sce$batch)) {
    cur.out <- var.out$per.block[[batch]]
    plot(cur.out$mean, cur.out$total, pch=16, cex=0.6, xlab="Mean log-expression", 
        ylab="Variance of log-expression", main=batch)
    curve(metadata(cur.out)$trend(x), col="dodgerblue", lwd=2, add=TRUE)
    points(cur.out$mean[is.spike], cur.out$total[is.spike], col="red", pch=16)
}
```

The trend passes through the bulk of the spike-ins, which is reassuring.
We save the combined statistics across batches to file.

```{r combinevar}
write.table(file=file.path(resdir, "var.tsv"), 
    var.out[,setdiff(colnames(var.out), "per.block")], # don't save the DFs.
    sep="\t", quote=FALSE, col.names = NA) 
```

The computed variance is used to identify highly variable genes, which are subsequently saved to file. 

```{r highvar}
hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$bio >= 0.5),-7]
nrow(hvg.out)
hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),]
write.table(file = file.path(resdir, "hvg.tsv"), hvg.out, sep="\t", quote=FALSE, col.names = NA) 
head(hvg.out)
```

The most highly variable genes across the whole population are shown to ensure that they are not driven by one or two outliers.

```{r vioplot}
plotExpression(shuffled.sce, rownames(hvg.out)[1:10], point_alpha=0.5, 
    colour_by="phenotype", jitter="jitter") + fontsize 
```

# Dimensionality reduction based on the technical noise

We remove the batch effect prior to performing dimensionality reduction.

```{r}
library(limma)
norm_exprs(sce) <- removeBatchEffect(exprs(sce), batch=sce$batch) 
```

We then estimate the number of PCs to retain based on our estimates of the technical noise.

```{r}
sce <- denoisePCA(sce, var.out, assay.type="norm_exprs")
ncol(reducedDim(sce))
```

We also examine the variance explained by the first few PCs.

```{r}
pc.out <- attr(reducedDim(sce), "percentVar")
plot(pc.out)
```

We save the PCs to file for future use.

```{r}
saveRDS(reducedDim(sce), file=file.path(resdir, "pcs.rds"))
```

# Visualization in low-dimensional space

We perform dimensionality reduction using _t_-SNE, colouring by various attributes.

```{r tsne, fig.height=10, fig.width=10}
set.seed(100)
sce <- runTSNE(sce, use_dimred="PCA", perplexity=30)
tsne1 <- plotTSNE(sce, colour_by=rownames(hvg.out)[1]) +  fontsize
tsne2 <- plotTSNE(sce, colour_by="KLF4") + fontsize
tsne3 <- plotTSNE(sce, colour_by="batch") + fontsize
tsne4 <- plotTSNE(sce, colour_by="phenotype") + fontsize
multiplot(tsne1, tsne3, tsne2, tsne4, cols=2)
```

We repeat this with PCA, which is simpler but provides more stable results.

```{r pca, fig.height=10, fig.width=10}
pca1 <- plotPCA(sce, colour_by=rownames(hvg.out)[1]) + fontsize
pca2 <- plotPCA(sce, colour_by="KLF4") + fontsize
pca3 <- plotPCA(sce, colour_by="batch") + fontsize 
pca4 <- plotPCA(sce, colour_by="phenotype") + fontsize 
multiplot(pca1, pca3, pca2, pca4, cols=2)
```

# Differential expression analysis

## Testing for differences between naive and primed

Finally, we test for differences in expression between the naive and primed conditions.
This is done using _edgeR_, after summing all cells within each batch to create a pseudo-bulk sample.

```{r desetup}
library(edgeR)
de.counts <- sumTechReps(counts(sce), paste(sce$phenotype, sce$batch, sep="."))
de.counts <- de.counts[!isSpike(sce),] # removing spikes
y <- DGEList(de.counts)
```

We construct a new design matrix for the summed samples.

```{r}
ptype <- factor(sub("\\..", "\\1", colnames(de.counts)))
batch <- factor(sub("^.*\\.", "\\1", colnames(de.counts)))
design_pheno <- model.matrix(~0 + ptype + batch) 
```

We run through _edgeR_'s analysis pipeline. 
We first filter out low-abundance genes, as we did not perform any filtering on `sce` during preprocessing.

```{r}
keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size))
y <- y[keep,]
summary(keep)
```

First, we normalize the samples using the TMM method.

```{r}
y <- calcNormFactors(y)
y$samples
```

We estimate the negative binomial dispersion, which represents the variability between replicates.
For true bulk RNA-seq experiments, we'd expect BCV values around 0.1 or lower, though the values here will be higher as they are derived from noisier single-cell values.

```{r nbdisp}
y <- estimateDisp(y, design_pheno)  
plotBCV(y)
```

We fit a generalized linear model, with estimates of the quasi-likelihood dispersion to account for gene-specific variability.

```{r qldisp}
fit <- glmQLFit(y, design_pheno, robust=TRUE)
plotQLDisp(fit)
```

Finally, we set up contrasts between the primed and naive population.
We use the QL F-test to test for significant differences in expression.

```{r ftrest}
con <- makeContrasts(ptypenaive - ptypeprimed, levels = design_pheno)
res <- glmQLFTest(fit, contrast=con)
summary(decideTestsDGE(res))
DEmarkers <- topTags(res, n=Inf)$table
write.table(DEmarkers, file=file.path(resdir, "de.tsv"), sep="\t", quote=FALSE, col.names=NA) 
head(DEmarkers)
```

This is shown in more detail using the smear plot below.

```{r smearplot}
is.sig <- DEmarkers$FDR <= 0.05
plotSmear(res, de.tags=rownames(DEmarkers)[is.sig], cex=0.1)
```

To ensure that the top DE genes are not driven by outliers, we can examine them explicitly.

```{r viode}
plotExpression(shuffled.sce, features = rownames(DEmarkers)[1:10], col = "phenotype")
```

## Interpreting the set of DE genes

We use the DE genes for gene set analyses.
First, we take the set of genes that were upregulated in naive over primed.

```{r upgo}
library(GO.db)
library(org.Hs.eg.db)
entreznames <- as.character(rowData(sce)$entrezgene)
has.entrez <- !is.na(entreznames)
in.universe <- entreznames[has.entrez & rownames(sce) %in% rownames(DEmarkers)]

up.genes <- rownames(DEmarkers)[is.sig & DEmarkers$logFC > 0]
is.up <- rownames(sce) %in% up.genes
in.up <- entreznames[has.entrez & is.up]

go <- goana(in.up, universe=in.universe, species = "Hs")
topgo <- topGO(go, n=Inf, ontology = "BP") 
write.table(file=file.path(resdir, "go_up.tsv"), topgo, sep="\t", quote=FALSE, col.names = NA) 
head(topgo)
```

We repeat the dose for KEGG analyses.

```{r upkegg}
kegg <- kegga(in.up, universe=in.universe, species = "Hs")
topkegg <- topKEGG(kegg, n=Inf)
write.table(file=file.path(resdir, "kegg_up.tsv"), topkegg, sep="\t", quote=FALSE, col.names = NA) 
head(topkegg)
```

We now turn our attention to the downregulated genes, i.e., those expressed in primed and not naive.

```{r downgo}
down.genes <- rownames(DEmarkers)[is.sig & DEmarkers$logFC < 0]
is.down <- rownames(sce) %in% down.genes
in.down <- entreznames[has.entrez & is.down]

go <- goana(in.down, universe=in.universe, species = "Hs")
topgo <- topGO(go, n=Inf, ontology = "BP") 
write.table(file=file.path(resdir, "go_down.tsv"), topgo, sep="\t", quote=FALSE, col.names = NA) 
head(topgo)
```

We repeat for KEGG analyses.

```{r downkegg}
kegg <- kegga(in.down, universe=in.universe, species = "Hs")
topkegg <- topKEGG(kegg, n=Inf)
write.table(file=file.path(resdir, "kegg_down.tsv"), topkegg, sep="\t", quote=FALSE, col.names = NA) 
head(topkegg)
```

# Wrapping up

There's nothing else to save, so we just report the session information here.

```{r}
sessionInfo()
```