---
title: Examining the variability of spike-in addition
author: Aaron Lun
date: 26 January 2016
output:
  html_document:
      fig_caption: no
---

```{r, echo=FALSE, results='hide'}
knitr::opts_chunk$set(fig.path="figure-tech/", error=FALSE, message=FALSE, warning=FALSE)
options(width=100)
if (!exists(".spike.local")) { .spike.local <- TRUE }
```

# Introduction

Here, we examine the variability of spike-in addition using the mixture experiment.
First, we load the counts for all genes and define their categories, i.e., mouse cellular, ERCC or SIRV spike-ins.

```{r}
all.counts <- read.table("genic_counts.tsv", header=TRUE, row.names=1, check.names=FALSE)
gene.lengths <- all.counts[,1]
all.counts <- all.counts[,-1]
is.ercc <- grepl("ERCC", rownames(all.counts))
is.sirv <- grepl("SIRV", rownames(all.counts))
is.mouse <- !is.ercc & !is.sirv
colSums(data.frame(ERCC=sum(is.ercc), SIRV=sum(is.sirv), Mouse=sum(is.mouse)))
```

We also define the grouping for each sample. 
Here we have two factors making up six groups in total.
The first factor is whether an oncogene has been induced in the cell line.
The second factor is how the spike-ins have been added -- ERCCs followed by SIRVs, SIRVs followed by ERCCs, or as a premixed set.

<!--
Only this block changes depending on whether it is run locally (i.e., on the author's machine)
or using public data, via "run_me.sh"
-->

```{r, eval=.spike.local, echo=.spike.local, message=FALSE}
library(simpaler)
is.induced <- grepl("S50[5-8]", colnames(all.counts))
ercc.first <- grepl("N70[1-4]", colnames(all.counts))
sirv.first <- grepl("N70[5-8]", colnames(all.counts))
premixed <- grepl("N7(09|10|11|12)", colnames(all.counts))
colSums(data.frame(Induced=is.induced, ERCC=ercc.first, SIRV=sirv.first, Premixed=premixed))
```

```{r, eval=!.spike.local, echo=!.spike.local, message=FALSE}
library(simpaler)
anno <- read.table("../../../ArrayExpress/E-MTAB-5522.sdrf.txt", header=TRUE, 
    sep="\t", fill=TRUE, check.names=FALSE, stringsAsFactors=FALSE, comment="")
anno <- anno[match(colnames(all.counts), anno[,"Source Name"]),]
is.induced <- anno[,"Factor Value[phenotype]"]=="induced CBFB-MYH11 oncogene expression"
ercc.first <- anno[,"Factor Value[spike-in addition]"]=="ERCC+SIRV"
sirv.first <- anno[,"Factor Value[spike-in addition]"]=="SIRV+ERCC"
premixed <- anno[,"Factor Value[spike-in addition]"]=="Premixed"
colSums(data.frame(Induced=is.induced, ERCC=ercc.first, SIRV=sirv.first, Premixed=premixed))
```

We proceed to make an object containing all of this information.

```{r}
spike.data <- setupSpikes(all.counts, spike1=is.ercc, spike2=is.sirv, 
    separate=ercc.first|sirv.first, premixed=premixed,
    ercc.first=ercc.first, sirv.first=sirv.first, induced=is.induced)
```

We also add some annotation describing which genes are endogenous, and whether they are mitochondrial.
This may be useful in downstream analyses.

```{r}
spike.data$genes$mouse <- is.mouse
library(TxDb.Mmusculus.UCSC.mm10.ensGene)
chr.loc <- select(TxDb.Mmusculus.UCSC.mm10.ensGene, keys=rownames(spike.data), 
    keytype="GENEID", column="CDSCHROM")
chr.loc <- chr.loc$CDSCHROM[match(rownames(spike.data), chr.loc$GENEID)]
spike.data$genes$is.mito <- chr.loc=="chrM" & !is.na(chr.loc)
```

# Quality control

We examine the distribution of the sum of counts across all samples, in order to identify samples for which cell capture or library preparation have failed.
First, we compute the count sums for each category of genes.

```{r}
mouse.sums <- colSums(spike.data$counts[spike.data$genes$mouse,])
spike.data$samples$mouse <- mouse.sums
ercc.sums <- spike.data$samples$sum1
sirv.sums <- spike.data$samples$sum2
summary(data.frame(Mouse=mouse.sums, ERCC=ercc.sums, SIRV=sirv.sums))
```

We then have a look at the distributions of the count sums.

```{r counthist, fig.height=5, fig.width=10}
par(mfrow=c(1,3))
hist(mouse.sums, breaks=20, main="Mouse sums", col="grey")
hist(ercc.sums, breaks=20, main="ERCC sums", col="grey")
hist(sirv.sums, breaks=20, main="SIRV sums", col="grey")
```

We remove libraries with outliers in any of the count sums.

```{r, message=FALSE}
library(scater)
keep <- !(isOutlier(mouse.sums, nmad=3, log=TRUE, type="lower") | 
          isOutlier(ercc.sums, nmad=3, log=TRUE, type="lower") | 
          isOutlier(sirv.sums, nmad=3, log=TRUE, type="lower"))
all.data <- spike.data
spike.data <- spike.data[,keep]
sum(keep)
```

We check that the remaining libraries are distributed as expected along the induced/control groups for CBFB-MYH11-mcherry expression.

```{r}
oncogene.cpm <- edgeR::cpm(spike.data)["CBFB-MYH11-mcherry",]
by.cherry <- split(oncogene.cpm, ifelse(spike.data$samples$induced, "Induced", "Control"))
boxplot(by.cherry, log="y", ylab="Oncogene CPM")
```

Note that we'll be working with count sums, so we don't bother removing low-abundance genes.

# Exploring the variability of the spike-in ratios

We examine the distribution of the log-ratios for the ERCC to SIRV counts for each combination of factors.
First, we set up the groups in the following manner.

```{r}
spike.type <- 1*spike.data$samples$ercc.first+2*spike.data$samples$sirv.first+3*spike.data$samples$premixed
stopifnot(all(spike.type >= 1 & spike.type <= 3))
spike.in <- c("ERCC+SIRV", "SIRV+ERCC", "Premixed")[spike.type]
induced <- ifelse(spike.data$samples$induced, "Induced", "Control")
grouping <- paste0(induced, ".", spike.in)
spike.data$samples$group <- grouping
log.spikes <- spike.data$samples$ratio
```

We then construct histograms for each combination of factors.
Nothing too irregular here -- perhaps only an outlier for the induced premixed set.

```{r ratiohist, fig.width=6, fig.height=12}
par(mfrow=c(3, 2))
by.group <- split(log.spikes, grouping)
ref <- hist(log.spikes, plot=FALSE, breaks=10)
for (g in names(by.group)) {
    hist(by.group[[g]], main=g, breaks=ref$breaks, ylim=c(0, 10), col="grey")
}
```

We estimate the variance of each group, to check that the order of addition and the biological condition doesn't affect the estimates.
This means that we can pool them together for more precision in the actual analysis.
There doesn't seem to be any significant difference, though the number of samples is probably too low to get a reliable gauge.

```{r}
keep <- spike.data$samples$separate
diagnoseVariance(spike.data[,keep], grouping[keep])
keep <- spike.data$samples$premixed
diagnoseVariance(spike.data[,keep], grouping[keep])
```

We repeat this after merging the separate addition groups by the order of addition or by the biological group.
This provides some more power to detect differences (if there are any) by increasing the number of samples.

```{r}
keep <- spike.data$samples$separate
design <- model.matrix(~0+grouping)
by.order <- diagnoseVariance(spike.data[,keep], spike.data$samples$ercc.first[keep], design[keep,])
by.order
by.induced <- diagnoseVariance(spike.data[,keep], spike.data$samples$induced[keep], design[keep,])
by.induced
```

We fit separate linear models to the samples with separate and premixed spike-in additions.
We then have a look at the variance estimates (or specifically, the standard deviation estimates in `sigma`).
As it turns out, they're pretty similar, which suggests that there is minimal variability from spike-in addition.

```{r}
diagnoseVariance(spike.data, spike.data$samples$premixed, design)
```

This can be examined more visually by looking at the distribution of the residuals.
We check whether they're reasonably normal.
Some minor deviation at the tails, perhaps -- but all in all, normality is probably a good approximation.

```{r, fig.width=10, fig.height=5}
keep <- spike.data$samples$separate
sep.fit <- lm.fit(spike.data$samples$ratio[keep], x=design[keep,])
sep.resids <- residuals(sep.fit)
keep <- spike.data$samples$premixed
premix.fit <- lm.fit(spike.data$samples$ratio[keep], x=design[keep,])
premix.resids <- residuals(premix.fit)
par(mfrow=c(1,2))
qqnorm(sep.resids)
qqline(sep.resids)
qqnorm(premix.resids)
qqline(premix.resids)
```

<!--
# Checking the MADs

We can also do this for the median absolute deviation, computed by treating the residual effects as observations.
This provides some protection against any outliers that might be present.
There's no increase here (at all, let alone one that is significant).

```{r}
#sep.mad <- mad(tail(effects(sep.fit), df.residual(sep.fit)))
#premix.mad <- mad(tail(effects(premix.fit), df.residual(premix.fit)))
#sep.mad
#premix.mad
```
-->

# Decomposing the variance components

The decomposition can be done simply using the `decomposeVariances` function.
We can examine the variance of the added (log-)volume, by subtracting the variance of the log-ratios in the premixed experiment from the variance of separate additions.
We also test this difference for significance using a F-test for the equality of variances.

```{r}
out <- decomposeVariance(spike.data, design)
out
```

Note that this only applies when the count sizes are comparable between the separate and premixed additions.
Otherwise, the technical component of the variance will be different between the two estimates.
Fortunately, this seems to be the case here, so we can proceed safely.

```{r}
summary(spike.data$samples$sum1[spike.data$samples$premixed]) # ERCC, premixed
summary(spike.data$samples$sum1[!spike.data$samples$premixed]) # ERCC, separate
summary(spike.data$samples$sum2[spike.data$samples$premixed]) # SIRV, premixed
summary(spike.data$samples$sum2[!spike.data$samples$premixed]) # SIRV, separate
```

<!--
We also calculate the variability of spike-in behaviour between wells.
Most spike-ins are artificial and will behave differently to well-specific conditions.
This results in variable differences in capture efficiency, etc. that has a similar effect to variable addition.

To estimate this variability, we break up the spike-ins into two halves and estimate the ratio between them.
Behaviour should be consistent between transcripts in the same spike-in set, so this variability should only represent technical noise.
With some work, we can figure out the technical variance of each spike-in set (i.e., from sequencing and stochastic capture variability),
and separate this from the behavioural variability, i.e., the variance in the relative capture efficiency between the spike-in sets.

```{r}
out$split
```
-->

# Computing the variability in cellular RNA

For reference, we examine the variability in the spike-ins to the cellular counts.
This easily exceeds the variability due to spike-in addition or behaviour, which suggests that those two factors don't really matter for practical purposes.

```{r}
cellular <- log2(colSums(spike.data$counts[spike.data$genes$mouse,])/spike.data$samples$sum1)
cell.var <- estimateVariance(ratios=cellular, design=design)
cell.var
```

We also compute the variability of each of the spike-in totals across all cells, blocking on the various conditions.
Again, this provides a contrast to each of the individual variance components, as it implies their effect on the normalization across cells is small.

```{r}
ercc.var <- estimateVariance(ratios=log2(spike.data$samples$sum1), design=design)
ercc.var
sirv.var <- estimateVariance(ratios=log2(spike.data$samples$sum2), design=design)
sirv.var
```

# Session information

```{r}
sessionInfo()
```

We collect the results of interest and save them into an object.

```{r}
saveRDS(list(sfERCC.var=ercc.var, sfSIRV.var=sirv.var, cell.var=cell.var,
             ratioERCCfirst.var=by.order$var[["TRUE"]], 
             ratioERCCsecond.var=by.order$var[["FALSE"]], 
             ratioOrder.sig=by.order$pval[1,2],
             ratioSep.var=out$total, ratioPre.var=out$premixed,
             ratioVol.var=out$volume, ratioVol.sig=out$pval),
        file="results.rds")             
```

We also save the object to file for downstream uses (e.g., plotting).

```{r}
saveRDS(spike.data, file="object.rds")
saveRDS(all.data, file="full.rds")
```