---
title: Examining the variability of spike-in addition
author: Aaron Lun
date: 6 September 2016
output:
  html_document:
      fig_caption: no
---

```{r, echo=FALSE, results='hide'}
knitr::opts_chunk$set(fig.path="figure-tech/", error=FALSE, message=FALSE, warning=FALSE)
options(width=100)
if (!exists(".spike.local")) { 
    .spike.local <- TRUE
}
```

# Introduction

Here, we examine the variability of spike-in addition using the mixture experiment.
First, we load the counts for all genes and define their categories, i.e., mouse cellular, ERCC or SIRV spike-ins.

```{r}
all.counts <- read.table("genic_counts.tsv", header=TRUE, row.names=1, comment="", check.names=FALSE)
gene.lengths <- all.counts[,1]
all.counts <- all.counts[,-1]
is.ercc <- grepl("ERCC", rownames(all.counts))
is.sirv <- grepl("SIRV", rownames(all.counts))
is.mouse <- !is.ercc & !is.sirv
colSums(data.frame(ERCC=sum(is.ercc), SIRV=sum(is.sirv), Mouse=sum(is.mouse)))
```

<!--
The evaluation of some code blocks depends on whether
they are executed locally (i.e., on the author's machine)
or using the public data. Results are, fortunately, the same.
-->

`r if (.spike.local) { "
We define the well identities for each sample, using information in the metadata and manifest files.
(The former stores the Sanger ID:well number, and the latter stores the file name:Sanger ID. 
Hence the need for the contortions below.)
" } else { "
We load in the annotation.
"}
`

```{r, echo=.spike.local, eval=.spike.local}
manifest <- read.csv("manifest.csv", header=TRUE, stringsAsFactors=FALSE)
metadata <- read.csv("metadata.csv", header=TRUE, stringsAsFactors=FALSE)
metadata <-  metadata[match(colnames(all.counts), metadata[,4]),]
manifest <- manifest[match(metadata$Sanger.sample.ID, manifest$SANGER.SAMPLE.ID),]
well.ids <- sub(".*_", "", manifest$SUPPLIER.SAMPLE.NAME)
stopifnot(all(!is.na(well.ids)))
```

```{r, echo=!.spike.local, eval=!.spike.local} 
anno <- read.table("../../../ArrayExpress/E-MTAB-5522.sdrf.txt", header=TRUE, 
    sep="\t", fill=TRUE, check.names=FALSE, stringsAsFactors=FALSE, comment="")
anno <- anno[match(colnames(all.counts), anno[,"Source Name"]),]
```


`r if (.spike.local) { "
The positive control is at A1 (50 cells), while the negative control is at H12 (no cells).
" } else { "
Pulling out the control type for each well.
"}
`

```{r, echo=.spike.local, eval=.spike.local}
control.type <- rep(NA, length(well.ids))
control.type[well.ids=="A01"] <- "+"
control.type[well.ids=="H12"] <- "-"
```

```{r, echo=!.spike.local, eval=!.spike.local}
conanno <- anno[,"Characteristics[single cell well quality]"]
control.type <- character(nrow(anno))
control.type[conanno=="50 cells"] <- "+"
control.type[conanno=="no cell"] <- "-"
```

We also define the grouping for each sample. 
Here we have two factors making up six groups in total.
The first factor is whether an oncogene has been induced in the cell line.
The second factor is how the spike-ins have been added -- ERCCs followed by SIRVs, SIRVs followed by ERCCs, or as a premixed set.

```{r, echo=.spike.local, eval=.spike.local, message=FALSE}
library(simpaler)
ercc.first <- grepl("[A-Z]0[1234]", well.ids) 
sirv.first <- grepl("[A-Z]0[5678]", well.ids)
premixed <- grepl("[A-Z](09|10|11|12)", well.ids) 
colSums(data.frame(ERCC=ercc.first, SIRV=sirv.first, Premixed=premixed))
```

```{r, echo=!.spike.local, eval=!.spike.local, message=FALSE}
library(simpaler)
ercc.first <- anno[,"Factor Value[spike-in addition]"]=="ERCC+SIRV"
sirv.first <- anno[,"Factor Value[spike-in addition]"]=="SIRV+ERCC"
premixed <- anno[,"Factor Value[spike-in addition]"]=="Premixed"
colSums(data.frame(ERCC=ercc.first, SIRV=sirv.first, Premixed=premixed))
```

Now we add all this information together.

```{r}
spike.data <- setupSpikes(all.counts, spike1=is.ercc, spike2=is.sirv, 
    separate=ercc.first|sirv.first, premixed=premixed,
    ercc.first=ercc.first, sirv.first=sirv.first, 
    control.well=control.type)
```

We also add some annotation describing which genes are endogenous, and whether they are mitochondrial.
This may be useful in downstream analyses.

```{r}
spike.data$genes$mouse <- is.mouse
library(TxDb.Mmusculus.UCSC.mm10.ensGene)
chr.loc <- select(TxDb.Mmusculus.UCSC.mm10.ensGene, keys=rownames(spike.data), 
    keytype="GENEID", column="CDSCHROM")
chr.loc <- chr.loc$CDSCHROM[match(rownames(spike.data), chr.loc$GENEID)]
spike.data$genes$is.mito <- chr.loc=="chrM" & !is.na(chr.loc)
```

# Quality control

First we save the full data set for future use.

```{r}
all.data <- spike.data
```

We check the positive and negative controls.
The positive control should be sequenced at the least, and the abundance of endogenous RNA should suppress spike-in coverage (assuming that library quantification was performed).
In contrast, the negative control at H12 uses no cells, which means that the spike-ins will dominate the sequencing output for that library.

```{r}
poscon <- which(spike.data$samples$control.well=="+")
data.frame(Mouse=sum(spike.data$counts[is.mouse,poscon]), 
    ERCC=sum(spike.data$counts[is.ercc,poscon]), 
    SIRV=sum(spike.data$counts[is.sirv,poscon]))
negcon <- which(spike.data$samples$control.well=="-")
data.frame(Mouse=sum(spike.data$counts[is.mouse,negcon]), 
    ERCC=sum(spike.data$counts[is.ercc,negcon]), 
    SIRV=sum(spike.data$counts[is.sirv,negcon]))
spike.data <- spike.data[,-c(poscon, negcon)]
```

We examine the distribution of the sum of counts across all samples, in order to identify samples for which cell capture or library preparation have failed.
First, we compute the count sums for each category of genes.

```{r}
mouse.sums <- colSums(spike.data$counts[spike.data$genes$mouse,])
spike.data$samples$mouse <- mouse.sums
ercc.sums <- spike.data$samples$sum1
sirv.sums <- spike.data$samples$sum2
summary(data.frame(Mouse=mouse.sums, ERCC=ercc.sums, SIRV=sirv.sums))
```

We then have a look at the distributions of the count sums.

```{r counthist, fig.height=5, fig.width=10}
par(mfrow=c(1,3))
hist(mouse.sums, breaks=20, main="Mouse sums", col="grey")
hist(ercc.sums, breaks=20, main="ERCC sums", col="grey")
hist(sirv.sums, breaks=20, main="SIRV sums", col="grey")
```

We remove libraries with outliers in any of the count sums.

```{r, message=FALSE}
library(scater)
keep <- !(isOutlier(mouse.sums, nmad=3, log=TRUE, type="lower") | 
          isOutlier(ercc.sums, nmad=3, log=TRUE, type="lower") | 
          isOutlier(sirv.sums, nmad=3, log=TRUE, type="lower")) 
spike.data <- spike.data[,keep ]
sum(keep)
```

Note that we'll be working with count sums, so we don't bother removing low-abundance genes.

# Exploring the variability of the spike-in ratios

We examine the distribution of the log-ratios for the ERCC to SIRV counts for each combination of factors.
First, we set up the groups in the following manner.

```{r}
spike.type <- 1*spike.data$samples$ercc.first+2*spike.data$samples$sirv.first+3*spike.data$samples$premixed
stopifnot(all(spike.type >= 1 & spike.type <= 3))
grouping <- c("ERCC+SIRV", "SIRV+ERCC", "Premixed")[spike.type]
spike.data$samples$group <- grouping
log.spikes <- spike.data$samples$ratio
```

We then construct histograms for each combination of factors.
Nothing too irregular here, though it seems that the ERCC-first has a couple of outliers.
We'll leave them in just in case they represent genuine variability in addition.

```{r ratiohist, fig.width=12, fig.height=6}
par(mfrow=c(1, 3))
by.group <- split(log.spikes, grouping)
ref <- hist(log.spikes, plot=FALSE, breaks=10)
for (g in names(by.group)) {
    hist(by.group[[g]], main=g, breaks=ref$breaks, ylim=c(0, 10), col="grey")
}
```

We estimate the variance of the separate addition groups.
There's actually a moderate difference due to order of addition.
All in all, though, both values are fairly small, so it's probably safe to pool them together for more precision in the actual analysis.

```{r}
keep <- spike.data$samples$separate
by.order <- diagnoseVariance(spike.data[,keep], grouping[keep])
by.order
```

We fit separate linear models to the samples with separate and premixed spike-in additions.
We then have a look at the variance estimates (or specifically, the standard deviation estimates in `sigma`).
As it turns out, they're pretty similar, which suggests that there is minimal variability from spike-in addition.

```{r}
design <- model.matrix(~0+grouping)
diagnoseVariance(spike.data, spike.data$samples$premixed, design)
```

This can be examined more visually by looking at the distribution of the residuals.
We check whether they're reasonably normal.
Some minor deviation at the tails, perhaps -- but all in all, normality is probably a good approximation.

```{r, fig.width=10, fig.height=5}
keep <- spike.data$samples$separate
sep.fit <- lm.fit(spike.data$samples$ratio[keep], x=design[keep,])
sep.resids <- residuals(sep.fit)
keep <- spike.data$samples$premixed
premix.fit <- lm.fit(spike.data$samples$ratio[keep], x=design[keep,])
premix.resids <- residuals(premix.fit)
par(mfrow=c(1,2))
qqnorm(sep.resids)
qqline(sep.resids)
qqnorm(premix.resids)
qqline(premix.resids)
```

<!--
# Checking the MADs

We can also do this for the median absolute deviation, computed by treating the residual effects as observations.
This provides some protection against any outliers that might be present.
There's no increase here (at all, let alone one that is significant).

```{r}
#sep.mad <- mad(tail(effects(sep.fit), df.residual(sep.fit)))
#premix.mad <- mad(tail(effects(premix.fit), df.residual(premix.fit)))
#sep.mad
#premix.mad
```
-->

# Decomposing the variance components

The decomposition can be done simply using the `decomposeVariances` function.
We can examine the variance of the added (log-)volume, by subtracting the variance of the log-ratios in the premixed experiment from the variance of separate additions.
We also test this difference for significance using a F-test for the equality of variances.

```{r}
out <- decomposeVariance(spike.data, design)
out
```

Note that this only applies when the count sizes are comparable between the separate and premixed additions.
Otherwise, the technical component of the variance will be different between the two estimates.
Fortunately, this seems to be the case here, so we can proceed safely.

```{r}
summary(spike.data$samples$sum1[spike.data$samples$premixed]) # ERCC, premixed
summary(spike.data$samples$sum1[!spike.data$samples$premixed]) # ERCC, separate
summary(spike.data$samples$sum2[spike.data$samples$premixed]) # SIRV, premixed
summary(spike.data$samples$sum2[!spike.data$samples$premixed]) # SIRV, separate
```

<!--
We also calculate the variability of spike-in behaviour between wells.
Most spike-ins are artificial and will behave differently to well-specific conditions.
This results in variable differences in capture efficiency, etc. that has a similar effect to variable addition.

To estimate this variability, we break up the spike-ins into two halves and estimate the ratio between them.
Behaviour should be consistent between transcripts in the same spike-in set, so this variability should only represent technical noise.
With some work, we can figure out the technical variance of each spike-in set (i.e., from sequencing and stochastic capture variability),
and separate this from the behavioural variability, i.e., the variance in the relative capture efficiency between the spike-in sets.

```{r}
out$split
```
-->

# Computing the variability in cellular RNA

For reference, we examine the variability in the spike-ins to the cellular counts.
This easily exceeds the variability due to spike-in addition or behaviour, which suggests that those two factors don't really matter for practical purposes.

```{r}
cellular <- log2(colSums(spike.data$counts[spike.data$genes$mouse,])/spike.data$samples$sum1)
cell.var <- estimateVariance(ratios=cellular, design=design)
cell.var
```

We also compute the variability of each of the spike-in totals across all cells, blocking on the various conditions.
Again, this provides a contrast to each of the individual variance components, as it implies their effect on the normalization across cells is small.

```{r}
ercc.var <- estimateVariance(ratios=log2(spike.data$samples$sum1), design=design)
ercc.var
sirv.var <- estimateVariance(ratios=log2(spike.data$samples$sum2), design=design)
sirv.var
```

# Session information

```{r}
sessionInfo()
```

We collect the results of interest and save them into an object.

```{r}
saveRDS(list(sfERCC.var=ercc.var, sfSIRV.var=sirv.var, cell.var=cell.var,
             ratioERCCfirst.var=by.order$var[["ERCC+SIRV"]], 
             ratioERCCsecond.var=by.order$var[["SIRV+ERCC"]], 
             ratioOrder.sig=by.order$pval[1,2],
             ratioSep.var=out$total, ratioPre.var=out$premixed,
             ratioVol.var=out$volume, ratioVol.sig=out$pval),
        file="results.rds")             
```

We also save the object to file for downstream uses (e.g., plotting).

```{r}
saveRDS(spike.data, file="object.rds")
saveRDS(all.data, file="full.rds")
```