--- title: Examining the variability of spike-in addition author: Aaron Lun date: 6 September 2016 output: html_document: fig_caption: no --- ```{r, echo=FALSE, results='hide'} knitr::opts_chunk$set(fig.path="figure-tech/", error=FALSE, message=FALSE, warning=FALSE) options(width=100) if (!exists(".spike.local")) { .spike.local <- TRUE } ``` # Introduction Here, we examine the variability of spike-in addition using the mixture experiment. First, we load the counts for all genes and define their categories, i.e., mouse cellular, ERCC or SIRV spike-ins. ```{r} all.counts <- read.table("genic_counts.tsv", header=TRUE, row.names=1, comment="", check.names=FALSE) gene.lengths <- all.counts[,1] all.counts <- all.counts[,-1] is.ercc <- grepl("ERCC", rownames(all.counts)) is.sirv <- grepl("SIRV", rownames(all.counts)) is.mouse <- !is.ercc & !is.sirv colSums(data.frame(ERCC=sum(is.ercc), SIRV=sum(is.sirv), Mouse=sum(is.mouse))) ``` `r if (.spike.local) { " We define the well identities for each sample, using information in the metadata and manifest files. (The former stores the Sanger ID:well number, and the latter stores the file name:Sanger ID. Hence the need for the contortions below.) " } else { " We load in the annotation. "} ` ```{r, echo=.spike.local, eval=.spike.local} manifest <- read.csv("manifest.csv", header=TRUE, stringsAsFactors=FALSE) metadata <- read.csv("metadata.csv", header=TRUE, stringsAsFactors=FALSE) metadata <- metadata[match(colnames(all.counts), metadata[,4]),] manifest <- manifest[match(metadata$Sanger.sample.ID, manifest$SANGER.SAMPLE.ID),] well.ids <- sub(".*_", "", manifest$SUPPLIER.SAMPLE.NAME) stopifnot(all(!is.na(well.ids))) ``` ```{r, echo=!.spike.local, eval=!.spike.local} anno <- read.table("../../../ArrayExpress/E-MTAB-5522.sdrf.txt", header=TRUE, sep="\t", fill=TRUE, check.names=FALSE, stringsAsFactors=FALSE, comment="") anno <- anno[match(colnames(all.counts), anno[,"Source Name"]),] ``` `r if (.spike.local) { " The positive control is at A1 (50 cells), while the negative control is at H12 (no cells). " } else { " Pulling out the control type for each well. "} ` ```{r, echo=.spike.local, eval=.spike.local} control.type <- rep(NA, length(well.ids)) control.type[well.ids=="A01"] <- "+" control.type[well.ids=="H12"] <- "-" ``` ```{r, echo=!.spike.local, eval=!.spike.local} conanno <- anno[,"Characteristics[single cell well quality]"] control.type <- character(nrow(anno)) control.type[conanno=="50 cells"] <- "+" control.type[conanno=="no cell"] <- "-" ``` We also define the grouping for each sample. Here we have two factors making up six groups in total. The first factor is whether an oncogene has been induced in the cell line. The second factor is how the spike-ins have been added -- ERCCs followed by SIRVs, SIRVs followed by ERCCs, or as a premixed set. ```{r, echo=.spike.local, eval=.spike.local, message=FALSE} library(simpaler) ercc.first <- grepl("[A-Z]0[1234]", well.ids) sirv.first <- grepl("[A-Z]0[5678]", well.ids) premixed <- grepl("[A-Z](09|10|11|12)", well.ids) colSums(data.frame(ERCC=ercc.first, SIRV=sirv.first, Premixed=premixed)) ``` ```{r, echo=!.spike.local, eval=!.spike.local, message=FALSE} library(simpaler) ercc.first <- anno[,"Factor Value[spike-in addition]"]=="ERCC+SIRV" sirv.first <- anno[,"Factor Value[spike-in addition]"]=="SIRV+ERCC" premixed <- anno[,"Factor Value[spike-in addition]"]=="Premixed" colSums(data.frame(ERCC=ercc.first, SIRV=sirv.first, Premixed=premixed)) ``` Now we add all this information together. ```{r} spike.data <- setupSpikes(all.counts, spike1=is.ercc, spike2=is.sirv, separate=ercc.first|sirv.first, premixed=premixed, ercc.first=ercc.first, sirv.first=sirv.first, control.well=control.type) ``` We also add some annotation describing which genes are endogenous, and whether they are mitochondrial. This may be useful in downstream analyses. ```{r} spike.data$genes$mouse <- is.mouse library(TxDb.Mmusculus.UCSC.mm10.ensGene) chr.loc <- select(TxDb.Mmusculus.UCSC.mm10.ensGene, keys=rownames(spike.data), keytype="GENEID", column="CDSCHROM") chr.loc <- chr.loc$CDSCHROM[match(rownames(spike.data), chr.loc$GENEID)] spike.data$genes$is.mito <- chr.loc=="chrM" & !is.na(chr.loc) ``` # Quality control First we save the full data set for future use. ```{r} all.data <- spike.data ``` We check the positive and negative controls. The positive control should be sequenced at the least, and the abundance of endogenous RNA should suppress spike-in coverage (assuming that library quantification was performed). In contrast, the negative control at H12 uses no cells, which means that the spike-ins will dominate the sequencing output for that library. ```{r} poscon <- which(spike.data$samples$control.well=="+") data.frame(Mouse=sum(spike.data$counts[is.mouse,poscon]), ERCC=sum(spike.data$counts[is.ercc,poscon]), SIRV=sum(spike.data$counts[is.sirv,poscon])) negcon <- which(spike.data$samples$control.well=="-") data.frame(Mouse=sum(spike.data$counts[is.mouse,negcon]), ERCC=sum(spike.data$counts[is.ercc,negcon]), SIRV=sum(spike.data$counts[is.sirv,negcon])) spike.data <- spike.data[,-c(poscon, negcon)] ``` We examine the distribution of the sum of counts across all samples, in order to identify samples for which cell capture or library preparation have failed. First, we compute the count sums for each category of genes. ```{r} mouse.sums <- colSums(spike.data$counts[spike.data$genes$mouse,]) spike.data$samples$mouse <- mouse.sums ercc.sums <- spike.data$samples$sum1 sirv.sums <- spike.data$samples$sum2 summary(data.frame(Mouse=mouse.sums, ERCC=ercc.sums, SIRV=sirv.sums)) ``` We then have a look at the distributions of the count sums. ```{r counthist, fig.height=5, fig.width=10} par(mfrow=c(1,3)) hist(mouse.sums, breaks=20, main="Mouse sums", col="grey") hist(ercc.sums, breaks=20, main="ERCC sums", col="grey") hist(sirv.sums, breaks=20, main="SIRV sums", col="grey") ``` We remove libraries with outliers in any of the count sums. ```{r, message=FALSE} library(scater) keep <- !(isOutlier(mouse.sums, nmad=3, log=TRUE, type="lower") | isOutlier(ercc.sums, nmad=3, log=TRUE, type="lower") | isOutlier(sirv.sums, nmad=3, log=TRUE, type="lower")) spike.data <- spike.data[,keep ] sum(keep) ``` Note that we'll be working with count sums, so we don't bother removing low-abundance genes. # Exploring the variability of the spike-in ratios We examine the distribution of the log-ratios for the ERCC to SIRV counts for each combination of factors. First, we set up the groups in the following manner. ```{r} spike.type <- 1*spike.data$samples$ercc.first+2*spike.data$samples$sirv.first+3*spike.data$samples$premixed stopifnot(all(spike.type >= 1 & spike.type <= 3)) grouping <- c("ERCC+SIRV", "SIRV+ERCC", "Premixed")[spike.type] spike.data$samples$group <- grouping log.spikes <- spike.data$samples$ratio ``` We then construct histograms for each combination of factors. Nothing too irregular here, though it seems that the ERCC-first has a couple of outliers. We'll leave them in just in case they represent genuine variability in addition. ```{r ratiohist, fig.width=12, fig.height=6} par(mfrow=c(1, 3)) by.group <- split(log.spikes, grouping) ref <- hist(log.spikes, plot=FALSE, breaks=10) for (g in names(by.group)) { hist(by.group[[g]], main=g, breaks=ref$breaks, ylim=c(0, 10), col="grey") } ``` We estimate the variance of the separate addition groups. There's actually a moderate difference due to order of addition. All in all, though, both values are fairly small, so it's probably safe to pool them together for more precision in the actual analysis. ```{r} keep <- spike.data$samples$separate by.order <- diagnoseVariance(spike.data[,keep], grouping[keep]) by.order ``` We fit separate linear models to the samples with separate and premixed spike-in additions. We then have a look at the variance estimates (or specifically, the standard deviation estimates in `sigma`). As it turns out, they're pretty similar, which suggests that there is minimal variability from spike-in addition. ```{r} design <- model.matrix(~0+grouping) diagnoseVariance(spike.data, spike.data$samples$premixed, design) ``` This can be examined more visually by looking at the distribution of the residuals. We check whether they're reasonably normal. Some minor deviation at the tails, perhaps -- but all in all, normality is probably a good approximation. ```{r, fig.width=10, fig.height=5} keep <- spike.data$samples$separate sep.fit <- lm.fit(spike.data$samples$ratio[keep], x=design[keep,]) sep.resids <- residuals(sep.fit) keep <- spike.data$samples$premixed premix.fit <- lm.fit(spike.data$samples$ratio[keep], x=design[keep,]) premix.resids <- residuals(premix.fit) par(mfrow=c(1,2)) qqnorm(sep.resids) qqline(sep.resids) qqnorm(premix.resids) qqline(premix.resids) ``` # Decomposing the variance components The decomposition can be done simply using the `decomposeVariances` function. We can examine the variance of the added (log-)volume, by subtracting the variance of the log-ratios in the premixed experiment from the variance of separate additions. We also test this difference for significance using a F-test for the equality of variances. ```{r} out <- decomposeVariance(spike.data, design) out ``` Note that this only applies when the count sizes are comparable between the separate and premixed additions. Otherwise, the technical component of the variance will be different between the two estimates. Fortunately, this seems to be the case here, so we can proceed safely. ```{r} summary(spike.data$samples$sum1[spike.data$samples$premixed]) # ERCC, premixed summary(spike.data$samples$sum1[!spike.data$samples$premixed]) # ERCC, separate summary(spike.data$samples$sum2[spike.data$samples$premixed]) # SIRV, premixed summary(spike.data$samples$sum2[!spike.data$samples$premixed]) # SIRV, separate ``` # Computing the variability in cellular RNA For reference, we examine the variability in the spike-ins to the cellular counts. This easily exceeds the variability due to spike-in addition or behaviour, which suggests that those two factors don't really matter for practical purposes. ```{r} cellular <- log2(colSums(spike.data$counts[spike.data$genes$mouse,])/spike.data$samples$sum1) cell.var <- estimateVariance(ratios=cellular, design=design) cell.var ``` We also compute the variability of each of the spike-in totals across all cells, blocking on the various conditions. Again, this provides a contrast to each of the individual variance components, as it implies their effect on the normalization across cells is small. ```{r} ercc.var <- estimateVariance(ratios=log2(spike.data$samples$sum1), design=design) ercc.var sirv.var <- estimateVariance(ratios=log2(spike.data$samples$sum2), design=design) sirv.var ``` # Session information ```{r} sessionInfo() ``` We collect the results of interest and save them into an object. ```{r} saveRDS(list(sfERCC.var=ercc.var, sfSIRV.var=sirv.var, cell.var=cell.var, ratioERCCfirst.var=by.order$var[["ERCC+SIRV"]], ratioERCCsecond.var=by.order$var[["SIRV+ERCC"]], ratioOrder.sig=by.order$pval[1,2], ratioSep.var=out$total, ratioPre.var=out$premixed, ratioVol.var=out$volume, ratioVol.sig=out$pval), file="results.rds") ``` We also save the object to file for downstream uses (e.g., plotting). ```{r} saveRDS(spike.data, file="object.rds") saveRDS(all.data, file="full.rds") ```