# This examines the technical variability of spike-ins in other data sets.

library(scater)
library(edgeR)
library(simpaler)
is.first <- TRUE
set.seed(9999)

all.values <- list()
for (dataset in c("Islam2",
                  "Buettner", 
                  "Grun", 
                  "Kolodziejczyk", 
#                  "Islam", 
                  "Hashimshony", 
                  "Liora",
#                  "Wilson", 
                  "Calero", 
                  "Scialdone", 
                  "Zeisel")) {
    val <- readRDS(file.path("../datasets", paste0(dataset, ".rds")))
    incoming <- val$counts
    spike.in <- val$spikes
    design <- val$design
    name <- val$name

    # Quality control on the sums of the spike in counts and endogenous transcripts.
    spike.counts <- as.matrix(incoming[spike.in,])
    other.sums <- colSums(incoming) - colSums(spike.counts)
    f <- designAsFactor(design)
    discard <- isOutlier(colSums(spike.counts), log=TRUE, nmads=3, type="lower", batch=f) |
               isOutlier(other.sums, log=TRUE, nmads=3, type="lower", batch=f)

    # Estimating the mean and dispersion.
    d <- DGEList(spike.counts[,!discard])
    redesign <- design[!discard,,drop=FALSE]
    d <- estimateDisp(d, redesign, prior.df=0, trend.method="none")
    fit <- glmFit(d, redesign, prior.count=0)
    fitted <- fit$fitted.values

    # Simulating data and calculating the variance.
    collected <- vector("list", 20)
    for (it in seq_len(20)) { 
        sim.spikes <- matrix(rnbinom(length(fitted), mu=fitted, size=1/d$tagwise.dispersion),
                             nrow=nrow(fitted), ncol=ncol(fitted))
        spike.totals <- colSums(sim.spikes)
        spike.totals <- spike.totals/mean(spike.totals)
        collected[[it]] <- spike.totals
    }

    spike.totals <- colSums(d$counts)
    spike.totals <- spike.totals/mean(spike.totals)
    collected <- do.call(cbind, collected)/spike.totals
    all.values[[name]] <- apply(collected, 1, sd) * 100
}

pdf("collated_sampling.pdf", width=10, height=6)
par(mar=c(9.1, 4.1, 4.1, 2.1))
boxplot(all.values, ylab="Size factor estimation error (%)", cex.axis=1.2, cex.names=1.2, 
        cex.lab=1.4, col="grey80", las=2)
dev.off()