# This examines the technical variability of spike-ins in other data sets.

library(scater)
library(edgeR)
library(simpaler)
is.first <- TRUE
set.seed(9999)

all.values <- list()
for (dataset in c("Islam2",
                  "Buettner", 
                  "Grun", 
                  "Kolodziejczyk", 
#                  "Islam", 
                  "Hashimshony", 
                  "Liora",
#                  "Wilson", 
                  "Calero", 
                  "Scialdone", 
                  "Zeisel")) {
    val <- readRDS(file.path("../datasets", paste0(dataset, ".rds")))
    incoming <- val$counts
    spike.in <- val$spikes
    design <- val$design
    name <- val$name

    # Quality control on the sums of the spike in counts and endogenous transcripts.
    spike.counts <- as.matrix(incoming[spike.in,])
    other.sums <- colSums(incoming) - colSums(spike.counts)
    f <- designAsFactor(design)
    discard <- isOutlier(colSums(spike.counts), log=TRUE, nmads=3, type="lower", batch=f) |
               isOutlier(other.sums, log=TRUE, nmads=3, type="lower", batch=f)

    # Regressing out any blocking effects.
    spike.totals <- log2(colSums(spike.counts[,!discard]))
    fit <- lm.fit(y=spike.totals, x=design[!discard,,drop=FALSE])
    all.values[[name]] <- (2^abs(fit$residuals) - 1)*100
}

pdf("collated_sizefac.pdf", width=10, height=6)
par(mar=c(9.1, 4.1, 4.1, 2.1))
boxplot(all.values, ylab="Absolute deviation from mean (%)", cex.axis=1.2, cex.names=1.2, 
        cex.lab=1.4, col="grey80", las=2, ylim=c(0, 100))
dev.off()