###############################################################

dir.create("pics")
names <- c("416B (I)", "416B (II)", "TSC (I)", "TSC (II)")
exp.cols <- c("orange", "purple", "grey80")
spike.cols <- c("grey30", "grey70")
batch.cols <- c("blue", "lightblue", "red", "pink")

###############################################################
# Making a barplot of the variance estimates.

require(simpaler)
total <- premixed <- ERCC.first <- SIRV.first <- volume <- ERCC.sf <- SIRV.sf <- cell <- list()
total.err <- premixed.err <- ERCC.first.err <- SIRV.first.err <- volume.err <- ERCC.sf.err <- SIRV.sf.err <- cell.err <- list()
total.df <- premixed.df <- ERCC.first.df <- SIRV.first.df <- volume.df <- ERCC.sf.df <- SIRV.sf.df <- cell.df <- list()
index <- 1L

for (operator in c("Calero", "Liora")) {
    if (operator=="Calero") {
        datasets <- c("trial_20160113", "trial_20160325")
    } else if (operator=="Liora") {
        datasets <- c("test_20160906", "test_20170201")
    }
    
    for (dataset in datasets) {
        out <- readRDS(file.path(operator, dataset, "analysis", "results.rds"))
        total[[index]] <- out$ratioSep.var
        total.err[[index]] <- attributes(out$ratioSep.var)$standard.error
        total.df[[index]] <- attributes(out$ratioSep.var)$df
        
        premixed[[index]] <- out$ratioPre.var
        premixed.err[[index]] <- attributes(out$ratioPre.var)$standard.error
        premixed.df[[index]] <- attributes(out$ratioPre.var)$df
        
        volume[[index]] <- out$ratioVol.var
        volume.err[[index]] <- attributes(out$ratioVol.var)$standard.error
        volume.df[[index]] <- attributes(out$ratioVol.var)$df
        
        ERCC.sf[[index]] <- out$sfERCC.var
        ERCC.sf.err[[index]] <- attributes(out$sfERCC.var)$standard.error
        ERCC.sf.df[[index]] <- attributes(out$sfERCC.var)$df
        
        SIRV.sf[[index]] <- out$sfSIRV.var
        SIRV.sf.err[[index]] <- attributes(out$sfSIRV.var)$standard.error
        SIRV.sf.df[[index]] <- attributes(out$sfSIRV.var)$df

        cell[[index]] <- out$cell.var
        cell.err[[index]] <- attributes(out$cell.var)$standard.error
        cell.df[[index]] <- attributes(out$cell.var)$df
        
        ERCC.first[[index]] <- out$ratioERCCfirst.var 
        ERCC.first.err[[index]] <- attributes(out$ratioERCCfirst.var)$standard.error
        ERCC.first.df[[index]] <- attributes(out$ratioERCCfirst.var)$df
        
        SIRV.first[[index]] <- out$ratioERCCsecond.var
        SIRV.first.err[[index]] <- attributes(out$ratioERCCsecond.var)$standard.error
        SIRV.first.df[[index]] <- attributes(out$ratioERCCsecond.var)$df
        index <- index + 1L
    }
}

pdf("pics/variance_exp.pdf", width=14, height=7)
par(mar=c(5.1, 5.1, 2.1, 2.1), mfrow=c(1,2), xpd=TRUE)

# First making a plot of separate vs premixed.

final <- rbind(Separate=unlist(total), Premixed=unlist(premixed), Volume=unlist(volume))
colnames(final) <- names
final.err <- rbind(Separate=unlist(total.err), Premixed=unlist(premixed.err), Volume=unlist(volume.err))
upper.limit <- final  + final.err
final.df <- rbind(Separate=unlist(total.df), Premixed=unlist(premixed.df), Volume=unlist(volume.df))

ylim <- 0.03
out <- barplot(final, beside=TRUE, ylab=expression("Variance of"~log[2]~"[ERCC/SIRV]"), 
               cex.axis=1.2, cex.lab=1.4, cex.names=1.4, col=exp.cols, ylim=c(0, ylim))
segments(out, final, y1=upper.limit)
segments(out-0.1, upper.limit, out+0.1)
text(out[-3,], upper.limit[-3,], final.df[-3,], pos=3, cex=0.9, offset=0.2)
legend(out[1]-0.5, ylim, fill=exp.cols, rownames(final), cex=1.2)
curcoords <- par()$usr
mtext("a", line=0, cex=1.5, at=curcoords[1] - 0.14*(curcoords[2] - curcoords[1]), font=2)

final <- rbind(ERCC=unlist(ERCC.sf), SIRV=unlist(SIRV.sf))
colnames(final) <- names
final.err <- rbind(ERCC=unlist(ERCC.sf.err), SIRV=unlist(SIRV.sf.err))
upper.limit <- final  + final.err
final.df <- rbind(ERCC=unlist(ERCC.sf.df), SIRV=unlist(SIRV.sf.df))

# Making a plot of the size factors.

out <- barplot(final, beside=TRUE, ylab=expression("Variance of"~log[2]~"size factors"), 
        cex.axis=1.2, cex.lab=1.4, cex.names=1.4, col=spike.cols, ylim=c(0, 0.3))
segments(out, final, y1=upper.limit)
segments(out-0.1, upper.limit, out+0.1)
text(colMeans(out), apply(upper.limit, 2, max), final.df[1,], pos=3, cex=0.9, offset=0.2)
legend("topright", fill=spike.cols, rownames(final), cex=1.2)
curcoords <- par()$usr
mtext("b", line=0, cex=1.5, at=curcoords[1] - 0.14*(curcoords[2] - curcoords[1]), font=2)
dev.off()

# Ordering the elements.

pdf("pics/variance_order.pdf", width=9, height=7)
par(mar=c(5.1, 5.1, 2.1, 10.1), xpd=TRUE)

final <- rbind("ERCC+SIRV"=unlist(ERCC.first), "SIRV+ERCC"=unlist(SIRV.first))
colnames(final) <- names
final.err <- rbind("ERCC+SIRV"=unlist(ERCC.first.err), "SIRV+ERCC"=unlist(SIRV.first.err))
upper.limit <- final  + final.err
final.df <- rbind("ERCC+SIRV"=unlist(ERCC.first.df), "SIRV+ERCC"=unlist(SIRV.first.df))

out <- barplot(final, beside=TRUE, ylab=expression("Variance of"~log[2]~"[ERCC/SIRV]"), 
               cex.axis=1.2, cex.lab=1.4, cex.names=1.4, col=spike.cols, ylim=c(0, max(upper.limit)))
segments(out, final, y1=upper.limit)
segments(out-0.1, upper.limit, out+0.1)
text(out, upper.limit, final.df, pos=3, cex=0.9, offset=0.2)
legend(max(out)+0.5, max(final), fill=spike.cols, rownames(final), cex=1.2)
dev.off()

# Making a plot of the ratios of mouse totals to the size factors.
pdf("pics/variance_cell.pdf")
par(mar=c(5.1, 5.1, 2.1, 2.1), xpd=TRUE)
final <- unlist(cell)
names(final) <- names
final.err <- unlist(cell.err)
upper.limit <- final  + final.err
final.df <- unlist(cell.df)

out <- barplot(final, beside=TRUE, ylab=expression("Variance of"~log[2]~"[mouse/ERCC]"), 
               cex.axis=1.2, cex.lab=1.4, cex.names=1.4, col="grey80", 
               ylim=c(0, max(upper.limit)))
segments(out, final, y1=upper.limit)
segments(out-0.1, upper.limit, out+0.1)
text(out, upper.limit, final.df, pos=3, cex=0.9, offset=0.2)
dev.off()

###############################################################
# Checking normality of the ratios.

separate <- premixed <- list()
index <- 1L
for (operator in c("Calero", "Liora")) {
    if (operator=="Calero") {
        datasets <- c("trial_20160113", "trial_20160325")
    } else if (operator=="Liora") {
        datasets <- c("test_20160906", "test_20170201")
    }

    for (dataset in datasets) {
        y <- readRDS(file.path(operator, dataset, "analysis", "object.rds"))

        sep <- y$samples$separate
        sep.groups <- factor(y$samples$group[sep])
        if (nlevels(sep.groups)==1L) { 
            design <- cbind(rep(1, length(sep.groups)))
        } else {
            design <- model.matrix(~sep.groups)
        }
        fit <- lm.fit(y$samples$ratio[sep], x=design)
        separate[[index]] <- sort(fit$residuals)

        pre <- y$samples$premixed
        pre.groups <- factor(y$samples$group[pre])
        if (nlevels(pre.groups)==1L) { 
            design <- cbind(rep(1, length(pre.groups)))
        } else {
            design <- model.matrix(~pre.groups)
        }
        fit <- lm.fit(y$samples$ratio[pre], x=design)
        premixed[[index]] <- sort(fit$residuals)
        
        index <- index + 1L
    }
}

pdf("pics/qq_separate.pdf")
collected <- list()
xlim <- ylim <- c(0, 0)
for (index in seq_along(separate)) {
    out <- qqnorm(separate[[index]], plot.it=FALSE)
    out$y <- out$y - mean(out$y)
    out$y <- out$y/sd(out$y)
    collected[[index]] <- out
    xlim <- pmax(xlim, abs(range(out$x)))
    ylim <- pmax(ylim, abs(range(out$y)))
}

plot(0,0,type="n", xlim=c(-xlim[1], xlim[2]), ylim=c(-ylim[1], ylim[2]), main="Separate addition", cex.main=1.4, 
     xlab="Theoretical quantiles", ylab="Sample quantiles", cex.axis=1.2, cex.lab=1.4)
abline(a=0, b=1, col="black", lwd=2, lty=2)
for (index in seq_along(collected)) {
    out <- collected[[index]]
    points(out$x, out$y, col=batch.cols[index], pch=16)
    lines(out$x, out$y, col=batch.cols[index], lwd=2)
}
legend("bottomright", col=batch.cols, lwd=2, pch=16, legend=names, cex=1.2)
dev.off()

pdf("pics/qq_premixed.pdf")
collected <- list()
xlim <- ylim <- c(0, 0)
for (index in seq_along(premixed)) {
    out <- qqnorm(premixed[[index]], plot.it=FALSE)
    out$y <- out$y - mean(out$y)
    out$y <- out$y/sd(out$y)
    collected[[index]] <- out
    xlim <- pmax(xlim, abs(range(out$x)))
    ylim <- pmax(ylim, abs(range(out$y)))
}

plot(0,0,type="n", xlim=c(-xlim[1], xlim[2]), ylim=c(-ylim[1], ylim[2]), main="Premixed addition", cex.main=1.4, 
     xlab="Theoretical quantiles", ylab="Sample quantiles", cex.axis=1.2, cex.lab=1.4)
abline(a=0, b=1, col="black", lwd=2, lty=2)
for (index in seq_along(collected)) {
    out <- collected[[index]]
    points(out$x, out$y, col=batch.cols[index], pch=16)
    lines(out$x, out$y, col=batch.cols[index], lwd=2)
}
dev.off()

pdf("pics/dist_separate.pdf")
xrange <- range(sapply(separate, range)) * 1.5
plot(0,0,type="n", xlim=xrange, ylim=c(0, 6), main="Separate addition", cex.main=1.4, 
     xlab=expression("Residual log"[2]*"-ratio [ERCC/SIRV]"), ylab="Density", cex.axis=1.2, cex.lab=1.4)
for (index in seq_along(separate)) {
    d <- density(separate[[index]], adjust=2)
    lines(d$x, d$y, col=batch.cols[index], lwd=2)
}
dev.off()

pdf("pics/dist_premixed.pdf")
xrange <- range(sapply(premixed, range)) * 1.5
plot(0,0,type="n", xlim=xrange, ylim=c(0, 6), main="Premixed addition", cex.main=1.4, 
     xlab=expression("Residual log"[2]*"-ratio [ERCC/SIRV]"), ylab="Density", cex.axis=1.2, cex.lab=1.4)
for (index in seq_along(premixed)) {
    d <- density(premixed[[index]], adjust=2)
    lines(d$x, d$y, col=batch.cols[index], lwd=2)
}
dev.off()

###############################################################
# Checking that the sums for the spike-ins are the same between premixed and separate additions.

erccs <- sirvs <- list()
erccs.prop <- sirvs.prop <- list()
index <- 1L
for (operator in c("Calero", "Liora")) {
    if (operator=="Calero") {
        datasets <- c("trial_20160113", "trial_20160325")
    } else if (operator=="Liora") {
        datasets <- c("test_20160906", "test_20170201")
    }

    for (dataset in datasets) {
        y <- readRDS(file.path(operator, dataset, "analysis", "object.rds"))
        erccs[[index]] <- y$samples$sum1[y$samples$separate]/1e3
        sirvs[[index]] <- y$samples$sum2[y$samples$separate]/1e3
        erccs.prop[[index]] <- y$samples$sum1[y$samples$separate]/y$samples$lib.size[y$samples$separate]*100
        sirvs.prop[[index]] <- y$samples$sum2[y$samples$separate]/y$samples$lib.size[y$samples$separate]*100
        index <- index + 1L
        erccs[[index]] <- y$samples$sum1[y$samples$premixed]/1e3
        sirvs[[index]] <- y$samples$sum2[y$samples$premixed]/1e3
        erccs.prop[[index]] <- y$samples$sum1[y$samples$premixed]/y$samples$lib.size[y$samples$premixed]*100
        sirvs.prop[[index]] <- y$samples$sum2[y$samples$premixed]/y$samples$lib.size[y$samples$premixed]*100
        index <- index + 1L
    }
}

pdf("pics/total_ercc.pdf")
par(mar=c(5.1, 5.1, 4.1, 2.1))
my.cols <- exp.cols[1:2]
boxplot(erccs, at=rep(c(-0.5, 0.5), 4) + rep(1:4*3, each=2), xaxt="n", log="y", 
        ylab=expression("Total ERCC read count ("*10^3*")"),
        col=rep(my.cols, 4), cex.axis=1.2, cex.lab=1.4)
axis(1, at=1:4*3, names, cex.axis=1.1)
legend("topright", fill=my.cols, legend=c("Separate", "Premixed"), cex=1.2)
dev.off()

pdf("pics/prop_ercc.pdf")
par(mar=c(5.1, 5.1, 4.1, 2.1))
boxplot(erccs.prop, at=rep(c(-0.5, 0.5), 4) + rep(1:4*3, each=2), xaxt="n", log="y", 
        ylab="ERCC proportion (%)", col=rep(my.cols, 4), cex.axis=1.2, cex.lab=1.4)
axis(1, at=1:4*3, names, cex.axis=1.1)
legend("topright", fill=my.cols, legend=c("Separate", "Premixed"), cex=1.2)
dev.off()

pdf("pics/total_sirv.pdf")
par(mar=c(5.1, 5.1, 4.1, 2.1))
boxplot(sirvs, at=rep(c(-0.5, 0.5), 4) + rep(1:4*3, each=2), xaxt="n", log="y", 
        ylab=expression("Total SIRV read count ("*10^3*")"),
        col=rep(my.cols, 4), cex.axis=1.2, cex.lab=1.4)
axis(1, at=1:4*3, names, cex.axis=1.1)
dev.off()

pdf("pics/prop_sirv.pdf")
par(mar=c(5.1, 5.1, 4.1, 2.1))
boxplot(sirvs.prop, at=rep(c(-0.5, 0.5), 4) + rep(1:4*3, each=2), xaxt="n", log="y", 
        ylab="SIRV proportion (%)", col=rep(my.cols, 4), cex.axis=1.2, cex.lab=1.4)
axis(1, at=1:4*3, names, cex.axis=1.1)
dev.off()

#############################################################
# Also plotting the abundances of the various transcripts.

library(edgeR)
index <- 1L
collected.ercc <- list()
collected.sirv <- list()
collected.mouse <- list()
for (operator in c("Calero", "Liora")) {
    if (operator=="Calero") {
        datasets <- c("trial_20160113", "trial_20160325")
    } else if (operator=="Liora") {
        datasets <- c("test_20160906", "test_20170201")
    }
    
    for (dataset in datasets) {
        y <- readRDS(file.path(operator, dataset, "analysis", "object.rds"))
        collected.ercc[[index]] <- setNames(rowMeans(y$counts[y$genes$spike1,]), rownames(y$counts)[y$genes$spike1]) #keeping the name.
        collected.sirv[[index]] <- rowMeans(y$counts[y$genes$spike2,])
        discard <- y$genes$spike1 | y$genes$spike2 | rowSums(y$counts)==0 # Removing spike-ins and non-expressed features.
        collected.mouse[[index]] <- rowMeans(y$counts[!discard,])
        index <- index+1L
    }
}

lFUN <- function(x) { log2(x + 1) }
lcollected.ercc <- lapply(collected.ercc, lFUN)
lcollected.sirv <- lapply(collected.sirv, lFUN)
lcollected.mouse <- lapply(collected.mouse, lFUN)

pdf("pics/feature_abundances.pdf")
my.cols <- c(spike.cols, "grey90")
par(mar=c(5.1, 5.1, 4.1, 2.1))
boxplot(c(lcollected.ercc, lcollected.sirv, lcollected.mouse), col=rep(my.cols, each=4),
        at=cumsum(1 + rep(c(1,0,0,0), 3)), ylab=expression("Log"[2]~"average count"),
        cex.axis=1.2, cex.lab=1.4, names=rep(names, 3), las=2, range=0)
legend("topleft", fill=my.cols, legend=c("ERCC", "SIRV", "Mouse"))
dev.off()

# Plotting against the theoretical concentrations of ERCCs, obtained from
# https://tools.thermofisher.com/content/sfs/manuals/cms_095046.txt

spike.conc <- read.table("https://tools.thermofisher.com/content/sfs/manuals/cms_095046.txt", header=TRUE, row.names=1, check.names=FALSE, sep="\t")
spike.conc <- setNames(spike.conc[,"concentration in Mix 1 (attomoles/ul)"], spike.conc[,"ERCC ID"])

all.yranges <- range(sapply(collected.ercc, FUN=function(x) { range(x[x>0]) }))
all.xranges <- range(spike.conc[spike.conc>0])

pdf("pics/against_theoretical.pdf")
for (x in seq_along(collected.ercc)) {
    observed <- collected.ercc[[x]]
    expected <- spike.conc[names(observed)]
    get.x <- x > 2
    get.y <- x%%2==1
    plot(expected, observed, log="xy", 
         xlab=ifelse(get.x, expression("Theoretical concentration (attomoles/"*mu*"l)"), expression("")),
         ylab=ifelse(get.y, "Observed average read count", ""),
         pch=16, cex.axis=1.2, cex.lab=1.4, main=names[x], cex.main=1.4, ylim=all.yranges, xlim=all.xranges,
         yaxt=ifelse(get.y, "s", "n"), xaxt=ifelse(get.x, "s", "n"))

    # Plotting a line of best fit.
    ly <- log10(observed/expected)
    keep <- is.finite(ly)
    fit <- lm(ly[keep]~1)
    grad <- 10^coef(fit)
    pt.x <- seq(from=min(expected)/10, to=max(expected)*10, length.out=100)
    lines(x=pt.x, y=pt.x*grad, col="red")
}

dev.off()