# This code estimates the increase in technical variance with decreasing depth. library(simpaler) getRescalingFunction <- function(d, design) { # Estimating the NB dispersion, fitting a GLM. d <- estimateDisp(d, design, prior.df=0, trend.method="none") fit <- glmFit(d, design) # Calculating new fitted values with a common offset for all cells. fitted <- exp(fit$unshrunk.coefficients %*% t(design) + mean(fit$offset)) fitted[is.na(fitted)] <- 0 # Obtaining non-NA log-means and log-dispersions for trend fitting. means <- rowMeans(fitted) disp <- fit$dispersion lmean <- log(means) ldisp <- log(disp) keep <- is.finite(lmean) lmean <- lmean[keep] ldisp <- ldisp[keep] trend <- loess(ldisp ~ lmean, degree=1) # Returning a function to compute statistics given a downscaling factor. function(scale) { new.fitted <- fitted*scale new.means <- rowMeans(new.fitted) lnew.means <- log(new.means)[keep] new.disp <- rep(NA_real_, length(disp)) new.disp[keep] <- exp(predict(trend, data.frame(lmean = lnew.means)) + residuals(trend)) # Defining the upper bound as the largest observed variance # (or the left edge of the trend, whichever is largest). upper.bound <- exp(max(c(ldisp, fitted(trend)))) new.disp <- pmin(upper.bound, new.disp) # predict.loess doesn't extrapolate, so we set NA values to the upper bound. new.disp[is.na(new.disp)] <- upper.bound return(list(fitted=new.fitted, disp=new.disp)) } } # pdf("diagnostic.pdf") # plot(lmean, ldisp, xlab="Log-mean", ylab="Log-Dispersion") # o <- order(lmean) # lines(lmean[o], fitted(trend)[o], col="red") # out <- FUN(0.1) # points(log(rowMeans(out$fitted)), log(out$disp), col="blue", pch=16, cex=0.5) # dev.off() pdf("depth_effect.pdf") par(mar=c(5.1, 5.1, 4.1, 2.1)) for (operator in c("Calero", "Liora")) { if (operator=="Calero") { datasets <- c("trial_20160113", "trial_20160325") } else if (operator=="Liora") { datasets <- c("test_20160906", "test_20170201") } collected.cov <- collected.est <- collected.se <- list() all.props <- c(1, 0.8, 0.6, 0.4, 0.2, 0.1, 0.05, 0.02, 0.01) for (dataset in datasets) { loadin <- readRDS(file.path("../", operator, dataset, "analysis/object.rds")) keep <- loadin$samples$premixed my.counts <- loadin$counts[,keep] if (operator=="Liora") { design <- matrix(1, nrow=ncol(my.counts)) top <- "TSC" } else { induction <- factor(loadin$samples$induced[keep]) design <- model.matrix(~induction) top <- "416B" } library(edgeR) d <- DGEList(my.counts[grepl("ERCC|SIRV", rownames(my.counts)),]) # Setting offsets for each transcript, based on the set of origin. is.ercc <- grepl("ERCC", rownames(d)) is.sirv <- !is.ercc totals <- list(colSums(d$counts[is.ercc,,drop=FALSE]), colSums(d$counts[is.sirv,,drop=FALSE])) offsets <- do.call(rbind, totals[1 + is.sirv]) d <- scaleOffset(d, log(offsets)) FUN <- getRescalingFunction(d, design) # Simulating to estimate the effect of depth on the variability of ratios. # Note that we assume the offsets are true when estimating the dispersions. # This means dispersions may be underestimated for high-abundance genes (and overestimated for low-abundance genes). set.seed(1000) collected <- list() for (p in seq_along(all.props)) { prop <- all.props[p] to.collect <- list() for (it in seq_len(20)) { new.spikes <- FUN(prop) ercc.fitted <- new.spikes$fitted[is.ercc,,drop=FALSE] ercc.disp <- new.spikes$disp[is.ercc] sim.ercc <- matrix(rnbinom(length(ercc.fitted), mu=ercc.fitted, size=1/ercc.disp), nrow=nrow(ercc.fitted), ncol=ncol(ercc.fitted)) sirv.fitted <- new.spikes$fitted[is.sirv,,drop=FALSE] sirv.disp <- new.spikes$disp[is.sirv] sim.sirv <- matrix(rnbinom(length(sirv.fitted), mu=sirv.fitted, size=1/sirv.disp), nrow=nrow(sirv.fitted), ncol=ncol(sirv.fitted)) ratios <- log2(colSums(sim.ercc)/colSums(sim.sirv)) if (any(!is.finite(ratios))) { warning("skipping iteration with non-finite ratios") next } to.collect[[it]] <- estimateVariance(ratios=ratios, design=design) } to.collect <- unlist(to.collect) collected[[p]] <- to.collect } collected.cov[[dataset]] <- mean(colSums(d$counts)) collected.est[[dataset]] <- sapply(collected, mean) collected.se[[dataset]] <- sapply(collected, sd)/sqrt(lengths(collected)) } upper <- mapply("+", collected.est, collected.se, SIMPLIFY=FALSE) ylow <- min(sapply(collected.est, min)) yhigh <- max(sapply(upper, max)) xlow <- min(sapply(collected.cov, min)) xhigh <- max(sapply(collected.cov, max)) collected.cov <- lapply(collected.cov, FUN=function(x) { x * all.props/1000 }) # Making a plot. cols <- c("black", "grey70") plot(collected.cov[[1]], collected.est[[1]], xlab=expression("Total spike-in counts, ERCC + SIRV ("*10^3*")"), ylab=expression("Variance of"~log[2]~"[ERCC/SIRV]"), ylim=c(ylow, yhigh), pch=16, log="y", col=cols[1], cex.axis=1.2, cex.lab=1.4) points(collected.cov[[2]], collected.est[[2]], pch=17, col=cols[2]) for (x in seq_along(collected.est)) { lines(collected.cov[[x]], collected.est[[x]], col=cols[x]) segments(collected.cov[[x]], collected.est[[x]], y1=upper[[x]], col=cols[x]) segments(collected.cov[[x]]-1, upper[[x]], collected.cov[[x]]+1, upper[[x]], col=cols[x]) } legend("topright", col=cols, lwd=2, pch=16:17, sprintf("%s (%s)", top, c("I", "II")), cex=1.2) # Saving results somewhere. for (x in seq_along(datasets)) { write.table(file=paste0(operator, "_", datasets[x], ".txt"), data.frame(Coverage=collected.cov[[x]], Variance=collected.est[[x]], SE=collected.se[[x]]), sep="\t", quote=FALSE, row.names=FALSE) } } dev.off()