################################################################################# # This performs a differential expression analysis of various data sets, # exploiting the presence of spike-ins for normalization. suppressPackageStartupMessages(require(simpaler)) suppressPackageStartupMessages(require(scran)) suppressPackageStartupMessages(require(edgeR)) suppressPackageStartupMessages(require(scater)) suppressPackageStartupMessages(library(MAST)) # Note that Kharchenko's SCDE package doesn't seem to support spike-in normalization. fdr.threshold <- 0.05 top.hits <- 200 temp <- "temp.txt" if (file.exists(temp)) { unlink(temp) } filled <- FALSE ################################################################################# for (datatype in c("Calero", # "Islam", "Buettner", "Grun", "Kolodziejczyk")) { val <- readRDS(file.path("../datasets", paste0(datatype, ".rds"))) incoming <- val$counts spike.in <- val$spikes design <- val$design if (datatype=="Islam") { test.coef <- "groupingMEF" } else if (datatype=="Calero") { test.coef <- "GroupInduced" } else if (datatype=="Buettner") { test.coef <- "groupingG2M" } else if (datatype=="Grun") { test.coef <- "groupingserum" } else if (datatype=="Kolodziejczyk") { test.coef <- "conditiona2i" } # Quality control on cells. totals <- colSums(incoming) f <- designAsFactor(design) okay.libs <- !isOutlier(totals, nmad=3, log=TRUE, type="lower", batch=f) & !isOutlier(colSums(incoming!=0), nmad=3, log=TRUE, type="lower", batch=f) & !isOutlier(colSums(incoming[spike.in,])/totals, nmad=3, type="higher", batch=f) incoming <- incoming[,okay.libs] design <- design[okay.libs,,drop=FALSE] filter.keep <- calcAverage(incoming) >= 0.1 spike.param <- spikeParam(incoming[spike.in & filter.keep,], design=design) cell.counts <- incoming[!spike.in & filter.keep,] ################################################################################# # Running edgeR first. y.ref <- DGEList(cell.counts) for (my.var in c(0.015)) { set.seed(231234) results <- top.set <- list() for (i in seq(0,20)) { # Running original data, then resampled versions. if (i) { spike.data <- resampleSpikes(spike.param, var.log=my.var) } else { spike.data <- spike.param$counts } # Normalizing on the spike-in counts. y <- y.ref y$samples$norm.factors <- colSums(spike.data)/y$samples$lib.size # Running edgeR. y <- estimateDisp(y, design, robust=TRUE) fit <- glmFit(y, design) res <- glmLRT(fit, coef=test.coef) # Check if diagnostics look okay: if (i==0) { pdf(paste0("diagnostics_", datatype, ".pdf")) plotBCV(y) # Nice downward trend limma::plotMDS(edgeR::cpm(y, log=TRUE), pch=16, col=c("red", "blue")[(as.integer(design[,test.coef])==1)+1L]) # Mostly separated dev.off() } # Comparing. chosen <- p.adjust(res$table$PValue, method="BH") <= fdr.threshold top.ranked <- rank(res$table$PValue) <= top.hits if (i) { results[[i]] <- sum(original!=chosen)/sum(original) top.set[[i]] <- sum(best!=top.ranked)/top.hits } else { original <- chosen best <- top.ranked } if (i<=1) { # Top genes also look like the ones listed in Islam's paper, which is good, e.g. Sparc, S100a6, Vim, Fn1. de.out <- topTags(res, n=Inf) write.table(file=paste0("edgeR_", datatype, "_", i, ".tsv"), de.out, row.names=TRUE, quote=FALSE, sep="\t", col.names=NA) diag.done <- TRUE } } final <- do.call(rbind, results) colnames(final) <- "AllDE" top.lost <- do.call(rbind, top.set) colnames(top.lost) <- paste0("Top", top.hits) write.table(file=temp, data.frame(Dataset=datatype, Variance=my.var, Method="edgeR", Total=length(original), Detected=sum(original), final, top.lost), row.names=FALSE, col.names=!filled, quote=FALSE, sep="\t", append=filled) filled <- TRUE } ################################################################################# # Also, MAST. redesign <- design colnames(redesign) <- gsub("[()]", "", colnames(design)) # Killing the braces around the intercept. new.form <- paste(c("~0 + cngeneson", colnames(redesign)), collapse="+") form <- eval(parse(text=new.form)) for (my.var in c(0.015)) { set.seed(3742) results <- top.set <- list() for (i in seq(0,20)) { # Running original data, then resampled versions. if (i) { spike.data <- resampleSpikes(spike.param, var.log=my.var) } else { spike.data <- spike.param$counts } # Computing effective library sizes that reflect the relative spike-in coverage. spike.totals <- colSums(spike.data) eff.lib.size <- spike.totals/mean(spike.totals) * mean(colSums(cell.counts)) # Computing log2-(CPM+1), as suggested in the vignette. lcpms <- log2(t(t(cell.counts)/eff.lib.size)*1e6 + 1) suppressMessages({ sca <- FromMatrix(lcpms, cData=data.frame(wellKey=colnames(lcpms)), fData=data.frame(primerid=rownames(lcpms))) colData(sca) <- cbind(colData(sca), DataFrame(redesign)) colData(sca)$cngeneson <- scale(colMeans(cell.counts>0)) mfit <- zlm(form, sca) mlrt <- lrTest(mfit, test.coef) MAST.p <- mlrt[, "hurdle", "Pr(>Chisq)"] }) # Choosing significant genes. fdr <- p.adjust(MAST.p, method="BH") chosen <- fdr <= fdr.threshold top.ranked <- rank(MAST.p) <= top.hits if (i) { results[[i]] <- sum(original!=chosen)/sum(original) top.set[[i]] <- sum(best!=top.ranked)/top.hits } else { original <- chosen best <- top.ranked } # Storing diagnostics. if (i<=1) { write.table(file=paste0("MAST_", datatype, "_", i, ".tsv"), cbind(mlrt, FDR=fdr), row.names=TRUE, quote=FALSE, sep="\t", col.names=NA) } } final <- do.call(rbind, results) colnames(final) <- "AllDE" top.lost <- do.call(rbind, top.set) colnames(top.lost) <- paste0("Top", top.hits) write.table(file=temp, data.frame(Dataset=datatype, Variance=my.var, Method="MAST", Total=length(original), Detected=sum(original), final, top.lost), row.names=FALSE, col.names=!filled, quote=FALSE, sep="\t", append=filled) filled <- TRUE } } ################################################################################# file.rename(temp, "results.txt") sessionInfo() ###########################################################################