R version 3.2.2 (2015-08-14) -- "Fire Safety" Copyright (C) 2015 The R Foundation for Statistical Computing Platform: x86_64-pc-linux-gnu (64-bit) R is free software and comes with ABSOLUTELY NO WARRANTY. You are welcome to redistribute it under certain conditions. Type 'license()' or 'licence()' for distribution details. Natural language support but running in an English locale R is a collaborative project with many contributors. Type 'contributors()' for more information and 'citation()' on how to cite R or R packages in publications. Type 'demo()' for some demos, 'help()' for on-line help, or 'help.start()' for an HTML browser interface to help. Type 'q()' to quit R. > ########################################################################### > # Identifying highly variable genes using the QL strategy. > > suppressPackageStartupMessages(require(simpaler)) > suppressPackageStartupMessages(require(edgeR)) > suppressPackageStartupMessages(require(scran)) > suppressPackageStartupMessages(require(matrixStats)) > suppressPackageStartupMessages(require(statmod)) > > temp <- "temp.txt" > if (file.exists(temp)) { unlink(temp) } > filled <- FALSE > > top.hits <- c(20, 200, 2000) > > ########################################################################### > # Running across all data types (saving diagnostics along the way) > > for (datatype in c("wilson", "islam", "brennecke")) { + + if (datatype=="wilson") { + incoming <- read.table('GSE61533_HTSEQ_count_results.tsv.gz', header=TRUE, row.names=1, colClasses=c('character', rep('integer', 96))) + spike.in <- grepl('^ERCC', rownames(incoming)) + + # Quality control on cells. + totals <- colSums(incoming[!spike.in,]) + is.mito <- grepl("^mt-", rownames(incoming)) & !spike.in + okay.libs <- totals > 1e5 & colSums(incoming[is.mito,])/totals < 0.1 + incoming <- incoming[,okay.libs] + + } else if (datatype=="islam") { + incoming <- read.table('GSE46980_CombinedMoleculeCounts.tab.gz', skip=7, row.names=1, sep="\t", fill=TRUE, + colClasses=c(list('character', NULL, NULL, NULL, NULL, NULL, NULL), rep('integer', 96))) + spike.in <- grepl('SPIKE', rownames(incoming)) + + # Quality control on cells (skipped mitochondrial check; all counts low, and all mitoprops are high - UMI?). + totals <- colSums(incoming[!spike.in,]) + is.mito <- grepl("^mt-", rownames(incoming)) & !spike.in + okay.libs <- totals > 1e4 # & colSums(incoming[is.mito,])/totals < 0.1 + incoming <- incoming[,okay.libs] + + } else if (datatype=="brennecke") { + # Note that cell-level quality control has already been performed. + incoming <- read.table('nmeth.2645-S7.csv.gz', header=TRUE, row.names=1, sep=',', colClasses=c('character', rep('integer', 92))) + gene.length <- as.vector(incoming[,1]) + incoming <- incoming[,-1] + spike.in <- grepl('^ERCC', rownames(incoming)) + + } + + countsCell <- incoming[!spike.in,] + countsCell <- countsCell[rowSums(countsCell) >= ncol(countsCell),] + spike.param <- spikeParam(incoming[spike.in,]) + diag.done <- FALSE + + ######################################################################### + # Using our custom method. + + for (my.var in c(0.001, 0.01, 0.1)) { + set.seed(34271) + results <- top.res <- list() + + for (i in seq(0,20)) { + # Running original data (once for each dataset), then resampled versions. + if (i) { countsSpike <- resampleSpikes(spike.param, var.log=my.var) } + else { countsSpike <- spike.param$counts } + + # Fitting the trend to the spike-in variances. + out <- fitTechTrend(countsSpike, trend="loess") + if (!diag.done) { + pdf(paste0("diagnostics_", datatype, ".pdf")) + plot(out$mean, out$var) + sort.ab <- sort(out$mean) + lines(sort.ab, out$trend(sort.ab), col="red", lwd=2) + dev.off() + } + + # Computing the biological component. + out2 <- getBioVar(countsCell, out) + my.rank <- rank(-out2$bio) + top.ranked <- lapply(top.hits, function(x) { my.rank <= x }) + if (i) { + top.res[[i]] <- sapply(seq_along(top.hits), function(j) { sum(best[[j]] & !top.ranked[[j]]) }) + } else { + best <- top.ranked + } + + if (!diag.done) { + output <- data.frame(GeneID=rownames(countsCell), Mean=out2$mean, Total=out2$total, Bio=out2$bio, Tech=out2$tech) + write.table(file=paste0("out_custom_", datatype, ".tsv"), output[order(output$Bio, decreasing=TRUE),], + row.names=FALSE, quote=FALSE, sep="\t", col.names=TRUE) + diag.done <- TRUE + } + } + + top.lost <- do.call(rbind, top.res) + colnames(top.lost) <- paste0("Top", top.hits) + write.table(file=temp, data.frame(Dataset=datatype, Variance=my.var, Method="custom", top.lost), + row.names=FALSE, col.names=!filled, quote=FALSE, sep="\t", append=filled) + filled <- TRUE + } + + ######################################################################### + # Using Brennecke's method on the original counts. + + for (my.var in c(0.001, 0.01, 0.1)) { + set.seed(3427) + results <- top.res <- list() + + for (i in seq(0,20)) { + # Running original data (once for each dataset), then resampled versions. + if (i) { countsSpike <- resampleSpikes(spike.param, var.log=my.var) } + else { countsSpike <- spike.param$counts } + + spike.sums <- colSums(countsSpike) + spike.sums <- spike.sums/exp(mean(log(spike.sums))) + sfSpike <- sfCell <- spike.sums + + # # Equivalent to original normalization strategy + # sfCell <- estimateSizeFactorsForMatrix(countsCell) + # sfSpike <- estimateSizeFactorsForMatrix(countsSpike) + + nCountsSpike <- t(t(countsSpike)/sfSpike) + nCountsCell <- t(t(countsCell)/sfCell) + + # Copied straight from the supplementary materials. + meansSpike <- rowMeans( nCountsSpike ) + varsSpike <- rowVars( nCountsSpike ) + cv2Spike <- varsSpike / meansSpike^2 + + meansCell <- rowMeans( nCountsCell ) + varsCell <- rowVars( nCountsCell ) + cv2Cell <- varsCell / meansCell^2 + + minMeanForFitA <- unname( quantile( meansSpike[ which( cv2Spike > .3 ) ], .8 ) ) + useForFitA <- meansSpike >= minMeanForFitA + fitA <- glmgam.fit( cbind( a0 = 1, a1tilde = 1/meansSpike[useForFitA] ), cv2Spike[useForFitA] ) + + minBiolDisp <- .5^2 + xi <- mean( 1 / sfSpike ) + m <- ncol(countsCell) + psia1thetaA <- mean( 1 / sfSpike ) + ( coefficients(fitA)["a1tilde"] - xi ) * mean( sfSpike / sfCell ) + cv2thA <- coefficients(fitA)["a0"] + minBiolDisp + coefficients(fitA)["a0"] * minBiolDisp + testDenomA <- ( meansCell * psia1thetaA + meansCell^2 * cv2thA ) / ( 1 + cv2thA/m ) + # pA <- 1 - pchisq( varsCell * (m-1) / testDenomA, m-1 ) + # padjA <- p.adjust( pA, "BH" ) + lpA <- pchisq( varsCell * (m-1) / testDenomA, m-1, lower=FALSE, log=TRUE) + + # Getting the top-ranked hits. + my.rank <- rank(lpA) + top.ranked <- lapply(top.hits, function(x) { my.rank <= x }) + if (i) { + top.res[[i]] <- sapply(seq_along(top.hits), function(j) { sum(best[[j]] & !top.ranked[[j]]) }) + } else { + best <- top.ranked + } + + if (!diag.done) { + output <- data.frame(GeneID=rownames(countsCell), CV2=cv2Cell, logPValue=lpA) + write.table(file=paste0("out_brennecke_", datatype, ".tsv"), output[order(output$Bio, decreasing=TRUE),], + row.names=FALSE, quote=FALSE, sep="\t", col.names=TRUE) + diag.done <- TRUE + } + } + + top.lost <- do.call(rbind, top.res) + colnames(top.lost) <- paste0("Top", top.hits) + write.table(file=temp, data.frame(Dataset=datatype, Variance=my.var, Method="brennecke", top.lost), + row.names=FALSE, col.names=!filled, quote=FALSE, sep="\t", append=filled) + filled <- TRUE + } + } Warning messages: 1: In glmgam.fit(cbind(a0 = 1, a1tilde = 1/meansSpike[useForFitA]), : Too much damping - convergence tolerance not achievable 2: In glmgam.fit(cbind(a0 = 1, a1tilde = 1/meansSpike[useForFitA]), : Too much damping - convergence tolerance not achievable 3: In glmgam.fit(cbind(a0 = 1, a1tilde = 1/meansSpike[useForFitA]), : Too much damping - convergence tolerance not achievable > > ########################################################################### > > file.rename(temp, "results.txt") [1] TRUE > sessionInfo() R version 3.2.2 (2015-08-14) Platform: x86_64-pc-linux-gnu (64-bit) Running under: Ubuntu 14.04.3 LTS locale: [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8 [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8 [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C [9] LC_ADDRESS=C LC_TELEPHONE=C [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] statmod_1.4.23 matrixStats_0.50.1 scran_0.1.1 simpaler_0.99.0 [5] edgeR_3.12.0 limma_3.26.6 loaded via a namespace (and not attached): [1] locfit_1.5-9.1 lattice_0.20-33 IRanges_2.4.6 [4] zoo_1.7-12 grid_3.2.2 DBI_0.3.1 [7] stats4_3.2.2 dynamicTreeCut_1.62 RSQLite_1.0.0 [10] S4Vectors_0.8.7 Biobase_2.30.0 parallel_3.2.2 [13] BiocGenerics_0.16.1 AnnotationDbi_1.32.3 > > ########################################################################### > # End. > > proc.time() user system elapsed 99.039 0.332 99.396