# This tests that the C++ code for emptyDrops does what it says it does. # library(DropletUtils); library(testthat); source("test-empty.R") test_that("multinomial calculations are correct without alpha", { set.seed(1000) prop <- runif(100) X <- matrix(rpois(100000, lambda=prop*2), nrow=length(prop)) # DropletUtils assumes that proportions sum to unity! prop <- prop/sum(prop) # Checking it's the same regardless of the input matrix. dense.out <- DropletUtils:::.compute_multinom_prob_data(X, prop) sparse.out1 <- DropletUtils:::.compute_multinom_prob_data(as(X, "dgCMatrix"), prop) expect_equal(dense.out, sparse.out1) sparse.out2 <- DropletUtils:::.compute_multinom_prob_data(as(X, "dgTMatrix"), prop) expect_equal(dense.out, sparse.out2) # Checking it's the same compared to a reference. ref <- apply(X, 2, dmultinom, prob=prop, log=TRUE) combined <- dense.out + DropletUtils:::.compute_multinom_prob_rest(colSums(X)) expect_equal(ref, combined) }) ddirmultinom <- function(x, prob, alpha) # A helper function for computing the Dirichlet-multionial PDF. { size <- sum(x) lfactorial(size) + lgamma(alpha) - lgamma(size + alpha) + sum(lgamma(x + prob * alpha) - lfactorial(x) - lgamma(prob * alpha)) } test_that("multinomial calculations are correct with alpha", { set.seed(2000) prop <- runif(100) X <- matrix(rpois(100000, lambda=prop*2), nrow=length(prop)) prop <- prop/sum(prop) # Checking it's the same regardless of the input matrix. for (alpha in c(1, 10, 100)) { dense.out <- DropletUtils:::.compute_multinom_prob_data(X, prop, alpha=alpha) sparse.out1 <- DropletUtils:::.compute_multinom_prob_data(as(X, "dgCMatrix"), prop, alpha=alpha) expect_equal(dense.out, sparse.out1) sparse.out2 <- DropletUtils:::.compute_multinom_prob_data(as(X, "dgTMatrix"), prop, alpha=alpha) expect_equal(dense.out, sparse.out2) # Checking it's the same compared to the reference. ref <- apply(X, 2, FUN=ddirmultinom, prob=prop, alpha=alpha) combined <- dense.out + DropletUtils:::.compute_multinom_prob_rest(colSums(X), alpha=alpha) expect_equal(ref, combined) } }) test_that("p-value calculations are correct without alpha", { # Simulating some multinomial probabilities. SIMSTUFF <- function(nbarcodes, ngenes) { totals <- sample(1000, nbarcodes, replace=TRUE) ambient <- runif(ngenes) probs <- numeric(length(totals)) for (i in seq_along(totals)) { sim <- rmultinom(1, totals[i], prob=ambient) probs[i] <- sum(sim * log(ambient) - lfactorial(sim)) } return(list(totals=totals, probs=probs, ambient=ambient)) } # Setting up the reference function. REFFUN <- function(totals, probs, ambient, iter) { cumamb <- cumsum(ambient) logamb <- log(ambient) sumamb <- sum(ambient) n.above <- integer(length(totals)) for (it in seq_len(iter)) { vec <- runif(max(totals), 0, sumamb) chosen <- findInterval(vec, cumamb)+1L for (i in seq_along(totals)) { curchosen <- chosen[seq_len(totals[i])] curprof <- tabulate(curchosen, nbins=length(ambient)) curP <- sum(logamb * curprof - lfactorial(curprof)) if (curP <= probs[i]) { n.above[i] <- n.above[i] + 1L } } } return(n.above) } # Checking the equivalence of our two implementations. set.seed(0) sim <- SIMSTUFF(100, 20) totals <- sim$totals probs <- sim$probs ambient.prof <- sim$ambient set.seed(100) ref <- REFFUN(totals, probs, ambient.prof, iter=100) set.seed(100) stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=100) expect_identical(stats, ref) # More barcodes (especially checking what happens with two barcodes of the same total). set.seed(0) sim <- SIMSTUFF(1001, 20) totals <- sim$totals probs <- sim$probs ambient.prof <- sim$ambient set.seed(100) ref <- REFFUN(totals, probs, ambient.prof, iter=100) set.seed(100) stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=100) expect_identical(stats, ref) # More genes. set.seed(0) sim <- SIMSTUFF(100, 200) totals <- sim$totals probs <- sim$probs ambient.prof <- sim$ambient set.seed(100) ref <- REFFUN(totals, probs, ambient.prof, iter=100) set.seed(100) stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=100) expect_identical(stats, ref) # More iterations. set.seed(0) sim <- SIMSTUFF(100, 20) totals <- sim$totals probs <- sim$probs ambient.prof <- sim$ambient set.seed(100) ref <- REFFUN(totals, probs, ambient.prof, iter=500) set.seed(100) stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=500) expect_identical(stats, ref) # Checking for uniformity: # set.seed(0) # sim <- SIMSTUFF(10000, 1000) # totals <- sim$totals # probs <- sim$probs # ambient.prof <- sim$ambient # set.seed(100) # stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=1000) }) test_that("p-value calculations are correct with alpha", { # Simulating some Dirichlet-multinomial probabilities. SIMSTUFF <- function(nbarcodes, ngenes, alpha) { totals <- sample(1000, nbarcodes, replace=TRUE) ambient <- runif(ngenes) true.prob <- ambient * alpha probs <- numeric(length(totals)) for (i in seq_along(totals)) { prob <- rgamma(ngenes, true.prob) # pre-simulation of the Dirichlet sim <- rmultinom(1, totals[i], prob=prob) probs[i] <- sum(lgamma(sim + true.prob) - lfactorial(sim) - lgamma(true.prob)) } return(list(totals=totals, probs=probs, ambient=ambient)) } # Setting up the reference function. REFFUN <- function(totals, probs, ambient, iter, alpha) { n.above <- integer(length(totals)) alpha.ambient <- ambient*alpha for (it in seq_len(iter)) { test.ambient <- rgamma(length(ambient), alpha.ambient) test.ambient <- test.ambient/sum(test.ambient) cumamb <- cumsum(test.ambient) logamb <- log(test.ambient) sumamb <- sum(test.ambient) vec <- runif(max(totals), 0, sumamb) chosen <- findInterval(vec, cumamb)+1L for (i in seq_along(totals)) { curchosen <- chosen[seq_len(totals[i])] curprof <- tabulate(curchosen, nbins=length(test.ambient)) curP <- sum(lgamma(alpha.ambient + curprof) - lfactorial(curprof) - lgamma(alpha.ambient)) if (curP <= probs[i]) { n.above[i] <- n.above[i] + 1L } } } return(n.above) } # Checking the equivalence of our two implementations. set.seed(0) sim <- SIMSTUFF(100, 20, alpha=10) totals <- sim$totals probs <- sim$probs ambient.prof <- sim$ambient set.seed(100) ref <- REFFUN(totals, probs, ambient.prof, iter=100, alpha=10) set.seed(100) stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=100, alpha=10) expect_identical(stats, ref) # More barcodes (especially checking what happens with two barcodes of the same total). set.seed(0) sim <- SIMSTUFF(1001, 20, alpha=5) totals <- sim$totals probs <- sim$probs ambient.prof <- sim$ambient set.seed(100) ref <- REFFUN(totals, probs, ambient.prof, iter=100, alpha=5) set.seed(100) stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=100, alpha=5) expect_identical(stats, ref) # More genes. set.seed(0) sim <- SIMSTUFF(100, 200, alpha=2) totals <- sim$totals probs <- sim$probs ambient.prof <- sim$ambient set.seed(100) ref <- REFFUN(totals, probs, ambient.prof, iter=100, alpha=2) set.seed(100) stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=100, alpha=2) expect_identical(stats, ref) # More iterations. set.seed(0) sim <- SIMSTUFF(100, 20, alpha=20) totals <- sim$totals probs <- sim$probs ambient.prof <- sim$ambient set.seed(100) ref <- REFFUN(totals, probs, ambient.prof, iter=500, alpha=20) set.seed(100) stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=500, alpha=20) expect_identical(stats, ref) # Checking for uniformity: # set.seed(0) # sim <- SIMSTUFF(10000, 1000, alpha=5) # totals <- sim$totals # probs <- sim$probs # ambient.prof <- sim$ambient # set.seed(100) # stats <- DropletUtils:::.permute_counter(totals, probs, ambient.prof, iter=1000, alpha=5) }) test_that("emptyDrops runs to completion", { # Mocking up some counts. set.seed(1000) my.counts <- DropletUtils:::simCounts() limit <- 100 e.out <- emptyDrops(my.counts, lower=limit) totals <- Matrix::colSums(my.counts) expect_identical(as.integer(totals), e.out$Total) expect_true(all(is.na(e.out$LogProb[totals<=limit]))) expect_true(all(!is.na(e.out$LogProb[totals>limit]))) # Checking that the log-probability calculation is correct. ambient.prop <- edgeR::goodTuringProportions(Matrix::rowSums(my.counts[,totals<= limit])) valid <- which(totals > limit) collected <- numeric(nrow(e.out)) for (v in valid) { collected[v] <- dmultinom(my.counts[,v], size=totals[v], prob=ambient.prop, log=TRUE) } expect_equal(e.out$LogProb[valid], collected[valid]) # Checking ambient tests. e.out2 <- emptyDrops(my.counts, lower=limit, test.ambient=TRUE) expect_identical(e.out$Total, e.out2$Total) expect_identical(e.out$LogProb[totals>limit], e.out2$LogProb[totals>limit]) expect_true(all(!is.na(e.out2$LogProb[totals>0]))) # Checking the ignore argument works correctly. set.seed(1001) e.out3a <- emptyDrops(my.counts, lower=limit, ignore=200) e.out3a$FDR <- NULL set.seed(1001) survivors <- totals <= 100 | totals > 200 new.counts <- my.counts[,survivors] e.out3b <- emptyDrops(new.counts, lower=limit, ignore=NULL) e.out3b$FDR <- NULL expect_equal(e.out3a[survivors,], e.out3b) # Checking retention options. K <- barcodeRanks(my.counts, lower=limit)$knee expect_true(all(e.out$FDR[totals >= K]==0)) expect_true(!all(e.out$FDR[totals < K]==0)) e.outK <- emptyDrops(my.counts, retain=K*0.6) expect_true(all(e.outK$FDR[totals >= K*0.6]==0)) expect_true(!all(e.outK$FDR[totals < K*0.6]==0)) }) test_that("emptyDrops works correctly with alpha estimation", { # Mocking up some counts. set.seed(7000) my.counts <- DropletUtils:::simCounts() * 2 limit <- 100 e.out <- emptyDrops(my.counts, lower=limit, alpha=NULL) # Pulling out the alpha. totals <- Matrix::colSums(my.counts) discarded <- totals <= limit lostmat <- my.counts[,discarded,drop=FALSE] ambient.prop <- edgeR::goodTuringProportions(Matrix::rowSums(lostmat)) alpha0 <- DropletUtils:::.estimate_alpha(lostmat, ambient.prop, totals[discarded]) alphaT <- DropletUtils:::.estimate_alpha(as(lostmat, "dgTMatrix"), ambient.prop, totals[discarded]) expect_identical(alpha0, alphaT) # Checking that the total probability calculations are correct. valid <- which(totals > limit) collected <- numeric(nrow(e.out)) for (v in valid) { collected[v] <- ddirmultinom(my.counts[,v], prob=ambient.prop, alpha=alpha0) } expect_equal(e.out$LogProb[valid], collected[valid]) # Checking that it responds to the specified value of alpha. e.out2 <- emptyDrops(my.counts, lower=limit, alpha=5) expect_false(isTRUE(all.equal(e.out2$PValue, e.out$PValue))) })