--- title: "Single-cell RNA-seq Analysis of naive/primed embryonic stem cells: marker validation in public bulk RNA data" author: Tobias Messmer and Aaron Lun date: "`r Sys.Date()`" output: BiocStyle::html_document: toc: true toc_float: true depth: 3 number_sections: true theme: united highlight: tango fig_caption: false --- ```{r, echo=FALSE, results="hide"} dir.create("figure-bulk/", showWarning=FALSE) knitr::opts_chunk$set(error=FALSE, warning=FALSE, message=FALSE, fig.path="figure-bulk/") options(bitmapType="cairo", width=100) ``` # Validation of naive and primed markers in RNA-seq data from other cell lines We want to use other cell lines to validate the markers that we identified by DE analysis between our naive and primed cells. For this, we use the following published bulk RNA-seq data sets of naive and primed hESCs: - Pastor et al. (2016), _Cell Stem Cell_. Naive Human Pluripotent Cells Feature a Methylation Landscape Devoid of Blastocyst or Germline Memory. - Theunissen et al. (2016), _Cell Stem Cell_. Molecular Criteria for Defining the Naive Human Pluripotent State. - Guo et al. (2017), _Development_. Epigenetic resetting of human pluripotency. The aim is to identify DE genes in those data sets and to assess how many of them overlap with our previously identified naive and primed markers. ```{r} resdir <- "results-bulk" dir.create(resdir, showWarning=FALSE) ``` # Pastor et al., 2016 First, we read in the data: ```{r pastor.in} pastor16 <- read.table(file.path("../data/other_counts", "Pastor2016_rawReadcountRNAseq.txt.gz"), sep = "\t", header = TRUE) gene.names <- pastor16$Probe # Selecting only the columns containing counts pastor16 <- as.matrix(pastor16[,14:19]) rownames(pastor16) <- gene.names # Remove duplicated gene names pastor16 <- pastor16[!duplicated(rownames(pastor16)),] dim(pastor16) ``` Next, we test for differences in expression between the naive and primed conditions using `r Biocpkg("edgeR")`. We first do some filtering: ```{r} library(edgeR) y <- DGEList(pastor16) keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size)) y <- y[keep,] table(keep) ``` We compute TMM normalization factors: ```{r} y <- calcNormFactors(y) y$samples ``` We create a design model based on the phenotype of each sample to estimate the NB dispersion: ```{r pastor.model} ptype <- factor(rep(c('Primed', 'Naive'), each=3)) design_pheno <- model.matrix(~0 + ptype) y <- estimateDisp(y, design_pheno) plotBCV(y) ``` ... and also the QL dispersion: ```{r} fit <- glmQLFit(y, design_pheno, robust=TRUE) plotQLDisp(fit) ``` We finally test for DE genes between conditions using a log-fold change threshold of 0.5. ```{r pastor.de} con <- makeContrasts(ptypeNaive - ptypePrimed, levels = design_pheno) res <- glmTreat(fit, contrast=con, lfc = 0.5) de.tab.pastor <- topTags(res, n=Inf)$table saveRDS(de.tab.pastor, file = file.path(resdir, 'de_pastor.rds')) head(de.tab.pastor) ``` # Theunissen et al., 2016 Again, reading in data: ```{r theu.in} theunissen16 <- read.table(file.path("../data/other_counts", "Theunissen2016_rawReadcountRNAseq.txt.gz"), sep = "\t", header = TRUE) gene.names <- theunissen16$Probe # Selecting only the columns containing counts theunissen16 <- as.matrix(theunissen16[,14:23]) rownames(theunissen16) <- gene.names #Remove duplicated gene names theunissen16 <- theunissen16[!duplicated(rownames(theunissen16)),] dim(theunissen16) ``` We do some filtering: ```{r} library(edgeR) y <- DGEList(theunissen16) keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size)) y <- y[keep,] table(keep) ``` We compute TMM normalization factors: ```{r} y <- calcNormFactors(y) y$samples ``` We create a design model based on the phenotype of each sample to estimate the NB dispersion: ```{r} ptype <- factor(c('Primed', 'Naive', 'Naive', 'Naive', 'Naive', 'Naive', 'Naive', 'Primed', 'Primed', 'Primed')) design_pheno <- model.matrix(~0 + ptype) y <- estimateDisp(y, design_pheno) plotBCV(y) ``` ... and also the QL dispersion: ```{r} fit <- glmQLFit(y, design_pheno, robust=TRUE) plotQLDisp(fit) ``` We finally test for DE genes between conditions using a log-fold change threshold of 0.5. ```{r} con <- makeContrasts(ptypeNaive - ptypePrimed, levels = design_pheno) res <- glmTreat(fit, contrast=con, lfc = 0.5) de.tab.theunissen <- topTags(res, n=Inf)$table saveRDS(de.tab.theunissen, file = file.path(resdir, 'de_theunissen.rds')) head(de.tab.theunissen) ``` # Guo et al., 2017 ## Overview we read in the Guo data set. ```{r guo.in} guo17 <- read.table(file.path("../data/other_counts", "Guo2017_rawReadcountRNAseq.txt"), sep = "\t", header = TRUE) guo17 <- guo17[!duplicated(guo17$Probe),] #Remove duplicated gene names gene.names <- guo17$Probe rownames(guo17) <- gene.names ``` There's two cell lines in this set: HNES and Shef7. For consistency with the other analyses (and to minimize differences in power due to differences in residual d.f.), we will analyse them separately from each other. ## HNES1 Extracting out the HNES1 samples: ```{r} hnes <- as.matrix(guo17[,c(32:34, 26:28)]) dim(hnes) ``` We do some filtering: ```{r} library(edgeR) y <- DGEList(hnes) keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size)) y <- y[keep,] table(keep) ``` We compute TMM normalization factors: ```{r} y <- calcNormFactors(y) y$samples ``` We create a design model based on the phenotype of each sample to estimate the NB dispersion: ```{r} ptype <- factor(rep(c('Naive', 'Primed'), each=3)) design_pheno <- model.matrix(~0 + ptype) y <- estimateDisp(y, design_pheno) plotBCV(y) ``` ... and also the QL dispersion: ```{r} fit <- glmQLFit(y, design_pheno, robust=TRUE) plotQLDisp(fit) ``` We finally test for DE genes between conditions using a log-fold change threshold of 0.5. ```{r} con <- makeContrasts(ptypeNaive - ptypePrimed, levels = design_pheno) res <- glmTreat(fit, contrast=con, lfc = 0.5) de.tab.guo_hnes1 <- topTags(res, n=Inf)$table saveRDS(de.tab.guo_hnes1, file = file.path(resdir, 'de_guo_hnes1.rds')) head(de.tab.guo_hnes1) ``` ## Shef7 Extracting out the Shef7 samples: ```{r} shef <- as.matrix(guo17[,c(23:25, 35:37)]) dim(hnes) ``` We do some filtering: ```{r} library(edgeR) y <- DGEList(shef) keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size)) y <- y[keep,] table(keep) ``` We compute TMM normalization factors: ```{r} y <- calcNormFactors(y) y$samples ``` We create a design model based on the phenotype of each sample to estimate the NB dispersion: ```{r} ptype <- factor(rep(c('Naive', 'Primed'), each=3)) design_pheno <- model.matrix(~0 + ptype) y <- estimateDisp(y, design_pheno) plotBCV(y) ``` ... and also the QL dispersion: ```{r} fit <- glmQLFit(y, design_pheno, robust=TRUE) plotQLDisp(fit) ``` We finally test for DE genes between conditions using a log-fold change threshold of 0.5. ```{r} con <- makeContrasts(ptypeNaive - ptypePrimed, levels = design_pheno) res <- glmTreat(fit, contrast=con, lfc = 0.5) de.tab.guo_shef7 <- topTags(res, n=Inf)$table saveRDS(de.tab.guo_shef7, file = file.path(resdir, 'de_guo_shef7.rds')) head(de.tab.guo_shef7) ``` # Single-cell marker validation ## Loading in our markers We load in our top 200 markers for each population: ```{r markers.in} de.genes <- read.table(file = file.path('results-overall', 'de.tsv'), sep = "\t", header = TRUE, row.names = 1) naive.set <- de.genes[de.genes$logFC > 0 & de.genes$FDR <= 0.05,] primed.set <- de.genes[de.genes$logFC < 0 & de.genes$FDR <= 0.05,] naive.markers <- rownames(naive.set)[1:200] primed.markers <- rownames(primed.set)[1:200] ``` We filter them for only the genes that also exist in all other data sets. ```{r} other.names <- list( rownames(de.tab.pastor), rownames(de.tab.theunissen), rownames(de.tab.guo_hnes1), rownames(de.tab.guo_shef7) ) shared.naive.markers <- Reduce(intersect, c(list(naive.markers), other.names)) length(shared.naive.markers) shared.primed.markers <- Reduce(intersect, c(list(primed.markers), other.names)) length(shared.primed.markers) ``` ## Visualizing the log-fold changes We can visualize the logFC of our markers between the naive and primed conditions of the other data sets in a heatmap. This gives a first estimate whether there's a similar expression trend. ```{r} bound <- 10 naive_mat <- data.frame( Pastor = de.tab.pastor[shared.naive.markers,]$logFC, Theunissen = de.tab.theunissen[shared.naive.markers,]$logFC, Guo.HNES = de.tab.guo_hnes1[shared.naive.markers,]$logFC, Guo.SHEF = de.tab.guo_shef7[shared.naive.markers,]$logFC ) rownames(naive_mat) <- shared.naive.markers naive_mat[naive_mat >= bound] <- bound naive_mat[naive_mat < -bound] <- -bound primed_mat <- data.frame( Pastor = de.tab.pastor[shared.primed.markers,]$logFC, Theunissen = de.tab.theunissen[shared.primed.markers,]$logFC, Guo.HNES = de.tab.guo_hnes1[shared.primed.markers,]$logFC, Guo.SHEF = de.tab.guo_shef7[shared.primed.markers,]$logFC ) rownames(primed_mat) <- shared.primed.markers primed_mat[primed_mat >= bound] <- bound primed_mat[primed_mat < -bound] <- -bound library(pheatmap) anno <- data.frame(Marker = rep(c('naive', 'primed'), c(length(shared.naive.markers), length(shared.primed.markers)))) rownames(anno) <- c(shared.naive.markers, shared.primed.markers) pheatmap(rbind(naive_mat, primed_mat), annotation_row = anno, cluster_rows=FALSE, cluster_cols = FALSE) ``` ## Volcano plots We use volcano plots to visualize the p-values of our markers in each of the three data sets. We also add a line representing the FDR threshold in each data set. First Pastor: ```{r pastor.volcano} plot(de.tab.pastor$logFC, -log10(de.tab.pastor$PValue), pch = 16, xlab = 'Log FC', ylab = '- Log10(Pval)', bty = 'n', main = 'Pastor, 2016', xlim = c(-11, 11)) points(de.tab.pastor[naive.markers,]$logFC, -log10(de.tab.pastor[naive.markers,]$PValue), pch = 16, cex = 1.5, col = 'orange') points(de.tab.pastor[primed.markers,]$logFC, -log10(de.tab.pastor[primed.markers,]$PValue), pch = 16, cex = 1.5, col = 'royalblue1') max.pval <- min(-log10(de.tab.pastor[de.tab.pastor$FDR < 0.05,]$PValue)) lines(x = c(-11, 11), y = c(max.pval, max.pval), col = 'red', lty = 2, lwd = 3) text(x = -9, y = max.pval, labels = 'FDR < 0.05', pos = 1, col = 'red') ``` Then Theunissen: ```{r theu.volcano} plot(de.tab.theunissen$logFC, -log10(de.tab.theunissen$PValue), pch = 16, xlab = 'Log FC', ylab = '- Log10(Pval)', bty = 'n', main = 'Theunissen, 2016', xlim = c(-15, 15)) points(de.tab.theunissen[naive.markers,]$logFC, -log10(de.tab.theunissen[naive.markers,]$PValue), pch = 16, cex = 1.5, col = 'orange') points(de.tab.theunissen[primed.markers,]$logFC, -log10(de.tab.theunissen[primed.markers,]$PValue), pch = 16, cex = 1.5, col = 'royalblue1') max.pval <- min(-log10(de.tab.theunissen[de.tab.theunissen$FDR < 0.05,]$PValue)) lines(x = c(-15, 15), y = c(max.pval, max.pval), col = 'red', lty = 2, lwd = 3) text(x = -12, y = max.pval, labels = 'FDR < 0.05', pos = 1, col = 'red') ``` Guo HNES1: ```{r hnes.volcano} plot(de.tab.guo_hnes1$logFC, -log10(de.tab.guo_hnes1$PValue), pch = 16, xlab = 'Log FC', ylab = '- Log10(Pval)', bty = 'n', main = 'HNES1 (Guo, 2017)', xlim = c(-15, 15)) points(de.tab.guo_hnes1[naive.markers,]$logFC, -log10(de.tab.guo_hnes1[naive.markers,]$PValue), pch = 16, cex = 1.5, col = 'orange') points(de.tab.guo_hnes1[primed.markers,]$logFC, -log10(de.tab.guo_hnes1[primed.markers,]$PValue), pch = 16, cex = 1.5, col = 'royalblue1') max.pval <- min(-log10(de.tab.guo_hnes1[de.tab.guo_hnes1$FDR < 0.05,]$PValue)) lines(x = c(-15, 15), y = c(max.pval, max.pval), col = 'red', lty = 2, lwd = 3) text(x = -12, y = max.pval, labels = 'FDR < 0.05', pos = 1, col = 'red') ``` And Guo Shef7: ```{r shef.volcano} plot(de.tab.guo_shef7$logFC, -log10(de.tab.guo_shef7$PValue), pch = 16, xlab = 'Log FC', ylab = '- Log10(Pval)', bty = 'n', main = 'SHEF7 (Guo, 2017)', xlim = c(-15, 15)) points(de.tab.guo_shef7[naive.markers,]$logFC, -log10(de.tab.guo_shef7[naive.markers,]$PValue), pch = 16, cex = 1.5, col = 'orange') points(de.tab.guo_shef7[primed.markers,]$logFC, -log10(de.tab.guo_shef7[primed.markers,]$PValue), pch = 16, cex = 1.5, col = 'royalblue1') max.pval <- min(-log10(de.tab.guo_shef7[de.tab.guo_shef7$FDR < 0.05,]$PValue)) lines(x = c(-15, 15), y = c(max.pval, max.pval), col = 'red', lty = 2, lwd = 3) text(x = -12, y = max.pval, labels = 'FDR < 0.05', pos = 1, col = 'red') ``` # Identifying intersecting markers We want to create a list with all interesecting markers genes by performing an intersection-union test. For each gene, we take the largest p-value across all four comparisons. (We'll ignore the HNES1 results as it seems to be an outlier with respect to the number of naive/primed differences.) If the signs of the log-fold changes are not the same, the P-values is set to 1. ```{r intersect.fc} common.genes <- Reduce(intersect, list( rownames(de.tab.pastor), rownames(de.tab.theunissen), rownames(de.tab.guo_shef7), rownames(de.genes)) ) all.stats <- list( de.tab.pastor[common.genes,], de.tab.theunissen[common.genes,], de.tab.guo_shef7[common.genes,], de.genes[common.genes,] ) max.pval <- do.call(pmax, lapply(all.stats, "[[", "PValue")) lfc.sign <- Reduce("+", lapply(all.stats, FUN=function(x) x$logFC > 0)) max.pval[lfc.sign > 0L & lfc.sign < length(all.stats)] <- 1 # inconsistent directions. ``` We create a table that contains the log-fold changes from each comparison and add the p-value that was just computed. ```{r intersect.pval} de.list <- data.frame( H9NK2 = de.genes[common.genes,]$logFC, UCLA1 = de.tab.pastor[common.genes,]$logFC, WIBR3 = de.tab.theunissen[common.genes,]$logFC, SHEF7 = de.tab.guo_shef7[common.genes,]$logFC, PValue = max.pvals, row.names = common.genes) ``` Finally, we correct the p-values for multiple testing using Benjamini-Hochberg, order the table for increasing FDR and export the table. ```{r intersect.exp} de.list$FDR <- p.adjust(de.list$PVal, method = 'BH') de.list <- de.list[order(de.list$PValue),] write.table(de.list, file = file.path(resdir, 'overlapping.markers.tsv'), sep="\t", quote=FALSE, col.names=NA) head(de.list) ``` # Wrapping up To wrap this story up, we report the session information. ```{r end} sessionInfo() ```