--- title: "Single-cell RNA-seq of naive/primed embryonic stem cells: overall analysis" author: Tobias Messmer and Aaron Lun date: "`r Sys.Date()`" output: BiocStyle::html_document: toc: true toc_float: true depth: 3 number_sections: true theme: united highlight: tango fig_caption: false --- ```{r, echo=FALSE, results="hide"} dir.create("figure-overall/", showWarning=FALSE) knitr::opts_chunk$set(error=FALSE, warning=FALSE, message=FALSE, fig.path="figure-overall/") ``` # Reading in the data First, we read in the processed data set from the previous script. ```{r readin} library(scran) sce <- readRDS("results-preprocess/sce_all.rds") sce ``` To get a first impression of the data, expression profiles of typical ESC markers are plotted. This is done after shuffling the cell order to avoid all naive cells being plotted after primed cells. ```{r explot} library(scater) shuffled.sce <- sce[, sample(ncol(sce))] fontsize <- theme(axis.text=element_text(size=12), axis.title=element_text(size=16)) plotExpression(shuffled.sce, c("KLF4", "KLF17", "DPPA3", "TFCP2L1", "NANOG", "ZFP42", "POU5F1"), colour_by="phenotype", shape_by = "batch", show_violin = FALSE) + fontsize ``` Based on the variabilty of gene expression over the whole population, we can identify subpopulations within the data set. As we already know that the cells are either in a naive or primed state, this clustering serves mainly as confirmation. Finally, we set up a directory to save various results into. ```{r} resdir <- "results-overall" dir.create(resdir, showWarning=FALSE) ``` # Detecting highly variable genes across all cells We fit a trend to the variances of the spike-ins. This represents the technical variance at any given mean count. For each endogenous gene, the difference between the technical and total variance is the biological component of variability. We do this separately for each batch, as there is at least an order of magnitude difference in the amount of spike-in added to each batch. ```{r trendvarfit, fig.height=6, fig.width=10} sce <- multiBlockNorm(sce, sce$batch) var.out <- multiBlockVar(sce, block=sce$batch, trend.args=list(parametric=TRUE, loess.args=list(span=0.4))) par(mfrow=c(1,2)) is.spike <- isSpike(sce) for (batch in levels(sce$batch)) { cur.out <- var.out$per.block[[batch]] plot(cur.out$mean, cur.out$total, pch=16, cex=0.6, xlab="Mean log-expression", ylab="Variance of log-expression", main=batch) curve(metadata(cur.out)$trend(x), col="dodgerblue", lwd=2, add=TRUE) points(cur.out$mean[is.spike], cur.out$total[is.spike], col="red", pch=16) } ``` The trend passes through the bulk of the spike-ins, which is reassuring. We save the combined statistics across batches to file. ```{r combinevar} write.table(file=file.path(resdir, "var.tsv"), var.out[,setdiff(colnames(var.out), "per.block")], # don't save the DFs. sep="\t", quote=FALSE, col.names = NA) ``` The computed variance is used to identify highly variable genes, which are subsequently saved to file. ```{r highvar} hvg.out <- var.out[which(var.out$FDR <= 0.05 & var.out$bio >= 0.5),-7] nrow(hvg.out) hvg.out <- hvg.out[order(hvg.out$bio, decreasing=TRUE),] write.table(file = file.path(resdir, "hvg.tsv"), hvg.out, sep="\t", quote=FALSE, col.names = NA) head(hvg.out) ``` The most highly variable genes across the whole population are shown to ensure that they are not driven by one or two outliers. ```{r vioplot} plotExpression(shuffled.sce, rownames(hvg.out)[1:10], point_alpha=0.5, colour_by="phenotype", jitter="jitter") + fontsize ``` # Dimensionality reduction based on the technical noise We remove the batch effect prior to performing dimensionality reduction. ```{r} library(limma) norm_exprs(sce) <- removeBatchEffect(exprs(sce), batch=sce$batch) ``` We then estimate the number of PCs to retain based on our estimates of the technical noise. ```{r} sce <- denoisePCA(sce, var.out, assay.type="norm_exprs") ncol(reducedDim(sce)) ``` We also examine the variance explained by the first few PCs. ```{r} pc.out <- attr(reducedDim(sce), "percentVar") plot(pc.out) ``` We save the PCs to file for future use. ```{r} saveRDS(reducedDim(sce), file=file.path(resdir, "pcs.rds")) ``` # Visualization in low-dimensional space We perform dimensionality reduction using _t_-SNE, colouring by various attributes. ```{r tsne, fig.height=10, fig.width=10} set.seed(100) sce <- runTSNE(sce, use_dimred="PCA", perplexity=30) tsne1 <- plotTSNE(sce, colour_by=rownames(hvg.out)[1]) + fontsize tsne2 <- plotTSNE(sce, colour_by="KLF4") + fontsize tsne3 <- plotTSNE(sce, colour_by="batch") + fontsize tsne4 <- plotTSNE(sce, colour_by="phenotype") + fontsize multiplot(tsne1, tsne3, tsne2, tsne4, cols=2) ``` We repeat this with PCA, which is simpler but provides more stable results. ```{r pca, fig.height=10, fig.width=10} pca1 <- plotPCA(sce, colour_by=rownames(hvg.out)[1]) + fontsize pca2 <- plotPCA(sce, colour_by="KLF4") + fontsize pca3 <- plotPCA(sce, colour_by="batch") + fontsize pca4 <- plotPCA(sce, colour_by="phenotype") + fontsize multiplot(pca1, pca3, pca2, pca4, cols=2) ``` # Differential expression analysis ## Testing for differences between naive and primed Finally, we test for differences in expression between the naive and primed conditions. This is done using _edgeR_, after summing all cells within each batch to create a pseudo-bulk sample. ```{r desetup} library(edgeR) de.counts <- sumTechReps(counts(sce), paste(sce$phenotype, sce$batch, sep=".")) de.counts <- de.counts[!isSpike(sce),] # removing spikes y <- DGEList(de.counts) ``` We construct a new design matrix for the summed samples. ```{r} ptype <- factor(sub("\\..", "\\1", colnames(de.counts))) batch <- factor(sub("^.*\\.", "\\1", colnames(de.counts))) design_pheno <- model.matrix(~0 + ptype + batch) ``` We run through _edgeR_'s analysis pipeline. We first filter out low-abundance genes, as we did not perform any filtering on `sce` during preprocessing. ```{r} keep <- aveLogCPM(y) > aveLogCPM(5, mean(y$samples$lib.size)) y <- y[keep,] summary(keep) ``` First, we normalize the samples using the TMM method. ```{r} y <- calcNormFactors(y) y$samples ``` We estimate the negative binomial dispersion, which represents the variability between replicates. For true bulk RNA-seq experiments, we'd expect BCV values around 0.1 or lower, though the values here will be higher as they are derived from noisier single-cell values. ```{r nbdisp} y <- estimateDisp(y, design_pheno) plotBCV(y) ``` We fit a generalized linear model, with estimates of the quasi-likelihood dispersion to account for gene-specific variability. ```{r qldisp} fit <- glmQLFit(y, design_pheno, robust=TRUE) plotQLDisp(fit) ``` Finally, we set up contrasts between the primed and naive population. We use the QL F-test to test for significant differences in expression. ```{r ftrest} con <- makeContrasts(ptypenaive - ptypeprimed, levels = design_pheno) res <- glmQLFTest(fit, contrast=con) summary(decideTestsDGE(res)) DEmarkers <- topTags(res, n=Inf)$table write.table(DEmarkers, file=file.path(resdir, "de.tsv"), sep="\t", quote=FALSE, col.names=NA) head(DEmarkers) ``` This is shown in more detail using the smear plot below. ```{r smearplot} is.sig <- DEmarkers$FDR <= 0.05 plotSmear(res, de.tags=rownames(DEmarkers)[is.sig], cex=0.1) ``` To ensure that the top DE genes are not driven by outliers, we can examine them explicitly. ```{r viode} plotExpression(shuffled.sce, features = rownames(DEmarkers)[1:10], col = "phenotype") ``` ## Interpreting the set of DE genes We use the DE genes for gene set analyses. First, we take the set of genes that were upregulated in naive over primed. ```{r upgo} library(GO.db) library(org.Hs.eg.db) entreznames <- as.character(rowData(sce)$entrezgene) has.entrez <- !is.na(entreznames) in.universe <- entreznames[has.entrez & rownames(sce) %in% rownames(DEmarkers)] up.genes <- rownames(DEmarkers)[is.sig & DEmarkers$logFC > 0] is.up <- rownames(sce) %in% up.genes in.up <- entreznames[has.entrez & is.up] go <- goana(in.up, universe=in.universe, species = "Hs") topgo <- topGO(go, n=Inf, ontology = "BP") write.table(file=file.path(resdir, "go_up.tsv"), topgo, sep="\t", quote=FALSE, col.names = NA) head(topgo) ``` We repeat the dose for KEGG analyses. ```{r upkegg} kegg <- kegga(in.up, universe=in.universe, species = "Hs") topkegg <- topKEGG(kegg, n=Inf) write.table(file=file.path(resdir, "kegg_up.tsv"), topkegg, sep="\t", quote=FALSE, col.names = NA) head(topkegg) ``` We now turn our attention to the downregulated genes, i.e., those expressed in primed and not naive. ```{r downgo} down.genes <- rownames(DEmarkers)[is.sig & DEmarkers$logFC < 0] is.down <- rownames(sce) %in% down.genes in.down <- entreznames[has.entrez & is.down] go <- goana(in.down, universe=in.universe, species = "Hs") topgo <- topGO(go, n=Inf, ontology = "BP") write.table(file=file.path(resdir, "go_down.tsv"), topgo, sep="\t", quote=FALSE, col.names = NA) head(topgo) ``` We repeat for KEGG analyses. ```{r downkegg} kegg <- kegga(in.down, universe=in.universe, species = "Hs") topkegg <- topKEGG(kegg, n=Inf) write.table(file=file.path(resdir, "kegg_down.tsv"), topkegg, sep="\t", quote=FALSE, col.names = NA) head(topkegg) ``` # Wrapping up There's nothing else to save, so we just report the session information here. ```{r} sessionInfo() ```