--- title: Performing a differential analysis of Zika binding author: Aaron Lun date: 21 August 2017 output: html_document: fig_caption: false toc: true toc_float: true depth: 3 number_sections: true theme: united highlight: tango --- ```{r, echo=FALSE, results="hide"} knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) knitr::opts_chunk$set(dev='png', fig.path="figures-diff/") options(width=90) ``` # Overview Here we perform a differential analysis of Zika-Human interactions between the livefire samples and the reverse control samples. The first job is to set up the parameters for counting. ```{r} library(diffHic) load("../processed/1-Livefire1_S1.Rda") all.chrs <- seqlevels(param$fragments) all.chrs <- all.chrs[grep("^chr[^_]+$", all.chrs)] restricted <- cbind("ZIKV", all.chrs) param <- reform(param, restrict=restricted) param ``` Now to count into 1 kbp bins. ```{r} fnames <- c("1-Livefire1_S1.h5", "2-Livefire2_S2.h5", "3-Livefire3_S3.h5", "4-Rcont1_S4.h5", "5-Rcont2_S5.h5", "6-Rcont3_S6.h5") obj <- squareCounts(file.path("../processed", fnames), param, width=1000) obj ``` We apply some filtering to remove bin pairs that have low counts. In this case, we remove bin pairs that have average counts below 3. The coverage is so sparse that there is little point in estimating the non-specific ligation rate from inter-chromosomal contacts. ```{r} library(edgeR) ab <- aveLogCPM(asDGEList(obj)) keep <- ab > aveLogCPM(3, mean(obj$totals)) subobj <- obj[keep,] summary(keep) ``` # Normalization for composition biases We apply the TMM normalization on the counts, assuming that most contacts between ZIKV and other transcripts are non-specific. This corrects for composition biases when more genuine ZIKV-related interactions are captured in the livefire samples. (In such cases, the intensity of non-specific contacts should drop as there are fewer free ends to ligate.) ```{r} subobj <- normOffsets(subobj, se.out=TRUE) subobj$norm.factors ``` We can examine the suitability of these normalization factors with some MA plots. Note that we only care about normalization of each livefire sample to its corresponding control. We don't care about normalization between livefire samples as we have a blocking factor anyway. ```{r, fig.width=12, fig.height=6} adjc <- cpm(assay(subobj), lib.size=subobj$totals, log=TRUE) par(mfrow=c(1,3)) for (x in 1:3) { plot(rowMeans(adjc[,c(0,3)+x]), adjc[,x+3] - adjc[,x], ylab="M", xlab="A", main=sprintf("%s vs %s", basename(fnames[x+3]), basename(fnames[x]))) abline(h=log2(subobj$norm.factors[x+3]/subobj$norm.factors[x]), col="red") } ``` We do not have the ability to remove biases in ligation or cross-linking efficiency; we will have to assume that these are mostly the same. # Modelling the biological variability We first convert this into a `DGEList` object. ```{r} y <- asDGEList(subobj) ``` We also set up the design matrix. Here we block on the sample of origin, as livefire and control samples were prepared from the same pool of RNA. ```{r} cond <- factor(rep(c("Livefire", "Rcont"), each=3)) batch <- factor(rep(1:3, 2)) design <- model.matrix(~0 + batch + cond) design ``` We estimate the NB dispersion. ```{r} y <- estimateDisp(y, design) plotBCV(y) ``` Same for the QL dispersion. ```{r} fit <- glmQLFit(y, design, robust=TRUE) plotQLDisp(fit) ``` # Performing the contrasts We test for differences between the livefire and reverse control samples. ```{r} res <- glmQLFTest(fit) topTags(res) ``` We can have a look at the CPMs for some of the top features, to confirm that everything is correct. ```{r} library(pheatmap) top <- order(res$table$PValue)[1:50] vals <- cpm(y, log=TRUE, prior.count=3)[top,] vals <- vals - rowMeans(vals) colnames(vals) <- fnames pheatmap(vals) ``` We summarize the p-values to gene-level statistics, based on overlap with exons. We use the Ensembl annotation from BioMaRt as this preserves mitochondrial genes (amongst other things). ```{r, results="hide"} library(GenomicFeatures) library(BiocFileCache) url <- paste("ftp://ftp.ensembl.org/pub/release-89/gtf", "homo_sapiens/Homo_sapiens.GRCh38.89.gtf.gz", sep="/") bfc <- BiocFileCache() path <- bfcrpath(bfc, url) txdb <- makeTxDbFromGFF(path) ``` Identifying the exons and performing overlaps. ```{r} all.genes <- exonsBy(txdb, "gene") seqlevels(all.genes) <- paste0("chr", sub("MT", "M", seqlevels(all.genes))) olap <- findOverlaps(all.genes, subobj) ``` We now combine the p-values for all bin pairs overlapping exonic regions of the same gene. ```{r} library(csaw) comstats <- combineOverlaps(olap, res$table) beststats <- getBestOverlaps(olap, res$table) comstats$logFC <- beststats$logFC comstats$Best <- beststats$best rownames(comstats) <- names(all.genes) comstats <- comstats[!is.na(comstats$nWindows),] ``` Adding some more information, including the name of the gene and the location of the most differential site. ```{r} library(org.Hs.eg.db) anno <- mapIds(org.Hs.eg.db, keytype="ENSEMBL", column="SYMBOL", keys=rownames(comstats), multiVals="first") best.reg <- interactions(subobj)[comstats$Best,] comstats$Best <- as.character(second(best.reg)) comstats <- cbind(Symbol=anno, comstats) comstats <- comstats[order(comstats$PValue),] head(comstats) ``` Writing to file. ```{r} write.table(comstats, file="results_diff.txt", col.names=NA, sep="\t", quote=FALSE) ``` # Visualizing the top hit We can examine the top hit in some more detail. ```{r, fig.width=10, fig.height=6} top <- which.min(res$table$PValue) best.int <- interactions(subobj)[top] par(mfrow=c(2,3)) for (fpath in fnames) { suppressWarnings(plotPlaid(file.path("../processed", fpath), resize(first(best.int), fix="center", 1e4), resize(second(best.int), fix="center", 1e4), param=param, width=100)) } ``` # Wrapping up ```{r} sessionInfo() ```