---
title: Quality control checks on the Zika-Psolaren data
author: Aaron Lun
date: 7 November 2017
output:
  html_document:
    fig_caption: false
    toc: true
    toc_float: true
    depth: 3
    number_sections: true
    theme: united 
    highlight: tango 
---

```{r, echo=FALSE, results="hide"}
#dir.create("figures-process", showWarning=FALSE)
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
knitr::opts_chunk$set(dev='png', fig.path="figures-process/", 
                      dpi=300, dev.args=list(pointsize=15))
options(width=90)
```

# Setting up the data structures

Specifying the libraries to examine.

```{r}
dir.out <- "../processed"
all.h5 <- list.files(dir.out, full=TRUE, pattern="h5$")
basename(all.h5)
```

Getting the `pairParam` object out.

```{r}
library(diffHic)
cur.env <- new.env()
load(sub("h5", "Rda", all.h5[1]), envir=cur.env)
param <- cur.env$param
param
```    

# Examining the mapping statistics

We have a look at the number of read pairs that were successfully mapped.
(We apply a MAPQ filter of 10, and UMIs have been removed.
We get 100% mapping proportion because unmapped reads are no longer present in the BAM file!)

```{r}
stats <- list()
for (x in list.files(dir.out, full=TRUE, pattern="Rda$")) {
    cur.env <- new.env()
    load(x, envir=cur.env)
    stats[[sub(".Rda", "", basename(x))]] <- cur.env$out$pairs[c(1,4)]
}
stats <- data.frame(do.call(rbind, stats))
stats
```

# Examining the strand orientations 

## Human-only

We construct strand orientation plots, and we collect the number of reads in each orientation in each library.
We do this first for the human chromosomes.

```{r}
all.but.zika <- setdiff(seqlevels(param$fragments), "ZIKV")
alt.param <- reform(param, restrict=all.but.zika)
alt.param
```

Running the stats for the human-only reads. 

```{r strandori, fig.width=15, fig.height=6}
stats <- list()
par(mfrow=c(1,3))
for (x in all.h5) { 
    diags <- getPairData(x, alt.param)

    # Constructing the histograms for each orientation.
    llinsert <- log2(diags$insert + 1L)
    intra <- !is.na(llinsert)
    breaks <- seq(min(llinsert[intra]), max(llinsert[intra]), length.out=30)
    hist(llinsert[diags$orientation==1L], breaks=breaks, col="darkgreen", 
         main=paste0(basename(x), "(inward)"))
    hist(llinsert[diags$orientation==2L] ,breaks=breaks, col="red", 
         main=paste0(basename(x), "(outward)"))
    hist(llinsert[diags$orientation==0L | diags$orientation==3L], breaks=breaks, 
         col="blue", main=paste0(basename(x), "(same strand)"))

    # Recording stats.
    is.inward <- intra & diags$orientation==1L
    cur.stats <- list(Number=nrow(diags), 
        Interchr=1-sum(intra)/nrow(diags),
        Outward=sum(diags$orientation==2L & intra)/sum(intra),
        LongInward=sum(diags$insert > 500 & is.inward)/sum(is.inward))

    stats[[sub(".Rda", "", basename(x))]] <- unlist(cur.stats)
}
```

We examine the breakdown of read pairs.

```{r}
data.frame(do.call(rbind, stats))
```

```{r, echo=FALSE, results="hide"}
gc()
```

## ZIKA-only

We do this again for ZIKA-ZIKA interactions.

```{r}
alt.param <- reform(param, restrict="ZIKV")
alt.param
```

Running the stats.

```{r, ref.label="strandori", fig.width=15, fig.height=6}
```

We examine the breakdown of read pairs.

```{r}
data.frame(do.call(rbind, stats))
```

```{r, echo=FALSE, results="hide"}
gc()
```

```{r, results="asis", echo=FALSE}
if (file.exists("supp_qc.md")) { 
    cat("We also have a look at the absolute orientation of these files.\n")
    cat(readLines("supp_qc.md"), sep="\n")
}   
```

# Checking the read pair distributions

We want to count the number of read pairs between every pair of chromosomes.

```{r}
iset <- squareCounts(all.h5, param=param, width=1e9)
iset
```

We ask for the number of read pairs between ZIKA and all other chromosomes.

```{r}
zikv <- regions(iset)[seqnames(regions(iset))=="ZIKV"]
zikv.set <- subsetByOverlaps(iset, zikv)
stats <- data.frame(ZikaHuman=colSums(assay(zikv.set)))
stats$Prop <- stats$ZikaHuman/zikv.set$totals
stats
```

We ask for the number of read pairs between the mitochondrial genome and all other chromosomes.
This presumes that mitochondrial and nuclear transcripts do not interact.

```{r}
mito <- regions(iset)[seqnames(regions(iset))=="chrM"]
mito.set <- subsetByOverlaps(iset, mito)
stats <- data.frame(MitoOther=colSums(assay(mito.set)))
stats$Prop <- stats$MitoOther/mito.set$totals
stats
```

# Session information

Reporting the session information.

```{r}
sessionInfo()
```