--- title: Initial inspection of the Zika virus data author: Aaron Lun date: 8 May 2017 output: html_document: fig_caption: false toc: true toc_float: true depth: 3 number_sections: true theme: united highlight: tango --- ```{r, echo=FALSE, results="hide"} #dir.create("figures-process", showWarning=FALSE) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) knitr::opts_chunk$set(dev='pdf')#fig.path="figures-process/") options(width=90) ``` # Diagnostics for Library 1: low ATP, low ligase, interactions ## Based on gene assignments Setting up the file path. Also loading human transcript annotation. ```{r} library(rtracklayer) anno <- import("../../scripts/combined.gtf") mito.genes <- anno$gene_id[as.logical(seqnames(anno)=="chrM")] lib <- "../processed/OZ-TL6-01_S1.raw.gz" ``` Computing statistics for reads. ```{r compstats} # Only using the first 10 million entries for diagnostic purposes, ignoring read names. incoming <- read.table(lib, colClasses= c(list(NULL), as.list(rep("character", 4))), stringsAsFactors=FALSE, nrows=1e7) N <- nrow(incoming) status1 <- incoming[,1] status2 <- incoming[,3] # Determining the proportion of read pairs that are both mapped. is.mapped <- c("Assigned", "Unassigned_NoFeatures", "Unassigned_Duplicate") sum(status1 %in% is.mapped & status2 %in% is.mapped)/N ## Determining the proportion of read pairs that are not duplicates. #is.nondup <- c("Assigned", "Unassigned_NoFeatures") #sum(status1 %in% is.nondup & status2 %in% is.nondup)/N # Determining the proportion of read pairs that are both assigned (mapped & non-duplicate). both.good <- status1 == "Assigned" & status2 == "Assigned" sum(both.good)/N # Stripping out the assigned read pairs. gene1 <- incoming[both.good,2] gene2 <- incoming[both.good,4] # Determining the proportion of assigned read pairs both mapped to Mt. sum(gene1 %in% mito.genes & gene2 %in% mito.genes)/sum(both.good) # Determining the proportion of assigned read pairs both NOT mapped to Mt. sum(!gene1 %in% mito.genes & !gene2 %in% mito.genes)/sum(both.good) # Determining the proportion of assigned read pairs mapped between Mt and nuclear genes. sum((gene1 %in% mito.genes)!=(gene2 %in% mito.genes))/sum(both.good) # Determining the proportion of read pairs mapped to different genes. sum(gene1!=gene2)/sum(both.good) ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} library(Rsamtools) incoming <- "../bam/OZ-TL6-01_S1.bam" ``` Having a look at the paired-end statistics. Again, only looking at the first 10 million read pairs. ```{r bamstats} bf <- BamFile(incoming, yieldSize=1e7) out1 <- scanBam(bf, param=ScanBamParam(what=c("flag", "mapq", "isize", "rname"), flag=scanBamFlag(isFirstMateRead=TRUE)))[[1]] out2 <- scanBam(bf, param=ScanBamParam(what=c("flag", "mapq", "isize"), flag=scanBamFlag(isSecondMateRead=TRUE)))[[1]] summary.stats <- list() # Keeping things that are not duplicates, supplementary, or unmapped/mate unmapped. # (Need to check mappedness here, as unmapped reads don't get marked as duplicates.) out1 <- lapply(out1, "[", !bitwAnd(out1$flag, 0x800 + 0x400 + 0x4 + 0x8)) out2 <- lapply(out2, "[", !bitwAnd(out2$flag, 0x800 + 0x400 + 0x4 + 0x8)) # Capping yield results in some asynchronicity due to supplementary alignments. smaller <- seq_len(min(length(out1$isize), length(out2$isize))) out1 <- lapply(out1, "[", smaller) out2 <- lapply(out2, "[", smaller) stopifnot(all(out1$isize + out2$isize==0L, na.rm=TRUE)) # Computing the proportion of high-quality alignments. keep <- out1$mapq==255L & out2$mapq==255L keep[is.na(keep)] <- FALSE out1 <- lapply(out1, "[", keep) out2 <- lapply(out2, "[", keep) table(keep)/length(keep) # Computing the proportion of the read pairs on the same chromosome. intra <- out1$isize!=0L out1 <- lapply(out1, "[", intra) out2 <- lapply(out2, "[", intra) table(intra)/length(intra) # Computing the distribution of strand orientation for Mt reads. in.mito <- out1$rname=="chrM" rev1 <- bitwAnd(out1$flag, 0x10)!=0L rev2 <- bitwAnd(out2$flag, 0x10)!=0L table(ifelse(rev1[in.mito], "R1", "F1"), ifelse(rev2[in.mito], "R2", "F2"))/sum(in.mito) # Computing the proportion of outward-facing read pairs. outward <- (!rev1 & out1$isize < 0L & rev2) | (!rev2 & out2$isize < 0L & rev1) (summary.stats$outward.Mt <- table(outward[in.mito])/sum(in.mito)) # ... in Mt (summary.stats$outward.Nc <- table(outward[!in.mito])/sum(!in.mito)) # ... in Nuc # Computing the proportion of inward-facing read pairs. inward <- (!rev1 & out1$isize > 0L & rev2) | (!rev2 & out2$isize > 0L & rev1) out1 <- lapply(out1, "[", inward) out2 <- lapply(out2, "[", inward) table(inward[in.mito])/sum(in.mito) # ... in Mt table(inward[!in.mito])/sum(!in.mito) # ... in Nuc # Examining the distribution of sizes, in and outside of Mt. sizes <- abs(out1$isize) in.mito <- out1$rname=="chrM" mito.sizes <- quantile(sizes[in.mito], 1:100/100) summary.stats$sizes <- mito.sizes[mito.sizes >= 300] mito.sizes # ... in Mt quantile(sizes[!in.mito], 1:100/100) # ... in Nuc ``` Saving the summary statistics. ```{r} all.stats <- list() all.stats[["OZ-TL6-01_S1"]] <- summary.stats ``` # Diagnostics for Library 2: low ATP, low ligase, reverse crosslink ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-02_S2.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-02_S2.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-02_S2"]] <- summary.stats ``` # Diagnostics for Library 3: low ATP, low ligase, no crosslink ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-03_S3.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-03_S3.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-03_S3"]] <- summary.stats ``` # Diagnostics for Library 4: low ATP, high ligase, interactions ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-04_S4.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-04_S4.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-04_S4"]] <- summary.stats ``` # Diagnostics for Library 5: low ATP, high ligase, reverse crosslink ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-05_S5.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-05_S5.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-05_S5"]] <- summary.stats ``` # Diagnostics for Library 6: low ATP, high ligase, no crosslink ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-06_S6.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-06_S6.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-06_S6"]] <- summary.stats ``` # Diagnostics for Library 7: low ATP, low ligase, DMSO, interactions ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-07_S7.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-07_S7.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-07_S7"]] <- summary.stats ``` # Diagnostics for Library 8: low ATP, low ligase, DMSO, reverse crosslink ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-08_S8.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-08_S8.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-08_S8"]] <- summary.stats ``` # Diagnostics for Library 9: low ATP, low ligase, DMSO, no crosslink ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-09_S9.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-09_S9.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-09_S9"]] <- summary.stats ``` # Diagnostics for Library 10: circligase I, interactions ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-10_S10.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-10_S10.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-10_S10"]] <- summary.stats ``` # Diagnostics for Library 11: circligase I, reverse crosslink ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-11_S11.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-11_S11.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-11_S11"]] <- summary.stats ``` # Diagnostics for Library 12: circligase I, no crosslink ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-12_S12.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-12_S12.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-12_S12"]] <- summary.stats ``` # Diagnostics for Library 13: no ligation, interactions ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-13_S13.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-13_S13.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-13_S13"]] <- summary.stats ``` # Diagnostics for Library 14: no ligation, no crosslink ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/OZ-TL6-14_S14.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/OZ-TL6-14_S14.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["OZ-TL6-14_S14"]] <- summary.stats ``` # Printing out the summary statistics Printing out the proportions of outward-facing mitochondrial read pairs. ```{r} lapply(all.stats, "[[", i="outward.Mt") ``` Same for the outward-facing nuclear read pairs. ```{r} lapply(all.stats, "[[", i="outward.Nc") ``` Printing out the proportions of inward-facing read pairs above 300. ```{r} lapply(all.stats, "[[", i="sizes") ``` # Wrapping up ```{r} sessionInfo() ```