--- title: Initial inspection of the Zika virus data author: Aaron Lun date: 13 April 2017 output: html_document: fig_caption: false toc: true toc_float: true depth: 3 number_sections: true theme: united highlight: tango --- ```{r, echo=FALSE, results="hide"} #dir.create("figures-process", showWarning=FALSE) knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE) knitr::opts_chunk$set(dev='pdf')#fig.path="figures-process/") options(width=90) ``` # Diagnostics for RNA-interactions, 100-300nt ## Based on gene assignments Setting up the file path. Also loading human transcript annotation. ```{r} lib <- "../processed/Lib3-1.raw.gz" library(org.Hs.eg.db) ``` Computing statistics for reads. ```{r compstats} # Only using the first 10 million entries for diagnostic purposes, ignoring read names. incoming <- read.table(lib, colClasses= c(list(NULL), as.list(rep("character", 4))), stringsAsFactors=FALSE, nrows=1e7) N <- nrow(incoming) status1 <- incoming[,1] status2 <- incoming[,3] # Determining the proportion of read pairs that are both mapped. is.mapped <- c("Assigned", "Unassigned_NoFeatures", "Unassigned_Duplicate") sum(status1 %in% is.mapped & status2 %in% is.mapped)/N ## Determining the proportion of read pairs that are not duplicates. #is.nondup <- c("Assigned", "Unassigned_NoFeatures") #sum(status1 %in% is.nondup & status2 %in% is.nondup)/N # Determining the proportion of read pairs that are both assigned (mapped & non-duplicate). both.good <- status1 == "Assigned" & status2 == "Assigned" sum(both.good)/N # Stripping out the assigned read pairs. gene1 <- incoming[both.good,2] gene2 <- incoming[both.good,4] # Determining the proportion of assigned read pairs both mapped to Zika. sum(gene1=="Zika" & gene2=="Zika")/sum(both.good) # Determining the proportion of assigned read pairs both NOT mapped to Zika. sum(gene1!="Zika" & gene2!="Zika")/sum(both.good) # Determining the proportion of assigned read pairs mapped between Zika and other genes. zika.other <- (gene1!="Zika")!=(gene2!="Zika") sum(zika.other)/sum(both.good) # Calculating the top counts for these 'other' genes. first.zika <- gene1=="Zika" other.gene <- ifelse(first.zika, gene2, gene1)[zika.other] other.counts <- table(other.gene) output <- select(org.Hs.eg.db, keys=names(other.counts), keytype="ENSEMBL", column="SYMBOL") output <- output[match(names(other.counts), output$ENSEMBL),] output$count <- other.counts output <- output[order(output$count, decreasing=TRUE),] head(output, 10) ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} library(Rsamtools) incoming <- "../bam/Lib3-1.bam" ``` Having a look at the paired-end statistics. Again, only looking at the first 10 million read pairs. ```{r bamstats} bf <- BamFile(incoming, yieldSize=1e7) out1 <- scanBam(bf, param=ScanBamParam(what=c("flag", "mapq", "isize", "rname"), flag=scanBamFlag(isFirstMateRead=TRUE)))[[1]] out2 <- scanBam(bf, param=ScanBamParam(what=c("flag", "mapq", "isize"), flag=scanBamFlag(isSecondMateRead=TRUE)))[[1]] summary.stats <- list() # Keeping things that are not duplicates, supplementary, or unmapped/mate unmapped. # (Need to check mappedness here, as unmapped reads don't get marked as duplicates.) out1 <- lapply(out1, "[", !bitwAnd(out1$flag, 0x800 + 0x400 + 0x4 + 0x8)) out2 <- lapply(out2, "[", !bitwAnd(out2$flag, 0x800 + 0x400 + 0x4 + 0x8)) # Capping yield results in some asynchronicity due to supplementary alignments. smaller <- seq_len(min(length(out1$isize), length(out2$isize))) out1 <- lapply(out1, "[", smaller) out2 <- lapply(out2, "[", smaller) stopifnot(all(out1$isize + out2$isize==0L, na.rm=TRUE)) # Computing the proportion of high-quality alignments. keep <- out1$mapq==255L & out2$mapq==255L keep[is.na(keep)] <- FALSE out1 <- lapply(out1, "[", keep) out2 <- lapply(out2, "[", keep) table(keep)/length(keep) # Computing the proportion of the read pairs on the same chromosome. intra <- out1$isize!=0L out1 <- lapply(out1, "[", intra) out2 <- lapply(out2, "[", intra) table(intra)/length(intra) # Computing the distribution of strand orientation for Zika reads. in.zika <- out1$rname=="ZIKV" rev1 <- bitwAnd(out1$flag, 0x10)!=0L rev2 <- bitwAnd(out2$flag, 0x10)!=0L table(ifelse(rev1[in.zika], "R1", "F1"), ifelse(rev2[in.zika], "R2", "F2"))/sum(in.zika) # Computing the proportion of outward-facing read pairs. outward <- (!rev1 & out1$isize < 0L & rev2) | (!rev2 & out2$isize < 0L & rev1) out.stats <- table(outward[in.zika])/sum(in.zika) (summary.stats$outward <- out.stats) # ... in Zika table(outward[!in.zika])/sum(!in.zika) # ... in human # Computing the proportion of inward-facing read pairs. inward <- (!rev1 & out1$isize > 0L & rev2) | (!rev2 & out2$isize > 0L & rev1) out1 <- lapply(out1, "[", inward) out2 <- lapply(out2, "[", inward) table(inward[in.zika])/sum(in.zika) # ... in Zika table(inward[!in.zika])/sum(!in.zika) # ... in human # Examining the distribution of sizes, in and outside of Zika. sizes <- abs(out1$isize) in.zika <- out1$rname=="ZIKV" zika.sizes <- quantile(sizes[in.zika], 1:100/100) summary.stats$sizes <- zika.sizes[zika.sizes >= 300] zika.sizes # ... in Zika quantile(sizes[!in.zika], 1:100/100) # ... in human ``` Saving the summary statistics. ```{r} all.stats <- list() all.stats[["Lib3-1"]] <- summary.stats ``` # Diagnostics for RNA interactions, 300-800nt ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/Lib3-2.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/Lib3-2.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["Lib3-2"]] <- summary.stats ``` # Diagnostics for Reverse-crosslink control 100-300nt ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/Lib3-3.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/Lib3-3.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["Lib3-3"]] <- summary.stats ``` # Diagnostics for Reverse-crosslink control 300-800nt ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/Lib3-4.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` Saving the counts to file. ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/Lib3-4.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["Lib3-4"]] <- summary.stats ``` # Diagnostics for non-crosslinked control 300-800nt ## Based on gene assignments Setting up the file path. ```{r} lib <- "../processed/Lib3-6.raw.gz" ``` Computing statistics for reads. ```{r, ref.label="compstats"} ``` Saving the counts to file. ## Looking at the fragment sizes Setting up the file path to the BAM file. ```{r} incoming <- "../bam/Lib3-6.bam" ``` Having a look at the paired-end statistics. ```{r, ref.label="bamstats"} ``` Saving the summary statistics. ```{r} all.stats[["Lib3-6"]] <- summary.stats ``` # Printing out the summary statistics Printing out the proportions of outward-facing Zika read pairs. ```{r} lapply(all.stats, "[[", i="outward") ``` Printing out the proportions of inward-facing read pairs above 300. ```{r} lapply(all.stats, "[[", i="sizes") ``` # Wrapping up ```{r} sessionInfo() ```