---
title: PCM1 knockdown in RPE1 cells
author: Aaron Lun
date: 6th January 2017
output:
  html_document:
    toc: true
    toc_float: true
    depth: 3
    number_sections: true
    theme: united 
    highlight: tango 
    fig_caption: false
---

```{r, echo=FALSE, results="hide"}
knitr::opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
options(width=100)
```

# Introduction

Paired-end RNA sequencing data was generated by Lovorka Stojic and obtained from the CI Genomics servers on 4th January 2017.
Reads were aligned to the hg38 build of the human genome using `subread`, along with the CRISPR sequence.
Read counts were obtained by assigning reads into genes annotated in Ensembl version 83.
Here, we have a look at some mapping statistics.

```{r}
qual.data <- read.table("my_qual.tsv", sep="\t", header=TRUE, row.names=1)
total <- unname(rowSums(qual.data))
reads.mapped <- total - qual.data$Unassigned_Unmapped - qual.data$Unassigned_MappingQuality
reads.assigned <- qual.data$Assigned
data.frame(Sample=rownames(qual.data), Total=total, 
    Mapped=reads.mapped/total*100, Counted=reads.assigned/total*100)
```

We load in the counts, discarding the gene length information.

```{r}
all.counts <- read.table("genic_counts.tsv", header=TRUE, row.names=1)
gene.len <- all.counts[,1]
all.counts <- all.counts[,-1]
colnames(all.counts) <- sub(".*(D[0-9]+_D[0-9]+).*", "\\1", colnames(all.counts))
head(all.counts)
```

We also describe the allocation of samples to the various conditions.
Here we have RNAi and CRISPR knockdowns of PCM1.
The RNAi experiments were also performed at two time points, with matched siRNA and cell-only controls.

```{r}
metadata <- read.table("metadata.tsv")
metadata[,2] <- sub("-", "_", metadata[,2])
metadata <- metadata[match(colnames(all.counts), metadata[,2]),]
groups <- sub("Rpe_", "", sub("_exp[0-9]+", "", metadata[,1]))
groups <- sub("negguide_clone[0-9]+", "negguide", groups)
table(groups)
expnum <- sub(".*_exp([0-9]+)", "\\1", metadata[,1])
expnum[grep("clone", expnum)] <- "0"
data.frame(sample=metadata[,2], groups, expnum)
```

```{r, echo=FALSE, results="hide"}
gc()
```

# Quality control, filtering and normalization

We use the assembled information to construct a `DGEList` object in preparation for analysis with _edgeR_.
We also add some symbols for easier interpretation.

```{r}
library(edgeR)
y <- DGEList(all.counts, group=groups)
y$samples$exp <- expnum
library(org.Hs.eg.db)
anno <- select(org.Hs.eg.db, keytype="ENSEMBL", key=rownames(y), column="SYMBOL")
y$genes <- anno[match(rownames(y), anno$ENSEMBL), "SYMBOL", drop=FALSE]
```

One of the samples has much fewer reads than the others.
Clearly this corresponds to a failure of sequencing, so it is removed prior to further analysis.

```{r}
y <- y[,colnames(y)!="D712_D503"]
```

We filter out low-abundance genes as they do not contain enough evidence for rejection of the null,
and their discreteness can interfere with downstream analyses.

```{r}
keep <- aveLogCPM(y) > 0
y <- y[keep,]
summary(keep)
```

We perform normalization using the trimmed mean of M-values (TMM) method to remove composition biases.

```{r}
y <- calcNormFactors(y)
y$samples
```

The normalization outcome is visualized with some MA plots, where the cloud of points should be centered at zero.

```{r, fig.width=10, fig.height=10}
par(mfrow=c(5,5))
for (i in seq_len(ncol(y))) {
    plotMD(y, column=i)
    abline(h=0, col="red", lty=2)
}
```

Using the normalized values, we construct a MDS plot to examine the similarities between libraries.

```{r}
adjc <- cpm(y, prior=3, log=TRUE)
plotMDS(adjc, labels=paste0(y$samples$group, y$samples$exp))
```

# Modelling biological variability

We set up a design matrix that describes the experimental design.
Here, we use an additive model, blocking on the experiment number.
This is because the MDS plot suggests a weak experiment effect (replicate #2 is always to the left of #1).

```{r}
design <- model.matrix(~ 0 + group + exp, y$samples)
design <- design[,colnames(design)!="exp1"] # Get to full rank
```

We apply the `voom` method to log-transform the counts and compute precision weights.
This accounts for changes in the variability of the log-counts with increasing mean.

```{r}
v.all <- voom(y, design, plot=TRUE)
```

Finally, we fit a linear model to the log-CPMs.

```{r}
fit <- lmFit(v.all, design)
```

We also estimate the dispersion under a negative binomial model.
Generally, values under 0.1 are expected (note that the BCV is the root-dispersion).

```{r}
y <- estimateDisp(y, design)
plotBCV(y)
```

# Testing for differential expression

```{r decomp, echo=FALSE, eval=FALSE}
# Testing for the differential effect between time points.
res <- topTable(fit2, sort.by="none", n=Inf)
res$F <- NULL
subres <- topTable(fit2, sort.by="none", n=Inf, coef="Overall")
res$P.Value <- subres$P.Value
res$adj.P.Val <- subres$adj.P.Val
res2 <- res[order(res$P.Value),]
write.table(res2, file=sprintf("de_%s_diff.tsv", fname), col.names=NA, quote=FALSE, sep="\t")
head(res2)
# Testing for DE at T1.
res <- topTable(fit2, sort.by="p", n=Inf, coef="T1")
write.table(res, file=sprintf("de_%s_T1.tsv", fname), col.names=NA, quote=FALSE, sep="\t")
head(res)
# Testing for DE at T0.
res <- topTable(fit2, sort.by="p", n=Inf, coef="T0")
write.table(res, file=sprintf("de_%s_T0.tsv", fname), col.names=NA, quote=FALSE, sep="\t")
head(res)
```

```{r lfccomp, echo=FALSE, eval=FALSE}
# Testing for the differential effect between time points.
fit3 <- treat(fit2, lfc=0.5, robust=TRUE)
summary(decideTests(fit3))
res <- topTable(fit2, sort.by="none", n=Inf)
res$F <- NULL
subres <- topTreat(fit3, sort.by="none", n=Inf, coef="Overall")
res$P.Value <- subres$P.Value
res$adj.P.Val <- subres$adj.P.Val
res2 <- res[order(res$P.Value),]
write.table(res2, file=sprintf("lfc_%s_diff.tsv", fname), col.names=NA, quote=FALSE, sep="\t")
head(res2)
# Testing for DE at T1.
res <- topTreat(fit3, sort.by="p", n=Inf, coef="T1")
write.table(res, file=sprintf("lfc_%s_T1.tsv", fname), col.names=NA, quote=FALSE, sep="\t")
head(res)
# Testing for DE at T0.
res <- topTreat(fit3, sort.by="p", n=Inf, coef="T0")
write.table(res, file=sprintf("lfc_%s_T0.tsv", fname), col.names=NA, quote=FALSE, sep="\t")
head(res)
```

## For knockdown with siRNA 27

### Time-dependent effect of knockdown against control siRNAs

Setting up appropriate contrasts specifying the log-fold change due to knockdown at both time points.
The actual test will be that of differences between the log-fold changes across time points.

```{r}
fname <- "siRNA27vsConsi"
con <- makeContrasts(Overall=(groupSiPCM1_27_T1 - groupSiPCM1_27_T0) 
                             - (groupconsi_T1 - groupconsi_T0),
                     T1=groupSiPCM1_27_T1 - groupconsi_T1,
                     T0=groupSiPCM1_27_T0 - groupconsi_T0, levels=design)
fit2 <- contrasts.fit(fit, con)
fit2 <- eBayes(fit2, robust=TRUE)
summary(decideTests(fit2))
```

Having a look at the top DE genes for the overall log-fold change (i.e., differential knockdown effect at T1 versus that at T0).
We also test for DE genes in each of the individual time points.

```{r, ref.label="decomp"}
```

Checking that the target is indeed knocked down at both time points:

```{r}
res["ENSG00000078674",]
```

Repeating with a log-fold change threshold via `treat`:

```{r, ref.label="lfccomp"}
```

### Time-dependent effect of knockdown versus transfection control 

Setting up appropriate contrasts:

```{r}
fname <- "siRNA27vsMax" 
con <- makeContrasts(Overall=(groupSiPCM1_27_T1 - groupSiPCM1_27_T0) 
                             - (groupMax_T1 - groupMax_T0),
                     T1=groupSiPCM1_27_T1 - groupMax_T1,
                     T0=groupSiPCM1_27_T0 - groupMax_T0, levels=design)
fit2 <- contrasts.fit(fit, con)
fit2 <- eBayes(fit2, robust=TRUE)
summary(decideTests(fit2))
```

Having a look at the top DE genes.

```{r, ref.label="decomp"}
```

Checking that the target is indeed knocked down at both time points:

```{r}
res["ENSG00000078674",]
```

Repeating with a log-fold change threshold via `treat`:

```{r, ref.label="lfccomp"}
```

### Time-dependent effect of knockdown versus cells alone

Setting up appropriate contrasts:

```{r}
fname <- "siRNA27vsCells"
con <- makeContrasts(Overall=(groupSiPCM1_27_T1 - groupSiPCM1_27_T0) 
                             - (groupT1 - groupT0),
                     T1=groupSiPCM1_27_T1 - groupT1,
                     T0=groupSiPCM1_27_T0 - groupT0, levels=design)
fit2 <- contrasts.fit(fit, con)
fit2 <- eBayes(fit2, robust=TRUE)
summary(decideTests(fit2))
```

Having a look at the top DE genes:

```{r, ref.label="decomp"}
```

Checking that the target is indeed knocked down at both time points:

```{r}
res["ENSG00000078674",]
```

Repeating with a log-fold change threshold via `treat`:

```{r, ref.label="lfccomp"}
```

## For knockdown with siRNA 28

### Time-dependent effect of knockdown against control siRNAs

Setting up appropriate contrasts specifying the log-fold change due to knockdown at both time points.
The actual test will be that of differences between the log-fold changes across time points.

```{r}
fname <- "siRNA28vsConsi"
con <- makeContrasts(Overall=(groupSiPCM1_28_T1 - groupSiPCM1_28_T0) 
                             - (groupconsi_T1 - groupconsi_T0),
                     T1=groupSiPCM1_28_T1 - groupconsi_T1,
                     T0=groupSiPCM1_28_T0 - groupconsi_T0, levels=design)
fit2 <- contrasts.fit(fit, con)
fit2 <- eBayes(fit2, robust=TRUE)
summary(decideTests(fit2))
```

Having a look at the top DE genes for the overall log-fold change (i.e., differential knockdown effect at T1 versus that at T0).
We also test for DE genes in each of the individual time points.

```{r, ref.label="decomp"}
```

Checking that the target is indeed knocked down at both time points:

```{r}
res["ENSG00000078674",]
```

Repeating with a log-fold change threshold via `treat`:

```{r, ref.label="lfccomp"}
```

### Time-dependent effect of knockdown versus transfection control 

Setting up appropriate contrasts:

```{r}
fname <- "siRNA28vsMax"
con <- makeContrasts(Overall=(groupSiPCM1_28_T1 - groupSiPCM1_28_T0) 
                             - (groupMax_T1 - groupMax_T0),
                     T1=groupSiPCM1_28_T1 - groupMax_T1,
                     T0=groupSiPCM1_28_T0 - groupMax_T0, levels=design)
fit2 <- contrasts.fit(fit, con)
fit2 <- eBayes(fit2, robust=TRUE)
summary(decideTests(fit2))
```

Having a look at the top DE genes:

```{r, ref.label="decomp"}
```

Checking that the target is indeed knocked down at both time points:

```{r}
res["ENSG00000078674",]
```

Repeating with a log-fold change threshold via `treat`:

```{r, ref.label="lfccomp"}
```

### Time-dependent effect of knockdown versus cells alone

Setting up appropriate contrasts:

```{r}
fname <- "siRNA28vsCells"
con <- makeContrasts(Overall=(groupSiPCM1_28_T1 - groupSiPCM1_28_T0) 
                             - (groupT1 - groupT0),
                     T1=groupSiPCM1_28_T1 - groupT1,
                     T0=groupSiPCM1_28_T0 - groupT0, levels=design)
fit2 <- contrasts.fit(fit, con)
fit2 <- eBayes(fit2, robust=TRUE)
summary(decideTests(fit2))
```

Having a look at the top DE genes:

```{r, ref.label="decomp"}
```

Checking that the target is indeed knocked down at both time points:

```{r}
res["ENSG00000078674",]
```

Repeating with a log-fold change threshold via `treat`:

```{r, ref.label="lfccomp"}
```

## Effect of knockdown with CRISPR

Setting up appropriate contrasts:

```{r}
con <- makeContrasts(Clone14=groupPCM1KO_clone14 - groupnegguide, 
                     Clone18=groupPCM1KO_clone18 - groupnegguide, levels=design)
fit2 <- contrasts.fit(fit, con)
fit2 <- eBayes(fit2, robust=TRUE)
summary(decideTests(fit2))
```

Having a look at the top DE genes for clone 14.

```{r}
res <- topTable(fit2, sort.by="p", n=Inf, coef="Clone14")
res["ENSG00000078674",]
write.table(res, file="de_CRISPR_14.tsv", col.names=NA, quote=FALSE, sep="\t")
head(res)
```

Repeating for clone 18:

```{r}
res <- topTable(fit2, sort.by="p", n=Inf, coef="Clone18")
res["ENSG00000078674",]
write.table(res, file="de_CRISPR_18.tsv", col.names=NA, quote=FALSE, sep="\t")
head(res)
```

Repeating with a log-fold change threshold via `treat`:

```{r}
fit2 <- treat(fit2, lfc=0.5, robust=TRUE)
summary(decideTests(fit2))
res <- topTreat(fit2, sort.by="p", n=Inf, coef="Clone14")
write.table(res, file="lfc_CRISPR_14.tsv", col.names=NA, quote=FALSE, sep="\t")
head(res)
res <- topTreat(fit2, sort.by="p", n=Inf, coef="Clone18")
write.table(res, file="lfc_CRISPR_18.tsv", col.names=NA, quote=FALSE, sep="\t")
head(res)
```

## Effect of control siRNAs 

### At time zero

Setting up appropriate contrasts:

```{r}
con <- makeContrasts(logFC=groupconsi_T0 - groupT0, levels=design)
fit2 <- contrasts.fit(fit, con)
fit2 <- eBayes(fit2, robust=TRUE)
summary(decideTests(fit2))
```

Having a look at the top DE genes:

```{r}
res <- topTable(fit2, sort.by="p", n=Inf)
write.table(res, file="de_consi_T0.tsv", col.names=NA, quote=FALSE, sep="\t")
head(res)
```

Repeating with a log-fold change threshold via `treat`:

```{r}
fit2 <- treat(fit2, lfc=0.5, robust=TRUE)
summary(decideTests(fit2))
res2 <- res[order(res$P.Value),]
write.table(res2, file="lfc_consi_T0.tsv", col.names=NA, quote=FALSE, sep="\t")
head(res2)
```

### At the later time point

Setting up appropriate contrasts:

```{r}
con <- makeContrasts(logFC=groupconsi_T1 - groupT1, levels=design)
fit2 <- contrasts.fit(fit, con)
fit2 <- eBayes(fit2, robust=TRUE)
summary(decideTests(fit2))
```

Having a look at the top DE genes:

```{r}
res <- topTable(fit2, sort.by="p", n=Inf)
write.table(res, file="de_consi_T1.tsv", col.names=NA, quote=FALSE, sep="\t")
head(res)
```

Repeating with a log-fold change threshold via `treat`:

```{r}
fit2 <- treat(fit2, lfc=0.5, robust=TRUE)
summary(decideTests(fit2))
res2 <- res[order(res$P.Value),]
write.table(res2, file="lfc_consi_T1.tsv", col.names=NA, quote=FALSE, sep="\t")
head(res2)
```


# Wrapping up

Saving objects for future use, and showing the session information.

```{r}
saveRDS(file="objects.rds", list(y=y, fit=fit))
sessionInfo()
```