---
title: Effect of MeCP2 dosage on gene expression variability in LUHMES cells
author: Aaron Lun
date: 19 June 2016
output:
  html_document:
    fig_caption: false
---

```{r, echo=FALSE, results='hide'}
knitr::opts_chunk$set(error=FALSE, warning=FALSE, message=FALSE)
options(width=100, bitmapType="cairo")
```

# Introduction

The aim of this expriment is to identify genes that change in expression variability across cells upon varying the dosage of MeCP2 (by overexpression, knockdown or mutation). 
RNA sequencing data for single LUHMES cells were generated by Justyna Cholewa-Waclaw.
Paired-end 125 bp libraries were obtained on the 18th June 2016 from the Sanger servers.
Reads were mapped to the hg38 build of the human genome (along with the ERCC sequences) using the `subread` aligner.
Mapped read pairs were counted into exonic regions of annotated genes or spike-in transcripts using the `featureCounts` function.

```{r}
counts <- read.table("genic_counts.tsv", header=TRUE, row.names=1, comment="")
gene.length <- counts[,1]
counts <- counts[,-1]
dim(counts)
```

We load in the sample metadata, and match it up to the columns of our sample data.

```{r}
metadata <- read.csv("sample_metadata.csv", header=TRUE, row.names=1)
m <- match(rownames(metadata), sub("\\.", "#", sub("X", "", colnames(counts))))
counts <- counts[,m] 
stopifnot(all(!is.na(m)))
table(metadata$Treatment)
```

We can also have a look at the mapping and counting proportions.
The mapping percentages are quite low -- contamination, perhaps? -- which is typical of scRNA-seq data sets.
They don't seem to be adaptors or primers, based on the FASTQC reports.

```{r}
stats <- read.table("my_qual.tsv", header=TRUE, row.names=1, sep="\t", comment="")
stats <- stats[rownames(stats) %in% rownames(metadata),]
mapped <- rowSums(stats[,-c(5:6)])
summary(mapped/rowSums(stats)) # Proportion mapped
summary(stats$Assigned/mapped) # Proportion counted into genes.
```

Finally, we set up a `SCESet` object for further analysis.

```{r}
library(scran)
sce <- newSCESet(countData=counts)
fData(sce)$Length <- gene.length
sce$Treatment <- metadata$Treatment
sce$Differentiation <- metadata$Differentiation
sce$Plate <- sub(".*_([0-9]+)_.*", "\\1", metadata$Sample)
```

We add some annotation for later use.

```{r}
library(org.Hs.eg.db)
anno <- select(org.Hs.eg.db, keytype="ENSEMBL", keys=rownames(sce), columns="SYMBOL")
fData(sce)$Symbol <- anno$SYMBOL[match(rownames(sce), anno$ENSEMBL)]
```

```{r, echo=FALSE, results="hide"}
gc()
```

# Quality control on the cells

We perform some basic quality control on the cells.
To do so, we need to know which rows correspond to mitochondrial genes and spike-in transcripts.

```{r}
library(GenomicFeatures)
#txdb <- makeTxDbFromUCSC(genome='hg19',tablename='ensGene') 
#saveDb(txdb, file="hg19.ensGene")
txdb <- loadDb(file="hg19.ensGene")
location <- select(txdb, keys=rownames(sce), column="TXCHROM", keytype="GENEID")
is.mito <- (location$TXCHROM=="chrM" & !is.na(location$TXCHROM))[match(rownames(sce), location$GENEID)]
is.spike <- grepl("^ERCC", rownames(sce))
sce <- calculateQCMetrics(sce, feature_controls=list(Mt=is.mito, ERCC=is.spike))
isSpike(sce) <- "ERCC"
```

We examine the distribution of library sizes and the number of expressed genes.
Low-quality libraries will have small library sizes and few expressed genes, as the capture efficiency for the corresponding cells is lower.

```{r, fig.height=6, fig.width=10}
par(mfrow=c(1,2))
hist(sce$total_counts/1e3, xlab="Library size (thousands)", col="grey50")
hist(sce$total_features, xlab="No. of expressed genes", col="grey50")
```

We also examine the distribution of spike-in and mitochondrial reads.
Low-quality libraries will have high proportions of such reads, due to the loss of cytoplasmic RNA from lysed cells.

```{r, fig.height=6, fig.width=10}
par(mfrow=c(1,2))
hist(sce$pct_counts_feature_controls_Mt, xlab="Mitochondrial percentage", col="grey50")
hist(sce$pct_counts_feature_controls_ERCC, xlab="ERCC percentage", col="grey50")
```

Going through and removing outlier libraries for any of the metrics above.
We also remove the pools, no point using these.

```{r}
small.lib <- isOutlier(sce$total_counts, log=TRUE, nmads=3, type="lower")
few.genes <- isOutlier(sce$total_features, log=TRUE, nmads=3, type="lower")
mito.high <- isOutlier(sce$pct_counts_feature_controls_Mt, nmads=3, type="higher", na.rm=TRUE)
spike.high <- isOutlier(sce$pct_counts_feature_controls_Mt, nmads=3, type="higher", na.rm=TRUE)
keep <- !(small.lib | few.genes | mito.high | spike.high)
keep <- keep & !grepl("50 cells", sce$Treatment)
sce <- sce[,keep]
data.frame(ByLib=sum(small.lib), ByGenes=sum(few.genes), ByMito=sum(mito.high, na.rm=TRUE),
    BySpike=sum(spike.high, na.rm=TRUE), TotalLost=sum(!keep))
```

```{r, echo=FALSE, results="hide"}
gc()
```

We also check that the MeCP2 levels are expressed at expected levels across the groups.
There doesn't seem to be much difference between the WT and KO, though the over-expressors show a clear increase in the median.

```{r}
par(mar=c(7.1, 4.1, 4.1, 0.1))
boxplot(split(exprs(sce)["ENSG00000169057",], sce$Treatment), las=2, ylab="log-(Library size-adjusted counts)")
```

# Filtering and normalization

We remove low-abundance genes as these are likely to contain more noise than information.
This also reduces computational work later on.
All genes with mean counts below 1 are removed in this manner.

```{r}
ave.count <- rowMeans(counts(sce))
sce <- sce[ave.count > 1,]
dim(sce)
```

We use the deconvolution method to normalize out cell-specific biases in capture efficiency, sequencing depth, etc.
The size factors are quite consistent with the library sizes, indicating that there's not too much DE happening.

```{r}
clust <- quickCluster(sce)
sce <- computeSumFactors(sce, cluster=clust)
plot(sce$total_counts/1e3, sizeFactors(sce), xlab="Library size (thousands)", ylab="Size factors", log="xy")
```

We would usually normalize the spike-ins separately, as these are not subject to cell content biases.
However, it seems that the spike-in coverage is very low here, such that some cells have no spike-in counts at all.
This will compromise our ability to use these counts in downstream analyses, so we'll just remove them instead.

```{r}
sum(colSums(spikes(sce))==0)
sce <- sce[!isSpike(sce),]
```

We proceed to compute normalized expression values based on the size factors above.

```{r}
sce <- normalize(sce)
```

# Data exploration

We perform some basic data exploration to check for biological and technical effects.
First, we examine whether the treatment is able to separate the samples.
It seems that MeCP2 KO shifts the cells to a different part of the plot, while overexpression also has a slight shift from the control.
(For some reason, many of the control E10 cells are located near the KO cells as well.)

```{r, fig.width=10, fig.height=6}
type <- character(ncol(sce))
type[grep("KO|KI", sce$Treatment)] <- "KO"
type[grep("ctr", sce$Treatment)] <- "ctr"
type[grep("OE", sce$Treatment)] <- "OE"
sce2 <- sce
sce2$Type <- type
combined <- plotPCA(sce2, colour_by="Type")
separate <- plotPCA(sce, colour_by="Treatment")
multiplot(combined, separate, cols=2)
```

We then check for batch effects from the plate or differentiation.
It seems like there's a fairly strong differentiation effect, as one might expect.
There might also be a plate effect between plates in the same differentiation (i.e., 130 and 131, or 132 and 133).
That said, this is confounded by the fact that there's different treatment groups on each plate, so who knows?

```{r, fig.width=10, fig.height=6}
byplate <- plotPCA(sce, colour_by="Plate")
bydiff <- plotPCA(sce, colour_by="Differentiation")
multiplot(byplate, bydiff, cols=2)
```

We also do this after colouring by the MeCP2 expression.

```{r}
bymecp <- plotPCA(sce, colour_by="ENSG00000169057")
plot(bymecp)
```

We make a series of plots where we only show points from each sample type, to simplify the interpretation.

```{r, fig.width=15, fig.height=6, res=300}
separate2 <- ggplot2::ggplot_build(separate)$data[[2]]
par(mfrow=c(1,4))
plot(separate2$x, separate2$y, col=separate2$fill, cex=ifelse(type=="KO", 1, 0), 
    pch=16, main="KO and KI", xlab="PC1", ylab="PC2")
plot(separate2$x, separate2$y, col=separate2$fill, cex=ifelse(type=="ctr", 1, 0), 
    pch=16, main="Control", xlab="PC1", ylab="PC2")
plot(separate2$x, separate2$y, col=separate2$fill, cex=ifelse(type=="OE", 1, 0), 
    pch=16, main="OE", xlab="PC1", ylab="PC2")
available <- sort(as.character(unique(sce$Treatment)))
plot(0,0, type="n", axes=FALSE, xlab="", ylab="")
legend("topleft", legend=available, pch=16, col=separate2$fill[match(available, sce$Treatment)])
```

We also perform this after removing the differentiation effect and selecting a subset of biologically variable genes.

```{r, fig.width=15, fig.height=6, res=300}
fit <- trendVar(sce, trend="loess", span=0.2, use.spikes=NA)
dec <- decomposeVar(sce, fit)
top.hvgs <- order(dec$bio, decreasing=TRUE)[1:200]
ex <- limma::removeBatchEffect(exprs(sce)[top.hvgs,], sce$Differentiation)
coords <- prcomp(t(ex), scale=TRUE)
par(mfrow=c(1,4))
plot(coords$x[,1], coords$x[,2], col=separate2$fill, cex=ifelse(type=="KO", 1, 0), 
    pch=16, main="KO and KI", xlab="PC1", ylab="PC2")
plot(coords$x[,1], coords$x[,2], col=separate2$fill, cex=ifelse(type=="ctr", 1, 0), 
    pch=16, main="Control", xlab="PC1", ylab="PC2")
plot(coords$x[,1], coords$x[,2], col=separate2$fill, cex=ifelse(type=="OE", 1, 0), 
    pch=16, main="OE", xlab="PC1", ylab="PC2") 
available <- sort(as.character(unique(sce$Treatment)))
plot(0,0, type="n", axes=FALSE, xlab="", ylab="")
legend("topleft", legend=available, pch=16, col=separate2$fill[match(available, sce$Treatment)])
```

We repeat this process, colouring by MeCP2 expression levels within each set of samples.

```{r, fig.width=15, fig.height=6, res=300}
par(mfrow=c(1,3))
bymecp2 <- ggplot2::ggplot_build(bymecp)$data[[2]]
plot(coords$x[,1], coords$x[,2], col=bymecp2$fill, cex=ifelse(type=="KO", 1, 0), 
    pch=16, main="KO and KI", xlab="PC1", ylab="PC2")
plot(coords$x[,1], coords$x[,2], col=bymecp2$fill, cex=ifelse(type=="ctr", 1, 0), 
    pch=16, main="Control", xlab="PC1", ylab="PC2")
plot(coords$x[,1], coords$x[,2], col=bymecp2$fill, cex=ifelse(type=="OE", 1, 0), 
    pch=16, main="OE", xlab="PC1", ylab="PC2") 
```

# Testing for differentially expressed genes

Not technically the aim of the experiment, but let's do it anyway and see what we find.
We sum all the cells together within each group on each plate, and we proceed to a standard DE analysis.
We toss out the some of the E10 control and E6 overexpressing cells as there are too few cells per plate for summation.

```{r}
library(edgeR)
groupings <- paste0(sce$Treatment, "|", sce$Plate)
table(groupings)
counts <- sumTechReps(counts(sce), groupings)
counts <- counts[,!colnames(counts) %in% c("OE CMV E6|131", "ctr E10|131")]
```

We set up a design matrix for these pools.

```{r}
tr <- factor(gsub(" ", "_", sub("\\|.*", "", colnames(counts))))
pl <- factor(sub(".*\\|", "", colnames(counts)))
design <- model.matrix(~0 + tr + pl)
design <- design[,!grepl("pl131", colnames(design))]
colnames(design)[seq_len(nlevels(tr))] <- levels(tr)
```

We then proceed to normalization, to remove biases in the pools.
We also have a look at how the samples are distributed.
It looks like the control E10 sample is in the middle of the knock-outs, while overexpressing E6 is in the middle of the controls.

```{r, fig.width=8, fig.height=7}
y <- DGEList(counts, genes=cbind(Ensembl=rownames(counts), Symbol=fData(sce)$Symbol))
y <- calcNormFactors(y)
y$samples$norm.factors
all.pch <- c(15, 16, 17, 18)
all.cols <- c("green", "red", "blue", "black", "grey60", "orange", "purple", "pink")
layout(cbind(1, 2), widths=c(5, 2))
plotMDS(cpm(y, log=TRUE, prior=3), pch=all.pch[as.integer(pl)], col=all.cols[as.integer(tr)])
par(mar=c(5.1,0.5,4.1,0.5))
plot(0,0,type="n",axes=FALSE,ylab="",xlab="")
legend("topleft", pch=all.pch, col="black", levels(pl))
legend("bottomleft", pch=16, col=all.cols, levels(tr))
```

We perform dispersion estimation to model biological variability, and we use the trended dispersion to fit a generalized linear model.
We estimate the quasi-likelihood dispersion to account for uncertainty in the dispersion estimates.

```{r, fig.width=10, fig.height=5}
y <- estimateDisp(y, design)
fit <- glmQLFit(y, design, robust=TRUE)
par(mfrow=c(1,2))
plotBCV(y)
plotQLDisp(fit)
```

We then identify DE genes between KO and control.
We use the E10 control as it exhibits KO-like behaviour after cloning, which protects us against DE due to clonality.
(An unfortunate consequence of this is that we'll be conservative if clonality effects are correlated with MeCP2 levels.)

```{r}
con <- makeContrasts(D10.E10=KO_D10 - ctr_E10, H4.E10=KO_H4-ctr_E10, levels=design)
res.ko <- glmQLFTest(fit, contrast=con)
out <- topTags(res.ko, n=Inf)
sum(out$table$FDR <=0.05)
sum(sign(out$table$logFC.D10)==sign(out$table$logFC.H4) & out$table$FDR <= 0.05)
write.table(file="de_KOvCTR.tsv", out$table, sep="\t", quote=FALSE, col.names=NA)
head(out$table)
```

We similarly identify DE genes between control and overexpressing cells.
In this case, we compare C10 and E6 to G5 and C8 controls, as the latter exhibit OE-like behaviour.
There's quite a few disagreements in the direction of DE between clones; not surprising, perhaps.

```{r}
con <- makeContrasts(C10.G5=OE_CMV_C10 - ctr_G5, C10.C8=OE_CMV_C10 - ctr_C8, 
        E6.G5=OE_CMV_E6 - ctr_G5, levels=design)
res.oe <- glmQLFTest(fit, contrast=con)
out <- topTags(res.oe, n=Inf)
out$table$logFC.E6.C8 <- out$table$logFC.E6.G5 - out$table$logFC.C10.G5 + out$table$logFC.C10.C8
out$table <- out$table[,c(1:5, 10, 6:9)] # Reordering for aesthetics.
sum(out$table$FDR <= 0.05)
sum(sign(out$table$logFC.C10.G5)==sign(out$table$logFC.C10.C8) & out$table$FDR <= 0.05)
sum(sign(out$table$logFC.E6.G5)==sign(out$table$logFC.E6.C8) & out$table$FDR <= 0.05)
sum(sign(out$table$logFC.C10.G5)==sign(out$table$logFC.E6.G5) & out$table$FDR <= 0.05)
sum(sign(out$table$logFC.C10.G5)==sign(out$table$logFC.E6.C8) & out$table$FDR <= 0.05) 
write.table(file="de_OEvCTR.tsv", out$table, sep="\t", quote=FALSE, col.names=NA)
head(out$table)
```

Out of interest, we also look at DE between the control samples, given that there might be some clonal effect there.

```{r}
con <- makeContrasts(E10.C8=ctr_E10 - ctr_C8, E10.G5=ctr_E10-ctr_G5, levels=design)
res.ko <- glmQLFTest(fit, contrast=con)
out <- topTags(res.ko, n=Inf)
sum(out$table$FDR <=0.05)
write.table(file="de_CTRvCTR.tsv", out$table, sep="\t", quote=FALSE, col.names=NA)
head(out$table)
```

# Looking for subpopulation structures within each group

## Computing the variances within each group

We compute the biological component of the variance for each group of cells.
As we don't have working spikes, we assume that the variability of most genes is dominated by technical effects.
Thus, an abundance-dependent trend fitted to the variances of the majority of genes can be used to represent the technical component.
We use an experiment-wide trend to avoid loss of power with group-specific trends when one group is systematically more variable than the others.

```{r}
design <- model.matrix(~0 + groupings)
fit <- trendVar(exprs(sce), design=design, trend="loess")
dec <- decomposeVar(exprs(sce), fit)
plot(dec$mean, dec$total, pch=16, col="grey")
points(dec$mean, dec$tech, col="dodgerblue", pch=16)
getDec <- function(group) {
    i <- grep(group, groupings)
    design <- design[i,]
    QR <- qr(design)
    design <- design[,QR$pivot[seq_len(QR$rank)]]
    dec <- decomposeVar(exprs(sce)[,i], fit, design=design)
    top.hvgs <- order(dec$bio, decreasing=TRUE)[1:200]
    return(list(var=dec, cor=correlatePairs(sce, subset.row=top.hvgs),
                df=nrow(design)-ncol(design)))
#    y <- convertTo(sce[,i], type="edgeR")
#    y <- estimateDisp(y, design, prior.df=0)
#    data.frame(bio=y$tagwise.dispersion-y$trended.dispersion)
}
```

We run through all groups and get their biological components.

```{r}
c8.out <- getDec("ctr C8")
c8.var <- c8.out$var
c8.cor <- c8.out$cor
e10.out <- getDec("ctr E10")
e10.var <- e10.out$var
e10.cor <- e10.out$cor
g5.out <- getDec("ctr G5")
g5.var <- g5.out$var
g5.cor <- g5.out$cor
ki.out <- getDec("KI R306C")
ki.var <- ki.out$var
ki.cor <- ki.out$cor
d10.out <- getDec("KO D10")
d10.var <- d10.out$var
d10.cor <- d10.out$cor
h4.out <- getDec("KO H4")
h4.var <- h4.out$var
h4.cor <- h4.out$cor
c10.out <- getDec("OE CMV C10")
c10.var <- c10.out$var
c10.cor <- c10.out$cor
e6.out <- getDec("OE CMV E6")
e6.var <- e6.out$var
e6.cor <- e6.out$cor
```

<!--
We plot all the trends together to see how they compare.
It doesn't seem like KO increases or decreases the variance consistently relative to its control (E10).
Similarly, OE doesn't have a consistent effect relative to their controls.

```{r, eval=FALSE}
plot(0, 0, xlab="Mean", ylab="Variance", xlim=c(0, 10), ylim=c(0, 10), type="n")
points(c8.var$mean, c8.var$tech, col=all.cols[1], pch=16, cex=0.5)
points(e10.var$mean, e10.var$tech, col=all.cols[2], pch=16, cex=0.5)
points(g5.var$mean, g5.var$tech, col=all.cols[3], pch=16, cex=0.5)
points(d10.var$mean, d10.var$tech, col=all.cols[5], pch=16, cex=0.5)
points(h4.var$mean, h4.var$tech, col=all.cols[6], pch=16, cex=0.5)
points(c10.var$mean, c10.var$tech, col=all.cols[7], pch=16, cex=0.5)
points(e6.var$mean, e6.var$tech, col=all.cols[8], pch=16, cex=0.5)
all.names <- c("ctr C8", "ctr E10", "ctr G5", "KO D10", "KO H4", "OE CMV C10", "OE CMV E6")
legend("topright", col=all.cols[-4], pch=16, legend=all.names)
```
-->

We have a look at a boxplot to see whether the groups exhibit any change in their variance.
We look at all genes with their total variance, as well as the top 200 most highly variable genes in each group.
It doesn't seem like KO samples exhibit increases or decreases in the variance that are consistent relative to its control (E10).
Similarly, the OE variances aren't consistently larger or smaller than their controls.

```{r, fig.width=10, fig.height=5}
par(mfrow=c(1,2))
toplot <- list(c8.var$total, e10.var$total, g5.var$total, d10.var$total,
    h4.var$total, ki.var$total, c10.var$total, e6.var$total)
all.names <- c("ctr C8", "ctr E10", "ctr G5", "KO D10", "KO H4", "KI", "OE CMV C10", "OE CMV E6")
names(toplot) <- all.names    
boxplot(toplot, ylab="Variance", las=2)
toplot <- lapply(toplot, function(x) { x[order(x, decreasing=TRUE)[1:200]] })
boxplot(toplot, ylab="Variance (top 200)", las=2)
```

We repeat the dose using the biological components.
Similar results.

```{r, fig.width=10, fig.height=5}
par(mfrow=c(1,2))
toplot <- list(c8.var$bio, e10.var$bio, g5.var$bio, d10.var$bio,
    h4.var$bio, ki.var$bio, c10.var$bio, e6.var$bio)
names(toplot) <- all.names    
boxplot(toplot, ylab="Biological component", las=2)
toplot <- lapply(toplot, function(x) { x[order(x, decreasing=TRUE)[1:200]] })
boxplot(toplot, ylab="Biological component (top 200)", las=2)
```

We also compute a batch-corrected set of expression values for plotting purposes.
Note that, as a result, all of the different treatments will pile on top of each other.
This is fine if we're interested in looking at variability within the treatment, but it means that it makes little sense to plot across treatments.

```{r}
library(limma)
norm_exprs(sce) <- removeBatchEffect(exprs(sce), groupings)
```

## Repeating with the coefficient of variation

We can also do this with the CV^2^, for what it's worth.
We compute a central mean-CV trend using all groups to account for the mean dependency of the variance due to technical effects.
For each group, we then compute the log-fold change of CV^2^ from the trend, using only on group-specific expression values.

```{r}
ex <- t(t(counts(sce))/sizeFactors(sce))
ex <- removeBatchEffect(ex, groupings)
mu <- rowMeans(ex)
sigma2 <- apply(ex, 1, var)
cv2 <- sigma2/mu^2
plot(mu, cv2, log="xy", xlab="Mean", ylab="CV2")
fit <- lowess(x=log(mu), y=log(cv2))
lines(exp(fit$x), exp(fit$y), col="dodgerblue")
apfun <- approxfun(x=exp(fit$x), y=exp(fit$y), rule=2)
getLRCV <- function(group) {
    i <- grep(group, groupings)
    ex <- ex[,i]
    mu <- rowMeans(ex)
    sigma2 <- apply(ex, 1, var)
    cv2 <- sigma2/mu^2
    log(cv2/apfun(mu))
}
```

We compute the LRCV-like statistic for all genes in each group.

```{r}
c8.lrcv <- getLRCV("ctr C8")
e10.lrcv <- getLRCV("ctr E10")
g5.lrcv <- getLRCV("ctr G5")
ki.lrcv <- getLRCV("KI R306C")
d10.lrcv <- getLRCV("KO D10")
h4.lrcv <- getLRCV("KO H4")
c10.lrcv <- getLRCV("OE CMV C10")
e6.lrcv <- getLRCV("OE CMV E6")
```

Finally, we make a boxplot of these values to compare variability across groups.

```{r, fig.width=10, fig.height=5}
par(mfrow=c(1,2))
toplot <- list(c8.lrcv, e10.lrcv, g5.lrcv, d10.lrcv, h4.lrcv, ki.lrcv, c10.lrcv, e6.lrcv)
names(toplot) <- all.names    
boxplot(toplot, ylab="Log-ratio of CV2", las=2, ylim=median(toplot[[1]]) + c(-6,6)*mad(toplot[[1]]))
toplot <- lapply(toplot, function(x) { x[order(x, decreasing=TRUE)[1:200]] })
boxplot(toplot, ylab="Log-ratio of CV2 (top 200)", las=2)
```

## Examining subpopulations in the control samples

We use genes that are highly variable and significantly correlated in each group to construct a PCA plot for all control cells.
This will allow us to highlight any substructure in the plot.
There seems to be around three obvious subpopulations here, supported by all three cell lines.

```{r}
is.sig <- c8.cor$FDR <= 0.05
to.use.c8 <- union(c8.cor$gene1[is.sig], c8.cor$gene2[is.sig])
is.sig <- e10.cor$FDR <= 0.05
to.use.e10 <- union(e10.cor$gene1[is.sig], e10.cor$gene2[is.sig])
is.sig <- g5.cor$FDR <= 0.05
to.use.g5 <- union(g5.cor$gene1[is.sig], g5.cor$gene2[is.sig])
to.use <- unique(c(to.use.c8, to.use.e10, to.use.g5))
plotPCA(sce[,grep("ctr (C8|E10|G5)", groupings)], feature_set=to.use, colour_by="Treatment", exprs="norm_exprs")
```

## Examining subpopulations in the KO samples

We repeat this procedure for the KO cell lines (as well as the KI cell line).
This is less consistent between cell lines -- D10 has some clear substructure, whereas H4 does not.

```{r}
is.sig <- d10.cor$FDR <= 0.05
to.use.d10 <- union(d10.cor$gene1[is.sig], d10.cor$gene2[is.sig])
is.sig <- h4.cor$FDR <= 0.05
to.use.h4 <- union(h4.cor$gene1[is.sig], h4.cor$gene2[is.sig])
is.sig <- ki.cor$FDR <= 0.05
to.use.ki <- union(ki.cor$gene1[is.sig], ki.cor$gene2[is.sig])
to.use <- unique(c(to.use.d10, to.use.h4, to.use.ki))
use.libs <- grepl("KO (D10|H4)\\|", groupings) | grepl("KI", groupings) 
plotPCA(sce[,use.libs], feature_set=to.use, colour_by="Treatment", exprs="norm_exprs")
```

## Examining subpopulations in the overexpressing samples

We repeat this for the overexpressing cell lines.
Again, this isn't very consistent across cell lines.

```{r}
is.sig <- c10.cor$FDR <= 0.05
to.use.c10 <- union(c10.cor$gene1[is.sig], c10.cor$gene2[is.sig])
is.sig <- e6.cor$FDR <= 0.05
to.use.e6 <- union(e6.cor$gene1[is.sig], e6.cor$gene2[is.sig])
to.use <- unique(c(to.use.c10, to.use.e6))
plotPCA(sce[,grep("OE CMV (C10|E6)", groupings)], feature_set=to.use, colour_by="Treatment", exprs="norm_exprs")
```

# Identifying differentially variable genes

## Between KO and control

We limit ourselves to the genes that don't show evidence of DE between control and KO.
Such genes are more likely to mediate their effects via a change in average expression than variable transcription.
(There is a subtlety here, in that the genes with no evidence for DE are not necessarily non-DE genes.
A strict requirement for non-DE would likely remove all interesting genes as it is rare to get a change in variability without some change in expression.)

```{r}
de.results <- read.table("de_KOvCTR.tsv", header=TRUE)
not.de <- rownames(sce) %in% de.results$Ensembl[de.results$FDR > 0.05]
sum(not.de)
```

We identify the genes that increase in variability upon KO as those where the variance in both KO groups is greater than any variance in the control groups.
The opposite is applied to identify those genes that decrease in variability upon KO.
No big difference in numbers, though.

```{r}
go.up <- pmin(h4.var$total, d10.var$total) > e10.var$total + 1 & not.de
sum(go.up)
go.down <- pmax(h4.var$total, d10.var$total) + 1 < e10.var$total & not.de
sum(go.down)
```

We go through and construct a list of DVGs in both directions.
We order by the average difference in the totallogical components, to prioritize genes with strong increases/decreases in the variability.

```{r}
combined <- data.frame(row.names=rownames(sce), Symbol=fData(sce)$Symbol,
    E10=e10.var$total, D10=d10.var$total, H4=h4.var$total,
    Diff=(d10.var$total + h4.var$total)/2 - e10.var$total)
combined.up <- combined[go.up,]
combined.up <- combined.up[order(combined.up$Diff, decreasing=TRUE),]
write.table(file="dv_KOvCTR_up.tsv", combined.up, sep="\t", quote=FALSE, col.names=NA) 
head(combined.up)
```

We have a look at the top set of genes that increase in variability.

```{r}
library(vioplot)
chosen <- grepl("(E10|D10|H4)", sce$Treatment) & !grepl("cells", sce$Treatment)
for (g in rownames(combined.up)[1:5]) { 
    current <- split(exprs(sce)[g,chosen], factor(sce$Treatment[chosen]))
    do.call(vioplot, c(list(x=current[[1]]), current[-1], list(names=names(current))))
    title(main=g)
}
library(vioplot)
chosen <- grepl("(E10|D10|H4)", sce$Treatment) & !grepl("cells", sce$Treatment)
for (g in rownames(combined.up)[1:5]) { 
    current <- split(exprs(sce)[g,chosen], factor(sce$Treatment[chosen]))
    do.call(vioplot, c(list(x=current[[1]]), current[-1], list(names=names(current))))
    title(main=g)
}
```

Repeating for the opposite direction, i.e., decrease in variability upon KO.

```{r}
combined.down <- combined[go.down,]
combined.down <- combined.down[order(combined.down$Diff, decreasing=TRUE),] 
write.table(file="dv_KOvCTR_down.tsv", combined.down, sep="\t", quote=FALSE, col.names=NA) 
head(combined.down)
```

Looking at the genes.

```{r}
for (g in rownames(combined.down)[1:5]) { 
    current <- split(exprs(sce)[g,chosen], factor(sce$Treatment[chosen]))
    do.call(vioplot, c(list(x=current[[1]]), current[-1], list(names=names(current))))
    title(main=g)
}
```

## Between control and overexpressing cells

We use a similar approach to identify genes that increase or decrease in expression upon overexpression.
Again, we remove genes that are DE between overexpressing cells and the controls.

```{r}
de.results <- read.table("de_OEvCTR.tsv", header=TRUE)
not.de <- rownames(sce) %in% de.results$Ensembl[de.results$FDR > 0.05]
sum(not.de)
```

We identify genes with altered variability if the variance in all of the OE groups is greater than that of all of the control groups by at least 1, or vice versa.

```{r}
go.up <- pmax(c8.var$total, g5.var$total) +1 < pmin(E6=e6.var$total, C10=c10.var$total) & not.de
sum(go.up)
go.down <- pmin(c8.var$total, g5.var$total) > pmax(E6=e6.var$total, C10=c10.var$total) + 1 & not.de
sum(go.down)
```

Constructing and saving the list of DVGs in both directions.
First, going up:

```{r}
combined <- data.frame(row.names=rownames(sce), Symbol=fData(sce)$Symbol,
    C8=c8.var$total, G5=g5.var$total, E6=e6.var$total, C10=c10.var$total, 
    Diff=(c8.var$total + g5.var$total)/2 - (c10.var$total + e6.var$total)/2)
combined.up <- combined[go.up,]
combined.up <- combined.up[order(combined.up$Diff),]
write.table(file="dv_OEvCTR_up.tsv", combined.up, sep="\t", quote=FALSE, col.names=NA) 
head(combined.up)
```

Looking at the genes.

```{r}
chosen <- grepl("(C8|G5|E6|C10)", sce$Treatment) & !grepl("cells", sce$Treatment)
for (g in rownames(combined.up)[1:5]) {
    current <- split(exprs(sce)[g,chosen], factor(sce$Treatment[chosen]))
    do.call(vioplot, c(list(x=current[[1]]), current[-1], list(names=names(current))))
    title(main=g)
}
```

Now going down.

```{r}
combined.down <- combined[go.down,]
combined.down <- combined.down[order(combined.down$Diff, decreasing=TRUE),]
write.table(file="dv_OEvCTR_down.tsv", combined.down, sep="\t", quote=FALSE, col.names=NA) 
head(combined.down)
```

Again, looking at the genes.

```{r}
for (g in rownames(combined.down)[1:5]) { 
    current <- split(exprs(sce)[g,chosen], factor(sce$Treatment[chosen]))
    do.call(vioplot, c(list(x=current[[1]]), current[-1], list(names=names(current))))
    title(main=g)
}
```

<!--
## Intersecting the KO and OE lists

Finally, we intersect the KO up/OE down and KO down/OE up lists.
The idea is that any effects of knock-down should be reversed by overexpression.

```{r, eval=FALSE}
total.combined <- data.frame(row.names=rownames(sce), Symbol=fData(sce)$Symbol,
    C8.Bio=c8.out$bio, G5.Bio=g5.out$bio,
    D10.Bio=d10.out$bio, H4.Bio=h4.out$bio, C10.Bio=c10.out$bio, 
    Diff=(d10.out$bio+h4.out$bio)/2-(c10.out$bio)/2)
ko.up <- intersect(rownames(read.table("KO_up.tsv")), rownames(read.table("OE_down.tsv")))
length(ko.up)
total.up <- total.combined[ko.up,]
total.up <- total.up[order(total.up$Diff, decreasing=TRUE),]
write.table(file="dv_all_up.tsv", total.up, sep="\t", quote=FALSE, col.names=NA)
head(total.up)
ko.down <- intersect(rownames(read.table("KO_down.tsv")), rownames(read.table("OE_up.tsv")))
length(ko.down)
total.down <- total.combined[ko.down,]
total.down <- total.down[order(total.down$Diff),]
write.table(file="dv_all_down.tsv", total.down, sep="\t", quote=FALSE, col.names=NA)
head(total.down)
```
-->

# Session information

```{r}
saveRDS(file="object.rds", sce)
sessionInfo()
```