---
title: Preparing to correct batch effects in single-cell RNA-seq data
author: Aaron T. L. Lun
output: 
  BiocStyle::html_document:
    fit_caption: false
---

```{r style, echo=FALSE, results='hide', message=FALSE}
library(BiocStyle)
library(knitr)
opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE)
#options(bitmapType="cairo", width=100) # if transparencies don't work on your machine.
```

# Setting up the Nestorowa data

## Data input

We download the counts from NCBI GEO corresponding to https://doi.org/10.1182/blood-2016-05-716480.

```{r}
fnameN <- "GSE81682_HTSeq_counts.txt.gz"
download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE81682&format=file&file=GSE81682%5FHTSeq%5Fcounts%2Etxt%2Egz", 
    fnameN)
```

We load in the counts:

```{r}
dataN <- read.table(fnameN, header=TRUE, row.names=1, check.names=FALSE)
dataN <- as.matrix(dataN)
```

We convert this to a sparse matrix to save space:

```{r}
library(Matrix)
dataN <- as(dataN, "dgCMatrix")
dim(dataN)
```

... and we get rid of non-genes at the end:


```{r}
dataN <- dataN[!grepl("^_", rownames(dataN)),]
dim(dataN)
```

We construct a `SingleCellExperiment` object:

```{r}
library(SingleCellExperiment)
sceN <- SingleCellExperiment(list(counts=dataN))
```

... and throw in some spike-in information:

```{r}
isSpike(sceN, "ERCC") <- grepl("^ERCC", rownames(sceN))
```

We also add information about the cell type:

```{r}
sceN$CellType <- sub("_.*", "", colnames(sceN))
```

## Normalizing for cell-specific biases

The authors have already done the quality control for us, so we skip straight to the normalization.
First, we break up the dataset with `quickCluster`:

```{r}
library(scran)
clustersN <- quickCluster(sceN, method="igraph")
table(clustersN)
```

Then we apply the deconvolution method:

```{r}
sceN <- computeSumFactors(sceN, clusters=clustersN)
summary(sizeFactors(sceN))
```

We plot the size factors against the library sizes as a sanity check:

```{r}
plot(Matrix::colSums(counts(sceN)), sizeFactors(sceN), log="xy")
```

We also generate size factors for the spike-ins separately.
Remember, it's fine that the spike-in size factors are not well-correlated to the deconvolution size factors.

```{r}
sceN <- computeSpikeFactors(sceN, general.use=FALSE, type="ERCC")
plot(sizeFactors(sceN), sizeFactors(sceN, "ERCC"), log="xy")
```

Finally, we compute log-normalized expression values:

```{r}
sceN <- normalize(sceN)
```

## Detecting highly variable genes

Let's pick out some highly variable genes.

```{r}
fitN <- trendVar(sceN, parametric=TRUE, loess.args=list(span=0.2))
decN <- decomposeVar(sceN, fitN)
head(decN)
```

Having a look at them on the plot:

```{r}
plot(decN$mean, decN$total, xlab="Mean log-expression",
    ylab="Variance of log-expression")
points(fitN$mean, fitN$var, col="red", pch=16)
curve(fitN$trend(x), col="red", add=TRUE)
```

```{r, echo=FALSE}
# Clearing the garbage to free up memory
gc()
```

# Setting up the Paul data

## Data input

We download the counts from NCBI GEO corresponding to https://doi.org/10.1016/j.cell.2015.11.013.

```{r}
fnameP <- "umitab_Amit.txt.gz"
download.file("https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE72857&format=file&file=GSE72857%5Fumitab%2Etxt%2Egz", 
    fnameP)
```

We load in the counts:

```{r}
dataP <- read.table(fnameP, header=TRUE, row.names=1, check.names=FALSE)
dataP <- as.matrix(dataP)
```

We convert this to a sparse matrix, and take a random set of 2000 cells for the sake of speed:

```{r}
set.seed(0)
chosen <- sample(ncol(dataP), 2000)
dataP <- as(dataP[,chosen], "dgCMatrix")
dim(dataP)
```

We convert the gene names back to Ensembl:

```{r}
library(org.Mm.eg.db)
symbols <- sub(".*;", "", rownames(dataP))
ens <- mapIds(org.Mm.eg.db, keys=symbols, keytype="SYMBOL", column="ENSEMBL")
```

We remove duplicates or genes without any Ensembl:

```{r}
keep <- !is.na(ens) & !duplicated(ens)
summary(keep)
dataP <- dataP[keep,]
rownames(dataP) <- unname(ens[keep])
```

We now construct a `SingleCellExperiment` object:

```{r}
sceP <- SingleCellExperiment(list(counts=dataP))
```

## Removing low-quality cells

We will remove some low quality cells here:

```{r}
library(scater)
sceP <- calculateQCMetrics(sceP)
multiplot(cols=2,
    plotColData(sceP, "log10_total_counts"),
    plotColData(sceP, "log10_total_features_by_counts"))
```

We'll apply our outlier strategy:

```{r}
low.total <- isOutlier(sceP$log10_total_counts, type="lower", nmads=3)
low.nexpr <- isOutlier(sceP$log10_total_features_by_counts, type="lower", nmads=3)
discard <- low.total | low.nexpr
data.frame(LowLib=sum(low.total), LowNexprs=sum(low.nexpr), Lost=sum(discard))
```

... and discard the offending cells:

```{r}
sceP <- sceP[,!discard]
```

## Normalizing for cell-specific biases

Again, we break up the dataset with `quickCluster` - note `min.mean` is set lower than the default, as we are dealing with UMI counts.

```{r}
library(scran)
clustersP <- quickCluster(sceP, method="igraph", min.mean=0.1)
table(clustersP)
```

Then we apply the deconvolution method:

```{r}
sceP <- computeSumFactors(sceP, clusters=clustersP, min.mean=0.1)
summary(sizeFactors(sceP))
```

We plot the size factors against the library sizes as a sanity check:

```{r}
plot(Matrix::colSums(counts(sceP)), sizeFactors(sceP), log="xy")
```

Finally, we compute log-normalized expression values:

```{r}
sceP <- normalize(sceP)
```

## Detecting highly variable genes

Let's pick out some highly variable genes. 
There aren't any spike-ins, so we'll have to use `use.spikes=FALSE`.

```{r}
fitP <- trendVar(sceP, use.spikes=FALSE, loess.args=list(span=0.01))
decP <- decomposeVar(sceP, fitP)
head(decP)
```

Having a look at them on the plot:

```{r}
plot(decP$mean, decP$total, xlab="Mean log-expression",
    ylab="Variance of log-expression")
curve(fitP$trend(x), col="blue", add=TRUE)
```

# Cleaning up

We clean up after ourselves by deleting the two source files.

```{r}
unlink(fnameN)
unlink(fnameP)
```

We also save the objects for further use.

```{r}
save(file="objects.Rda", sceN, sceP, decN, decP, fitN, fitP)
```

Finally, as this procedure was somewhat involved, we will report the session information as well.

```{r}
sessionInfo()
```