--- title: Differential expression analysis in hESCs after silencing HUSH components author: Aaron Lun date: 7 November 2016 output: html_document: fig_caption: false --- ```{r, echo=FALSE, results='hide'} knitr::opts_chunk$set(error=FALSE, message=FALSE, fig.path="figure-de/") knitr::opts_knit$set(width=100) options(width=100) ``` # Background This project involves using shRNAs to silence components of the HUSH complex in human embryonic stem cells. Data were generated by Dan Greaves in the Lehner group (CIMR) and obtained from the CI Genomics Core servers on October 24, 2016. 125 bp paired-end reads were uniquely aligned to the hg38 build of the human genome using `subread`. Reads were counted into exonic regions of genes in the ENSEMBL GRCh38.83 annotation _and_ into repeat calls from the RepeatMasker software, using `featureCounts` with stringent settings (minimum mapping quality of 10, both reads in the pair mapped to the same feature). To reduce file size, repeat elements with fewer than 10 counts across all samples were removed. ```{r} ncols <- length(strsplit(readLines("genic_subcounts.tsv.gz", n=1), "\t")[[1]]) all.counts <- read.table("genic_subcounts.tsv.gz", row.names=1, header=TRUE, colClasses=rep(c("character", "integer"), c(1, ncols-1))) gene.length <- all.counts[,1] all.counts <- all.counts[,-1] dim(all.counts) ``` We collapse redundant technical replicates by summing their counts. ```{r} library(edgeR) pattern <- "HF[A-Z0-9]+\\.s_[0-9]+\\.r" bio.rep <- sub(pattern, "", colnames(all.counts)) all.counts <- sumTechReps(all.counts, bio.rep) dim(all.counts) ``` ```{r, results="hide", echo=FALSE} gc() ``` We report the mapping statistics for each file. ```{r} mapstat <- read.table("my_qual.tsv", header=TRUE) mapstat <- aggregate(mapstat, by=list(sub(pattern, "", rownames(mapstat))), FUN=sum) rownames(mapstat) <- mapstat[,1] actual.totals <- colSums(all.counts) totals <- rowSums(mapstat[,-1]) mapped <- totals - mapstat$Unassigned_Unmapped - mapstat$Unassigned_MappingQuality data.frame(Total=actual.totals, Mapped=mapped/totals, Genic=mapstat$Assigned/totals) ``` We determine the experimental design for this well layout. ```{r} wellpos <- sub(".*(D[0-9]+)_(D[0-9]+)\\..*", "\\1-\\2", colnames(all.counts)) metadata <- read.csv("contents.csv", header=TRUE, stringsAsFactors=FALSE) m <- match(wellpos, metadata$Barcode) stopifnot(all(!is.na(m))) groups <- metadata$Sample[m] groups <- sub("[0-9]([a-z]+)$", "\\1", groups) table(groups) ``` We set up an appropriate design matrix to reflect the groupings for each sample. We use a one-way layout without an intercept, for simplicity. ```{r} groups <- factor(groups) design <- model.matrix(~0 + groups) colnames(design) <- levels(groups) colnames(design) ``` We also identify which rows correspond to genes, and which correspond to repeat elements. As these can have quite different properties, we shall analyze them separately. ```{r} is.ens <- grepl("^ENS", rownames(all.counts)) sum(is.ens) ``` # Setting up the analysis of the genes ## Filtering and normalization Our first job is to filter out the low-abundance genes. We define these as those with an average count-per-million (CPM) less than 1. This is equivalent to an average count of around 5-10, given the library sizes involved here. ```{r} gy <- DGEList(all.counts[is.ens,], group=groups) keep <- aveLogCPM(gy) > 1 gy <- gy[keep,] sum(keep) ``` We proceed to normalize using the TMM normalization. This removes any biases between libraries due to differences in RNA composition. Sequencing depth is automatically normalized and does not need to be considered here. In general, the normalization factors are fairly mild which suggests that not much normalization was required. ```{r} gy <- calcNormFactors(gy) gy$samples ``` ## Data exploration and annotation We can have a look at the similarities and differences between samples. We colour each sample by its group and construct a MDS plot. Libraries that are close together on the plot are more similar. We would expect that replicates would cluster into distinct groups, though we don't really observe this here. ```{r, fig.width=9, fig.height=6} by.protein <- factor(sub("sh", "", sub("(plus|off|no)tet", "", levels(groups)))) by.type <- factor(sub(".*(plus|off|no)", "\\1", levels(groups))) colors <- rainbow(nlevels(by.protein)) pchs <- c(16, 4, 2) par(mar=c(5.1, 4.1, 1.1, 11.1), xpd=TRUE) plotMDS(cpm(gy, log=TRUE, prior.count=3), pch=pchs[by.type[groups]], col=colors[by.protein[groups]]) legend("topright", legend=levels(groups), col=colors[by.protein], inset=c(-0.4,0.1), pch=pchs[by.type]) ``` First, we convert the ENSEMBL identifiers into gene symbols that are more informative. ```{r} library(org.Hs.eg.db) anno <- select(org.Hs.eg.db, keys=rownames(gy$counts), columns="SYMBOL", keytype="ENSEMBL") m <- match(rownames(gy$counts), anno$ENSEMBL) gy$genes <- anno[m,] rownames(gy$counts) <- NULL head(gy$genes) ``` ## Log-transforming for linear modelling We then use the `voom` framework to convert the counts into a suitable format for linear modelling. This fits a mean-variance trend and computes precision weights to account for the greater precision (relative to the mean) for large counts. We also compute sample-specific quality weights to downweight outliers in each group. ```{r, fig.width=10, fig.height=6} gv <- voomWithQualityWeights(gy, design, plot=TRUE) ``` We go on to fit a linear model to the data. We use empirical Bayes shrinkage to stabilize the variance estimates -- the extent of shrinkage is determined by the prior degrees of freedom shown below. We perform robust shrinkage to reduce the effect of outliers. ```{r} gfit <- lmFit(gv, design) gfit <- eBayes(gfit, robust=TRUE) summary(gfit$df.prior) ``` As a diagnostic, we estimate the NB dispersion across replicates. Values of around 0.05 to 0.1 should be expected in well-replicated studies. This is consistent with what we observ here (note that the square-root of the dispersion is shown below). ```{r} gy <- estimateDisp(gy, design) plotBCV(gy, ylim=c(0, 0.5)) ``` ## Performing the differential analyses for periphilin knockdown ### Testing for differential induction response We have a look at the expression profile for periphilin. Ideally, the shRNA'd samples should show a decrease in expression values. ```{r, fig.width=10, fig.height=6} adj <- cpm(gy[gy$genes$ENSEMBL=="ENSG00000134283",]) par(mar=c(10.1, 4.1, 2.1, 2.1)) plot(as.integer(groups), adj, pch=16, xaxt="n", ylab="CPM", xlab="") axis(1, at=seq_len(nlevels(groups)), label=levels(groups), las=2) ``` We test for differences in the response to induction upon KD of periphilin. In other words, we look for genes where the effect of induction differs between the scrambled control and the periphilin-KD samples. ```{r} con <- makeContrasts(Target=shPHLplustet - shPHLnotet, Control=shControlplustet - shControlnotet, logFC=(shPHLplustet - shPHLnotet) - (shControlplustet - shControlnotet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.dr <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.dr$P.Value <- extra$P.Value out.dr$adj.P.Val <- extra$adj.P.Val out.dr$F <- NULL o <- order(out.dr$P.Value) write.table(out.dr[o,], file="knockdown_PHL_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` ### Testing for recovery off tetracyclin When tetracyclin is removed, shRNA is no longer produced and periphilin levels should recover. We test for genes where there is any DE between the induced and off-tet samples, relative to the same experiment for the scrambled control. ```{r} con <- makeContrasts(Target=shPHLofftet - shPHLplustet, Control=shControlofftet - shControlplustet, logFC=(shPHLofftet - shPHLplustet) - (shControlofftet - shControlplustet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.rec <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.rec$P.Value <- extra$P.Value out.rec$adj.P.Val <- extra$adj.P.Val out.rec$F <- NULL o <- order(out.rec$P.Value) write.table(out.rec[o,], file="recovered_PHL_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` We also look at the genes that respond significantly in opposite directions between the initial knockdown and recovery. Such genes are more likely to be genuine targets. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$P.Value, out.dr$P.Value) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(gfit$genes, Knockdown=out.dr$logFC, Recovery=out.rec$logFC, P.Value=pval, adj.P.Val=qval) o <- order(output$P.Value) write.table(output[o,], file="combined_PHL_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ## Performing the differential analyses for TASOR knockdown ### Testing for differential induction response We have a look at the expression profile for TASOR. Ideally, the shRNA'd samples should show a decrease in expression values. ```{r, fig.width=10, fig.height=6} adj <- cpm(gy[gy$genes$ENSEMBL=="ENSG00000163946",]) par(mar=c(10.1, 4.1, 2.1, 2.1)) plot(as.integer(groups), adj, pch=16, xaxt="n", ylab="CPM", xlab="") axis(1, at=seq_len(nlevels(groups)), label=levels(groups), las=2) ``` We test for differences in the response to induction upon KD of TASOR. In other words, we look for genes where the effect of induction differs between the scrambled control and the TASOR-KD samples. ```{r} con <- makeContrasts(Target=shTASORplustet - shTASORnotet, Control=shControlplustet - shControlnotet, logFC=(shTASORplustet - shTASORnotet) - (shControlplustet - shControlnotet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.dr <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.dr$P.Value <- extra$P.Value out.dr$adj.P.Val <- extra$adj.P.Val out.dr$F <- NULL o <- order(out.dr$P.Value) write.table(out.dr[o,], file="knockdown_TASOR_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` ### Testing for recovery off tetracyclin When tetracyclin is removed, shRNA is no longer produced and TASOR levels should recover. We test for genes where there is any DE between the induced and off-tet samples, relative to the same experiment for the scrambled control. ```{r} con <- makeContrasts(Target=shTASORofftet - shTASORplustet, Control=shControlofftet - shControlplustet, logFC=(shTASORofftet - shTASORplustet) - (shControlofftet - shControlplustet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.rec <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.rec$P.Value <- extra$P.Value out.rec$adj.P.Val <- extra$adj.P.Val out.rec$F <- NULL o <- order(out.rec$P.Value) write.table(out.rec[o,], file="recovered_TASOR_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` We also look at the genes that respond significantly in opposite directions between the initial knockdown and recovery. Such genes are more likely to be genuine targets. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$P.Value, out.dr$P.Value) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(gfit$genes, Knockdown=out.dr$logFC, Recovery=out.rec$logFC, P.Value=pval, adj.P.Val=qval) o <- order(output$P.Value) write.table(output[o,], file="combined_TASOR_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ## Performing the differential analyses for SETDB knockdown ### Testing for differential induction response We have a look at the expression profile for SETDB. Ideally, the shRNA'd samples should show a decrease in expression values. ```{r, fig.width=10, fig.height=6} adj <- cpm(gy[gy$genes$ENSEMBL=="ENSG00000143379",]) par(mar=c(10.1, 4.1, 2.1, 2.1)) plot(as.integer(groups), adj, pch=16, xaxt="n", ylab="CPM", xlab="") axis(1, at=seq_len(nlevels(groups)), label=levels(groups), las=2) ``` We test for differences in the response to induction upon KD of SETDB. In other words, we look for genes where the effect of induction differs between the scrambled control and the SETDB-KD samples. ```{r} con <- makeContrasts(Target=shSETDBplustet - shSETDBnotet, Control=shControlplustet - shControlnotet, logFC=(shSETDBplustet - shSETDBnotet) - (shControlplustet - shControlnotet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.dr <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.dr$P.Value <- extra$P.Value out.dr$adj.P.Val <- extra$adj.P.Val out.dr$F <- NULL o <- order(out.dr$P.Value) write.table(out.dr[o,], file="knockdown_SETDB_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` ### Testing for recovery off tetracyclin When tetracyclin is removed, shRNA is no longer produced and SETDB levels should recover. We test for genes where there is any DE between the induced and off-tet samples, relative to the same experiment for the scrambled control. ```{r} con <- makeContrasts(Target=shSETDBofftet - shSETDBplustet, Control=shControlofftet - shControlplustet, logFC=(shSETDBofftet - shSETDBplustet) - (shControlofftet - shControlplustet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.rec <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.rec$P.Value <- extra$P.Value out.rec$adj.P.Val <- extra$adj.P.Val out.rec$F <- NULL o <- order(out.rec$P.Value) write.table(out.rec[o,], file="recovered_SETDB_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` We also look at the genes that respond significantly in opposite directions between the initial knockdown and recovery. Such genes are more likely to be genuine targets. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$P.Value, out.dr$P.Value) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(gfit$genes, Knockdown=out.dr$logFC, Recovery=out.rec$logFC, P.Value=pval, adj.P.Val=qval) o <- order(output$P.Value) write.table(output[o,], file="combined_SETDB_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ## Performing the differential analyses for MPP8 knockdown ### Testing for differential induction response We have a look at the expression profile for MPP8. Ideally, the shRNA'd samples should show a decrease in expression values. ```{r, fig.width=10, fig.height=6} adj <- cpm(gy[gy$genes$ENSEMBL=="ENSG00000196199",]) par(mar=c(10.1, 4.1, 2.1, 2.1)) plot(as.integer(groups), adj, pch=16, xaxt="n", ylab="CPM", xlab="") axis(1, at=seq_len(nlevels(groups)), label=levels(groups), las=2) ``` We test for differences in the response to induction upon KD of MPP8. In other words, we look for genes where the effect of induction differs between the scrambled control and the MPP8-KD samples. ```{r} con <- makeContrasts(Target=shMPP8plustet - shMPP8notet, Control=shControlplustet - shControlnotet, logFC=(shMPP8plustet - shMPP8notet) - (shControlplustet - shControlnotet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.dr <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.dr$P.Value <- extra$P.Value out.dr$adj.P.Val <- extra$adj.P.Val out.dr$F <- NULL o <- order(out.dr$P.Value) write.table(out.dr[o,], file="knockdown_MPP8_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` ### Testing for recovery off tetracyclin When tetracyclin is removed, shRNA is no longer produced and MPP8 levels should recover. We test for genes where there is any DE between the induced and off-tet samples, relative to the same experiment for the scrambled control. ```{r} con <- makeContrasts(Target=shMPP8offtet - shMPP8plustet, Control=shControlofftet - shControlplustet, logFC=(shMPP8offtet - shMPP8plustet) - (shControlofftet - shControlplustet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.rec <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.rec$P.Value <- extra$P.Value out.rec$adj.P.Val <- extra$adj.P.Val out.rec$F <- NULL o <- order(out.rec$P.Value) write.table(out.rec[o,], file="recovered_MPP8_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` We also look at the genes that respond significantly in opposite directions between the initial knockdown and recovery. Such genes are more likely to be genuine targets. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$P.Value, out.dr$P.Value) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(gfit$genes, Knockdown=out.dr$logFC, Recovery=out.rec$logFC, P.Value=pval, adj.P.Val=qval) o <- order(output$P.Value) write.table(output[o,], file="combined_MPP8_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ## Performing the differential analyses for KAP knockdown ### Testing for differential induction response We have a look at the expression profile for KAP. Ideally, the shRNA'd samples should show a decrease in expression values. ```{r, fig.width=10, fig.height=6} adj <- cpm(gy[gy$genes$ENSEMBL=="ENSG00000130726",]) par(mar=c(10.1, 4.1, 2.1, 2.1)) plot(as.integer(groups), adj, pch=16, xaxt="n", ylab="CPM", xlab="") axis(1, at=seq_len(nlevels(groups)), label=levels(groups), las=2) ``` We test for differences in the response to induction upon KD of KAP. In other words, we look for genes where the effect of induction differs between the scrambled control and the KAP-KD samples. ```{r} con <- makeContrasts(Target=shKAPplustet - shKAPnotet, Control=shControlplustet - shControlnotet, logFC=(shKAPplustet - shKAPnotet) - (shControlplustet - shControlnotet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.dr <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.dr$P.Value <- extra$P.Value out.dr$adj.P.Val <- extra$adj.P.Val out.dr$F <- NULL o <- order(out.dr$P.Value) write.table(out.dr[o,], file="knockdown_KAP_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` ### Testing for recovery off tetracyclin When tetracyclin is removed, shRNA is no longer produced and KAP levels should recover. We test for genes where there is any DE between the induced and off-tet samples, relative to the same experiment for the scrambled control. ```{r} con <- makeContrasts(Target=shKAPofftet - shKAPplustet, Control=shControlofftet - shControlplustet, logFC=(shKAPofftet - shKAPplustet) - (shControlofftet - shControlplustet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.rec <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.rec$P.Value <- extra$P.Value out.rec$adj.P.Val <- extra$adj.P.Val out.rec$F <- NULL o <- order(out.rec$P.Value) write.table(out.rec[o,], file="recovered_KAP_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` We also look at the genes that respond significantly in opposite directions between the initial knockdown and recovery. Such genes are more likely to be genuine targets. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$P.Value, out.dr$P.Value) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(gfit$genes, Knockdown=out.dr$logFC, Recovery=out.rec$logFC, P.Value=pval, adj.P.Val=qval) o <- order(output$P.Value) write.table(output[o,], file="combined_KAP_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ## Performing the differential analyses for MORC knockdown ### Testing for differential induction response We have a look at the expression profile for MORC2. Ideally, the shRNA'd samples should show a decrease in expression values. ```{r, fig.width=10, fig.height=6} adj <- cpm(gy[gy$genes$ENSEMBL=="ENSG00000133422",]) par(mar=c(10.1, 4.1, 2.1, 2.1)) plot(as.integer(groups), adj, pch=16, xaxt="n", ylab="CPM", xlab="") axis(1, at=seq_len(nlevels(groups)), label=levels(groups), las=2) ``` We test for differences in the response to induction upon KD of MORC. In other words, we look for genes where the effect of induction differs between the scrambled control and the MORC-KD samples. ```{r} con <- makeContrasts(Target=shMORCplustet - shMORCnotet, Control=shControlplustet - shControlnotet, logFC=(shMORCplustet - shMORCnotet) - (shControlplustet - shControlnotet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.dr <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.dr$P.Value <- extra$P.Value out.dr$adj.P.Val <- extra$adj.P.Val out.dr$F <- NULL o <- order(out.dr$P.Value) write.table(out.dr[o,], file="knockdown_MORC_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` ### Testing for recovery off tetracyclin When tetracyclin is removed, shRNA is no longer produced and MORC levels should recover. We test for genes where there is any DE between the induced and off-tet samples, relative to the same experiment for the scrambled control. ```{r} con <- makeContrasts(Target=shMORCofftet - shMORCplustet, Control=shControlofftet - shControlplustet, logFC=(shMORCofftet - shMORCplustet) - (shControlofftet - shControlplustet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.rec <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.rec$P.Value <- extra$P.Value out.rec$adj.P.Val <- extra$adj.P.Val out.rec$F <- NULL o <- order(out.rec$P.Value) write.table(out.rec[o,], file="recovered_MORC_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` We also look at the genes that respond significantly in opposite directions between the initial knockdown and recovery. Such genes are more likely to be genuine targets. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$P.Value, out.dr$P.Value) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(gfit$genes, Knockdown=out.dr$logFC, Recovery=out.rec$logFC, P.Value=pval, adj.P.Val=qval) o <- order(output$P.Value) write.table(output[o,], file="combined_MORC_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ## Performing the differential analyses for ATF7IP knockdown ### Testing for differential induction response We have a look at the expression profile for ATF7IP. Ideally, the shRNA'd samples should show a decrease in expression values. ```{r, fig.width=10, fig.height=6} adj <- cpm(gy[gy$genes$ENSEMBL=="ENSG00000171681",]) par(mar=c(10.1, 4.1, 2.1, 2.1)) plot(as.integer(groups), adj, pch=16, xaxt="n", ylab="CPM", xlab="") axis(1, at=seq_len(nlevels(groups)), label=levels(groups), las=2) ``` We test for differences in the response to induction upon KD of ATF7IP. In other words, we look for genes where the effect of induction differs between the scrambled control and the ATF7IP-KD samples. ```{r} con <- makeContrasts(Target=shATF7IPplustet - shATF7IPnotet, Control=shControlplustet - shControlnotet, logFC=(shATF7IPplustet - shATF7IPnotet) - (shControlplustet - shControlnotet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.dr <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.dr$P.Value <- extra$P.Value out.dr$adj.P.Val <- extra$adj.P.Val out.dr$F <- NULL o <- order(out.dr$P.Value) write.table(out.dr[o,], file="knockdown_ATF7IP_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` ### Testing for recovery off tetracyclin When tetracyclin is removed, shRNA is no longer produced and ATF7IP levels should recover. We test for genes where there is any DE between the induced and off-tet samples, relative to the same experiment for the scrambled control. ```{r} con <- makeContrasts(Target=shATF7IPofftet - shATF7IPplustet, Control=shControlofftet - shControlplustet, logFC=(shATF7IPofftet - shATF7IPplustet) - (shControlofftet - shControlplustet), levels=design) gfit2 <- contrasts.fit(gfit, con) gfit2 <- eBayes(gfit2, robust=TRUE) summary(decideTests(gfit2))[,3,drop=FALSE] ``` We record the DE genes. ```{r} out.rec <- topTable(gfit2, n=Inf, sort.by="none", coef=1:3) extra <- topTable(gfit2, n=Inf, sort.by="none", coef=3) out.rec$P.Value <- extra$P.Value out.rec$adj.P.Val <- extra$adj.P.Val out.rec$F <- NULL o <- order(out.rec$P.Value) write.table(out.rec[o,], file="recovered_ATF7IP_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` We also look at the genes that respond significantly in opposite directions between the initial knockdown and recovery. Such genes are more likely to be genuine targets. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$P.Value, out.dr$P.Value) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(gfit$genes, Knockdown=out.dr$logFC, Recovery=out.rec$logFC, P.Value=pval, adj.P.Val=qval) o <- order(output$P.Value) write.table(output[o,], file="combined_ATF7IP_genes.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` # Setting up the analysis of the specific repeat elements ## Filtering, normalization and dispersion estimation We repeat the analysis on the counts for the repeat elements. We re-use the normalization factors from the genic analysis. This is because it may not be reasonable to assume that most repeats are not DE when you perturb the silencing machinery. In contrast, the assumption that most genes are not DE is potentially safer due to other layers of regulation. Also, the counts for the repeats are prohibitively low and preclude stable normalization. ```{r} ry <- DGEList(all.counts[!is.ens,], group=groups, lib.size=gy$samples$lib.size * gy$samples$norm.factors) ``` We filter to remove repeats with an average count below 5. This is less stringent than for the genic analysis, due to the fact that repeats are lowly expressed at best. We don't use `aveLogCPM` here as it requires too much memory when there are so many rows. ```{r} keep <- rowMeans(ry$counts) >= 5 ry <- ry[keep,] ``` We proceed to estimate the NB dispersion across all repeats. It's probably unwise to use `voom` here as the range of counts is too low to accurately model the mean-variance relationship. ```{r} ry <- estimateDisp(ry, design) plotBCV(ry) ``` We use the quasi-likelihood methods to account for the uncertainty of dispersion estimation. This also involves empirical Bayes shrinkage to stabilize the repeat-specific dispersions, the prior d.f. for which is shown below. ```{r} qfit <- glmQLFit(ry, design, robust=TRUE) summary(qfit$df.prior) ``` ## Testing for differential repeat expression ### Upon periphilin knockdown and recovery We repeat the contrasts from before, but using the repeat counts this time. First, we do periphilin as shown below. ```{r} con <- makeContrasts(T=shPHLplustet - shPHLnotet, C=shControlplustet - shControlnotet, (shPHLplustet - shPHLnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE repeats. ```{r} out.dr <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_PHL_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shPHLofftet - shPHLplustet, C=shControlofftet - shControlplustet, (shPHLofftet - shPHLplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE repeats. ```{r} out.rec <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_PHL_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the repeats that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(ry), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_PHL_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon TASOR knockdown and recovery Next, TASOR. ```{r} con <- makeContrasts(T=shTASORplustet - shTASORnotet, C=shControlplustet - shControlnotet, (shTASORplustet - shTASORnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE repeats. ```{r} out.dr <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_TASOR_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shTASORofftet - shTASORplustet, C=shControlofftet - shControlplustet, (shTASORofftet - shTASORplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE repeats. ```{r} out.rec <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_TASOR_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the repeats that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(ry), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_TASOR_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon SETDB knockdown and recovery Next, SETDB. ```{r} con <- makeContrasts(T=shSETDBplustet - shSETDBnotet, C=shControlplustet - shControlnotet, (shSETDBplustet - shSETDBnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE repeats. ```{r} out.dr <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_SETDB_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shSETDBofftet - shSETDBplustet, C=shControlofftet - shControlplustet, (shSETDBofftet - shSETDBplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE repeats. ```{r} out.rec <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_SETDB_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the repeats that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(ry), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_SETDB_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon MPP8 knockdown and recovery Next, MPP8. ```{r} con <- makeContrasts(T=shMPP8plustet - shMPP8notet, C=shControlplustet - shControlnotet, (shMPP8plustet - shMPP8notet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE repeats. ```{r} out.dr <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_MPP8_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shMPP8offtet - shMPP8plustet, C=shControlofftet - shControlplustet, (shMPP8offtet - shMPP8plustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE repeats. ```{r} out.rec <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_MPP8_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the repeats that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(ry), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_MPP8_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon KAP knockdown and recovery Next, KAP. ```{r} con <- makeContrasts(T=shKAPplustet - shKAPnotet, C=shControlplustet - shControlnotet, (shKAPplustet - shKAPnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE repeats. ```{r} out.dr <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_KAP_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shKAPofftet - shKAPplustet, C=shControlofftet - shControlplustet, (shKAPofftet - shKAPplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE repeats. ```{r} out.rec <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_KAP_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the repeats that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(ry), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_KAP_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon MORC knockdown and recovery Next, MORC. ```{r} con <- makeContrasts(T=shMORCplustet - shMORCnotet, C=shControlplustet - shControlnotet, (shMORCplustet - shMORCnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE repeats. ```{r} out.dr <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_MORC_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shMORCofftet - shMORCplustet, C=shControlofftet - shControlplustet, (shMORCofftet - shMORCplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE repeats. ```{r} out.rec <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_MORC_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the repeats that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(ry), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_MORC_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon ATF7IP knockdown and recovery Next, ATF7IP. ```{r} con <- makeContrasts(T=shATF7IPplustet - shATF7IPnotet, C=shControlplustet - shControlnotet, (shATF7IPplustet - shATF7IPnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE repeats. ```{r} out.dr <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_ATF7IP_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shATF7IPofftet - shATF7IPplustet, C=shControlofftet - shControlplustet, (shATF7IPofftet - shATF7IPplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE repeats. ```{r} out.rec <- data.frame(Repeat=rownames(ry), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_ATF7IP_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the repeats that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(ry), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_ATF7IP_repeats.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` # Setting up the analysis of the repeat families ## Filtering, normalization and dispersion estimation We repeat the analysis on the counts for the repeat families. Again, we re-use the normalization factors from the genic analysis. ```{r} family.counts <- read.table("../Families/analysis/genic_counts.tsv", header=TRUE, row.names=1) family.counts <- family.counts[,-1] bio.rep <- sub(pattern, "", colnames(family.counts)) family.counts <- sumTechReps(family.counts, bio.rep) dim(family.counts) fy <- DGEList(family.counts, group=groups, lib.size=gy$samples$lib.size * gy$samples$norm.factors) ``` We filter to remove repeat families with an average count below 5. ```{r} keep <- rowMeans(fy$counts) >= 5 fy <- fy[keep,] ``` We proceed to estimate the NB dispersion across all families. ```{r} fy <- estimateDisp(fy, design) plotBCV(fy) ``` We use the quasi-likelihood methods to account for the uncertainty of dispersion estimation. ```{r} qfit <- glmQLFit(fy, design, robust=TRUE) summary(qfit$df.prior) ``` ## Testing for differential family expression ### Upon periphilin knockdown and recovery We family the contrasts from before, but using the family counts this time. First, we do periphilin as shown below. ```{r} con <- makeContrasts(T=shPHLplustet - shPHLnotet, C=shControlplustet - shControlnotet, (shPHLplustet - shPHLnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE families. ```{r} out.dr <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_PHL_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shPHLofftet - shPHLplustet, C=shControlofftet - shControlplustet, (shPHLofftet - shPHLplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE families. ```{r} out.rec <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_PHL_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the families that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(fy), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_PHL_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon TASOR knockdown and recovery Next, TASOR. ```{r} con <- makeContrasts(T=shTASORplustet - shTASORnotet, C=shControlplustet - shControlnotet, (shTASORplustet - shTASORnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE families. ```{r} out.dr <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_TASOR_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shTASORofftet - shTASORplustet, C=shControlofftet - shControlplustet, (shTASORofftet - shTASORplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE families. ```{r} out.rec <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_TASOR_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the families that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(fy), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_TASOR_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon SETDB knockdown and recovery Next, SETDB. ```{r} con <- makeContrasts(T=shSETDBplustet - shSETDBnotet, C=shControlplustet - shControlnotet, (shSETDBplustet - shSETDBnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE families. ```{r} out.dr <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_SETDB_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shSETDBofftet - shSETDBplustet, C=shControlofftet - shControlplustet, (shSETDBofftet - shSETDBplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE families. ```{r} out.rec <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_SETDB_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the families that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(fy), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_SETDB_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon MPP8 knockdown and recovery Next, MPP8. ```{r} con <- makeContrasts(T=shMPP8plustet - shMPP8notet, C=shControlplustet - shControlnotet, (shMPP8plustet - shMPP8notet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE families. ```{r} out.dr <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_MPP8_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shMPP8offtet - shMPP8plustet, C=shControlofftet - shControlplustet, (shMPP8offtet - shMPP8plustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE families. ```{r} out.rec <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_MPP8_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the families that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(fy), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_MPP8_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon KAP knockdown and recovery Next, KAP. ```{r} con <- makeContrasts(T=shKAPplustet - shKAPnotet, C=shControlplustet - shControlnotet, (shKAPplustet - shKAPnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE families. ```{r} out.dr <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_KAP_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shKAPofftet - shKAPplustet, C=shControlofftet - shControlplustet, (shKAPofftet - shKAPplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE families. ```{r} out.rec <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_KAP_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the families that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(fy), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_KAP_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon MORC knockdown and recovery Next, MORC. ```{r} con <- makeContrasts(T=shMORCplustet - shMORCnotet, C=shControlplustet - shControlnotet, (shMORCplustet - shMORCnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE families. ```{r} out.dr <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_MORC_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shMORCofftet - shMORCplustet, C=shControlofftet - shControlplustet, (shMORCofftet - shMORCplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE families. ```{r} out.rec <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_MORC_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the families that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(fy), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_MORC_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` ### Upon ATF7IP knockdown and recovery Next, ATF7IP. ```{r} con <- makeContrasts(T=shATF7IPplustet - shATF7IPnotet, C=shControlplustet - shControlnotet, (shATF7IPplustet - shATF7IPnotet) - (shControlplustet - shControlnotet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.dr <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.dr)) ``` We record the DE families. ```{r} out.dr <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.dr$table$logFC, logCPM=qres.dr$table$logCPM, PValue=qres.dr$table$PValue, row.names=rownames(qres.dr$table)) out.dr$FDR <- p.adjust(out.dr$PValue, method="BH") o <- order(out.dr$PValue) write.table(out.dr[o,], file="knockdown_ATF7IP_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.dr[o,]) ``` Removing tetracycline. ```{r} con <- makeContrasts(T=shATF7IPofftet - shATF7IPplustet, C=shControlofftet - shControlplustet, (shATF7IPofftet - shATF7IPplustet) - (shControlofftet - shControlplustet), levels=design) qres.all <- glmQLFTest(qfit, contrast=con) qres.rec <- glmQLFTest(qfit, contrast=con[,3]) summary(decideTestsDGE(qres.rec)) ``` We record the DE families. ```{r} out.rec <- data.frame(Repeat=rownames(fy), Target=qres.all$table$logFC.T, Control=qres.all$table$logFC.C, logFC=qres.rec$table$logFC, logCPM=qres.rec$table$logCPM, PValue=qres.rec$table$PValue, row.names=rownames(qres.rec$table)) out.rec$FDR <- p.adjust(out.rec$PValue, method="BH") o <- order(out.rec$PValue) write.table(out.rec[o,], file="recovered_ATF7IP_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(out.rec[o,]) ``` Now looking at the families that change in consistent directions upon knockdown and recovery. ```{r} same.sign <- sign(out.rec$logFC)==sign(out.dr$logFC) pval <- pmax(out.rec$PValue, out.dr$PValue) pval[same.sign] <- 1 qval <- p.adjust(pval, method="BH") table(sign(out.rec$logFC)[qval <= 0.05]) output <- data.frame(Repeat=rownames(fy), Knockdown=out.dr$logFC, Recovery=out.rec$logFC, PValue=pval, FDR=qval) o <- order(output$PValue) write.table(output[o,], file="combined_ATF7IP_families.tsv", sep="\t", quote=FALSE, row.names=FALSE, col.names=TRUE) head(output[o,]) ``` # Session information ```{r} saveRDS(file="object.rds", list(genes=gy, repeats=ry, families=fy)) sessionInfo() ```