#############################################################
# Get the length and GC content of the ERCCs.

library(Biostrings)
epath <- "/lustre/jmlab/resources/genomes/sequences/spikes/ERCC92.fa"
erccs <- readDNAStringSet(epath)

elen <- width(erccs)
edead <- DNAStringSet(sub("A+$", "", as.character(erccs)))
tab <- alphabetFrequency(edead)
egc <- rowSums(tab[,c("G", "C")])/rowSums(tab)

# Get the length and GC content of the SIRVs (these need to be processed into transcripts).

spath <- "/lustre/jmlab/resources/genomes/sequences/spikes/SIRV_150601a.fasta"
raw.sirvs <- readDNAStringSet(spath)

library(rtracklayer)
sirv.gtf <- import("/lustre/jmlab/resources/annotation/original/SIRV_C_150601a.gtf")

library(BSgenome)
sub.sirvs <- getSeq(raw.sirvs, sirv.gtf)
by.transcript <- split(sub.sirvs, sirv.gtf$transcript_id)
sirvs <- DNAStringSet(unlist(lapply(by.transcript, paste0, collapse="")))

slen <- width(sirvs)
sdead <- DNAStringSet(sub("A+$", "", as.character(sirvs)))
tab <- alphabetFrequency(sdead)
sgc <- rowSums(tab[,c("G", "C")])/rowSums(tab)

# Get the length and GC content of endogenous transcripts.

library(BSgenome.Mmusculus.UCSC.mm10)
mm10.gtf <- import("/lustre/jmlab/resources/annotation/processed/mm10.gtf")
chosen.mm10.gtf <- mm10.gtf[grep("^chr", seqnames(mm10.gtf))]
sub.mm10 <- getSeq(BSgenome.Mmusculus.UCSC.mm10, chosen.mm10.gtf)
all.transcript <- split(sub.mm10, chosen.mm10.gtf$transcript_id)

set.seed(1000) # Picking 2000 transcripts to show, otherwise this takes too long.
by.transcript <- all.transcript[sample(length(all.transcript), 2000)]
mm10 <- DNAStringSet(unlist(lapply(by.transcript, paste0, collapse="")))

mlen <- width(mm10)
tab <- alphabetFrequency(mm10)
mgc <- rowSums(tab[,c("G", "C")])/rowSums(tab)

# Making a pretty plot of biophysical characteristics.

pdf("comparison.pdf")
cols <- c("dodgerblue", "salmon", "orange")
boxplot(list(elen/1e3, slen/1e3, mlen/1e3), ylab="Length (kbp)", col=cols, log="y", cex.axis=0.9, cex.lab=1.4, names=rep("", 3))
axis(1, at=1:3, labels=c("ERCC", "SIRV", "mm10"), cex.axis=1.2)
boxplot(list(ERCC=egc*100, SIRV=sgc*100, mm10=mgc*100), ylab="GC content (%)", col=cols, cex.axis=1.2, cex.lab=1.4, cex.names=1.4)
dev.off()