######################################################################################## # Print out important bits from the metadata for Liora's lot. collected <- list() fpath <- "../Liora" relink <- "make_links_Liora.sh" write(file=relink, c("set -e", "set -u", "mkdir fastq_Liora", "cd fastq_Liora"), ncol=1) library(edgeR) for (sample in c("test_20160906", "test_20170201")) { cpath <- file.path(fpath, sample, "analysis", "genic_counts.tsv") all.files <- read.table(cpath, nrows=1, stringsAsFactor=FALSE, comment="") prefixes <- as.character(all.files[-c(1:2)]) batch <- sub(".*_", "", basename(sample)) # Loading in the corresponding object. full.obj <- readRDS(file.path(fpath, sample, "analysis", "full.rds")) m <- match(prefixes, colnames(full.obj)) full.obj <- full.obj[,m] # Adding in the metadata. addition.mode <- character(length(prefixes)) addition.mode[full.obj$samples$ercc.first] <- "ERCC+SIRV" addition.mode[full.obj$samples$sirv.first] <- "SIRV+ERCC" addition.mode[full.obj$samples$premixed] <- "Premixed" treatment <- "not applicable" well.type <- rep("single cell", length(prefixes)) well.type[full.obj$samples$control=="+"] <- "50 cells" well.type[full.obj$samples$control=="-"] <- "empty" # Adding in the MD5 sums. md5.sums <- read.table(file.path(fpath, sample, "fastq", "md5.all"), header=FALSE, stringsAsFactor=FALSE, comment="") md5.sums <- md5.sums[order(gsub("_", "X", md5.sums[,2])),] # weird sorting order with underscores in UTF-8 m <- match(sub("_[12].fq.gz$", "", md5.sums[,2]), prefixes) # Creating links to files. curpath <- file.path(fpath, sample, "fastq") chosen <- list.files(curpath, pattern="fq.gz$") write(file=relink, paste0("ln -s ", file.path("..", curpath, chosen), " ", chosen), append=TRUE, ncol=1) new.count.file <- paste0("counts_Liora_", batch, ".tsv") write(file=relink, paste0("ln -s ", normalizePath(cpath), " ", new.count.file), append=TRUE, ncol=1) # Setting manual standard deviation values. if (sample=="test_20170201") { ave.frag <- 461 sd.frag <- 182 } else { ave.frag <- 402 sd.frag <- 194 } out <- data.frame(Sample=prefixes, Batch=batch, Addition=addition.mode, Treatment=treatment, Well=well.type, Counts=new.count.file, MeanFrag=ave.frag, SDFrag=sd.frag)[m,] out$File <- md5.sums[,2] out$MD5 <- md5.sums[,1] collected[[sample]] <- out } collected <- do.call(rbind, collected) output <- list() output[["Source Name"]] <- collected$Sample output[["Characteristics[organism]"]] <- "Mus musculus" output[["Characteristics[cell line]"]] <- "Trophoblast stem cell" output[["Characteristics[single cell well quality]"]] <- collected$Well output[["Material Type"]] <- "RNA" output[[paste0(rep(c("Protocol REF", "Performer"), 5), collapse="\t")]] <- paste0(c("Obtaining TSCs", "Liora Vilmovsky", "Culturing TSCs", "Liora Vilmovsky", "Reverse transcription", "Liora Vilmovsky", "Extracting RNA", "Liora Vilmovsky", "Creating libraries","Liora Vilmovsky" ), collapse="\t") output[["Extract Name"]] <- collected$Sample output[["Comment[LIBRARY_LAYOUT]"]] <- "PAIRED" output[["Comment[LIBRARY_SELECTION]"]] <- "Oligo-dT" output[["Comment[LIBRARY_SOURCE]"]] <- "TRANSCRIPTOMIC" output[["Comment[LIBRARY_STRAND]"]] <- "not applicable" output[["Comment[LIBRARY_STRATEGY]"]] <- "RNA-seq" output[["Comment[NOMINAL_LENGTH]"]] <- collected$MeanFrag output[["Comment[NOMINAL_SDEV]"]] <- collected$SDFrag output[["Comment[ORIENTATION]"]] <- "5'-3'-3'-5'" output[["Protocol REF\tPerformer"]] <- "Sequencing libraries\tLiora Vilmovsky" output[["Assay Name"]] <- collected$Sample output[["Technology Type"]] <- "sequencing assay" output[["Array Data File"]] <- collected$File output[["Protocol REF\tPerformer"]] <- "Assigning reads to genes\tAaron Lun" output[["Derived Array Data File"]] <- collected$Counts output[["Comment[MD5]"]] <- collected$MD5 output[["Factor Value[spike-in addition]"]] <- collected$Addition output[["Factor Value[treatment]"]] <- collected$Treatment output[["Factor Value[block]"]] <- collected$Batch # Constructing the sdrf.tsv file. output$check.names <- FALSE sdrf <- do.call(data.frame, output) write.table(file="sdrf_Liora.tsv", sdrf, row.names=FALSE, sep="\t", quote=FALSE)