######################################################################################## # Print out important bits from the metadata. collected <- list() for (sample in c("2383", "2384", "2677", "2678", "2739", "2740", "2780", "2781")){ all.files <- read.table(file.path("..", "data", sprintf("real-%s_20161024", sample), "genic_counts.tsv"), nrows=1, stringsAsFactor=FALSE) prefixes <- as.character(all.files[-c(1:2)]) snames <- sub(".*_(3600STDY[0-9]+)_.*", "\\1", prefixes) if (sample %in% c("2383", "2384")) { batch <- 1 } else if (sample %in% c("2677", "2678", "2739", "2740")) { batch <- 2 } else { batch <- 3 } if (sample %in% c("2383", "2677", "2678")) { phenotype <- "naive" } else { phenotype <- "primed" } prelim <- data.frame(Sample=snames, Run=sample, Batch=batch, Phenotype=phenotype, stringsAsFactors=FALSE) collected[[sample]] <- rbind(data.frame(prelim, File=paste0(prefixes, "_R1.fastq.gz")), data.frame(prelim, File=paste0(prefixes, "_R2.fastq.gz"))) } collected <- do.call(rbind, collected) table(table(collected$Sample)) # should all be 4; two technical replicates for each paired-end library. ######################################################################################## # Getting all the file names and the MD5 sums. fpath <- "../data" all.md5 <- list() for (x in unique(collected$Run)) { curpath <- file.path(fpath, paste0("real-", x, "_20161024"), "fastq") curmd5 <- read.table(file.path(curpath, "md5.all"), stringsAsFactors=FALSE) all.md5[[x]] <- curmd5 } all.md5 <- do.call(rbind, all.md5) m <- match(collected$File, all.md5[,2]) any(is.na(m)) any(duplicated(m)) collected$MD5 <- all.md5[m,1] ######################################################################################## # Constructing the sdrf.tsv file. output <- list() output[["Source Name"]] <- collected$Sample output[["Characteristics[organism]"]] <- "Homo sapiens" output[["Characteristics[cell line]"]] <- "H9" output[["Characteristics[phenotype]"]] <- collected$Phenotype output[["Material Type"]] <- "cell" output[["Comment[spike in]"]] <- "ERCC" output[["Comment[spike in dilution]"]] <- "1:25,000,000" output[["Comment[spike sequence file]"]] <- "spikes.fa" output[["Comment[spike concentration file]"]] <- "spikes.txt" output[["Comment[experiment batch]"]] <- collected$Batch output[["Comment[sequencing run]"]] <- collected$Run output[[paste0(rep(c("Protocol REF", "Performer"), 5), collapse="\t")]] <- paste0(c("Obtaining H9 cells", "Ferdinand von Meyenn", "Culturing H9 cells", "Ferdinand von Meyenn", "Reverse transcription", "Ferdinand von Meyenn", "Extracting RNA", "Ferdinand von Meyenn", "Creating libraries","Ferdinand von Meyenn" ), collapse="\t") output[["Extract Name"]] <- collected$Sample output[["Comment[LIBRARY_LAYOUT]"]] <- "PAIRED" output[["Comment[LIBRARY_SELECTION]"]] <- "Oligo-dT" output[["Comment[LIBRARY_SOURCE]"]] <- "TRANSCRIPTOMIC SINGLE CELL" output[["Comment[LIBRARY_STRAND]"]] <- "not applicable" output[["Comment[LIBRARY_STRATEGY]"]] <- "RNA-seq" output[["Comment[NOMINAL_LENGTH]"]] <- 295 output[["Comment[NOMINAL_SDEV]"]] <- 25 output[["Comment[ORIENTATION]"]] <- "5'-3'-3'-5'" output[["Protocol REF\tPerformer"]] <- "Sequencing libraries\tFerdinand von Meyenn" output[["Assay Name"]] <- collected$Sample output[["Technology Type"]] <- "sequencing assay" output[["Array Data File"]] <- collected$File output[["Protocol REF"]] <- "Assigning reads to genes" output[["Derived Array Data File"]] <- paste0("genic_counts_", collected$Run, ".tsv") output[["Comment[MD5]"]] <- collected$MD5 output[["Factor Value[phenotype]"]] <- collected$Phenotype output$check.names <- FALSE sdrf <- do.call(data.frame, output) write.table(file="sdrf.tsv", sdrf, row.names=FALSE, sep="\t", quote=FALSE)