ExtractProtocol_v2.R 3.16 KB
#!/usr/bin/env Rscript
library("optparse")
library("GEOquery")
source("extraction_functions.R")
#-------------------------------------------------------------------------------
# Objective
# Extract banglines text-content from previous GSEnnn.soft.gz files
#
# Input parameters
# --d     download dir with GSE/GSEnnn.soft.gz files
# --i     GSE information file (gse, gsm columns)
#
# Parameters examples
# -d ../download/srr_galagan/ 
# -o ../output/srr_galagan 
# -i ../input/normalized_srr-gsm_des_v4.tsv 
# -b ../input/listMetaCampo.txt 
# -r ../reports/srr_galagan/extract_report.txt
#
# Example
# nohup Rscript ExtractProtocol_v2.R -d ../download/srr_galagan/ -o ../outputs/srr_galagan -i ../input/normalized_srr-gsm_des_v4.tsv -b ../input/listMetaCampo.txt -r ../reports/srr_galagan/extract_report.txt > extract_nohup.out &
#
#######################################################################################
#-----------------------------------------ARGS-----------------------------------------
#######################################################################################

option_list = list( 
	make_option(c("-d", "--downloadPath"),type="character", default=NULL, 
		help="download directory", 					metavar="character"),
	make_option(c("-o", "--outputPath"),	type="character", default=NULL, 
		help="directory to output files", 	metavar="character"),
	make_option(c("-i", "--infoFile"), 		type="character", default=NULL, 
		help="GSE id information file", 		metavar="character"),
	make_option(c("-b", "--banglines"), 	type="character", default=NULL, 
		help="banglines file", 							metavar="character"),
	make_option(c("-r", "--report"), 			type="character", default=NULL, 
		help="extraccion report file", 			metavar="character")
	);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

if (!length(opt)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file).n", call.=FALSE)
}

#######################################################################################
#-----------------------------------------MAIN-----------------------------------------
#######################################################################################

## Input files and output directories
odir     <- opt$outputPath

# Download dir
ddir     <- opt$downloadPath

# Baglines list file
bglsfile <- opt$banglines

# GSE information file
infoFile <- opt$infoFile

## Load main variables

# Baglines
bglslist <- readr::read_tsv( bglsfile )
bglsBase <- sapply( bglslist, GetBaseBagline )
# GSE-GSM
gseInfo  <- read.table( infoFile,header = T, sep = "\t" )
gseInfo  <- gseInfo[complete.cases(gseInfo),]


## Extraction

sink(opt$report, append = FALSE, split = FALSE)
cat("total gse id: ", (length(unique(gseInfo$gse))), "\n")
cat("total gsm id: ", (length(unique(gseInfo$gsm))), "\n")

for (geoid in unique(gseInfo$gse)) {
	print(geoid)
  ## Filter GSMs  
  subs   <- as.vector(gseInfo$gsm[which(geoid == gseInfo$gse)])
  report <- tryCatch(
    ExtractMetafields( geoid, subs, ddir, odir, bglsBase),
    error = function( e ) return( "extraccion failed" ) )
  print(report)
}
cat("extracted gsm id: ", length(list.files(opt$outputPath, recursive = TRUE)))