ExtractProtocol_v3.R 4.06 KB
#!/usr/bin/env Rscript
library("optparse")
library("GEOquery")
source("/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/bin/extraction_functions_v3.R")
#-------------------------------------------------------------------------------
# Objective
# Extract banglines text-content from previous GSEnnn.soft.gz files
#
# Input parameters
# --d     download dir with GSE/GSEnnn.soft.gz files
# --i     GSE information file (gse, gsm columns)
#
# Parameters examples
# -d ../download/srr_galagan/ 
# -o ../output/srr_galagan 
# -i ../input/normalized_srr-gsm_des_v4.tsv 
# -b ../input/listMetaCampo.txt 
# -r ../reports/srr_galagan/extract_report.txt
#
# Example
# nohup Rscript ExtractProtocol_v3.R -d ../download/srr_htregulondb/ -o ../outputs/srr_htregulondb -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -b ../input/srr_htregulondb/banglines.txt -r ../reports/srr_htregulondb/extract_report.txt > extract_nohup.out &
# Rscript tmp/bin/ExtractProtocol_v3.R -d tmp/download/ -o tmp/output/ext/ -i tmp/input/all_srr_geo_rnaseq_test.txt -b tmp/input/listMetaCampo.txt -r tmp/reports/extract_report.txt
# nohup Rscript ExtractProtocol_v3.R -d ../download/srr_htregulondb/ -o ../outputs/srr_htregulondb -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -b ../input/srr_htregulondb/banglines.txt -r ../reports/srr_htregulondb/extract_report.txt > extract_nohup.out &
#
#######################################################################################
#-----------------------------------------ARGS-----------------------------------------
#######################################################################################

option_list = list( 
	make_option(c("-d", "--downloadPath"),type="character", default=NULL, 
		help="download directory", 					metavar="character"),
	make_option(c("-o", "--outputPath"),	type="character", default=NULL, 
		help="directory to output files", 	metavar="character"),
	make_option(c("-i", "--infoFile"), 		type="character", default=NULL, 
		help="GSE id information file", 		metavar="character"),
	make_option(c("-b", "--banglines"), 	type="character", default=NULL, 
		help="banglines file", 							metavar="character"),
	make_option(c("-r", "--report"), 			type="character", default=NULL, 
		help="extraccion report file", 			metavar="character")
	);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

if (!length(opt)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file).n", call.=FALSE)
}

#######################################################################################
#-----------------------------------------MAIN-----------------------------------------
#######################################################################################

## Input files and output directories
odir     <- opt$outputPath

# Download dir
ddir     <- opt$downloadPath

# Baglines list file
bglsfile <- opt$banglines

# GSE information file
infoFile <- opt$infoFile

## Load main variables

# Baglines
bglslist <- readr::read_tsv( bglsfile )
bglsBase <- sapply( bglslist, GetBaseBagline )
# GSE-GSM
gseInfo  <- read.table( infoFile,header = T, sep = "\t" )
gseInfo  <- gseInfo[grep("GSE", gseInfo$gse, value=F), ]
gseInfo  <- gseInfo[grep("GSM", gseInfo$gsm, value=F), ]
gseInfo  <- gseInfo[complete.cases(gseInfo),]

ngse <- length(unique(gseInfo$gse))
ngsm <- length(unique(gseInfo$gsm))

message("Required GSE: ", ngse)
message("Required GSM: ", ngsm)

## Extraction

sink(opt$report, append = FALSE, split = FALSE)
cat("total gse id: ", (length(unique(gseInfo$gse))), "\n")
cat("total gsm id: ", (length(unique(gseInfo$gsm))), "\n")

for (geoid in unique(gseInfo$gse)) {
	print(geoid)
  ## Filter GSMs  
  subs   <- as.vector(gseInfo$gsm[which(geoid == gseInfo$gse)])
  report <- tryCatch(
    ExtractMetafields( geoid, subs, ddir, odir, bglsBase, gseInfo),
    error = function( e ) return( "extraccion failed" ) )
  print(report)
}
cat("extracted gsm id: ", length(list.files(opt$outputPath, recursive = TRUE)))