extraction 5.76 KB
library(GEOquery)

# This function use regex expresion to include all multi-baglines
GetBaseBagline <- function( meta ){
  meta <- sub( "[.].*",      "", meta )
  meta <- sub( "_ch.*",      "", meta )
  meta <- sub( "_[0-9].*",   "", meta )
  meta <- sub( ":ch[0-9].*", "", meta )
  #print(meta)
  return( meta )
}
# This function make a list each content of multi-bagline
ResizeDF       <- function( M, baglines, outfile ){
  splitBagline <- function(x){
  	baglineList      <- data.frame( data = unlist( M[x] ) )
  	baglineList$meta <- paste( x, 1:nrow( baglineList ), sep='.' )
  	# Saving meta gsm baglines broken down in list
  	write.table(
      file      = outfile, baglineList, 
      sep       = "\t", 
      eol       = "\n", 
      append    = TRUE, 
      row.names = FALSE, 
      col.names = FALSE, 
      quote     = TRUE)
  }
  sapply( baglines, splitBagline) 
}
# This function download soft file on individual folder
DownloadGEO    <- function( geoid, ddir ){
  # Work directory
  wdir <- file.path( ddir, geoid, fsep = "/" )
  # Create individual folder
  if ( !dir.exists( wdir ) ) {
    dir.create( wdir )
  }	  
  # Removing downloaded files for geo ID
  file.remove( list.files( wdir, pattern = geoid, full.names = TRUE ) )  
  # Download GSE without neither expression values nor platform info
  GEO  <- getGEO( 
    GEO       = geoid,
    destdir   = wdir,
    AnnotGPL  = FALSE,
    getGPL    = FALSE,
    GSEMatrix = FALSE)
  
  #return(GEO)
  return("successful download")
}
# This function load GEOobject once softfile has downloaded
ReadGEO        <- function( geoid, ddir ){
  GEOfile <- file.path(ddir,geoid,paste(geoid,"soft","gz",sep = "."))
  if (!file.exists(GEOfile)){return(FALSE)}
  RGEO <- getGEO(filename = GEOfile)
  return(RGEO)
}
# This function
AccessMefields <- function(subs, GEO, odir, baglinesB){  
  # PMID available
  PMID <- tryCatch( 
  	GEO@header$pubmed_id, 
  	error = function( e ) print( FALSE )  )
  gpl  <- tryCatch( 
  	paste( GEO@header$platform_id, collapse = "-"), 
  	error = function( e ) print( FALSE ) )  
  # Download report
  print(data.frame( GSE=geoid, PMID=PMID, NGSM=length(subs)))

  # Sava Metafields
  for ( gsm in subs ) {	
    # Accesing metadata. It should be read it as soft (access options )
    MetaDF   <- tryCatch(
    	GEO@gsms[[gsm]]@header,
    	error = function( e ) print( FALSE )  )
    # check available baglies  
    if(is.logical(MetaDF)){
    	print(gsm)
    	return('unavailable gsm')
    } else{
    # output filename
      geoName  <- paste(geoid, gsm, gpl, PMID, sep='-')
      outfile  <- file.path( odir, "/" , geoName, ".tsv", fsep = "" )  
      print(outfile)
      # Avoid append problems
      if ( file.exists( outfile ) ) { file.remove(outfile) }
      # Map baglines to download id
      baglines  <- sapply( baglinesB, function(x){ grep( x, names(MetaDF), value=TRUE ) } )
      baglines  <- as.vector( unlist( baglines ) )
      # filter and separate multi balines content. Resize GSM output
      ResizeDF(MetaDF, baglines, outfile)
      print( data.frame(GSM=gsm, ExtractedBaglines=length(baglines)) )
  }}
}  
# This function
ExtractMetafields <- function( geoid, subs, ddir, odir, baglinesB ){
  print("GSE identifier:")
  print(geoid)
  # output directory
  odir <- file.path( odir, geoid, fsep = "/" )
  # Create individual folder
  if ( !dir.exists( odir ) ) {
    dir.create( odir )
  }
  print("saving outputs:")
  print(odir)
  # load GSE object
  GEO <- tryCatch( ReadGEO( geoid, ddir ), error=function( e ) print( FALSE ) )
  if(is.logical(GEO)){
    print('unreadable GSE softfile')
    return("Unexpected end")
  }  
  # get gsms names
  gsmsList	<- names( GEO@gsms )
  if( is.logical( gsmsList ) ){ 
    print( 'unavailable gsms' )
    return("Unexpected end")
    }
  AccessMefields(subs, GEO, odir, baglinesB)
}

#######################################################################################
#-----------------------------------------MAIN-----------------------------------------
#######################################################################################


## Input files and output directories

odir     <- "/home/egaytan/automatic-extraction-growth-conditions/Extraction/outputs"
#odir     <- "/home/egaytan/Documentos/EGAYTAN-GROWTH-CONDITIONS/GEO/outputs"
# Download dir
ddir     <- "/home/egaytan/automatic-extraction-growth-conditions/Extraction/download"
#ddir     <- "/home/egaytan/Documentos/EGAYTAN-GROWTH-CONDITIONS/GEO/download"
# Baglines list file
bglsfile <- "/home/egaytan/automatic-extraction-growth-conditions/Extraction/input/listMetaCampo.txt"
#bglsfile <- "/home/egaytan/Documentos/EGAYTAN-GROWTH-CONDITIONS/GEO/input/listMetaCampo.txt"
# GSE information file
infofile <- "/home/egaytan/automatic-extraction-growth-conditions/Extraction/input/gsm-gse-count_v4.txt"
#infofile <- "/home/egaytan/Documentos/EGAYTAN-GROWTH-CONDITIONS/extraccion/gsm-gse-count_v4.txt"


## Load main variables

# Baglines
bglslist <- read.table( bglsfile, header = F, sep = "\t" )
bglsBase <- sapply( bglslist, GetBaseBagline )
# GSE-GSM
gseInfo  <- read.table( infofile,header = T, sep = "," )
gseInfo  <- gseInfo[complete.cases(gseInfo),]
# Single platform
#geoid    <- "GSE55199"
# Multuplatform
#geoid    <- "GSE74932"
# Super serie
geoid    <- "GSE65642"
# Test
geoid    <- "GSE55365"


## Single Extraction

# Download
GEO      <- tryCatch( DownloadGEO( geoid, ddir ), error=function( e ) print( FALSE ) )
# Show all available GSMs
subs     <- as.vector(gseInfo$gsm[which(geoid == gseInfo$gse)])
## Filter GSMs
subs <- subs[1]

ExtractMetafields( geoid, subs, ddir, odir, bglsBase)


## Download GSE-list
sink("download_report", append = FALSE, split = FALSE)
for (geoid in unique(gseInfo$gse)) {
	print(geoid)
	#report <- tryCatch( 
	#	DownloadGEO( geoid, ddir ), 
	#	error = function( e ) print( "download failed" ) )
	#print(report) 
}