ExtractProtocol_v1.R 5.74 KB
library(GEOquery)

# This function use regex expresion to include all multi-baglines
GetBaseBagline    <- function( meta ){
  meta <- sub( "[.].*",      "", meta )
  meta <- sub( "_ch.*",      "", meta )
  meta <- sub( "_[0-9].*",   "", meta )
  meta <- sub( ":ch[0-9].*", "", meta )
  return( meta )
}
# This function make a list each content of multi-bagline
ResizeDF          <- function( M, baglines, outfile ){
  splitBagline <- function(x){
    baglineList      <- data.frame( data = unlist( M[x] ) )
    baglineList$meta <- paste( x, 1:nrow( baglineList ), sep='.' )
    # Saving meta gsm baglines broken down in list
    write.table(
      file      = outfile, baglineList, 
      sep       = "\t", 
      eol       = "\n", 
      append    = TRUE, 
      row.names = FALSE, 
      col.names = FALSE, 
      quote     = TRUE)
  }
  sapply( baglines, splitBagline) 
}
# This function load GEOobject once softfile has downloaded
ReadGEO           <- function( geoid, ddir ){
  GEOfile <- file.path(ddir,geoid,paste(geoid,"soft","gz",sep = "."))
  if (!file.exists(GEOfile)){return(FALSE)}
  RGEO <- getGEO(filename = GEOfile)
  return(RGEO)
}
# This function
AccessMefields    <- function(subs, GEO, odir, baglinesB){  
  # PMID available
  PMID <- tryCatch( 
  	GEO@header$pubmed_id, 
  	error = function( e ) return( "unknwon" )  )
  gpl  <- tryCatch( 
  	paste( GEO@header$platform_id, collapse = "-"), 
  	error = function( e ) return( "unknwon" ) )  
  print( paste( "PMID", PMID, sep = ": ", collapse = "" ) )
  # Collapse multi GPL and mult PMID
  PMID <- paste( "PMID", PMID, sep = ":", collapse = "" )
  gpl  <- paste(  gpl,  sep = ":", collapse = "" )
  # Download report
  print( paste( "GSM", length(subs), sep = ":", collapse = "" ) )
  print( "Extraction..." )
  # Sava Metafields
  for ( gsm in subs ) {	
    print( gsm )
    # Accesing metadata. It should be read it as soft (access options )
    MetaDF   <- tryCatch(
    	GEO@gsms[[gsm]]@header,
    	error = function( e ) print( FALSE )  )
    # check available baglies  
    if(is.logical(MetaDF)){
    	print(gsm)
    	return( "Unavailable gsm" )
    } else{
    # output filename
      geoName  <- paste(geoid, gsm, gpl, PMID, sep='-')
      outfile  <- file.path( odir, "/" , geoName, ".tsv", fsep = "" )
      # Show outfile
      print(paste("outfile", outfile, sep = ": ", collapse = ""))
      # Avoid append problems
      if ( file.exists( outfile ) ) { file.remove(outfile) }
      # Map baglines to download id
      baglines  <- sapply( baglinesB, function(x){ grep( x, names(MetaDF), value=TRUE ) } )
      baglines  <- as.vector( unlist( baglines ) )
      # filter and separate multi balines content. Resize GSM output
      ResizeDF(MetaDF, baglines, outfile)      
      print( paste( "Baglines", length(baglines), sep = ": ", collapse = "") )
  }}
  return(TRUE)
}  
# This function
ExtractMetafields <- function( geoid, subs, ddir, odir, baglinesB ){  
  print(paste("ID", geoid, sep = ": ", collapse = "" ))
  #ddir <- file.path( ddir, geoid, fsep = "/" )
  # output directory
  odir <- file.path( odir, geoid, fsep = "/" )
  # Create individual folder
  if ( !dir.exists( odir ) ) {
    dir.create( odir )
  }
  # load GSE object
  GEO <- tryCatch( ReadGEO( geoid, ddir ), error=function( e ) print( FALSE ) )
  if(is.logical(GEO)){
    print( "Unreadable GSE softfile")
    return("Error: Unexpected end")
  }  
  # get gsms names
  gsmsList	<- names( GEO@gsms )
  if( is.logical( gsmsList ) ){ 
    print( "Unavailable gsms" )
    return("Error: Unexpected end")
    }
  print("successful load")
  
  report <- tryCatch( 
    AccessMefields(subs, GEO, odir, baglinesB),
    error=function( e ) return( FALSE ) )

  if(!report){
    # Remove unused folder
    unlink(odir, recursive=TRUE)
    return( "extraccion failed..." )
  }else{
    return( "successful extraccion..")
  }
}

#######################################################################################
#-----------------------------------------MAIN-----------------------------------------
#######################################################################################


## Input files and output directories

odir     <- "/export/storage/users/egaytan/GEO-MetaData/outputs"
#odir     <- "/home/egaytan/automatic-extraction-growth-conditions/Extraction/outputs"
#odir     <- "/home/egaytan/Documentos/EGAYTAN-GROWTH-CONDITIONS/GEO/outputs"
# Download dir
ddir     <- "/export/storage/users/egaytan/GEO-MetaData/download"
#ddir     <- "/home/egaytan/automatic-extraction-growth-conditions/Extraction/download"
#ddir     <- "/home/egaytan/Documentos/EGAYTAN-GROWTH-CONDITIONS/GEO/download"
# Baglines list file
bglsfile <- "/export/storage/users/egaytan/GEO-MetaData/input/listMetaCampo.txt"
#bglsfile <- "/home/egaytan/automatic-extraction-growth-conditions/Extraction/input/listMetaCampo.txt"
#bglsfile <- "/home/egaytan/Documentos/EGAYTAN-GROWTH-CONDITIONS/GEO/input/listMetaCampo.txt"
# GSE information file
infofile <- "/export/storage/users/egaytan/GEO-MetaData/input/gsm-gse-count_v4.txt"
#infofile <- "/home/egaytan/automatic-extraction-growth-conditions/Extraction/input/gsm-gse-count_v4.txt"
#infofile <- "/home/egaytan/Documentos/EGAYTAN-GROWTH-CONDITIONS/extraccion/gsm-gse-count_v4.txt"


## Load main variables

# Baglines
bglslist <- read.table( bglsfile, header = F, sep = "\t" )
bglsBase <- sapply( bglslist, GetBaseBagline )
# GSE-GSM
gseInfo  <- read.table( infofile,header = T, sep = "," )
gseInfo  <- gseInfo[complete.cases(gseInfo),]


## Extraction

sink("extract_report.txt", append = FALSE, split = FALSE)
for (geoid in unique(gseInfo$gse)) {
  ## Filter GSMs  
  subs   <- as.vector(gseInfo$gsm[which(geoid == gseInfo$gse)])
  report <- tryCatch(
    ExtractMetafields( geoid, subs, ddir, odir, bglsBase),
    error = function( e ) return( "extraccion failed" ) )
  print(report)
}