extraction_functions_v2.R 4.8 KB
DownloadGEO    <- function( geoid, ddir ){
  # Work directory
  wdir <- file.path( ddir, geoid, fsep = "/" )
  # Create individual folder
  if ( !dir.exists( wdir ) ) {
    dir.create( wdir )
  }
  print("Saving...")
  print(ddir)  
  # Removing downloaded files for geo ID
  file.remove( list.files( wdir, pattern = geoid, full.names = TRUE ) )  
  # Download GSE without neither expression values nor platform info
  GEO  <- getGEO( 
    GEO       = geoid,
    destdir   = wdir,
    AnnotGPL  = FALSE,
    getGPL    = FALSE,
    GSEMatrix = FALSE)  
  return("successful download")
}

# This function use regex expresion to include all multi-baglines
GetBaseBagline    <- function( meta ){
  meta <- sub( "[.].*",      "", meta )
  meta <- sub( "_ch.*",      "", meta )
  meta <- sub( "_[0-9].*",   "", meta )
  meta <- sub( ":ch[0-9].*", "", meta )
  return( meta )
}
# Remove problematic characters as " or tabs
remove_characters <- function(bagline_content){
  clean_text <- gsub("\n", "linebreak", bagline_content)
  clean_text <- gsub("\"", "linebreak", clea_text)
  clean_text <- gsub("\'", "linebreak", clea_text)
  return(clean_text)
}
# This function make a list each content of multi-bagline
ResizeDF          <- function( M, baglines, outfile ){
  splitBagline <- function(x){
    baglineList      <- data.frame( data = unlist( M[x] ) )
    baglineList$meta <- paste( x, 1:nrow( baglineList ), sep='.' )
    #filter tabs and " characterss
    bagLineist[,1] <- remove_characters(baglineList[,1])
    # Saving meta gsm baglines broken down in list
    write.table(
      file      = outfile, baglineList, 
      sep       = "\t", 
      eol       = "\n", 
      append    = TRUE, 
      row.names = FALSE, 
      col.names = FALSE, 
      quote     = TRUE)
  }
  sapply( baglines, splitBagline) 
}
# This function load GEOobject once softfile has downloaded
ReadGEO           <- function( geoid, ddir, gz = TRUE ){
  GEOfile <- file.path(ddir,geoid,paste(geoid,"soft","gz",sep = "."))
  if(!gz){  
    GEOfile <- gsub(pattern = ".gz", replacement="", x=GEOfile)
  }
  if (!file.exists(GEOfile)){return(FALSE)}
  RGEO <- getGEO(filename = GEOfile)
  return(RGEO)
}
# This function
AccessMefields    <- function(subs, GEO, odir, baglinesB){  
  # PMID available
  PMID <- tryCatch( 
    GEO@header$pubmed_id, 
    error = function( e ) return( "unknwon" )  )
  gpl  <- tryCatch( 
    paste( GEO@header$platform_id, collapse = "-"), 
    error = function( e ) return( "unknwon" ) )  
  print( paste( "PMID", PMID, sep = ": ", collapse = "" ) )
  # Collapse multi GPL and mult PMID
  PMID <- paste( "PMID", PMID, sep = ":", collapse = "" )
  gpl  <- paste(  gpl,  sep = ":", collapse = "" )
  # Download report
  print( paste( "GSM", length(subs), sep = ":", collapse = "" ) )
  print( "Extraction..." )
  # Sava Metafields
  for ( gsm in subs ) { 
    print( gsm )
    # Accesing metadata. It should be read it as soft (access options )
    MetaDF   <- tryCatch(
      GEO@gsms[[gsm]]@header,
      error = function( e ) print( FALSE )  )
    # check available baglies  
    if(is.logical(MetaDF)){
      print(gsm)
      return( "Unavailable gsm" )
    } else{
    # output filename
      geoName  <- paste(geoid, gsm, gpl, PMID, sep='-')
      outfile  <- file.path( odir, "/" , geoName, ".tsv", fsep = "" )
      # Show outfile
      print(paste("outfile", outfile, sep = ": ", collapse = ""))
      # Avoid append problems
      if ( file.exists( outfile ) ) { file.remove(outfile) }
      # Map baglines to download id
      baglines  <- sapply( baglinesB, function(x){ grep( x, names(MetaDF), value=TRUE ) } )
      baglines  <- as.vector( unlist( baglines ) )
      # filter and separate multi balines content. Resize GSM output
      ResizeDF(MetaDF, baglines, outfile)      
      print( paste( "Baglines", length(baglines), sep = ": ", collapse = "") )
  }}
  return(TRUE)
}  
# This function
ExtractMetafields <- function( geoid, subs, ddir, odir, baglinesB ){  
  print(paste("ID", geoid, sep = ": ", collapse = "" ))
  #ddir <- file.path( ddir, geoid, fsep = "/" )
  # output directory
  odir <- file.path( odir, geoid, fsep = "/" )
  # Create individual folder
  if ( !dir.exists( odir ) ) {
    dir.create( odir )
  }
  # load GSE object
  GEO <- tryCatch( ReadGEO( geoid, ddir ), error=function( e ) print( FALSE ) )
  if(is.logical(GEO)){
    print( "Unreadable GSE softfile")
    return("Error: Unexpected end")
  }  
  # get gsms names
  gsmsList  <- names( GEO@gsms )
  if( is.logical( gsmsList ) ){ 
    print( "Unavailable gsms" )
    return("Error: Unexpected end")
    }
  print("successful load")
  
  report <- tryCatch( 
    AccessMefields(subs, GEO, odir, baglinesB),
    error=function( e ) return( FALSE ) )

  if(!report){
    # Remove unused folder
    unlink(odir, recursive=TRUE)
    return( "extraccion failed..." )
  }else{
    return( "successful extraccion..")
  }
}