extraction_functions_v3.R 5.59 KB
# This function use regex expresion to include all multi-baglines
GetBaseBagline    <- function( meta ){
  meta <- sub( "[.].*",      "", meta )
  meta <- sub( "_ch.*",      "", meta )
  meta <- sub( "_[0-9].*",   "", meta )
  meta <- sub( ":ch[0-9].*", "", meta )
  return( meta )
}
# Remove problematic characters as " or tabs
remove_characters <- function(bagline_content){
  clean_text <- gsub("\n.", "[linebreak]", bagline_content)
  clean_text <- gsub("\"", "[linebreak]", clean_text)
  clean_text <- gsub("\'", "[linebreak]", clean_text)
  clean_text <- gsub("\t", "[linebreak]", clean_text)
  return(clean_text)
}

# This function make a list each content of multi-bagline
ResizeDF          <- function( srr, gse, gsm, gpl, PMID, 
  gsm_title, gse_title, gpl_title,
  M, baglines, outfile ){
  splitBagline <- function(x){
    full_text_bg = unlist( M[x] )
    baglineList   <- data.frame( data = full_text_bg )
    baglineList$srr <- srr
    baglineList$a <- gse
    baglineList$b <- gsm
    baglineList$c <- gpl
    baglineList$d <- PMID
    baglineList$e <- gsm_title
    baglineList$f <- gse_title
    baglineList$g <- gpl_title
    baglineList$h <- remove_characters(x)
    baglineList$h <- GetBaseBagline(baglineList$h)
    baglineList$i <- remove_characters(baglineList$data)
    #add string end
    baglineList$i <- paste(baglineList$i, "PGCGROWTHCONDITIONS", sep = " ")
    baglineList$data <- NULL
    # Saving meta gsm baglines broken down in list
    write.table(
      file      = outfile, baglineList, 
      sep       = "\t", 
      eol       = "\n", 
      append    = TRUE, 
      row.names = FALSE, 
      col.names = FALSE, 
      quote     = FALSE)
  }
  sapply( baglines, splitBagline) 
}
# This function load GEOobject once softfile has downloaded
ReadGEO           <- function( geoid, ddir, gz = TRUE ){
  GEOfile <- file.path(ddir,geoid,paste(geoid,"soft","gz",sep = "."))
  if(!gz){  
    GEOfile <- gsub(pattern = ".gz", replacement="", x=GEOfile)
  }
  if (!file.exists(GEOfile)){return(FALSE)}
  RGEO <- getGEO(filename = GEOfile)
  return(RGEO)
}
# This function
AccessMefields    <- function(subs, GEO, odir, baglinesB, meta_id){
  
  geoid <- GEO@header$geo_accession
  # PMID available
  PMID <- tryCatch( 
    GEO@header$pubmed_id, 
    error = function( e ) return( "unknwon" )  )
  
  gpl  <- tryCatch( 
    paste( GEO@header$platform_id, collapse = "-"), 
    error = function( e ) return( "unknwon" ) ) 
  
  
  gpl_title  <- tryCatch( 
    paste(sapply(GEO@gpls, FUN = function(x){paste( x@header$geo_accession, x@header$title, sep = ": ")}), collapse = ". "),
    error = function( e ) return( "unknwon" ) ) 
  
  
  print( paste( "PMID", PMID, sep = ": ", collapse = "" ) )
  # Collapse multi GPL and mult PMID
  PMID <- paste( "PMID", PMID, sep = "_", collapse = "" )
  gpl  <- paste(  gpl,  sep = "_", collapse = "" )
  
  # Download report
  print( paste( "GSM", length(subs), sep = ":", collapse = "" ) )
  print( "Extraction..." )
  
  # Sava Metafields
  for ( gsm in subs ) {
    srr <- meta_id$srr[meta_id$gsm==gsm]
    srr <- paste(  srr,  sep = "_", collapse = "" )
    
    print( gsm )
    # Accesing metadata. It should be read it as soft (access options )
    MetaDF   <- tryCatch(
      GEO@gsms[[gsm]]@header,
      error = function( e ) print( FALSE )  )
    
    #save sample title 
    gse_title <- tryCatch( 
      paste( GEO@header$title, collapse = "-"), 
      error = function( e ) return( "unknwon" ) ) 
    
    gsm_title <- tryCatch( 
      paste( GEO@gsms[[gsm]]@header$title, collapse = "-"), 
      error = function( e ) return( "unknwon" ) )
    # check available banglies  
    if(is.logical(MetaDF)){
      print(gsm)
      return( "Unavailable gsm" )
    } else{
    # output filename
      geoName  <- paste(geoid, gsm, gpl, PMID, sep='-')
      outfile  <- file.path( odir, "/" , geoName, ".tsv", fsep = "" )
      # Show outfile
      print(paste("outfile", outfile, sep = ": ", collapse = ""))
      # Avoid append problems
      if ( file.exists( outfile ) ) { file.remove(outfile) }
      # Map baglines to download id
      baglines  <- sapply( baglinesB, function(x){ grep( x, names(MetaDF), value=TRUE ) } )
      baglines  <- as.vector( unlist( baglines ) )
      # filter and separate multi balines content. Resize GSM output
      ResizeDF(srr, geoid, gsm, gpl, PMID, gsm_title, gse_title, gpl_title, MetaDF, baglines, outfile)
      print( paste( "Baglines", length(baglines), sep = ": ", collapse = "") )
  }}
  return(TRUE)
}  
# This function
ExtractMetafields <- function( geoid, subs, ddir, odir, baglinesB, meta_id ){  
  print(paste("ID", geoid, sep = ": ", collapse = "" ))
  #ddir <- file.path( ddir, geoid, fsep = "/" )
  # output directory
  odir <- file.path( odir, geoid, fsep = "/" )
  # Create individual folder
  if ( !dir.exists( odir ) ) {
    dir.create( odir )
  }
  # load GSE object
  GEO <- tryCatch( ReadGEO( geoid, ddir ), error=function( e ) print( FALSE ) )
  if(is.logical(GEO)){
    print( "Unreadable GSE softfile")
    return("Error: Unexpected end")
  }  
  # get gsms names
  gsmsList  <- names( GEO@gsms )
  if( is.logical( gsmsList ) ){ 
    print( "Unavailable gsms" )
    return("Error: Unexpected end")
    }
  print("successful load")
  
  report <- tryCatch( 
    AccessMefields(subs, GEO, odir, baglinesB, meta_id),
    error=function( e ) return( FALSE ) )

  if(!report){
    # Remove unused folder
    unlink(odir, recursive=TRUE)
    return( "extraccion failed..." )
  }else{
    return( "successful extraccion..")
  }
}