Estefani Gaytan Nunez

upload

Showing 1000 changed files with 700 additions and 17 deletions

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.

echo
echo
echo
echo "===================================Extraction============================================ "
cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/all_srr/
echo "Access to output extracted baglines"
echo "directory: "$(pwd);
#all output-extraction files
index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all_srr/all-output-index.txt"
#input sentences to run CoreNLP
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt"
#GSE index by bg_sentence row
mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_all_srr.txt"
#Number of fields by bagline
report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all_srr/bg_report.txt"
echo
echo
echo
echo "==============================Baglines index files======================================= "
# absolute file output path
for gse in $(ls -1)
do
cd $gse; ls -d $PWD/*; cd ..;
done > $index
echo "Number if extracted files"
wc -l $index
echo
echo
echo
echo "==============================Baglines extraction======================================="
echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g' | sort | uniq | awk 'BEGIN{FS="\t"; OFS="\t"}{ print $4"PGC",$0 }' | grep -vw "^V1PGC" | grep -vw "^WTPGC"| cut -f2- | sed 's/GSE[0-9]*\/GSE/GSE/1')
echo "$cext" > "/home/egaytan/ot"
echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
wc $output
echo "$cext" cut -f1-3,5 > $mapping
wc $mapping
echo
echo
echo "Number of total baglines: "$(wc -l $output );
echo
echo "Baglines report"
for gsef in $( cat $index)
do
cat $gsef | sort | uniq ;
done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report
cat $report
echo
echo
echo "Saving file: "$output;
......@@ -34,10 +34,13 @@ echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g' | sort | uniq)
echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g' | sort | uniq | awk 'BEGIN{FS="\t"; OFS="\t"}{ print $4"PGC",$0 }' | grep -vw "^V1PGC" | grep -vw "^WTPGC"| cut -f2- | sed 's/GSE[0-9]*\/GSE/GSE/1')
echo "$cext" > "/home/egaytan/ot"
echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
wc $output
echo "$cext" | cut -f1-3,5 > $mapping
echo "$cext" cut -f1-3,5 > $mapping
wc $mapping
echo
echo
......@@ -55,3 +58,4 @@ cat $report
echo
echo
echo "Saving file: "$output;
......
echo
echo
echo "==============================Run CoreNLP======================================= ";
echo
echo
input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt"
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation"
regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
echo
echo "input file: " $input;
echo
echo "output directory: " $output;
echo
echo "regex file: " $regexfile;
echo
corenlp.sh -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file $input -outputDirectory $output -regexner.mapping $regexfile -outputExtension .ner;
echo
echo
echo "==============================Run CoreNLP======================================= ";
echo
echo
input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/srr_htregulondb/bg_sentences.txt"
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/srr_htregulondb/"
regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
echo
echo "input file: " $input;
echo
echo "output directory: " $output;
echo
echo "regex file: " $regexfile;
echo
corenlp.sh -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file $input -outputDirectory $output -regexner.mapping $regexfile -outputExtension .ner;
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
===================================Extraction============================================
Access to output extracted baglines
directory: /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/all_srr
==============================Baglines index files=======================================
Number if extracted files
1208 /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all_srr/all-output-index.txt
==============================Baglines extraction=======================================
Add sentence-end-tag PGCGROWTHCONDITIONS
19510 396548 2913905 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt
19510 455080 3805057 /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_all_srr.txt
Number of total baglines: 19510 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt
Baglines report
3478 characteristics
6798 data_processing
2445 extract_protocol
1165 growth_protocol
1208 library_strategy
1208 organism
1208 source_name
1208 title
798 treatment_protocol
Saving file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt
......@@ -17,11 +17,11 @@ Number if extracted files
Add sentence-end-tag PGCGROWTHCONDITIONS
18006 380932 2801258 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt
18006 72024 1340105 /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_srr_galagan.txt
17999 35998 703982 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt
17999 416918 3448391 /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_srr_galagan.txt
Number of total baglines: 18006 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt
Number of total baglines: 17999 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt
Baglines report
3254 characteristics
......
==============================Run CoreNLP=======================================
input file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt
output directory: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
regex file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt
java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt -outputDirectory /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation -regexner.mapping /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt -outputExtension .ner
......@@ -20,15 +20,14 @@ java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp.
[main] INFO edu.stanford.nlp.pipeline.TokensRegexNERAnnotator - regexner: Read 9253 unique entries out of 13838 from /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt, 0 TokensRegex patterns.
Processing file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt ... writing to /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation/bg_sentences_srr_galagan.txt.ner
Untokenizable:  (U+7, decimal: 7)
Annotating file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt ... done [46.9 sec].
Annotating file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt ... done [60.8 sec].
Annotation pipeline timing information:
TokenizerAnnotator: 0.7 sec.
WordsToSentencesAnnotator: 0.4 sec.
POSTaggerAnnotator: 43.0 sec.
MorphaAnnotator: 1.2 sec.
TokensRegexNERAnnotator: 1.6 sec.
TOTAL: 46.8 sec. for 479434 tokens at 10233.6 tokens/sec.
TokenizerAnnotator: 0.3 sec.
WordsToSentencesAnnotator: 0.1 sec.
POSTaggerAnnotator: 57.9 sec.
MorphaAnnotator: 0.5 sec.
TokensRegexNERAnnotator: 2.0 sec.
TOTAL: 60.8 sec. for 98059 tokens at 1612.7 tokens/sec.
Pipeline setup: 0.8 sec.
Total time for StanfordCoreNLP pipeline: 48.1 sec.
Total time for StanfordCoreNLP pipeline: 61.9 sec.
......
......@@ -18,8 +18,9 @@ source("extraction_functions.R")
# -r ../reports/srr_galagan/extract_report.txt
#
# Examples
# Rscript ExtractProtocol_v2.R -d ../download/srr_galagan/ -o ../outputs/srr_galagan -i ../input/normalized_srr-gsm_des_v4.tsv -b ../input/listMetaCampo.txt -r ../reports/srr_galagan/extract_report.txt
#
# nohup Rscript DownloadProtocol_v2.R -d ../download/srr_galagan/ -i ../input/normalized_srr-gsm_des_v4.tsv -r ../reports/srr_galagan/download_report.txt > download_nohup.out
# nohup Rscript DownloadProtocol_v2.R -d ../download/all_srr/ -i ../input/all_srr_geo_rnaseq.txt -r ../reports/all_srr/download_report.txt > download_nohup.out
# nohup Rscript DownloadProtocol_v2.R -d ../download/srr_htregulondb/ -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -r ../reports/srr_htregulondb/download_report.txt > download_nohup.out &
#######################################################################################
#-----------------------------------------ARGS-----------------------------------------
#######################################################################################
......
#!/usr/bin/env Rscript
library("optparse")
library("GEOquery")
source("/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/bin/download_functions_v3.R")
#-------------------------------------------------------------------------------
# Objective
# download GSEnnn.soft.gz files for a list of GSE ids
#
# Input parameters
# --d download dir
# --i GSE information file
#
# Paramenters example
# -d ../download/srr_galagan/
# -o ../outputs/srr_galagan
# -i ../input/normalized_srr-gsm_des_v4.tsv
# -b ../input/listMetaCampo.txt
# -r ../reports/srr_galagan/extract_report.txt
#
# Examples
# nohup Rscript DownloadProtocol_v2.R -d ../download/srr_galagan/ -i ../input/normalized_srr-gsm_des_v4.tsv -r ../reports/srr_galagan/download_report.txt > download_nohup.out
# nohup Rscript DownloadProtocol_v2.R -d ../download/all_srr/ -i ../input/all_srr_geo_rnaseq.txt -r ../reports/all_srr/download_report.txt > download_nohup.out
# nohup Rscript DownloadProtocol_v3.R -d ../download/srr_htregulondb/ -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -r ../reports/srr_htregulondb/download_report.txt > download_nohup.out &
#######################################################################################
#-----------------------------------------ARGS-----------------------------------------
#######################################################################################
option_list = list( make_option(c("-d", "--downloadPath"),
type="character",
default=NULL,
help="download directory",
metavar="character"),
make_option(c("-i", "--infoFile"),
type="character",
default=NULL,
help="GSE id information file",
metavar="character"),
make_option(c("-r", "--report"),
type="character",
default=NULL,
help="download report file",
metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (!length(opt)){
print_help(opt_parser)
stop("At least one argument must be supplied (input file).n", call.=FALSE)
}
#######################################################################################
#-----------------------------------------MAIN-----------------------------------------
#######################################################################################
## Input files and output directories
infoFile <- opt$infoFile
## Load main variables
# GSE-GSM
gseInfo <- read.table( infoFile,header = T, sep = "\t" )
gseInfo <- gseInfo[grep("GSE", gseInfo$gse, value=F), ]
gseInfo <- gseInfo[grep("GSM", gseInfo$gsm, value=F), ]
gseInfo <- gseInfo[complete.cases(gseInfo),]
ngse <- length(unique(gseInfo$gse))
ngsm <- length(unique(gseInfo$gsm))
message("Required GSE: ", ngse)
message("Required GSM: ", ngsm)
#gseInfo
## Download GSE-list
sink(opt$report, append = FALSE, split = FALSE)
cat("total gse id: ", (length(unique(gseInfo$gse))), "\n")
ngse_down=0
for (geoid in unique(gseInfo$gse)) {
print(geoid)
report <- tryCatch(
DownloadGEO( geoid, opt$downloadPath ),
error = function( e ) return( "download failed" ) )
print(report)
if(report == "successful download"){
ngse_down = ngse_down + 1
}
}
cat("download id: ", length(list.dirs(opt$downloadPath, recursive = FALSE)))
message("Required GSE: ", ngse_down)
\ No newline at end of file
#from download screen-output
echo "GSE required: $(grep -c GSE download_report.txt)"
echo "GSE successfully downloaded $(grep -c "successful" download_report.txt)"
echo "double check: $(ls ../../download/all_srr/ | wc -l)"
echo "GSM required: $(grep GSM ../../input/all_srr_geo_rnaseq_v2.txt | cut -f2 | sort | uniq | wc -l)"
grep "Found" download_nohup.out | cut -f2 -d ' ' | awk '{sum+=$1} END {print "GSM found: " sum " associated to the GSE requiered"}'
echo "GSM download: $(grep -e "\"GSM[1-9]" extract_report.txt|sort|uniq|wc -l)"
#from download screen-output
echo "GSM successfully loaded $(grep -e "\"GSM[1-9]" extract_report.txt|sort|uniq|wc -l)"
echo "GSM required: $(grep GSM ../../input/all_srr_geo_rnaseq_v2.txt | cut -f2 | sort | uniq | wc -l)"
#!/usr/bin/env Rscript
library("optparse")
library("GEOquery")
source("/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/bin/extraction_functions_v3.R")
#-------------------------------------------------------------------------------
# Objective
# Extract banglines text-content from previous GSEnnn.soft.gz files
#
# Input parameters
# --d download dir with GSE/GSEnnn.soft.gz files
# --i GSE information file (gse, gsm columns)
#
# Parameters examples
# -d ../download/srr_galagan/
# -o ../output/srr_galagan
# -i ../input/normalized_srr-gsm_des_v4.tsv
# -b ../input/listMetaCampo.txt
# -r ../reports/srr_galagan/extract_report.txt
#
# Example
# nohup Rscript ExtractProtocol_v3.R -d ../download/srr_htregulondb/ -o ../outputs/srr_htregulondb -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -b ../input/srr_htregulondb/banglines.txt -r ../reports/srr_htregulondb/extract_report.txt > extract_nohup.out &
# Rscript tmp/bin/ExtractProtocol_v3.R -d tmp/download/ -o tmp/output/ext/ -i tmp/input/all_srr_geo_rnaseq_test.txt -b tmp/input/listMetaCampo.txt -r tmp/reports/extract_report.txt
# nohup Rscript ExtractProtocol_v3.R -d ../download/srr_htregulondb/ -o ../outputs/srr_htregulondb -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -b ../input/srr_htregulondb/banglines.txt -r ../reports/srr_htregulondb/extract_report.txt > extract_nohup.out &
#
#######################################################################################
#-----------------------------------------ARGS-----------------------------------------
#######################################################################################
option_list = list(
make_option(c("-d", "--downloadPath"),type="character", default=NULL,
help="download directory", metavar="character"),
make_option(c("-o", "--outputPath"), type="character", default=NULL,
help="directory to output files", metavar="character"),
make_option(c("-i", "--infoFile"), type="character", default=NULL,
help="GSE id information file", metavar="character"),
make_option(c("-b", "--banglines"), type="character", default=NULL,
help="banglines file", metavar="character"),
make_option(c("-r", "--report"), type="character", default=NULL,
help="extraccion report file", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
if (!length(opt)){
print_help(opt_parser)
stop("At least one argument must be supplied (input file).n", call.=FALSE)
}
#######################################################################################
#-----------------------------------------MAIN-----------------------------------------
#######################################################################################
## Input files and output directories
odir <- opt$outputPath
# Download dir
ddir <- opt$downloadPath
# Baglines list file
bglsfile <- opt$banglines
# GSE information file
infoFile <- opt$infoFile
## Load main variables
# Baglines
bglslist <- readr::read_tsv( bglsfile )
bglsBase <- sapply( bglslist, GetBaseBagline )
# GSE-GSM
gseInfo <- read.table( infoFile,header = T, sep = "\t" )
gseInfo <- gseInfo[grep("GSE", gseInfo$gse, value=F), ]
gseInfo <- gseInfo[grep("GSM", gseInfo$gsm, value=F), ]
gseInfo <- gseInfo[complete.cases(gseInfo),]
ngse <- length(unique(gseInfo$gse))
ngsm <- length(unique(gseInfo$gsm))
message("Required GSE: ", ngse)
message("Required GSM: ", ngsm)
## Extraction
sink(opt$report, append = FALSE, split = FALSE)
cat("total gse id: ", (length(unique(gseInfo$gse))), "\n")
cat("total gsm id: ", (length(unique(gseInfo$gsm))), "\n")
for (geoid in unique(gseInfo$gse)) {
print(geoid)
## Filter GSMs
subs <- as.vector(gseInfo$gsm[which(geoid == gseInfo$gse)])
report <- tryCatch(
ExtractMetafields( geoid, subs, ddir, odir, bglsBase, gseInfo),
error = function( e ) return( "extraccion failed" ) )
print(report)
}
cat("extracted gsm id: ", length(list.files(opt$outputPath, recursive = TRUE)))
\ No newline at end of file
DownloadGEO <- function( geoid, ddir ){
# Work directory
wdir <- file.path( ddir, geoid, fsep = "/" )
# Create individual folder
if ( !dir.exists( wdir ) ) {
dir.create( wdir )
}
print("Saving...")
print(ddir)
# Removing downloaded files for geo ID
file.remove( list.files( wdir, pattern = geoid, full.names = TRUE ) )
# Download GSE without neither expression values nor platform info
GEO <- getGEO(
GEO = geoid,
destdir = wdir,
AnnotGPL = FALSE,
getGPL = FALSE,
GSEMatrix = FALSE)
return("successful download")
}
\ No newline at end of file
DownloadGEO <- function( geoid, ddir ){
# Work directory
wdir <- file.path( ddir, geoid, fsep = "/" )
# Create individual folder
if ( !dir.exists( wdir ) ) {
dir.create( wdir )
}
print("Saving...")
print(ddir)
# Removing downloaded files for geo ID
file.remove( list.files( wdir, pattern = geoid, full.names = TRUE ) )
# Download GSE without neither expression values nor platform info
GEO <- getGEO(
GEO = geoid,
destdir = wdir,
AnnotGPL = FALSE,
getGPL = FALSE,
GSEMatrix = FALSE)
return("successful download")
}
# This function use regex expresion to include all multi-baglines
GetBaseBagline <- function( meta ){
meta <- sub( "[.].*", "", meta )
meta <- sub( "_ch.*", "", meta )
meta <- sub( "_[0-9].*", "", meta )
meta <- sub( ":ch[0-9].*", "", meta )
return( meta )
}
# Remove problematic characters as " or tabs
remove_characters <- function(bagline_content){
clean_text <- gsub("\n", "linebreak", bagline_content)
clean_text <- gsub("\"", "linebreak", clea_text)
clean_text <- gsub("\'", "linebreak", clea_text)
return(clean_text)
}
# This function make a list each content of multi-bagline
ResizeDF <- function( M, baglines, outfile ){
splitBagline <- function(x){
baglineList <- data.frame( data = unlist( M[x] ) )
baglineList$meta <- paste( x, 1:nrow( baglineList ), sep='.' )
#filter tabs and " characterss
bagLineist[,1] <- remove_characters(baglineList[,1])
# Saving meta gsm baglines broken down in list
write.table(
file = outfile, baglineList,
sep = "\t",
eol = "\n",
append = TRUE,
row.names = FALSE,
col.names = FALSE,
quote = TRUE)
}
sapply( baglines, splitBagline)
}
# This function load GEOobject once softfile has downloaded
ReadGEO <- function( geoid, ddir, gz = TRUE ){
GEOfile <- file.path(ddir,geoid,paste(geoid,"soft","gz",sep = "."))
if(!gz){
GEOfile <- gsub(pattern = ".gz", replacement="", x=GEOfile)
}
if (!file.exists(GEOfile)){return(FALSE)}
RGEO <- getGEO(filename = GEOfile)
return(RGEO)
}
# This function
AccessMefields <- function(subs, GEO, odir, baglinesB){
# PMID available
PMID <- tryCatch(
GEO@header$pubmed_id,
error = function( e ) return( "unknwon" ) )
gpl <- tryCatch(
paste( GEO@header$platform_id, collapse = "-"),
error = function( e ) return( "unknwon" ) )
print( paste( "PMID", PMID, sep = ": ", collapse = "" ) )
# Collapse multi GPL and mult PMID
PMID <- paste( "PMID", PMID, sep = ":", collapse = "" )
gpl <- paste( gpl, sep = ":", collapse = "" )
# Download report
print( paste( "GSM", length(subs), sep = ":", collapse = "" ) )
print( "Extraction..." )
# Sava Metafields
for ( gsm in subs ) {
print( gsm )
# Accesing metadata. It should be read it as soft (access options )
MetaDF <- tryCatch(
GEO@gsms[[gsm]]@header,
error = function( e ) print( FALSE ) )
# check available baglies
if(is.logical(MetaDF)){
print(gsm)
return( "Unavailable gsm" )
} else{
# output filename
geoName <- paste(geoid, gsm, gpl, PMID, sep='-')
outfile <- file.path( odir, "/" , geoName, ".tsv", fsep = "" )
# Show outfile
print(paste("outfile", outfile, sep = ": ", collapse = ""))
# Avoid append problems
if ( file.exists( outfile ) ) { file.remove(outfile) }
# Map baglines to download id
baglines <- sapply( baglinesB, function(x){ grep( x, names(MetaDF), value=TRUE ) } )
baglines <- as.vector( unlist( baglines ) )
# filter and separate multi balines content. Resize GSM output
ResizeDF(MetaDF, baglines, outfile)
print( paste( "Baglines", length(baglines), sep = ": ", collapse = "") )
}}
return(TRUE)
}
# This function
ExtractMetafields <- function( geoid, subs, ddir, odir, baglinesB ){
print(paste("ID", geoid, sep = ": ", collapse = "" ))
#ddir <- file.path( ddir, geoid, fsep = "/" )
# output directory
odir <- file.path( odir, geoid, fsep = "/" )
# Create individual folder
if ( !dir.exists( odir ) ) {
dir.create( odir )
}
# load GSE object
GEO <- tryCatch( ReadGEO( geoid, ddir ), error=function( e ) print( FALSE ) )
if(is.logical(GEO)){
print( "Unreadable GSE softfile")
return("Error: Unexpected end")
}
# get gsms names
gsmsList <- names( GEO@gsms )
if( is.logical( gsmsList ) ){
print( "Unavailable gsms" )
return("Error: Unexpected end")
}
print("successful load")
report <- tryCatch(
AccessMefields(subs, GEO, odir, baglinesB),
error=function( e ) return( FALSE ) )
if(!report){
# Remove unused folder
unlink(odir, recursive=TRUE)
return( "extraccion failed..." )
}else{
return( "successful extraccion..")
}
}
# This function use regex expresion to include all multi-baglines
GetBaseBagline <- function( meta ){
meta <- sub( "[.].*", "", meta )
meta <- sub( "_ch.*", "", meta )
meta <- sub( "_[0-9].*", "", meta )
meta <- sub( ":ch[0-9].*", "", meta )
return( meta )
}
# Remove problematic characters as " or tabs
remove_characters <- function(bagline_content){
clean_text <- gsub("\n.", "[linebreak]", bagline_content)
clean_text <- gsub("\"", "[linebreak]", clean_text)
clean_text <- gsub("\'", "[linebreak]", clean_text)
clean_text <- gsub("\t", "[linebreak]", clean_text)
return(clean_text)
}
# This function make a list each content of multi-bagline
ResizeDF <- function( srr, gse, gsm, gpl, PMID,
gsm_title, gse_title, gpl_title,
M, baglines, outfile ){
splitBagline <- function(x){
full_text_bg = unlist( M[x] )
baglineList <- data.frame( data = full_text_bg )
baglineList$srr <- srr
baglineList$a <- gse
baglineList$b <- gsm
baglineList$c <- gpl
baglineList$d <- PMID
baglineList$e <- gsm_title
baglineList$f <- gse_title
baglineList$g <- gpl_title
baglineList$h <- remove_characters(x)
baglineList$h <- GetBaseBagline(baglineList$h)
baglineList$i <- remove_characters(baglineList$data)
#add string end
baglineList$i <- paste(baglineList$i, "PGCGROWTHCONDITIONS", sep = " ")
baglineList$data <- NULL
# Saving meta gsm baglines broken down in list
write.table(
file = outfile, baglineList,
sep = "\t",
eol = "\n",
append = TRUE,
row.names = FALSE,
col.names = FALSE,
quote = FALSE)
}
sapply( baglines, splitBagline)
}
# This function load GEOobject once softfile has downloaded
ReadGEO <- function( geoid, ddir, gz = TRUE ){
GEOfile <- file.path(ddir,geoid,paste(geoid,"soft","gz",sep = "."))
if(!gz){
GEOfile <- gsub(pattern = ".gz", replacement="", x=GEOfile)
}
if (!file.exists(GEOfile)){return(FALSE)}
RGEO <- getGEO(filename = GEOfile)
return(RGEO)
}
# This function
AccessMefields <- function(subs, GEO, odir, baglinesB, meta_id){
geoid <- GEO@header$geo_accession
# PMID available
PMID <- tryCatch(
GEO@header$pubmed_id,
error = function( e ) return( "unknwon" ) )
gpl <- tryCatch(
paste( GEO@header$platform_id, collapse = "-"),
error = function( e ) return( "unknwon" ) )
gpl_title <- tryCatch(
paste(sapply(GEO@gpls, FUN = function(x){paste( x@header$geo_accession, x@header$title, sep = ": ")}), collapse = ". "),
error = function( e ) return( "unknwon" ) )
print( paste( "PMID", PMID, sep = ": ", collapse = "" ) )
# Collapse multi GPL and mult PMID
PMID <- paste( "PMID", PMID, sep = "_", collapse = "" )
gpl <- paste( gpl, sep = "_", collapse = "" )
# Download report
print( paste( "GSM", length(subs), sep = ":", collapse = "" ) )
print( "Extraction..." )
# Sava Metafields
for ( gsm in subs ) {
srr <- meta_id$srr[meta_id$gsm==gsm]
srr <- paste( srr, sep = "_", collapse = "" )
print( gsm )
# Accesing metadata. It should be read it as soft (access options )
MetaDF <- tryCatch(
GEO@gsms[[gsm]]@header,
error = function( e ) print( FALSE ) )
#save sample title
gse_title <- tryCatch(
paste( GEO@header$title, collapse = "-"),
error = function( e ) return( "unknwon" ) )
gsm_title <- tryCatch(
paste( GEO@gsms[[gsm]]@header$title, collapse = "-"),
error = function( e ) return( "unknwon" ) )
# check available banglies
if(is.logical(MetaDF)){
print(gsm)
return( "Unavailable gsm" )
} else{
# output filename
geoName <- paste(geoid, gsm, gpl, PMID, sep='-')
outfile <- file.path( odir, "/" , geoName, ".tsv", fsep = "" )
# Show outfile
print(paste("outfile", outfile, sep = ": ", collapse = ""))
# Avoid append problems
if ( file.exists( outfile ) ) { file.remove(outfile) }
# Map baglines to download id
baglines <- sapply( baglinesB, function(x){ grep( x, names(MetaDF), value=TRUE ) } )
baglines <- as.vector( unlist( baglines ) )
# filter and separate multi balines content. Resize GSM output
ResizeDF(srr, geoid, gsm, gpl, PMID, gsm_title, gse_title, gpl_title, MetaDF, baglines, outfile)
print( paste( "Baglines", length(baglines), sep = ": ", collapse = "") )
}}
return(TRUE)
}
# This function
ExtractMetafields <- function( geoid, subs, ddir, odir, baglinesB, meta_id ){
print(paste("ID", geoid, sep = ": ", collapse = "" ))
#ddir <- file.path( ddir, geoid, fsep = "/" )
# output directory
odir <- file.path( odir, geoid, fsep = "/" )
# Create individual folder
if ( !dir.exists( odir ) ) {
dir.create( odir )
}
# load GSE object
GEO <- tryCatch( ReadGEO( geoid, ddir ), error=function( e ) print( FALSE ) )
if(is.logical(GEO)){
print( "Unreadable GSE softfile")
return("Error: Unexpected end")
}
# get gsms names
gsmsList <- names( GEO@gsms )
if( is.logical( gsmsList ) ){
print( "Unavailable gsms" )
return("Error: Unexpected end")
}
print("successful load")
report <- tryCatch(
AccessMefields(subs, GEO, odir, baglinesB, meta_id),
error=function( e ) return( FALSE ) )
if(!report){
# Remove unused folder
unlink(odir, recursive=TRUE)
return( "extraccion failed..." )
}else{
return( "successful extraccion..")
}
}
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.