Estefani Gaytan Nunez

upload

Showing 1000 changed files with 700 additions and 17 deletions

Too many changes to show.

To preserve performance only 1000 of 1000+ files are displayed.

1 +echo
2 +echo
3 +echo
4 +echo "===================================Extraction============================================ "
5 +
6 +cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/all_srr/
7 +
8 +echo "Access to output extracted baglines"
9 +echo "directory: "$(pwd);
10 +#all output-extraction files
11 +index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all_srr/all-output-index.txt"
12 +#input sentences to run CoreNLP
13 +output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt"
14 +#GSE index by bg_sentence row
15 +mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_all_srr.txt"
16 +#Number of fields by bagline
17 +report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all_srr/bg_report.txt"
18 +echo
19 +echo
20 +echo
21 +echo "==============================Baglines index files======================================= "
22 +# absolute file output path
23 +for gse in $(ls -1)
24 +do
25 + cd $gse; ls -d $PWD/*; cd ..;
26 +done > $index
27 +echo "Number if extracted files"
28 +wc -l $index
29 +echo
30 +echo
31 +echo
32 +echo "==============================Baglines extraction======================================="
33 +echo
34 +echo
35 +echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
36 +#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
37 +cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g' | sort | uniq | awk 'BEGIN{FS="\t"; OFS="\t"}{ print $4"PGC",$0 }' | grep -vw "^V1PGC" | grep -vw "^WTPGC"| cut -f2- | sed 's/GSE[0-9]*\/GSE/GSE/1')
38 +
39 +echo "$cext" > "/home/egaytan/ot"
40 +echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
41 +
42 +wc $output
43 +echo "$cext" cut -f1-3,5 > $mapping
44 +wc $mapping
45 +echo
46 +echo
47 +echo "Number of total baglines: "$(wc -l $output );
48 +echo
49 +echo "Baglines report"
50 +
51 +
52 +for gsef in $( cat $index)
53 +do
54 + cat $gsef | sort | uniq ;
55 +done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report
56 +
57 +cat $report
58 +echo
59 +echo
60 +echo "Saving file: "$output;
61 +
...@@ -34,10 +34,13 @@ echo ...@@ -34,10 +34,13 @@ echo
34 echo 34 echo
35 echo "Add sentence-end-tag PGCGROWTHCONDITIONS" 35 echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
36 #cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' ) 36 #cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
37 -cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g' | sort | uniq) 37 +cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g' | sort | uniq | awk 'BEGIN{FS="\t"; OFS="\t"}{ print $4"PGC",$0 }' | grep -vw "^V1PGC" | grep -vw "^WTPGC"| cut -f2- | sed 's/GSE[0-9]*\/GSE/GSE/1')
38 -echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output 38 +
39 +echo "$cext" > "/home/egaytan/ot"
40 +echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
41 +
39 wc $output 42 wc $output
40 -echo "$cext" | cut -f1-3,5 > $mapping 43 +echo "$cext" cut -f1-3,5 > $mapping
41 wc $mapping 44 wc $mapping
42 echo 45 echo
43 echo 46 echo
...@@ -55,3 +58,4 @@ cat $report ...@@ -55,3 +58,4 @@ cat $report
55 echo 58 echo
56 echo 59 echo
57 echo "Saving file: "$output; 60 echo "Saving file: "$output;
61 +
......
1 +echo
2 +echo
3 +echo "==============================Run CoreNLP======================================= ";
4 +echo
5 +echo
6 +
7 +input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt"
8 +output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation"
9 +regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
10 +
11 +echo
12 +echo "input file: " $input;
13 +echo
14 +echo "output directory: " $output;
15 +echo
16 +echo "regex file: " $regexfile;
17 +echo
18 +corenlp.sh -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file $input -outputDirectory $output -regexner.mapping $regexfile -outputExtension .ner;
1 +echo
2 +echo
3 +echo "==============================Run CoreNLP======================================= ";
4 +echo
5 +echo
6 +
7 +input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/srr_htregulondb/bg_sentences.txt"
8 +output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/srr_htregulondb/"
9 +regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
10 +
11 +echo
12 +echo "input file: " $input;
13 +echo
14 +echo "output directory: " $output;
15 +echo
16 +echo "regex file: " $regexfile;
17 +echo
18 +corenlp.sh -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file $input -outputDirectory $output -regexner.mapping $regexfile -outputExtension .ner;
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +
2 +
3 +
4 +===================================Extraction============================================
5 +Access to output extracted baglines
6 +directory: /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/all_srr
7 +
8 +
9 +
10 +==============================Baglines index files=======================================
11 +Number if extracted files
12 +1208 /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all_srr/all-output-index.txt
13 +
14 +
15 +
16 +==============================Baglines extraction=======================================
17 +
18 +
19 +Add sentence-end-tag PGCGROWTHCONDITIONS
20 + 19510 396548 2913905 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt
21 + 19510 455080 3805057 /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_all_srr.txt
22 +
23 +
24 +Number of total baglines: 19510 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt
25 +
26 +Baglines report
27 +3478 characteristics
28 +6798 data_processing
29 +2445 extract_protocol
30 +1165 growth_protocol
31 +1208 library_strategy
32 +1208 organism
33 +1208 source_name
34 +1208 title
35 +798 treatment_protocol
36 +
37 +
38 +Saving file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt
...@@ -17,11 +17,11 @@ Number if extracted files ...@@ -17,11 +17,11 @@ Number if extracted files
17 17
18 18
19 Add sentence-end-tag PGCGROWTHCONDITIONS 19 Add sentence-end-tag PGCGROWTHCONDITIONS
20 - 18006 380932 2801258 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt 20 + 17999 35998 703982 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt
21 - 18006 72024 1340105 /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_srr_galagan.txt 21 + 17999 416918 3448391 /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_srr_galagan.txt
22 22
23 23
24 -Number of total baglines: 18006 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt 24 +Number of total baglines: 17999 /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt
25 25
26 Baglines report 26 Baglines report
27 3254 characteristics 27 3254 characteristics
......
1 +
2 +
3 +==============================Run CoreNLP=======================================
4 +
5 +
6 +
7 +input file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt
8 +
9 +output directory: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
10 +
11 +regex file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt
12 +
13 +java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_all_srr.txt -outputDirectory /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation -regexner.mapping /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt -outputExtension .ner
...@@ -20,15 +20,14 @@ java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp. ...@@ -20,15 +20,14 @@ java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp.
20 [main] INFO edu.stanford.nlp.pipeline.TokensRegexNERAnnotator - regexner: Read 9253 unique entries out of 13838 from /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt, 0 TokensRegex patterns. 20 [main] INFO edu.stanford.nlp.pipeline.TokensRegexNERAnnotator - regexner: Read 9253 unique entries out of 13838 from /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt, 0 TokensRegex patterns.
21 21
22 Processing file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt ... writing to /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation/bg_sentences_srr_galagan.txt.ner 22 Processing file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt ... writing to /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation/bg_sentences_srr_galagan.txt.ner
23 -Untokenizable:  (U+7, decimal: 7) 23 +Annotating file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt ... done [60.8 sec].
24 -Annotating file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt ... done [46.9 sec].
25 24
26 Annotation pipeline timing information: 25 Annotation pipeline timing information:
27 -TokenizerAnnotator: 0.7 sec. 26 +TokenizerAnnotator: 0.3 sec.
28 -WordsToSentencesAnnotator: 0.4 sec. 27 +WordsToSentencesAnnotator: 0.1 sec.
29 -POSTaggerAnnotator: 43.0 sec. 28 +POSTaggerAnnotator: 57.9 sec.
30 -MorphaAnnotator: 1.2 sec. 29 +MorphaAnnotator: 0.5 sec.
31 -TokensRegexNERAnnotator: 1.6 sec. 30 +TokensRegexNERAnnotator: 2.0 sec.
32 -TOTAL: 46.8 sec. for 479434 tokens at 10233.6 tokens/sec. 31 +TOTAL: 60.8 sec. for 98059 tokens at 1612.7 tokens/sec.
33 Pipeline setup: 0.8 sec. 32 Pipeline setup: 0.8 sec.
34 -Total time for StanfordCoreNLP pipeline: 48.1 sec. 33 +Total time for StanfordCoreNLP pipeline: 61.9 sec.
......
...@@ -18,8 +18,9 @@ source("extraction_functions.R") ...@@ -18,8 +18,9 @@ source("extraction_functions.R")
18 # -r ../reports/srr_galagan/extract_report.txt 18 # -r ../reports/srr_galagan/extract_report.txt
19 # 19 #
20 # Examples 20 # Examples
21 -# Rscript ExtractProtocol_v2.R -d ../download/srr_galagan/ -o ../outputs/srr_galagan -i ../input/normalized_srr-gsm_des_v4.tsv -b ../input/listMetaCampo.txt -r ../reports/srr_galagan/extract_report.txt 21 +# nohup Rscript DownloadProtocol_v2.R -d ../download/srr_galagan/ -i ../input/normalized_srr-gsm_des_v4.tsv -r ../reports/srr_galagan/download_report.txt > download_nohup.out
22 -# 22 +# nohup Rscript DownloadProtocol_v2.R -d ../download/all_srr/ -i ../input/all_srr_geo_rnaseq.txt -r ../reports/all_srr/download_report.txt > download_nohup.out
23 +# nohup Rscript DownloadProtocol_v2.R -d ../download/srr_htregulondb/ -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -r ../reports/srr_htregulondb/download_report.txt > download_nohup.out &
23 ####################################################################################### 24 #######################################################################################
24 #-----------------------------------------ARGS----------------------------------------- 25 #-----------------------------------------ARGS-----------------------------------------
25 ####################################################################################### 26 #######################################################################################
......
1 +#!/usr/bin/env Rscript
2 +library("optparse")
3 +library("GEOquery")
4 +source("/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/bin/download_functions_v3.R")
5 +#-------------------------------------------------------------------------------
6 +# Objective
7 +# download GSEnnn.soft.gz files for a list of GSE ids
8 +#
9 +# Input parameters
10 +# --d download dir
11 +# --i GSE information file
12 +#
13 +# Paramenters example
14 +# -d ../download/srr_galagan/
15 +# -o ../outputs/srr_galagan
16 +# -i ../input/normalized_srr-gsm_des_v4.tsv
17 +# -b ../input/listMetaCampo.txt
18 +# -r ../reports/srr_galagan/extract_report.txt
19 +#
20 +# Examples
21 +# nohup Rscript DownloadProtocol_v2.R -d ../download/srr_galagan/ -i ../input/normalized_srr-gsm_des_v4.tsv -r ../reports/srr_galagan/download_report.txt > download_nohup.out
22 +# nohup Rscript DownloadProtocol_v2.R -d ../download/all_srr/ -i ../input/all_srr_geo_rnaseq.txt -r ../reports/all_srr/download_report.txt > download_nohup.out
23 +# nohup Rscript DownloadProtocol_v3.R -d ../download/srr_htregulondb/ -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -r ../reports/srr_htregulondb/download_report.txt > download_nohup.out &
24 +#######################################################################################
25 +#-----------------------------------------ARGS-----------------------------------------
26 +#######################################################################################
27 +
28 +option_list = list( make_option(c("-d", "--downloadPath"),
29 + type="character",
30 + default=NULL,
31 + help="download directory",
32 + metavar="character"),
33 + make_option(c("-i", "--infoFile"),
34 + type="character",
35 + default=NULL,
36 + help="GSE id information file",
37 + metavar="character"),
38 + make_option(c("-r", "--report"),
39 + type="character",
40 + default=NULL,
41 + help="download report file",
42 + metavar="character")
43 + );
44 +
45 +opt_parser = OptionParser(option_list=option_list);
46 +opt = parse_args(opt_parser);
47 +
48 +if (!length(opt)){
49 + print_help(opt_parser)
50 + stop("At least one argument must be supplied (input file).n", call.=FALSE)
51 +}
52 +
53 +#######################################################################################
54 +#-----------------------------------------MAIN-----------------------------------------
55 +#######################################################################################
56 +
57 +## Input files and output directories
58 +infoFile <- opt$infoFile
59 +
60 +
61 +## Load main variables
62 +
63 +# GSE-GSM
64 +gseInfo <- read.table( infoFile,header = T, sep = "\t" )
65 +gseInfo <- gseInfo[grep("GSE", gseInfo$gse, value=F), ]
66 +gseInfo <- gseInfo[grep("GSM", gseInfo$gsm, value=F), ]
67 +gseInfo <- gseInfo[complete.cases(gseInfo),]
68 +
69 +ngse <- length(unique(gseInfo$gse))
70 +ngsm <- length(unique(gseInfo$gsm))
71 +
72 +message("Required GSE: ", ngse)
73 +message("Required GSM: ", ngsm)
74 +#gseInfo
75 +
76 +## Download GSE-list
77 +sink(opt$report, append = FALSE, split = FALSE)
78 +cat("total gse id: ", (length(unique(gseInfo$gse))), "\n")
79 +ngse_down=0
80 +for (geoid in unique(gseInfo$gse)) {
81 + print(geoid)
82 + report <- tryCatch(
83 + DownloadGEO( geoid, opt$downloadPath ),
84 + error = function( e ) return( "download failed" ) )
85 + print(report)
86 + if(report == "successful download"){
87 + ngse_down = ngse_down + 1
88 + }
89 +}
90 +cat("download id: ", length(list.dirs(opt$downloadPath, recursive = FALSE)))
91 +
92 +message("Required GSE: ", ngse_down)
...\ No newline at end of file ...\ No newline at end of file
1 +#from download screen-output
2 +echo "GSE required: $(grep -c GSE download_report.txt)"
3 +echo "GSE successfully downloaded $(grep -c "successful" download_report.txt)"
4 +echo "double check: $(ls ../../download/all_srr/ | wc -l)"
5 +echo "GSM required: $(grep GSM ../../input/all_srr_geo_rnaseq_v2.txt | cut -f2 | sort | uniq | wc -l)"
6 +grep "Found" download_nohup.out | cut -f2 -d ' ' | awk '{sum+=$1} END {print "GSM found: " sum " associated to the GSE requiered"}'
7 +echo "GSM download: $(grep -e "\"GSM[1-9]" extract_report.txt|sort|uniq|wc -l)"
8 +
1 +#from download screen-output
2 +echo "GSM successfully loaded $(grep -e "\"GSM[1-9]" extract_report.txt|sort|uniq|wc -l)"
3 +echo "GSM required: $(grep GSM ../../input/all_srr_geo_rnaseq_v2.txt | cut -f2 | sort | uniq | wc -l)"
4 +
1 +#!/usr/bin/env Rscript
2 +library("optparse")
3 +library("GEOquery")
4 +source("/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/bin/extraction_functions_v3.R")
5 +#-------------------------------------------------------------------------------
6 +# Objective
7 +# Extract banglines text-content from previous GSEnnn.soft.gz files
8 +#
9 +# Input parameters
10 +# --d download dir with GSE/GSEnnn.soft.gz files
11 +# --i GSE information file (gse, gsm columns)
12 +#
13 +# Parameters examples
14 +# -d ../download/srr_galagan/
15 +# -o ../output/srr_galagan
16 +# -i ../input/normalized_srr-gsm_des_v4.tsv
17 +# -b ../input/listMetaCampo.txt
18 +# -r ../reports/srr_galagan/extract_report.txt
19 +#
20 +# Example
21 +# nohup Rscript ExtractProtocol_v3.R -d ../download/srr_htregulondb/ -o ../outputs/srr_htregulondb -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -b ../input/srr_htregulondb/banglines.txt -r ../reports/srr_htregulondb/extract_report.txt > extract_nohup.out &
22 +# Rscript tmp/bin/ExtractProtocol_v3.R -d tmp/download/ -o tmp/output/ext/ -i tmp/input/all_srr_geo_rnaseq_test.txt -b tmp/input/listMetaCampo.txt -r tmp/reports/extract_report.txt
23 +# nohup Rscript ExtractProtocol_v3.R -d ../download/srr_htregulondb/ -o ../outputs/srr_htregulondb -i ../input/srr_htregulondb/SRR_GEO_RNASeq_Expressed.txt -b ../input/srr_htregulondb/banglines.txt -r ../reports/srr_htregulondb/extract_report.txt > extract_nohup.out &
24 +#
25 +#######################################################################################
26 +#-----------------------------------------ARGS-----------------------------------------
27 +#######################################################################################
28 +
29 +option_list = list(
30 + make_option(c("-d", "--downloadPath"),type="character", default=NULL,
31 + help="download directory", metavar="character"),
32 + make_option(c("-o", "--outputPath"), type="character", default=NULL,
33 + help="directory to output files", metavar="character"),
34 + make_option(c("-i", "--infoFile"), type="character", default=NULL,
35 + help="GSE id information file", metavar="character"),
36 + make_option(c("-b", "--banglines"), type="character", default=NULL,
37 + help="banglines file", metavar="character"),
38 + make_option(c("-r", "--report"), type="character", default=NULL,
39 + help="extraccion report file", metavar="character")
40 + );
41 +
42 +opt_parser = OptionParser(option_list=option_list);
43 +opt = parse_args(opt_parser);
44 +
45 +if (!length(opt)){
46 + print_help(opt_parser)
47 + stop("At least one argument must be supplied (input file).n", call.=FALSE)
48 +}
49 +
50 +#######################################################################################
51 +#-----------------------------------------MAIN-----------------------------------------
52 +#######################################################################################
53 +
54 +## Input files and output directories
55 +odir <- opt$outputPath
56 +
57 +# Download dir
58 +ddir <- opt$downloadPath
59 +
60 +# Baglines list file
61 +bglsfile <- opt$banglines
62 +
63 +# GSE information file
64 +infoFile <- opt$infoFile
65 +
66 +## Load main variables
67 +
68 +# Baglines
69 +bglslist <- readr::read_tsv( bglsfile )
70 +bglsBase <- sapply( bglslist, GetBaseBagline )
71 +# GSE-GSM
72 +gseInfo <- read.table( infoFile,header = T, sep = "\t" )
73 +gseInfo <- gseInfo[grep("GSE", gseInfo$gse, value=F), ]
74 +gseInfo <- gseInfo[grep("GSM", gseInfo$gsm, value=F), ]
75 +gseInfo <- gseInfo[complete.cases(gseInfo),]
76 +
77 +ngse <- length(unique(gseInfo$gse))
78 +ngsm <- length(unique(gseInfo$gsm))
79 +
80 +message("Required GSE: ", ngse)
81 +message("Required GSM: ", ngsm)
82 +
83 +## Extraction
84 +
85 +sink(opt$report, append = FALSE, split = FALSE)
86 +cat("total gse id: ", (length(unique(gseInfo$gse))), "\n")
87 +cat("total gsm id: ", (length(unique(gseInfo$gsm))), "\n")
88 +
89 +for (geoid in unique(gseInfo$gse)) {
90 + print(geoid)
91 + ## Filter GSMs
92 + subs <- as.vector(gseInfo$gsm[which(geoid == gseInfo$gse)])
93 + report <- tryCatch(
94 + ExtractMetafields( geoid, subs, ddir, odir, bglsBase, gseInfo),
95 + error = function( e ) return( "extraccion failed" ) )
96 + print(report)
97 +}
98 +cat("extracted gsm id: ", length(list.files(opt$outputPath, recursive = TRUE)))
...\ No newline at end of file ...\ No newline at end of file
1 +DownloadGEO <- function( geoid, ddir ){
2 + # Work directory
3 + wdir <- file.path( ddir, geoid, fsep = "/" )
4 + # Create individual folder
5 + if ( !dir.exists( wdir ) ) {
6 + dir.create( wdir )
7 + }
8 + print("Saving...")
9 + print(ddir)
10 + # Removing downloaded files for geo ID
11 + file.remove( list.files( wdir, pattern = geoid, full.names = TRUE ) )
12 + # Download GSE without neither expression values nor platform info
13 + GEO <- getGEO(
14 + GEO = geoid,
15 + destdir = wdir,
16 + AnnotGPL = FALSE,
17 + getGPL = FALSE,
18 + GSEMatrix = FALSE)
19 + return("successful download")
20 +}
...\ No newline at end of file ...\ No newline at end of file
1 +DownloadGEO <- function( geoid, ddir ){
2 + # Work directory
3 + wdir <- file.path( ddir, geoid, fsep = "/" )
4 + # Create individual folder
5 + if ( !dir.exists( wdir ) ) {
6 + dir.create( wdir )
7 + }
8 + print("Saving...")
9 + print(ddir)
10 + # Removing downloaded files for geo ID
11 + file.remove( list.files( wdir, pattern = geoid, full.names = TRUE ) )
12 + # Download GSE without neither expression values nor platform info
13 + GEO <- getGEO(
14 + GEO = geoid,
15 + destdir = wdir,
16 + AnnotGPL = FALSE,
17 + getGPL = FALSE,
18 + GSEMatrix = FALSE)
19 + return("successful download")
20 +}
21 +
22 +# This function use regex expresion to include all multi-baglines
23 +GetBaseBagline <- function( meta ){
24 + meta <- sub( "[.].*", "", meta )
25 + meta <- sub( "_ch.*", "", meta )
26 + meta <- sub( "_[0-9].*", "", meta )
27 + meta <- sub( ":ch[0-9].*", "", meta )
28 + return( meta )
29 +}
30 +# Remove problematic characters as " or tabs
31 +remove_characters <- function(bagline_content){
32 + clean_text <- gsub("\n", "linebreak", bagline_content)
33 + clean_text <- gsub("\"", "linebreak", clea_text)
34 + clean_text <- gsub("\'", "linebreak", clea_text)
35 + return(clean_text)
36 +}
37 +# This function make a list each content of multi-bagline
38 +ResizeDF <- function( M, baglines, outfile ){
39 + splitBagline <- function(x){
40 + baglineList <- data.frame( data = unlist( M[x] ) )
41 + baglineList$meta <- paste( x, 1:nrow( baglineList ), sep='.' )
42 + #filter tabs and " characterss
43 + bagLineist[,1] <- remove_characters(baglineList[,1])
44 + # Saving meta gsm baglines broken down in list
45 + write.table(
46 + file = outfile, baglineList,
47 + sep = "\t",
48 + eol = "\n",
49 + append = TRUE,
50 + row.names = FALSE,
51 + col.names = FALSE,
52 + quote = TRUE)
53 + }
54 + sapply( baglines, splitBagline)
55 +}
56 +# This function load GEOobject once softfile has downloaded
57 +ReadGEO <- function( geoid, ddir, gz = TRUE ){
58 + GEOfile <- file.path(ddir,geoid,paste(geoid,"soft","gz",sep = "."))
59 + if(!gz){
60 + GEOfile <- gsub(pattern = ".gz", replacement="", x=GEOfile)
61 + }
62 + if (!file.exists(GEOfile)){return(FALSE)}
63 + RGEO <- getGEO(filename = GEOfile)
64 + return(RGEO)
65 +}
66 +# This function
67 +AccessMefields <- function(subs, GEO, odir, baglinesB){
68 + # PMID available
69 + PMID <- tryCatch(
70 + GEO@header$pubmed_id,
71 + error = function( e ) return( "unknwon" ) )
72 + gpl <- tryCatch(
73 + paste( GEO@header$platform_id, collapse = "-"),
74 + error = function( e ) return( "unknwon" ) )
75 + print( paste( "PMID", PMID, sep = ": ", collapse = "" ) )
76 + # Collapse multi GPL and mult PMID
77 + PMID <- paste( "PMID", PMID, sep = ":", collapse = "" )
78 + gpl <- paste( gpl, sep = ":", collapse = "" )
79 + # Download report
80 + print( paste( "GSM", length(subs), sep = ":", collapse = "" ) )
81 + print( "Extraction..." )
82 + # Sava Metafields
83 + for ( gsm in subs ) {
84 + print( gsm )
85 + # Accesing metadata. It should be read it as soft (access options )
86 + MetaDF <- tryCatch(
87 + GEO@gsms[[gsm]]@header,
88 + error = function( e ) print( FALSE ) )
89 + # check available baglies
90 + if(is.logical(MetaDF)){
91 + print(gsm)
92 + return( "Unavailable gsm" )
93 + } else{
94 + # output filename
95 + geoName <- paste(geoid, gsm, gpl, PMID, sep='-')
96 + outfile <- file.path( odir, "/" , geoName, ".tsv", fsep = "" )
97 + # Show outfile
98 + print(paste("outfile", outfile, sep = ": ", collapse = ""))
99 + # Avoid append problems
100 + if ( file.exists( outfile ) ) { file.remove(outfile) }
101 + # Map baglines to download id
102 + baglines <- sapply( baglinesB, function(x){ grep( x, names(MetaDF), value=TRUE ) } )
103 + baglines <- as.vector( unlist( baglines ) )
104 + # filter and separate multi balines content. Resize GSM output
105 + ResizeDF(MetaDF, baglines, outfile)
106 + print( paste( "Baglines", length(baglines), sep = ": ", collapse = "") )
107 + }}
108 + return(TRUE)
109 +}
110 +# This function
111 +ExtractMetafields <- function( geoid, subs, ddir, odir, baglinesB ){
112 + print(paste("ID", geoid, sep = ": ", collapse = "" ))
113 + #ddir <- file.path( ddir, geoid, fsep = "/" )
114 + # output directory
115 + odir <- file.path( odir, geoid, fsep = "/" )
116 + # Create individual folder
117 + if ( !dir.exists( odir ) ) {
118 + dir.create( odir )
119 + }
120 + # load GSE object
121 + GEO <- tryCatch( ReadGEO( geoid, ddir ), error=function( e ) print( FALSE ) )
122 + if(is.logical(GEO)){
123 + print( "Unreadable GSE softfile")
124 + return("Error: Unexpected end")
125 + }
126 + # get gsms names
127 + gsmsList <- names( GEO@gsms )
128 + if( is.logical( gsmsList ) ){
129 + print( "Unavailable gsms" )
130 + return("Error: Unexpected end")
131 + }
132 + print("successful load")
133 +
134 + report <- tryCatch(
135 + AccessMefields(subs, GEO, odir, baglinesB),
136 + error=function( e ) return( FALSE ) )
137 +
138 + if(!report){
139 + # Remove unused folder
140 + unlink(odir, recursive=TRUE)
141 + return( "extraccion failed..." )
142 + }else{
143 + return( "successful extraccion..")
144 + }
145 +}
1 +# This function use regex expresion to include all multi-baglines
2 +GetBaseBagline <- function( meta ){
3 + meta <- sub( "[.].*", "", meta )
4 + meta <- sub( "_ch.*", "", meta )
5 + meta <- sub( "_[0-9].*", "", meta )
6 + meta <- sub( ":ch[0-9].*", "", meta )
7 + return( meta )
8 +}
9 +# Remove problematic characters as " or tabs
10 +remove_characters <- function(bagline_content){
11 + clean_text <- gsub("\n.", "[linebreak]", bagline_content)
12 + clean_text <- gsub("\"", "[linebreak]", clean_text)
13 + clean_text <- gsub("\'", "[linebreak]", clean_text)
14 + clean_text <- gsub("\t", "[linebreak]", clean_text)
15 + return(clean_text)
16 +}
17 +
18 +# This function make a list each content of multi-bagline
19 +ResizeDF <- function( srr, gse, gsm, gpl, PMID,
20 + gsm_title, gse_title, gpl_title,
21 + M, baglines, outfile ){
22 + splitBagline <- function(x){
23 + full_text_bg = unlist( M[x] )
24 + baglineList <- data.frame( data = full_text_bg )
25 + baglineList$srr <- srr
26 + baglineList$a <- gse
27 + baglineList$b <- gsm
28 + baglineList$c <- gpl
29 + baglineList$d <- PMID
30 + baglineList$e <- gsm_title
31 + baglineList$f <- gse_title
32 + baglineList$g <- gpl_title
33 + baglineList$h <- remove_characters(x)
34 + baglineList$h <- GetBaseBagline(baglineList$h)
35 + baglineList$i <- remove_characters(baglineList$data)
36 + #add string end
37 + baglineList$i <- paste(baglineList$i, "PGCGROWTHCONDITIONS", sep = " ")
38 + baglineList$data <- NULL
39 + # Saving meta gsm baglines broken down in list
40 + write.table(
41 + file = outfile, baglineList,
42 + sep = "\t",
43 + eol = "\n",
44 + append = TRUE,
45 + row.names = FALSE,
46 + col.names = FALSE,
47 + quote = FALSE)
48 + }
49 + sapply( baglines, splitBagline)
50 +}
51 +# This function load GEOobject once softfile has downloaded
52 +ReadGEO <- function( geoid, ddir, gz = TRUE ){
53 + GEOfile <- file.path(ddir,geoid,paste(geoid,"soft","gz",sep = "."))
54 + if(!gz){
55 + GEOfile <- gsub(pattern = ".gz", replacement="", x=GEOfile)
56 + }
57 + if (!file.exists(GEOfile)){return(FALSE)}
58 + RGEO <- getGEO(filename = GEOfile)
59 + return(RGEO)
60 +}
61 +# This function
62 +AccessMefields <- function(subs, GEO, odir, baglinesB, meta_id){
63 +
64 + geoid <- GEO@header$geo_accession
65 + # PMID available
66 + PMID <- tryCatch(
67 + GEO@header$pubmed_id,
68 + error = function( e ) return( "unknwon" ) )
69 +
70 + gpl <- tryCatch(
71 + paste( GEO@header$platform_id, collapse = "-"),
72 + error = function( e ) return( "unknwon" ) )
73 +
74 +
75 + gpl_title <- tryCatch(
76 + paste(sapply(GEO@gpls, FUN = function(x){paste( x@header$geo_accession, x@header$title, sep = ": ")}), collapse = ". "),
77 + error = function( e ) return( "unknwon" ) )
78 +
79 +
80 + print( paste( "PMID", PMID, sep = ": ", collapse = "" ) )
81 + # Collapse multi GPL and mult PMID
82 + PMID <- paste( "PMID", PMID, sep = "_", collapse = "" )
83 + gpl <- paste( gpl, sep = "_", collapse = "" )
84 +
85 + # Download report
86 + print( paste( "GSM", length(subs), sep = ":", collapse = "" ) )
87 + print( "Extraction..." )
88 +
89 + # Sava Metafields
90 + for ( gsm in subs ) {
91 + srr <- meta_id$srr[meta_id$gsm==gsm]
92 + srr <- paste( srr, sep = "_", collapse = "" )
93 +
94 + print( gsm )
95 + # Accesing metadata. It should be read it as soft (access options )
96 + MetaDF <- tryCatch(
97 + GEO@gsms[[gsm]]@header,
98 + error = function( e ) print( FALSE ) )
99 +
100 + #save sample title
101 + gse_title <- tryCatch(
102 + paste( GEO@header$title, collapse = "-"),
103 + error = function( e ) return( "unknwon" ) )
104 +
105 + gsm_title <- tryCatch(
106 + paste( GEO@gsms[[gsm]]@header$title, collapse = "-"),
107 + error = function( e ) return( "unknwon" ) )
108 + # check available banglies
109 + if(is.logical(MetaDF)){
110 + print(gsm)
111 + return( "Unavailable gsm" )
112 + } else{
113 + # output filename
114 + geoName <- paste(geoid, gsm, gpl, PMID, sep='-')
115 + outfile <- file.path( odir, "/" , geoName, ".tsv", fsep = "" )
116 + # Show outfile
117 + print(paste("outfile", outfile, sep = ": ", collapse = ""))
118 + # Avoid append problems
119 + if ( file.exists( outfile ) ) { file.remove(outfile) }
120 + # Map baglines to download id
121 + baglines <- sapply( baglinesB, function(x){ grep( x, names(MetaDF), value=TRUE ) } )
122 + baglines <- as.vector( unlist( baglines ) )
123 + # filter and separate multi balines content. Resize GSM output
124 + ResizeDF(srr, geoid, gsm, gpl, PMID, gsm_title, gse_title, gpl_title, MetaDF, baglines, outfile)
125 + print( paste( "Baglines", length(baglines), sep = ": ", collapse = "") )
126 + }}
127 + return(TRUE)
128 +}
129 +# This function
130 +ExtractMetafields <- function( geoid, subs, ddir, odir, baglinesB, meta_id ){
131 + print(paste("ID", geoid, sep = ": ", collapse = "" ))
132 + #ddir <- file.path( ddir, geoid, fsep = "/" )
133 + # output directory
134 + odir <- file.path( odir, geoid, fsep = "/" )
135 + # Create individual folder
136 + if ( !dir.exists( odir ) ) {
137 + dir.create( odir )
138 + }
139 + # load GSE object
140 + GEO <- tryCatch( ReadGEO( geoid, ddir ), error=function( e ) print( FALSE ) )
141 + if(is.logical(GEO)){
142 + print( "Unreadable GSE softfile")
143 + return("Error: Unexpected end")
144 + }
145 + # get gsms names
146 + gsmsList <- names( GEO@gsms )
147 + if( is.logical( gsmsList ) ){
148 + print( "Unavailable gsms" )
149 + return("Error: Unexpected end")
150 + }
151 + print("successful load")
152 +
153 + report <- tryCatch(
154 + AccessMefields(subs, GEO, odir, baglinesB, meta_id),
155 + error=function( e ) return( FALSE ) )
156 +
157 + if(!report){
158 + # Remove unused folder
159 + unlink(odir, recursive=TRUE)
160 + return( "extraccion failed..." )
161 + }else{
162 + return( "successful extraccion..")
163 + }
164 +}
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.