organizacion nueva

Estefani Gaytan Nunez
Commit 13412450bd4d13e34c9e3aaeecefe759c7a79b42 13412450 1 parent b4a0ecbb
Showing 53 changed files with 0 additions and 364 deletions
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/bin/label-split_training_test_v1.py → CRF/bin/label-split_training_test_v1.py
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/bin/params.py → CRF/bin/params.py
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/bin/training_validation_v3.py → CRF/bin/training_validation_v3.py
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/check/sentences-405-order-rep.txt → CRF/check/sentences-405-order-rep.txt
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets/test-data-set-30.txt → CRF/data-sets/test-data-set-30.txt
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets/training-data-set-70.txt → CRF/data-sets/training-data-set-70.txt
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/models/training-data-set-70.fStopWords_False.fSymbols_False.mod → CRF/models/training-data-set-70.fStopWords_False.fSymbols_False.mod
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt → CRF/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt → CRF/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt
GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt → CRF/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt
GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/bin/get-raw-sentences.sh → CoreNLP/bin/get-raw-sentences.sh
GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/bin/single_run.sh → CoreNLP/bin/single_run.sh
GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt → CoreNLP/input/raw-metadata-senteneces.txt
GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/raw-metadata-senteneces.txt.conll → CoreNLP/output/raw-metadata-senteneces.txt.conll
data-sets/file_output/exit_file.txt
data-sets/file_output/exit_file.xml
data-sets/gzip-data/borrame.txt
report-manually-tagged-gcs/GSE11230_family.report.csv → data-sets/report-manually-tagged-gcs/GSE11230_family.report.csv
report-manually-tagged-gcs/GSE19053_family.report.csv → data-sets/report-manually-tagged-gcs/GSE19053_family.report.csv
report-manually-tagged-gcs/GSE26054_family.report.csv → data-sets/report-manually-tagged-gcs/GSE26054_family.report.csv
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/bin/label-split_training_test_v1.py → CRF/bin/label-split_training_test_v1.py
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/bin/label-split_training_test_v1.py → CRF/bin/label-split_training_test_v1.py
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/bin/params.py → CRF/bin/params.py
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/bin/params.py → CRF/bin/params.py
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/bin/training_validation_v3.py → CRF/bin/training_validation_v3.py
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/bin/training_validation_v3.py → CRF/bin/training_validation_v3.py
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/check/sentences-405-order-rep.txt → CRF/check/sentences-405-order-rep.txt
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/check/sentences-405-order-rep.txt → CRF/check/sentences-405-order-rep.txt
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets/test-data-set-30.txt → CRF/data-sets/test-data-set-30.txt
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets/test-data-set-30.txt → CRF/data-sets/test-data-set-30.txt
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets/training-data-set-70.txt → CRF/data-sets/training-data-set-70.txt
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets/training-data-set-70.txt → CRF/data-sets/training-data-set-70.txt
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/models/training-data-set-70.fStopWords_False.fSymbols_False.mod → CRF/models/training-data-set-70.fStopWords_False.fSymbols_False.mod
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/models/training-data-set-70.fStopWords_False.fSymbols_False.mod → CRF/models/training-data-set-70.fStopWords_False.fSymbols_False.mod
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt → CRF/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt → CRF/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt → CRF/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt → CRF/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt → CRF/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt → CRF/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/bin/get-raw-sentences.sh → CoreNLP/bin/get-raw-sentences.sh
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/bin/get-raw-sentences.sh → CoreNLP/bin/get-raw-sentences.sh
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/bin/single_run.sh → CoreNLP/bin/single_run.sh
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/bin/single_run.sh → CoreNLP/bin/single_run.sh
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt → CoreNLP/input/raw-metadata-senteneces.txt
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt → CoreNLP/input/raw-metadata-senteneces.txt
View file @1341245
--- a/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/raw-metadata-senteneces.txt.conll → CoreNLP/output/raw-metadata-senteneces.txt.conll
View file @1341245
+++ b/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/raw-metadata-senteneces.txt.conll → CoreNLP/output/raw-metadata-senteneces.txt.conll
View file @1341245
--- a/data-sets/file_output/exit_file.txt deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/file_output/exit_file.txt deleted 100644 → 0
View file @b4a0ecb
--- a/data-sets/file_output/exit_file.xml deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/file_output/exit_file.xml deleted 100644 → 0
View file @b4a0ecb
--- a/data-sets/gzip-data/borrame.txt deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/gzip-data/borrame.txt deleted 100644 → 0
View file @b4a0ecb
--- a/report-manually-tagged-gcs/GSE11230_family.report.csv → data-sets/report-manually-tagged-gcs/GSE11230_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE11230_family.report.csv → data-sets/report-manually-tagged-gcs/GSE11230_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE19053_family.report.csv → data-sets/report-manually-tagged-gcs/GSE19053_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE19053_family.report.csv → data-sets/report-manually-tagged-gcs/GSE19053_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE26054_family.report.csv → data-sets/report-manually-tagged-gcs/GSE26054_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE26054_family.report.csv → data-sets/report-manually-tagged-gcs/GSE26054_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE26589_family.report.csv → data-sets/report-manually-tagged-gcs/GSE26589_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE26589_family.report.csv → data-sets/report-manually-tagged-gcs/GSE26589_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE41186_family.report.csv → data-sets/report-manually-tagged-gcs/GSE41186_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE41186_family.report.csv → data-sets/report-manually-tagged-gcs/GSE41186_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE41190_family.report.csv → data-sets/report-manually-tagged-gcs/GSE41190_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE41190_family.report.csv → data-sets/report-manually-tagged-gcs/GSE41190_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE41195_family.report.csv → data-sets/report-manually-tagged-gcs/GSE41195_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE41195_family.report.csv → data-sets/report-manually-tagged-gcs/GSE41195_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE4321_family.report.csv → data-sets/report-manually-tagged-gcs/GSE4321_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE4321_family.report.csv → data-sets/report-manually-tagged-gcs/GSE4321_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE54899_family.report.csv → data-sets/report-manually-tagged-gcs/GSE54899_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE54899_family.report.csv → data-sets/report-manually-tagged-gcs/GSE54899_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE54901_family.report.csv → data-sets/report-manually-tagged-gcs/GSE54901_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE54901_family.report.csv → data-sets/report-manually-tagged-gcs/GSE54901_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE55199_family.report.csv → data-sets/report-manually-tagged-gcs/GSE55199_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE55199_family.report.csv → data-sets/report-manually-tagged-gcs/GSE55199_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE55365_family.report.csv → data-sets/report-manually-tagged-gcs/GSE55365_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE55365_family.report.csv → data-sets/report-manually-tagged-gcs/GSE55365_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE55366_family.report.csv → data-sets/report-manually-tagged-gcs/GSE55366_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE55366_family.report.csv → data-sets/report-manually-tagged-gcs/GSE55366_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE60546_family.report.csv → data-sets/report-manually-tagged-gcs/GSE60546_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE60546_family.report.csv → data-sets/report-manually-tagged-gcs/GSE60546_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE65641_family.report.csv → data-sets/report-manually-tagged-gcs/GSE65641_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE65641_family.report.csv → data-sets/report-manually-tagged-gcs/GSE65641_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE65642_family.report.csv → data-sets/report-manually-tagged-gcs/GSE65642_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE65642_family.report.csv → data-sets/report-manually-tagged-gcs/GSE65642_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE65710_family.report.csv → data-sets/report-manually-tagged-gcs/GSE65710_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE65710_family.report.csv → data-sets/report-manually-tagged-gcs/GSE65710_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE65711_family.report.csv → data-sets/report-manually-tagged-gcs/GSE65711_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE65711_family.report.csv → data-sets/report-manually-tagged-gcs/GSE65711_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE66441_family.report.csv → data-sets/report-manually-tagged-gcs/GSE66441_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE66441_family.report.csv → data-sets/report-manually-tagged-gcs/GSE66441_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE66481_family.report.csv → data-sets/report-manually-tagged-gcs/GSE66481_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE66481_family.report.csv → data-sets/report-manually-tagged-gcs/GSE66481_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE74930_family.report.csv → data-sets/report-manually-tagged-gcs/GSE74930_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE74930_family.report.csv → data-sets/report-manually-tagged-gcs/GSE74930_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE74931_family.report.csv → data-sets/report-manually-tagged-gcs/GSE74931_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE74931_family.report.csv → data-sets/report-manually-tagged-gcs/GSE74931_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE74932_family.report.csv → data-sets/report-manually-tagged-gcs/GSE74932_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE74932_family.report.csv → data-sets/report-manually-tagged-gcs/GSE74932_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE88979_family.report.csv → data-sets/report-manually-tagged-gcs/GSE88979_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE88979_family.report.csv → data-sets/report-manually-tagged-gcs/GSE88979_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE88980_family.report.csv → data-sets/report-manually-tagged-gcs/GSE88980_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE88980_family.report.csv → data-sets/report-manually-tagged-gcs/GSE88980_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE93506_family.report.csv → data-sets/report-manually-tagged-gcs/GSE93506_family.report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE93506_family.report.csv → data-sets/report-manually-tagged-gcs/GSE93506_family.report.csv
View file @1341245
--- a/report-manually-tagged-gcs/GSE_family.complete-report.csv → data-sets/report-manually-tagged-gcs/GSE_family.complete-report.csv
View file @1341245
+++ b/report-manually-tagged-gcs/GSE_family.complete-report.csv → data-sets/report-manually-tagged-gcs/GSE_family.complete-report.csv
View file @1341245
--- a/report-manually-tagged-gcs/tagged-series-samples.txt → data-sets/report-manually-tagged-gcs/tagged-series-samples.txt
View file @1341245
+++ b/report-manually-tagged-gcs/tagged-series-samples.txt → data-sets/report-manually-tagged-gcs/tagged-series-samples.txt
View file @1341245
--- a/data-sets/scripts/extract-manually-tagged-gcs.py deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/scripts/extract-manually-tagged-gcs.py deleted 100644 → 0
View file @b4a0ecb
- # -*- coding: UTF-8 -*-
- 
- from optparse import OptionParser
- import os
- import sys
- import re
- 
- __author__ = 'CMendezC'
- 
- # Objective: extract manually tagged growth conditions.
- 
- # Parameters:
- #   1) --inputPath input path
- #   2) --outputPath output path
- 
- # Ouput:
- #   1) Tab separated file
- 
- # Execution:
- # python extract-manually-tagged-gcs.py
- # --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
- # --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
- # c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
- 
- # python extract-manually-tagged-gcs.py
- # --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs"
- # --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
- # python extract-manually-tagged-gcs.py --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs" --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
- 
- ###########################################################
- #                       MAIN PROGRAM                      #
- ###########################################################
- 
- if __name__ == "__main__":
-     # Parameter definition
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Path to read input files", metavar="PATH")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Path to place output files", metavar="PATH")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("None parameter entered.")
-         sys.exit(1)
- 
-     # Printing parameter values
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path to read input files: " + str(options.inputPath))
-     print("Path to place output files: " + str(options.outputPath))
- 
-     hashGcs = {}
-     regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
-     regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
-     regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
-     # Tags from esquema-gcs.xsd at 11/09/2018
-     tags = ["Name", "Anti", "Orgn", "Strain", "Substrain", "Gtype", "Gversion", "Med", "Technique", "Supp", "Air", "Temp", "pH", "Press", "OD", "Phase", "Rate", "Vess", "Agit", ]
-     processed_files = 0
-     saved_files = 0
-     complete_report = []
-     # Walk directory to read files
-     for path, dirs, files in os.walk(options.inputPath):
-         for f in files:
-             if f.endswith("_family.xml"):
-                 print("Processing...{} {}".format(options.inputPath, f))
-                 #with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
-                 with open(os.path.join(options.inputPath, f), "r", errors='replace') as iFile:
-                     # numline = 0
-                     for line in iFile:
-                         # numline+=1
-                         # if f.find("GSE41195") > -1:
-                         #     print(numline)
-                         line = line.strip('\n')
-                         result = regexSerie.match(line)
-                         if result:
-                             serie = result.group('serie')
-                             if serie in hashGcs:
-                                 print("WARNING! duplicate serie")
-                             else:
-                                 hashGcs[serie] = {}
-                             continue
-                         result = regexSample.match(line)
-                         if result:
-                             sample = result.group('sample')
-                             if sample in hashGcs[serie]:
-                                 print("WARNING! duplicate sample")
-                             else:
-                                 hashGcs[serie][sample] = {}
-                                 # hashGcs[serie] = hashSample
-                             #prevSample = sample
-                             continue
-                         result = regexTagContent.finditer(line)
-                         for m in result:
-                             tag = m.group('tag')
-                             content = m.group('content')
-                             content = content.strip()
-                             content = content.replace("&amp;", "&")
-                             content = content.replace("&lt;", "<")
-                             content = content.replace("&gt;", ">")
-                             content = content.replace("&quot;", "\"")
-                             content = content.replace("&apos;", "\'")
-                             #print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
-                             if tag in hashGcs[serie][sample]:
-                                 if content in hashGcs[serie][sample][tag]:
-                                     #print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
-                                     pass # GC content already in hash
-                                 else:
-                                     # print("New content: {}".format(content))
-                                     hashGcs[serie][sample][tag].append(content)
-                                     # print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
-                             else:
-                                 hashGcs[serie][sample][tag] = [content]
-                                 #print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
-                             # print(hashGcs)
-                     processed_files+=1
-                     #with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
-                     with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w") as oFile:
-                         output = '"Serie","Sample",'
-                         for tag in tags:
-                             output = output + '"' + tag + '",'
-                         output = output.rstrip(',')
-                         oFile.write(output + "\n")
-                         complete_report.append(output)
-                         for serie, hashSample in hashGcs.items():
-                             print("Serie: {}".format(serie))
-                             for sample, hashTag in sorted(hashSample.items()):
-                                 print("\tSample: {}".format(sample))
-                                 pTags = []
-                                 for tag in tags:
-                                     if tag in hashTag:
-                                         pTags.append(', '.join(hashTag[tag]))
-                                     else:
-                                         pTags.append('')
- 
-                                 output = '"{}","{}",'.format(serie, sample)
-                                 for tag in pTags:
-                                     output = output + '"' + tag + '",'
-                                 output = output.rstrip(',')
-                                 oFile.write(output + "\n")
-                                 complete_report.append(output)
-                                 # oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
-                                 # for tag, listContent in sorted(hashTag.items()):
-                                 #     print("\t\tTag: {}".format(tag))
-                                 #     for content in sorted(listContent):
-                                 #         print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
-                                 #         # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
-                                 #         oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))
-                         saved_files+=1
- 
-     with open(os.path.join(options.outputPath, "GSE_family.complete-report.csv"), "w") as oFile:
-         for line in complete_report:
-             oFile.write(line + "\n")
- 
-     print("Processed files: {}".format(processed_files))
-     print("Saved files: {}".format(saved_files))
- 
- 
--- a/data-sets/scripts/file_output.py deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/scripts/file_output.py deleted 100644 → 0
View file @b4a0ecb
- # -*- coding: UTF-8 -*-
- import os
- import sys
- import argparse
- import re
- import numpy as np 
- from datetime import *
- __author__ = 'KevinML'
- 
- # Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
- 
- # Parameters:
- #   1) --inputPath input path
- #   2) --outputPath output path
- 
- # Ouput:
- #   1) 
- 
- # Execution:
- #Example 1
- #python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
- #--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
- 
- #Example 2
- #python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
- #--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
- #--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
- 
- ###########################################################
- #                       MAIN PROGRAM                      #
- ###########################################################
- 
- parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
- 								epilog= 'Bien Hecho!')
- parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
-                     help='Ingrese el archivo de entrada.')
- parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
-                     help='Ingrese el archivo de salida.')
- 
- args = parser.parse_args()
- 
- #if len(args) != 2:
- #	parser.error("Se introdujeron mas o menos de 2 parametros.")
- #	sys.exit(1)
- 
- # Printing parameter values
- print('-------------------------------- PARAMETERS --------------------------------')
- print("Path to read input files: " + str(args.inputPath))
- print("Path to place output files: " + str(args.outputPath))
- 
- #ModificCIO TEMPORAL
- 
- archivo = {}
- regexTag = re.compile(r'<[A-Za-z]+>')
- exit_file = r"exit_file.xml"
- 
- with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
-   oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
- 
- for path, dirs, files in os.walk(args.inputPath):
-     for f in files:
-       metadatos = {}
-       with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
-         for line in iFile:
-           line = line.strip('\n')
-           if regexTag.search(line):
-             renglon = line.split(" = ")
-             if renglon[0] in metadatos:
-               metadatos[renglon[0]].append(renglon[1])
-             else:
-               metadatos[renglon[0]] = [renglon[1]]
- 
-         archivo[f] = metadatos
- 
-       with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
-         #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
-         for arch in sorted(archivo):
-           for k,v in sorted(metadatos.items()):
-             for x in v:
-               oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
--- a/data-sets/scripts/gzip-2-soft.py deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/scripts/gzip-2-soft.py deleted 100644 → 0
View file @b4a0ecb
- # -*- coding: UTF-8 -*-
- 
- from optparse import OptionParser
- import os
- import sys
- import gzip
- import shutil
- 
- __author__ = 'CMendezC'
- 
- # Objective: uncompress gzip soft file to text soft file
- 
- # Parameters:
- #   1) --inputPath input path
- #   2) --outputPath output path
- 
- # Ouput:
- #   1) Text soft file
- 
- # Execution:
- # python gzip-2-soft.py
- # --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data
- # --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
- # python gzip-2-soft.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
- 
- ###########################################################
- #                       MAIN PROGRAM                      #
- ###########################################################
- 
- if __name__ == "__main__":
-     # Parameter definition
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Path to read input files", metavar="PATH")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Path to place output files", metavar="PATH")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("None parameter entered.")
-         sys.exit(1)
- 
-     # Printing parameter values
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path to read input files: " + str(options.inputPath))
-     print("Path to place output files: " + str(options.outputPath))
- 
-     # Walk directory to read files
-     for path, dirs, files in os.walk(options.inputPath):
-         for f in files:
-             if f.endswith(".gz"):
-                 print("Processing...{}/{}".format(options.inputPath, f))
-                 try:
-                     with gzip.open(os.path.join(options.inputPath, f), 'rb') as f_in:
-                         with open(os.path.join(options.outputPath, f.replace('.soft.gz', '.txt')), 'wb') as f_out:
-                             shutil.copyfileobj(f_in, f_out)
-                 except:
-                     pass
--- a/data-sets/scripts/output.txt deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/scripts/output.txt deleted 100644 → 0
View file @b4a0ecb
--- a/data-sets/scripts/soft-2-xml.py deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/scripts/soft-2-xml.py deleted 100644 → 0
View file @b4a0ecb
- # -*- coding: UTF-8 -*-
- 
- from optparse import OptionParser
- import os
- import sys
- 
- __author__ = 'CMendezC'
- 
- # Objective: convert soft file to XML file:
- #   include headings, tags, substitute & and <
- 
- # Parameters:
- #   1) --inputPath input path
- #   2) --outputPath output path
- 
- # Ouput:
- #   1) XML File with soft file content
- 
- # Execution:
- # python soft-2-xml.py
- # --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional
- # --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
- # python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
- # Additional files
- # python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
- 
- ###########################################################
- #                       MAIN PROGRAM                      #
- ###########################################################
- 
- if __name__ == "__main__":
-     # Parameter definition
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Path to read input files", metavar="PATH")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Path to place output files", metavar="PATH")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("None parameter entered.")
-         sys.exit(1)
- 
-     # Printing parameter values
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path to read input files: " + str(options.inputPath))
-     print("Path to place output files: " + str(options.outputPath))
- 
-     # Walk directory to read files
-     processedFiles = 0
-     for path, dirs, files in os.walk(options.inputPath):
-         for f in files:
-             if f.endswith("_family.soft"):
-                 print("Processing...{}/{}".format(options.inputPath, f))
-                 softText = ''
-                 with open(os.path.join(options.inputPath, f), "r", encoding="utf-8", errors="replace") as iFile:
-                     with open(os.path.join(options.outputPath, f.replace(".soft", ".xml")), "w",
-                               encoding="utf-8") as oFile:
-                         oFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n<gse xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\nxsi:noNamespaceSchemaLocation=\"esquema-gcs.xsd\">\n\n")
-                         for line in iFile:
-                             line = line.replace("&", "&amp;")
-                             line = line.replace("<", "&lt;")
-                             # line = line.replace(">", "&gt;")
-                             # line = line.replace("\"", "&quot;")
-                             # line = line.replace("\'", "&apos;")
-                             oFile.write(line)
-                         oFile.write("\n</gse>\n")
-                         processedFiles+=1
-     print("Processed files: {}".format(processedFiles))
--- a/data-sets/soft-data-additional/borrame.txt deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/soft-data-additional/borrame.txt deleted 100644 → 0
View file @b4a0ecb
--- a/data-sets/soft-data/borrame.txt.txt deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/soft-data/borrame.txt.txt deleted 100644 → 0
View file @b4a0ecb
--- a/data-sets/xml-data/borrame.txt deleted 100644 → 0
View file @b4a0ecb
+++ b/data-sets/xml-data/borrame.txt deleted 100644 → 0
View file @b4a0ecb