GCs GEO extraction

Carlos-Francisco Méndez-Cruz
Commit 9fef2047f22db0808c1022316eee572e778ac154 9fef2047 1 parent d7bcdf3b
Showing 12 changed files with 80 additions and 139 deletions
scripts/extract-manually-tagged-gcs.py → data-sets/scripts/extract-manually-tagged-gcs.py
scripts/file_output.py → data-sets/scripts/file_output.py
scripts/gzip-2-soft.py → data-sets/scripts/gzip-2-soft.py
data-sets/scripts/output.txt
scripts/soft-2-xml.py → data-sets/scripts/soft-2-xml.py
training-evaluation-data-sets/.gitkeep
training-evaluation-data-sets/bin/.gitkeep
training-evaluation-data-sets/bin/parsed_sentences_from_labels_v3.py
training-evaluation-data-sets/data-sets-1/.gitkeep
training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv
training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv.conll
training-evaluation-data-sets/data-sets-1/sentences_labeled_v1.tsv
--- a/scripts/extract-manually-tagged-gcs.py → data-sets/scripts/extract-manually-tagged-gcs.py
View file @9fef204
+++ b/scripts/extract-manually-tagged-gcs.py → data-sets/scripts/extract-manually-tagged-gcs.py
View file @9fef204
--- a/scripts/file_output.py → data-sets/scripts/file_output.py
View file @9fef204
+++ b/scripts/file_output.py → data-sets/scripts/file_output.py
View file @9fef204
- # -*- coding: UTF-8 -*-
- import os
- import sys
- import argparse
- import re
- import numpy as np 
- from datetime import *
- __author__ = 'KevinML'
- 
- # Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
- 
- # Parameters:
- #   1) --inputPath input path
- #   2) --outputPath output path
- 
- # Ouput:
- #   1) 
- 
- # Execution:
- #Example 1
- #python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
- #--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
- 
- #Example 2
- #python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
- #--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
- #--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
- 
- ###########################################################
- #                       MAIN PROGRAM                      #
- ###########################################################
- 
- parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
- 								epilog= 'Bien Hecho!')
- parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
-                     help='Ingrese el archivo de entrada.')
- parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
-                     help='Ingrese el archivo de salida.')
- 
- args = parser.parse_args()
- 
- #if len(args) != 2:
- #	parser.error("Se introdujeron mas o menos de 2 parametros.")
- #	sys.exit(1)
- 
- # Printing parameter values
- print('-------------------------------- PARAMETERS --------------------------------')
- print("Path to read input files: " + str(args.inputPath))
- print("Path to place output files: " + str(args.outputPath))
- 
- #ModificCIO TEMPORAL
- 
- archivo = {}
- regexTag = re.compile(r'<[A-Za-z]+>')
- exit_file = r"exit_file.xml"
- 
- with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
-   oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
- 
- for path, dirs, files in os.walk(args.inputPath):
-     for f in files:
-       metadatos = {}
-       with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
-         for line in iFile:
-           line = line.strip('\n')
-           if regexTag.search(line):
-             renglon = line.split(" = ")
-             if renglon[0] in metadatos:
-               metadatos[renglon[0]].append(renglon[1])
-             else:
-               metadatos[renglon[0]] = [renglon[1]]
- 
-         archivo[f] = metadatos
- 
-       with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
-         #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
-         for arch in sorted(archivo):
-           for k,v in sorted(metadatos.items()):
-             for x in v:
-               oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
+ # -*- coding: UTF-8 -*-
+ import os
+ import sys
+ import argparse
+ import re
+ import numpy as np 
+ from datetime import *
+ __author__ = 'KevinML'
+ 
+ # Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
+ 
+ # Parameters:
+ #   1) --inputPath input path
+ #   2) --outputPath output path
+ 
+ # Ouput:
+ #   1) 
+ 
+ # Execution:
+ #Example 1
+ #python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
+ #--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
+ 
+ #Example 2
+ #python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
+ #--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
+ #--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
+ 								epilog= 'Bien Hecho!')
+ parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
+                     help='Ingrese el archivo de entrada.')
+ parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
+                     help='Ingrese el archivo de salida.')
+ 
+ args = parser.parse_args()
+ 
+ #if len(args) != 2:
+ #	parser.error("Se introdujeron mas o menos de 2 parametros.")
+ #	sys.exit(1)
+ 
+ # Printing parameter values
+ print('-------------------------------- PARAMETERS --------------------------------')
+ print("Path to read input files: " + str(args.inputPath))
+ print("Path to place output files: " + str(args.outputPath))
+ 
+ #ModificCIO TEMPORAL
+ 
+ archivo = {}
+ regexTag = re.compile(r'<[A-Za-z]+>')
+ exit_file = r"exit_file.xml"
+ 
+ with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
+   oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
+ 
+ for path, dirs, files in os.walk(args.inputPath):
+     for f in files:
+       metadatos = {}
+       with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
+         for line in iFile:
+           line = line.strip('\n')
+           if regexTag.search(line):
+             renglon = line.split(" = ")
+             if renglon[0] in metadatos:
+               metadatos[renglon[0]].append(renglon[1])
+             else:
+               metadatos[renglon[0]] = [renglon[1]]
+ 
+         archivo[f] = metadatos
+ 
+       with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
+         #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
+         for arch in sorted(archivo):
+           for k,v in sorted(metadatos.items()):
+             for x in v:
+               oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
--- a/scripts/gzip-2-soft.py → data-sets/scripts/gzip-2-soft.py
View file @9fef204
+++ b/scripts/gzip-2-soft.py → data-sets/scripts/gzip-2-soft.py
View file @9fef204
--- a/data-sets/scripts/output.txt 0 → 100644
View file @9fef204
+++ b/data-sets/scripts/output.txt 0 → 100644
View file @9fef204
--- a/scripts/soft-2-xml.py → data-sets/scripts/soft-2-xml.py
View file @9fef204
+++ b/scripts/soft-2-xml.py → data-sets/scripts/soft-2-xml.py
View file @9fef204
--- a/training-evaluation-data-sets/.gitkeep deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/.gitkeep deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/bin/.gitkeep deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/bin/.gitkeep deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/bin/parsed_sentences_from_labels_v3.py deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/bin/parsed_sentences_from_labels_v3.py deleted 100644 → 0
View file @d7bcdf3
-  #!/bin/python2.7
- out_labels = {
-  '</Air>': 'O',
-  '</Gtype>': 'O',
-  '</Gversion>': 'O',
-  '</Med>': 'O',
-  '</Orgn>': 'O',
-  '</Phase>': 'O',
-  '</Sample>': 'O',
-  '</Serie>': 'O',
-  '</Strain>': 'O',
-  '</Substrain>': 'O',
-  '</Supp>': 'O',
-  '</Technique>': 'O',
-  '</Temp>': 'O',
-  '</Name>': 'O',
-  '</OD>': 'O',
-  '</Anti>': 'O',
-  '</Agit>': 'O',
-  '</Vess>': 'O'}
- in_labels = {
-  '<Air>': 'Air',
-  '<Gtype>': 'Gtype',
-  '<Gversion>': 'Gversion',
-  '<Med>': 'Med',
-  '<Orgn>': 'Orgn',
-  '<Phase>': 'Phase',
-  '<Sample>': 'Sample',
-  '<Serie>': 'Serie',
-  '<Strain>': 'Strain',
-  '<Substrain>': 'Substrain',
-  '<Supp>': 'Supp',
-  '<Technique>': 'Technique',
-  '<Temp>': 'Temp',
-  '<Name>': 'Name',
-  '<OD>': 'OD',
-  '<Anti>': 'Anti',
-  '<Agit>': 'Agit',
-  '<Vess>': 'Vess'}
- 
- import re
- #columna Contenido de "/home/egaytan/Dropbox/PGC/data-sets/file_output/exit_file.txt"
- inpath = '/home/egaytan/Dropbox/PGC/data-sets_1/content_colum_data_set.tsv.conll'
- outpath = '/home/egaytan/Dropbox/PGC/data-sets_1/sentences_labeled_v1.tsv'
- flag = 'O'
- with open(outpath, 'w') as out:
- 	with open(inpath, 'r') as input_file:
- 		for line in input_file:
- 			if len(line.split('\t')) > 1:
- 				w = line.split('\t')[1]
- 				if w in in_labels or w in out_labels:
- 					if w in in_labels.keys(): flag = in_labels[w]
- 					if w in out_labels: flag = out_labels[w]
- 
- 				else:
- 					if w == "PGCGROWTHCONDITIONS":	out.write('\n')
- 					else: 
- 						out.write('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
- 						#print('\t'.join(line.split('\t')[1:4])+'\t'+flag)
\ No newline at end of file
--- a/training-evaluation-data-sets/data-sets-1/.gitkeep deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/data-sets-1/.gitkeep deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv.conll deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv.conll deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/data-sets-1/sentences_labeled_v1.tsv deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/data-sets-1/sentences_labeled_v1.tsv deleted 100644 → 0
View file @d7bcdf3