GCs GEO extraction

Carlos-Francisco Méndez-Cruz
Commit 9fef2047f22db0808c1022316eee572e778ac154 9fef2047 1 parent d7bcdf3b
Showing 12 changed files with 80 additions and 139 deletions
scripts/extract-manually-tagged-gcs.py → data-sets/scripts/extract-manually-tagged-gcs.py
scripts/file_output.py → data-sets/scripts/file_output.py
scripts/gzip-2-soft.py → data-sets/scripts/gzip-2-soft.py
data-sets/scripts/output.txt
scripts/soft-2-xml.py → data-sets/scripts/soft-2-xml.py
training-evaluation-data-sets/.gitkeep
training-evaluation-data-sets/bin/.gitkeep
training-evaluation-data-sets/bin/parsed_sentences_from_labels_v3.py
training-evaluation-data-sets/data-sets-1/.gitkeep
training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv
training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv.conll
training-evaluation-data-sets/data-sets-1/sentences_labeled_v1.tsv
--- a/scripts/extract-manually-tagged-gcs.py → data-sets/scripts/extract-manually-tagged-gcs.py
View file @9fef204
+++ b/scripts/extract-manually-tagged-gcs.py → data-sets/scripts/extract-manually-tagged-gcs.py
View file @9fef204
--- a/scripts/file_output.py → data-sets/scripts/file_output.py
View file @9fef204
+++ b/scripts/file_output.py → data-sets/scripts/file_output.py
View file @9fef204
-# -*- coding: UTF-8 -*-
+# -*- coding: UTF-8 -*-
-import os
+import os
-import sys
+import sys
-import argparse
+import argparse
-import re
+import re
-import numpy as np 
+import numpy as np 
-from datetime import *
+from datetime import *
-__author__ = 'KevinML'
+__author__ = 'KevinML'
-
+
-# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
+# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
-
+
-# Parameters:
+# Parameters:
-#   1) --inputPath input path
+#   1) --inputPath input path
-#   2) --outputPath output path
+#   2) --outputPath output path
-
+
-# Ouput:
+# Ouput:
-#   1) 
+#   1) 
-
+
-# Execution:
+# Execution:
-#Example 1
+#Example 1
-#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
+#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
-#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
+#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
-
+
-#Example 2
+#Example 2
-#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
+#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
-#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
+#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 
-#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
+#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
-
+
-###########################################################
+###########################################################
-#                       MAIN PROGRAM                      #
+#                       MAIN PROGRAM                      #
-###########################################################
+###########################################################
-
+
-parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
+parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
-								epilog= 'Bien Hecho!')
+								epilog= 'Bien Hecho!')
-parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
+parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
-                    help='Ingrese el archivo de entrada.')
+                    help='Ingrese el archivo de entrada.')
-parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
+parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
-                    help='Ingrese el archivo de salida.')
+                    help='Ingrese el archivo de salida.')
-
+
-args = parser.parse_args()
+args = parser.parse_args()
-
+
-#if len(args) != 2:
+#if len(args) != 2:
-#	parser.error("Se introdujeron mas o menos de 2 parametros.")
+#	parser.error("Se introdujeron mas o menos de 2 parametros.")
-#	sys.exit(1)
+#	sys.exit(1)
-
+
-# Printing parameter values
+# Printing parameter values
-print('-------------------------------- PARAMETERS --------------------------------')
+print('-------------------------------- PARAMETERS --------------------------------')
-print("Path to read input files: " + str(args.inputPath))
+print("Path to read input files: " + str(args.inputPath))
-print("Path to place output files: " + str(args.outputPath))
+print("Path to place output files: " + str(args.outputPath))
-
+
-#ModificCIO TEMPORAL
+#ModificCIO TEMPORAL
-
+
-archivo = {}
+archivo = {}
-regexTag = re.compile(r'<[A-Za-z]+>')
+regexTag = re.compile(r'<[A-Za-z]+>')
-exit_file = r"exit_file.xml"
+exit_file = r"exit_file.xml"
-
+
-with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
+with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
-  oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
+  oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
-
+
-for path, dirs, files in os.walk(args.inputPath):
+for path, dirs, files in os.walk(args.inputPath):
-    for f in files:
+    for f in files:
-      metadatos = {}
+      metadatos = {}
-      with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
+      with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
-        for line in iFile:
+        for line in iFile:
-          line = line.strip('\n')
+          line = line.strip('\n')
-          if regexTag.search(line):
+          if regexTag.search(line):
-            renglon = line.split(" = ")
+            renglon = line.split(" = ")
-            if renglon[0] in metadatos:
+            if renglon[0] in metadatos:
-              metadatos[renglon[0]].append(renglon[1])
+              metadatos[renglon[0]].append(renglon[1])
-            else:
+            else:
-              metadatos[renglon[0]] = [renglon[1]]
+              metadatos[renglon[0]] = [renglon[1]]
-
+
-        archivo[f] = metadatos
+        archivo[f] = metadatos
-
+
-      with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
+      with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
-        #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
+        #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
-        for arch in sorted(archivo):
+        for arch in sorted(archivo):
-          for k,v in sorted(metadatos.items()):
+          for k,v in sorted(metadatos.items()):
-            for x in v:
+            for x in v:
-              oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
+              oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
--- a/scripts/gzip-2-soft.py → data-sets/scripts/gzip-2-soft.py
View file @9fef204
+++ b/scripts/gzip-2-soft.py → data-sets/scripts/gzip-2-soft.py
View file @9fef204
--- a/data-sets/scripts/output.txt 0 → 100644
View file @9fef204
+++ b/data-sets/scripts/output.txt 0 → 100644
View file @9fef204
--- a/scripts/soft-2-xml.py → data-sets/scripts/soft-2-xml.py
View file @9fef204
+++ b/scripts/soft-2-xml.py → data-sets/scripts/soft-2-xml.py
View file @9fef204
--- a/training-evaluation-data-sets/.gitkeep deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/.gitkeep deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/bin/.gitkeep deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/bin/.gitkeep deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/bin/parsed_sentences_from_labels_v3.py deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/bin/parsed_sentences_from_labels_v3.py deleted 100644 → 0
View file @d7bcdf3
- #!/bin/python2.7
-out_labels = {
- '</Air>': 'O',
- '</Gtype>': 'O',
- '</Gversion>': 'O',
- '</Med>': 'O',
- '</Orgn>': 'O',
- '</Phase>': 'O',
- '</Sample>': 'O',
- '</Serie>': 'O',
- '</Strain>': 'O',
- '</Substrain>': 'O',
- '</Supp>': 'O',
- '</Technique>': 'O',
- '</Temp>': 'O',
- '</Name>': 'O',
- '</OD>': 'O',
- '</Anti>': 'O',
- '</Agit>': 'O',
- '</Vess>': 'O'}
-in_labels = {
- '<Air>': 'Air',
- '<Gtype>': 'Gtype',
- '<Gversion>': 'Gversion',
- '<Med>': 'Med',
- '<Orgn>': 'Orgn',
- '<Phase>': 'Phase',
- '<Sample>': 'Sample',
- '<Serie>': 'Serie',
- '<Strain>': 'Strain',
- '<Substrain>': 'Substrain',
- '<Supp>': 'Supp',
- '<Technique>': 'Technique',
- '<Temp>': 'Temp',
- '<Name>': 'Name',
- '<OD>': 'OD',
- '<Anti>': 'Anti',
- '<Agit>': 'Agit',
- '<Vess>': 'Vess'}
-
-import re
-#columna Contenido de "/home/egaytan/Dropbox/PGC/data-sets/file_output/exit_file.txt"
-inpath = '/home/egaytan/Dropbox/PGC/data-sets_1/content_colum_data_set.tsv.conll'
-outpath = '/home/egaytan/Dropbox/PGC/data-sets_1/sentences_labeled_v1.tsv'
-flag = 'O'
-with open(outpath, 'w') as out:
-	with open(inpath, 'r') as input_file:
-		for line in input_file:
-			if len(line.split('\t')) > 1:
-				w = line.split('\t')[1]
-				if w in in_labels or w in out_labels:
-					if w in in_labels.keys(): flag = in_labels[w]
-					if w in out_labels: flag = out_labels[w]
-
-				else:
-					if w == "PGCGROWTHCONDITIONS":	out.write('\n')
-					else: 
-						out.write('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
-						#print('\t'.join(line.split('\t')[1:4])+'\t'+flag)
\ No newline at end of file
--- a/training-evaluation-data-sets/data-sets-1/.gitkeep deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/data-sets-1/.gitkeep deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv.conll deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/data-sets-1/content_colum_data_set.tsv.conll deleted 100644 → 0
View file @d7bcdf3
--- a/training-evaluation-data-sets/data-sets-1/sentences_labeled_v1.tsv deleted 100644 → 0
View file @d7bcdf3
+++ b/training-evaluation-data-sets/data-sets-1/sentences_labeled_v1.tsv deleted 100644 → 0
View file @d7bcdf3