Carlos-Francisco Méndez-Cruz

GCs GEO extraction

1 -# -*- coding: UTF-8 -*- 1 +# -*- coding: UTF-8 -*-
2 -import os 2 +import os
3 -import sys 3 +import sys
4 -import argparse 4 +import argparse
5 -import re 5 +import re
6 -import numpy as np 6 +import numpy as np
7 -from datetime import * 7 +from datetime import *
8 -__author__ = 'KevinML' 8 +__author__ = 'KevinML'
9 - 9 +
10 -# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo. 10 +# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
11 - 11 +
12 -# Parameters: 12 +# Parameters:
13 -# 1) --inputPath input path 13 +# 1) --inputPath input path
14 -# 2) --outputPath output path 14 +# 2) --outputPath output path
15 - 15 +
16 -# Ouput: 16 +# Ouput:
17 -# 1) 17 +# 1)
18 - 18 +
19 -# Execution: 19 +# Execution:
20 -#Example 1 20 +#Example 1
21 -#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 21 +#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
22 -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ 22 +#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
23 - 23 +
24 -#Example 2 24 +#Example 2
25 -#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py 25 +#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
26 -#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ 26 +#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
27 -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ 27 +#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
28 - 28 +
29 -########################################################### 29 +###########################################################
30 -# MAIN PROGRAM # 30 +# MAIN PROGRAM #
31 -########################################################### 31 +###########################################################
32 - 32 +
33 -parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>', 33 +parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
34 - epilog= 'Bien Hecho!') 34 + epilog= 'Bien Hecho!')
35 -parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True, 35 +parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
36 - help='Ingrese el archivo de entrada.') 36 + help='Ingrese el archivo de entrada.')
37 -parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True, 37 +parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
38 - help='Ingrese el archivo de salida.') 38 + help='Ingrese el archivo de salida.')
39 - 39 +
40 -args = parser.parse_args() 40 +args = parser.parse_args()
41 - 41 +
42 -#if len(args) != 2: 42 +#if len(args) != 2:
43 -# parser.error("Se introdujeron mas o menos de 2 parametros.") 43 +# parser.error("Se introdujeron mas o menos de 2 parametros.")
44 -# sys.exit(1) 44 +# sys.exit(1)
45 - 45 +
46 -# Printing parameter values 46 +# Printing parameter values
47 -print('-------------------------------- PARAMETERS --------------------------------') 47 +print('-------------------------------- PARAMETERS --------------------------------')
48 -print("Path to read input files: " + str(args.inputPath)) 48 +print("Path to read input files: " + str(args.inputPath))
49 -print("Path to place output files: " + str(args.outputPath)) 49 +print("Path to place output files: " + str(args.outputPath))
50 - 50 +
51 -#ModificCIO TEMPORAL 51 +#ModificCIO TEMPORAL
52 - 52 +
53 -archivo = {} 53 +archivo = {}
54 -regexTag = re.compile(r'<[A-Za-z]+>') 54 +regexTag = re.compile(r'<[A-Za-z]+>')
55 -exit_file = r"exit_file.xml" 55 +exit_file = r"exit_file.xml"
56 - 56 +
57 -with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile: 57 +with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
58 - oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today())) 58 + oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
59 - 59 +
60 -for path, dirs, files in os.walk(args.inputPath): 60 +for path, dirs, files in os.walk(args.inputPath):
61 - for f in files: 61 + for f in files:
62 - metadatos = {} 62 + metadatos = {}
63 - with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile: 63 + with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
64 - for line in iFile: 64 + for line in iFile:
65 - line = line.strip('\n') 65 + line = line.strip('\n')
66 - if regexTag.search(line): 66 + if regexTag.search(line):
67 - renglon = line.split(" = ") 67 + renglon = line.split(" = ")
68 - if renglon[0] in metadatos: 68 + if renglon[0] in metadatos:
69 - metadatos[renglon[0]].append(renglon[1]) 69 + metadatos[renglon[0]].append(renglon[1])
70 - else: 70 + else:
71 - metadatos[renglon[0]] = [renglon[1]] 71 + metadatos[renglon[0]] = [renglon[1]]
72 - 72 +
73 - archivo[f] = metadatos 73 + archivo[f] = metadatos
74 - 74 +
75 - with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile: 75 + with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
76 - #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido') 76 + #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
77 - for arch in sorted(archivo): 77 + for arch in sorted(archivo):
78 - for k,v in sorted(metadatos.items()): 78 + for k,v in sorted(metadatos.items()):
79 - for x in v: 79 + for x in v:
80 - oFile.write('{}\t{}\t{}\n'.format(arch, k, x)) 80 + oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
......
This diff is collapsed. Click to expand it.
1 - #!/bin/python2.7
2 -out_labels = {
3 - '</Air>': 'O',
4 - '</Gtype>': 'O',
5 - '</Gversion>': 'O',
6 - '</Med>': 'O',
7 - '</Orgn>': 'O',
8 - '</Phase>': 'O',
9 - '</Sample>': 'O',
10 - '</Serie>': 'O',
11 - '</Strain>': 'O',
12 - '</Substrain>': 'O',
13 - '</Supp>': 'O',
14 - '</Technique>': 'O',
15 - '</Temp>': 'O',
16 - '</Name>': 'O',
17 - '</OD>': 'O',
18 - '</Anti>': 'O',
19 - '</Agit>': 'O',
20 - '</Vess>': 'O'}
21 -in_labels = {
22 - '<Air>': 'Air',
23 - '<Gtype>': 'Gtype',
24 - '<Gversion>': 'Gversion',
25 - '<Med>': 'Med',
26 - '<Orgn>': 'Orgn',
27 - '<Phase>': 'Phase',
28 - '<Sample>': 'Sample',
29 - '<Serie>': 'Serie',
30 - '<Strain>': 'Strain',
31 - '<Substrain>': 'Substrain',
32 - '<Supp>': 'Supp',
33 - '<Technique>': 'Technique',
34 - '<Temp>': 'Temp',
35 - '<Name>': 'Name',
36 - '<OD>': 'OD',
37 - '<Anti>': 'Anti',
38 - '<Agit>': 'Agit',
39 - '<Vess>': 'Vess'}
40 -
41 -import re
42 -#columna Contenido de "/home/egaytan/Dropbox/PGC/data-sets/file_output/exit_file.txt"
43 -inpath = '/home/egaytan/Dropbox/PGC/data-sets_1/content_colum_data_set.tsv.conll'
44 -outpath = '/home/egaytan/Dropbox/PGC/data-sets_1/sentences_labeled_v1.tsv'
45 -flag = 'O'
46 -with open(outpath, 'w') as out:
47 - with open(inpath, 'r') as input_file:
48 - for line in input_file:
49 - if len(line.split('\t')) > 1:
50 - w = line.split('\t')[1]
51 - if w in in_labels or w in out_labels:
52 - if w in in_labels.keys(): flag = in_labels[w]
53 - if w in out_labels: flag = out_labels[w]
54 -
55 - else:
56 - if w == "PGCGROWTHCONDITIONS": out.write('\n')
57 - else:
58 - out.write('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
59 - #print('\t'.join(line.split('\t')[1:4])+'\t'+flag)
...\ No newline at end of file ...\ No newline at end of file
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.