Showing
12 changed files
with
80 additions
and
139 deletions
File moved
1 | -# -*- coding: UTF-8 -*- | 1 | +# -*- coding: UTF-8 -*- |
2 | -import os | 2 | +import os |
3 | -import sys | 3 | +import sys |
4 | -import argparse | 4 | +import argparse |
5 | -import re | 5 | +import re |
6 | -import numpy as np | 6 | +import numpy as np |
7 | -from datetime import * | 7 | +from datetime import * |
8 | -__author__ = 'KevinML' | 8 | +__author__ = 'KevinML' |
9 | - | 9 | + |
10 | -# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo. | 10 | +# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo. |
11 | - | 11 | + |
12 | -# Parameters: | 12 | +# Parameters: |
13 | -# 1) --inputPath input path | 13 | +# 1) --inputPath input path |
14 | -# 2) --outputPath output path | 14 | +# 2) --outputPath output path |
15 | - | 15 | + |
16 | -# Ouput: | 16 | +# Ouput: |
17 | -# 1) | 17 | +# 1) |
18 | - | 18 | + |
19 | -# Execution: | 19 | +# Execution: |
20 | -#Example 1 | 20 | +#Example 1 |
21 | -#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ | 21 | +#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ |
22 | -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ | 22 | +#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ |
23 | - | 23 | + |
24 | -#Example 2 | 24 | +#Example 2 |
25 | -#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py | 25 | +#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py |
26 | -#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ | 26 | +#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ |
27 | -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ | 27 | +#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ |
28 | - | 28 | + |
29 | -########################################################### | 29 | +########################################################### |
30 | -# MAIN PROGRAM # | 30 | +# MAIN PROGRAM # |
31 | -########################################################### | 31 | +########################################################### |
32 | - | 32 | + |
33 | -parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>', | 33 | +parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>', |
34 | - epilog= 'Bien Hecho!') | 34 | + epilog= 'Bien Hecho!') |
35 | -parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True, | 35 | +parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True, |
36 | - help='Ingrese el archivo de entrada.') | 36 | + help='Ingrese el archivo de entrada.') |
37 | -parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True, | 37 | +parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True, |
38 | - help='Ingrese el archivo de salida.') | 38 | + help='Ingrese el archivo de salida.') |
39 | - | 39 | + |
40 | -args = parser.parse_args() | 40 | +args = parser.parse_args() |
41 | - | 41 | + |
42 | -#if len(args) != 2: | 42 | +#if len(args) != 2: |
43 | -# parser.error("Se introdujeron mas o menos de 2 parametros.") | 43 | +# parser.error("Se introdujeron mas o menos de 2 parametros.") |
44 | -# sys.exit(1) | 44 | +# sys.exit(1) |
45 | - | 45 | + |
46 | -# Printing parameter values | 46 | +# Printing parameter values |
47 | -print('-------------------------------- PARAMETERS --------------------------------') | 47 | +print('-------------------------------- PARAMETERS --------------------------------') |
48 | -print("Path to read input files: " + str(args.inputPath)) | 48 | +print("Path to read input files: " + str(args.inputPath)) |
49 | -print("Path to place output files: " + str(args.outputPath)) | 49 | +print("Path to place output files: " + str(args.outputPath)) |
50 | - | 50 | + |
51 | -#ModificCIO TEMPORAL | 51 | +#ModificCIO TEMPORAL |
52 | - | 52 | + |
53 | -archivo = {} | 53 | +archivo = {} |
54 | -regexTag = re.compile(r'<[A-Za-z]+>') | 54 | +regexTag = re.compile(r'<[A-Za-z]+>') |
55 | -exit_file = r"exit_file.xml" | 55 | +exit_file = r"exit_file.xml" |
56 | - | 56 | + |
57 | -with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile: | 57 | +with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile: |
58 | - oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today())) | 58 | + oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today())) |
59 | - | 59 | + |
60 | -for path, dirs, files in os.walk(args.inputPath): | 60 | +for path, dirs, files in os.walk(args.inputPath): |
61 | - for f in files: | 61 | + for f in files: |
62 | - metadatos = {} | 62 | + metadatos = {} |
63 | - with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile: | 63 | + with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile: |
64 | - for line in iFile: | 64 | + for line in iFile: |
65 | - line = line.strip('\n') | 65 | + line = line.strip('\n') |
66 | - if regexTag.search(line): | 66 | + if regexTag.search(line): |
67 | - renglon = line.split(" = ") | 67 | + renglon = line.split(" = ") |
68 | - if renglon[0] in metadatos: | 68 | + if renglon[0] in metadatos: |
69 | - metadatos[renglon[0]].append(renglon[1]) | 69 | + metadatos[renglon[0]].append(renglon[1]) |
70 | - else: | 70 | + else: |
71 | - metadatos[renglon[0]] = [renglon[1]] | 71 | + metadatos[renglon[0]] = [renglon[1]] |
72 | - | 72 | + |
73 | - archivo[f] = metadatos | 73 | + archivo[f] = metadatos |
74 | - | 74 | + |
75 | - with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile: | 75 | + with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile: |
76 | - #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido') | 76 | + #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido') |
77 | - for arch in sorted(archivo): | 77 | + for arch in sorted(archivo): |
78 | - for k,v in sorted(metadatos.items()): | 78 | + for k,v in sorted(metadatos.items()): |
79 | - for x in v: | 79 | + for x in v: |
80 | - oFile.write('{}\t{}\t{}\n'.format(arch, k, x)) | 80 | + oFile.write('{}\t{}\t{}\n'.format(arch, k, x)) | ... | ... |
data-sets/scripts/output.txt
0 → 100644
This diff is collapsed. Click to expand it.
File mode changed
File mode changed
1 | - #!/bin/python2.7 | ||
2 | -out_labels = { | ||
3 | - '</Air>': 'O', | ||
4 | - '</Gtype>': 'O', | ||
5 | - '</Gversion>': 'O', | ||
6 | - '</Med>': 'O', | ||
7 | - '</Orgn>': 'O', | ||
8 | - '</Phase>': 'O', | ||
9 | - '</Sample>': 'O', | ||
10 | - '</Serie>': 'O', | ||
11 | - '</Strain>': 'O', | ||
12 | - '</Substrain>': 'O', | ||
13 | - '</Supp>': 'O', | ||
14 | - '</Technique>': 'O', | ||
15 | - '</Temp>': 'O', | ||
16 | - '</Name>': 'O', | ||
17 | - '</OD>': 'O', | ||
18 | - '</Anti>': 'O', | ||
19 | - '</Agit>': 'O', | ||
20 | - '</Vess>': 'O'} | ||
21 | -in_labels = { | ||
22 | - '<Air>': 'Air', | ||
23 | - '<Gtype>': 'Gtype', | ||
24 | - '<Gversion>': 'Gversion', | ||
25 | - '<Med>': 'Med', | ||
26 | - '<Orgn>': 'Orgn', | ||
27 | - '<Phase>': 'Phase', | ||
28 | - '<Sample>': 'Sample', | ||
29 | - '<Serie>': 'Serie', | ||
30 | - '<Strain>': 'Strain', | ||
31 | - '<Substrain>': 'Substrain', | ||
32 | - '<Supp>': 'Supp', | ||
33 | - '<Technique>': 'Technique', | ||
34 | - '<Temp>': 'Temp', | ||
35 | - '<Name>': 'Name', | ||
36 | - '<OD>': 'OD', | ||
37 | - '<Anti>': 'Anti', | ||
38 | - '<Agit>': 'Agit', | ||
39 | - '<Vess>': 'Vess'} | ||
40 | - | ||
41 | -import re | ||
42 | -#columna Contenido de "/home/egaytan/Dropbox/PGC/data-sets/file_output/exit_file.txt" | ||
43 | -inpath = '/home/egaytan/Dropbox/PGC/data-sets_1/content_colum_data_set.tsv.conll' | ||
44 | -outpath = '/home/egaytan/Dropbox/PGC/data-sets_1/sentences_labeled_v1.tsv' | ||
45 | -flag = 'O' | ||
46 | -with open(outpath, 'w') as out: | ||
47 | - with open(inpath, 'r') as input_file: | ||
48 | - for line in input_file: | ||
49 | - if len(line.split('\t')) > 1: | ||
50 | - w = line.split('\t')[1] | ||
51 | - if w in in_labels or w in out_labels: | ||
52 | - if w in in_labels.keys(): flag = in_labels[w] | ||
53 | - if w in out_labels: flag = out_labels[w] | ||
54 | - | ||
55 | - else: | ||
56 | - if w == "PGCGROWTHCONDITIONS": out.write('\n') | ||
57 | - else: | ||
58 | - out.write('|'.join(line.split('\t')[1:4])+'|'+flag+' ') | ||
59 | - #print('\t'.join(line.split('\t')[1:4])+'\t'+flag) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
File mode changed
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
-
Please register or login to post a comment