Carlos-Francisco Méndez-Cruz

GCs GEO extraction

# -*- coding: UTF-8 -*-
import os
import sys
import argparse
import re
import numpy as np
from datetime import *
__author__ = 'KevinML'
# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1)
# Execution:
#Example 1
#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
#Example 2
#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
###########################################################
# MAIN PROGRAM #
###########################################################
parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
epilog= 'Bien Hecho!')
parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
help='Ingrese el archivo de entrada.')
parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
help='Ingrese el archivo de salida.')
args = parser.parse_args()
#if len(args) != 2:
# parser.error("Se introdujeron mas o menos de 2 parametros.")
# sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(args.inputPath))
print("Path to place output files: " + str(args.outputPath))
#ModificCIO TEMPORAL
archivo = {}
regexTag = re.compile(r'<[A-Za-z]+>')
exit_file = r"exit_file.xml"
with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
for path, dirs, files in os.walk(args.inputPath):
for f in files:
metadatos = {}
with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
for line in iFile:
line = line.strip('\n')
if regexTag.search(line):
renglon = line.split(" = ")
if renglon[0] in metadatos:
metadatos[renglon[0]].append(renglon[1])
else:
metadatos[renglon[0]] = [renglon[1]]
archivo[f] = metadatos
with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
#oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
for arch in sorted(archivo):
for k,v in sorted(metadatos.items()):
for x in v:
oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
# -*- coding: UTF-8 -*-
import os
import sys
import argparse
import re
import numpy as np
from datetime import *
__author__ = 'KevinML'
# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1)
# Execution:
#Example 1
#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
#Example 2
#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
###########################################################
# MAIN PROGRAM #
###########################################################
parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
epilog= 'Bien Hecho!')
parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
help='Ingrese el archivo de entrada.')
parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
help='Ingrese el archivo de salida.')
args = parser.parse_args()
#if len(args) != 2:
# parser.error("Se introdujeron mas o menos de 2 parametros.")
# sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(args.inputPath))
print("Path to place output files: " + str(args.outputPath))
#ModificCIO TEMPORAL
archivo = {}
regexTag = re.compile(r'<[A-Za-z]+>')
exit_file = r"exit_file.xml"
with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
for path, dirs, files in os.walk(args.inputPath):
for f in files:
metadatos = {}
with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
for line in iFile:
line = line.strip('\n')
if regexTag.search(line):
renglon = line.split(" = ")
if renglon[0] in metadatos:
metadatos[renglon[0]].append(renglon[1])
else:
metadatos[renglon[0]] = [renglon[1]]
archivo[f] = metadatos
with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
#oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
for arch in sorted(archivo):
for k,v in sorted(metadatos.items()):
for x in v:
oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
......
This diff is collapsed. Click to expand it.
#!/bin/python2.7
out_labels = {
'</Air>': 'O',
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Orgn>': 'O',
'</Phase>': 'O',
'</Sample>': 'O',
'</Serie>': 'O',
'</Strain>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</Name>': 'O',
'</OD>': 'O',
'</Anti>': 'O',
'</Agit>': 'O',
'</Vess>': 'O'}
in_labels = {
'<Air>': 'Air',
'<Gtype>': 'Gtype',
'<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Orgn>': 'Orgn',
'<Phase>': 'Phase',
'<Sample>': 'Sample',
'<Serie>': 'Serie',
'<Strain>': 'Strain',
'<Substrain>': 'Substrain',
'<Supp>': 'Supp',
'<Technique>': 'Technique',
'<Temp>': 'Temp',
'<Name>': 'Name',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Vess>': 'Vess'}
import re
#columna Contenido de "/home/egaytan/Dropbox/PGC/data-sets/file_output/exit_file.txt"
inpath = '/home/egaytan/Dropbox/PGC/data-sets_1/content_colum_data_set.tsv.conll'
outpath = '/home/egaytan/Dropbox/PGC/data-sets_1/sentences_labeled_v1.tsv'
flag = 'O'
with open(outpath, 'w') as out:
with open(inpath, 'r') as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS": out.write('\n')
else:
out.write('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
#print('\t'.join(line.split('\t')[1:4])+'\t'+flag)
\ No newline at end of file
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.