Kevin Meza Landeros

upload

-------------- REPORT --------------
Training Dataset:
Air: 111
Gtype: 261
Gversion: 17
Med: 119
Phase: 37
Strain: 3
Supp: 294
Technique: 58
Temp: 56
OD: 65
Anti: 38
Agit: 4
Vess: 2
Substrain: 4
pH: 26
Mean tags per sentence: 3
Testing Dataset:
Air: 88
Gtype: 78
Gversion: 6
Med: 56
Phase: 21
Strain: 0
Supp: 136
Technique: 33
Temp: 15
OD: 21
Anti: 13
Agit: 7
Vess: 0
Substrain: 0
pH: 10
Mean tags per sentence: 4
# Importacion de librerias
import pandas as pd
import numpy as np
import re
import argparse
import os
__author__ = 'kevinml'
# Objective
# Take CoreNLP processed files and extract two thing.
# 1.- Number of Each Tag within Training and Testing Datasets
# 2.- Mean Sentence Tag within Training and Testing Datasets
# Examples
# python Tags_Training_Testing.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/CRF/data-sets --outputPath /home/kevinml/automatic-extraction-growth-conditions/CRF/analysis_report --TrainingFile training-data-set-70-NER.txt --TestingFile test-data-set-30-NER.txt
####################################################################################
# MAIN PROGRAM #
####################################################################################
if __name__ == '__main__':
# Definicion de Parametros
parser = argparse.ArgumentParser()
parser.add_argument(
'--inputPath', help="Ruta donde se encuentran los archivos a procesar. Ej: --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets", required=True)
parser.add_argument(
'--outputPath', help="Ruta donde se depositaran los archivos resultantes. Ej: --outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets", required=True)
parser.add_argument(
'--TrainingFile', help="Archivo a procesar. Ej: --TrainingFile test-data-set-30.txt", required=True)
parser.add_argument(
'--TestingFile', help="Archivo a procesar. Ej: --TestingFile training-data-set-70.txt", required=True)
args = parser.parse_args()
# Se imprimen los parametros ingresados
print('\n-------------------------------- PARAMETERS --------------------------------\n')
print('Input Path: ' + str(args.inputPath))
print('Training File: ' + str(args.TrainingFile))
print('Testing File: ' + str(args.TestingFile))
print('Output Path: ' + str(args.outputPath))
print('\n-------------------------------- PROCESSING --------------------------------\n')
Air, Gtype, Gversion, Med, Phase, Strain, Supp, Technique, Temp, OD, Anti, Agit, Vess, Substrain, pH = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
Air_2, Gtype_2, Gversion_2, Med_2, Phase_2, Strain_2, Supp_2, Technique_2, Temp_2, OD_2, Anti_2, Agit_2, Vess_2, Substrain_2, pH_2 = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
taggs_by_sentence_Testing, taggs_by_sentence_Training = np.array(
[]), np.array([])
with open(os.path.join(args.inputPath, args.TrainingFile), mode="r") as Training_file:
for line in Training_file:
# Descomentar las lineas inferiores para ver cada una de las oraciones en el archivo.
# print(line)
# print("\n")
# Encontramos cuantas veces esta cada etiqueta en el archivo de Training
Air += len(re.findall("\|Air", line))
Gtype += len(re.findall("\|Gtype", line))
Gversion += len(re.findall("\|Gversion", line))
Med += len(re.findall("\|Med", line))
Phase += len(re.findall("\|Phase", line))
Strain += len(re.findall("\|Strain", line))
Supp += len(re.findall("\|Supp", line))
Technique += len(re.findall("\|Technique", line))
Temp += len(re.findall("\|Temp", line))
OD += len(re.findall("\|OD", line))
Anti += len(re.findall("\|Anti", line))
Agit += len(re.findall("\|Agit", line))
Vess += len(re.findall("\|Vess", line))
Substrain += len(re.findall("\|Substrain", line))
pH += len(re.findall("\|pH", line))
# Encontramos el promedio de palabras por oracion en el archivo de Training
taggs_by_sentence_Training = np.append(taggs_by_sentence_Training, len(re.findall(
"\|Air|\|Gtype|\|Gversion|\|Med|\|Phase|\|Strain|\|Supp|\|Technique|\|Temp|\|OD|\|Anti|\|Agit|\|Vess|\|Substrain|\|pH", line)))
MeanTags_by_Sent_Training = np.mean(taggs_by_sentence_Training)
# Descomentar la linea inferior para ver la lista que contienen el numero de etiquetas en cada una de las oraciones.
# print(taggs_by_sentence_Training)
with open(os.path.join(args.inputPath, args.TestingFile), mode="r") as Testing_file:
for line in Testing_file:
# Descomentar las lineas inferiores para ver cada una de las oraciones en el archivo.
# print(line)
# print("\n")
# Encontramos cuantas veces esta cada etiqueta en el archivo de Testing
Air_2 += len(re.findall("\|Air", line))
Gtype_2 += len(re.findall("\|Gtype", line))
Gversion_2 += len(re.findall("\|Gversion", line))
Med_2 += len(re.findall("\|Med", line))
Phase_2 += len(re.findall("\|Phase", line))
Strain_2 += len(re.findall("\|Strain", line))
Supp_2 += len(re.findall("\|Supp", line))
Technique_2 += len(re.findall("\|Technique", line))
Temp_2 += len(re.findall("\|Temp", line))
OD_2 += len(re.findall("\|OD", line))
Anti_2 += len(re.findall("\|Anti", line))
Agit_2 += len(re.findall("\|Agit", line))
Vess_2 += len(re.findall("\|Vess", line))
Substrain_2 += len(re.findall("\|Substrain", line))
pH_2 += len(re.findall("\|pH", line))
# Encontramos el promedio de palabras por oracion en el archivo de Testing
taggs_by_sentence_Testing = np.append(taggs_by_sentence_Testing, len(re.findall(
"Air|Gtype|Gversion|Med|Phase|Strain|Supp|Technique|Temp|OD|Anti|Agit|Vess|Substrain|pH", line)))
MeanTags_by_Sent_Testing = np.mean(taggs_by_sentence_Testing)
# Descomentar la linea inferior para ver la lista que contienen el numero de etiquetas en cada una de las oraciones.
# print(taggs_by_sentence_Testing)
with open(os.path.join(args.outputPath, "Train_Test_tags.txt"), mode="w+") as report:
report.write(
"-------------- REPORT --------------\nTraining Dataset:\n")
report.write("Air: %d\n" % Air)
report.write("Gtype: %d\n" % Gtype)
report.write("Gversion: %d\n" % Gversion)
report.write("Med: %d\n" % Med)
report.write("Phase: %d\n" % Phase)
report.write("Strain: %d\n" % Strain)
report.write("Supp: %d\n" % Supp)
report.write("Technique: %d\n" % Technique)
report.write("Temp: %d\n" % Temp)
report.write("OD: %d\n" % OD)
report.write("Anti: %d\n" % Anti)
report.write("Agit: %d\n" % Agit)
report.write("Vess: %d\n" % Vess)
report.write("Substrain: %d\n" % Substrain)
report.write("pH: %d\n\n" % pH)
report.write("Mean tags per sentence: %d\n\n\n" %
MeanTags_by_Sent_Training)
report.write("Testing Dataset:\n")
report.write("Air: %d\n" % Air_2)
report.write("Gtype: %d\n" % Gtype_2)
report.write("Gversion: %d\n" % Gversion_2)
report.write("Med: %d\n" % Med_2)
report.write("Phase: %d\n" % Phase_2)
report.write("Strain: %d\n" % Strain_2)
report.write("Supp: %d\n" % Supp_2)
report.write("Technique: %d\n" % Technique_2)
report.write("Temp: %d\n" % Temp_2)
report.write("OD: %d\n" % OD_2)
report.write("Anti: %d\n" % Anti_2)
report.write("Agit: %d\n" % Agit_2)
report.write("Vess: %d\n" % Vess_2)
report.write("Substrain: %d\n" % Substrain_2)
report.write("pH: %d\n\n" % pH_2)
report.write("Mean tags per sentence: %d\n" % MeanTags_by_Sent_Testing)
print("Done.\n")