Showing
2 changed files
with
182 additions
and
0 deletions
CRF/analysis_report/Train_Test_tags.txt
0 → 100644
| 1 | +-------------- REPORT -------------- | ||
| 2 | +Training Dataset: | ||
| 3 | +Air: 111 | ||
| 4 | +Gtype: 261 | ||
| 5 | +Gversion: 17 | ||
| 6 | +Med: 119 | ||
| 7 | +Phase: 37 | ||
| 8 | +Strain: 3 | ||
| 9 | +Supp: 294 | ||
| 10 | +Technique: 58 | ||
| 11 | +Temp: 56 | ||
| 12 | +OD: 65 | ||
| 13 | +Anti: 38 | ||
| 14 | +Agit: 4 | ||
| 15 | +Vess: 2 | ||
| 16 | +Substrain: 4 | ||
| 17 | +pH: 26 | ||
| 18 | + | ||
| 19 | +Mean tags per sentence: 3 | ||
| 20 | + | ||
| 21 | + | ||
| 22 | +Testing Dataset: | ||
| 23 | +Air: 88 | ||
| 24 | +Gtype: 78 | ||
| 25 | +Gversion: 6 | ||
| 26 | +Med: 56 | ||
| 27 | +Phase: 21 | ||
| 28 | +Strain: 0 | ||
| 29 | +Supp: 136 | ||
| 30 | +Technique: 33 | ||
| 31 | +Temp: 15 | ||
| 32 | +OD: 21 | ||
| 33 | +Anti: 13 | ||
| 34 | +Agit: 7 | ||
| 35 | +Vess: 0 | ||
| 36 | +Substrain: 0 | ||
| 37 | +pH: 10 | ||
| 38 | + | ||
| 39 | +Mean tags per sentence: 4 |
CRF/bin/analysis/Tags_Training_Testing.py
0 → 100644
| 1 | +# Importacion de librerias | ||
| 2 | +import pandas as pd | ||
| 3 | +import numpy as np | ||
| 4 | +import re | ||
| 5 | +import argparse | ||
| 6 | +import os | ||
| 7 | + | ||
| 8 | +__author__ = 'kevinml' | ||
| 9 | + | ||
| 10 | +# Objective | ||
| 11 | +# Take CoreNLP processed files and extract two thing. | ||
| 12 | +# 1.- Number of Each Tag within Training and Testing Datasets | ||
| 13 | +# 2.- Mean Sentence Tag within Training and Testing Datasets | ||
| 14 | + | ||
| 15 | +# Examples | ||
| 16 | +# python Tags_Training_Testing.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/CRF/data-sets --outputPath /home/kevinml/automatic-extraction-growth-conditions/CRF/analysis_report --TrainingFile training-data-set-70-NER.txt --TestingFile test-data-set-30-NER.txt | ||
| 17 | + | ||
| 18 | +#################################################################################### | ||
| 19 | +# MAIN PROGRAM # | ||
| 20 | +#################################################################################### | ||
| 21 | + | ||
| 22 | +if __name__ == '__main__': | ||
| 23 | + # Definicion de Parametros | ||
| 24 | + parser = argparse.ArgumentParser() | ||
| 25 | + parser.add_argument( | ||
| 26 | + '--inputPath', help="Ruta donde se encuentran los archivos a procesar. Ej: --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets", required=True) | ||
| 27 | + parser.add_argument( | ||
| 28 | + '--outputPath', help="Ruta donde se depositaran los archivos resultantes. Ej: --outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets", required=True) | ||
| 29 | + parser.add_argument( | ||
| 30 | + '--TrainingFile', help="Archivo a procesar. Ej: --TrainingFile test-data-set-30.txt", required=True) | ||
| 31 | + parser.add_argument( | ||
| 32 | + '--TestingFile', help="Archivo a procesar. Ej: --TestingFile training-data-set-70.txt", required=True) | ||
| 33 | + args = parser.parse_args() | ||
| 34 | + | ||
| 35 | + # Se imprimen los parametros ingresados | ||
| 36 | + print('\n-------------------------------- PARAMETERS --------------------------------\n') | ||
| 37 | + print('Input Path: ' + str(args.inputPath)) | ||
| 38 | + print('Training File: ' + str(args.TrainingFile)) | ||
| 39 | + print('Testing File: ' + str(args.TestingFile)) | ||
| 40 | + print('Output Path: ' + str(args.outputPath)) | ||
| 41 | + print('\n-------------------------------- PROCESSING --------------------------------\n') | ||
| 42 | + | ||
| 43 | + Air, Gtype, Gversion, Med, Phase, Strain, Supp, Technique, Temp, OD, Anti, Agit, Vess, Substrain, pH = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
| 44 | + Air_2, Gtype_2, Gversion_2, Med_2, Phase_2, Strain_2, Supp_2, Technique_2, Temp_2, OD_2, Anti_2, Agit_2, Vess_2, Substrain_2, pH_2 = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | ||
| 45 | + taggs_by_sentence_Testing, taggs_by_sentence_Training = np.array( | ||
| 46 | + []), np.array([]) | ||
| 47 | + | ||
| 48 | + with open(os.path.join(args.inputPath, args.TrainingFile), mode="r") as Training_file: | ||
| 49 | + for line in Training_file: | ||
| 50 | + # Descomentar las lineas inferiores para ver cada una de las oraciones en el archivo. | ||
| 51 | + # print(line) | ||
| 52 | + # print("\n") | ||
| 53 | + # Encontramos cuantas veces esta cada etiqueta en el archivo de Training | ||
| 54 | + Air += len(re.findall("\|Air", line)) | ||
| 55 | + Gtype += len(re.findall("\|Gtype", line)) | ||
| 56 | + Gversion += len(re.findall("\|Gversion", line)) | ||
| 57 | + Med += len(re.findall("\|Med", line)) | ||
| 58 | + Phase += len(re.findall("\|Phase", line)) | ||
| 59 | + Strain += len(re.findall("\|Strain", line)) | ||
| 60 | + Supp += len(re.findall("\|Supp", line)) | ||
| 61 | + Technique += len(re.findall("\|Technique", line)) | ||
| 62 | + Temp += len(re.findall("\|Temp", line)) | ||
| 63 | + OD += len(re.findall("\|OD", line)) | ||
| 64 | + Anti += len(re.findall("\|Anti", line)) | ||
| 65 | + Agit += len(re.findall("\|Agit", line)) | ||
| 66 | + Vess += len(re.findall("\|Vess", line)) | ||
| 67 | + Substrain += len(re.findall("\|Substrain", line)) | ||
| 68 | + pH += len(re.findall("\|pH", line)) | ||
| 69 | + # Encontramos el promedio de palabras por oracion en el archivo de Training | ||
| 70 | + taggs_by_sentence_Training = np.append(taggs_by_sentence_Training, len(re.findall( | ||
| 71 | + "\|Air|\|Gtype|\|Gversion|\|Med|\|Phase|\|Strain|\|Supp|\|Technique|\|Temp|\|OD|\|Anti|\|Agit|\|Vess|\|Substrain|\|pH", line))) | ||
| 72 | + MeanTags_by_Sent_Training = np.mean(taggs_by_sentence_Training) | ||
| 73 | + # Descomentar la linea inferior para ver la lista que contienen el numero de etiquetas en cada una de las oraciones. | ||
| 74 | + # print(taggs_by_sentence_Training) | ||
| 75 | + | ||
| 76 | + with open(os.path.join(args.inputPath, args.TestingFile), mode="r") as Testing_file: | ||
| 77 | + for line in Testing_file: | ||
| 78 | + # Descomentar las lineas inferiores para ver cada una de las oraciones en el archivo. | ||
| 79 | + # print(line) | ||
| 80 | + # print("\n") | ||
| 81 | + # Encontramos cuantas veces esta cada etiqueta en el archivo de Testing | ||
| 82 | + Air_2 += len(re.findall("\|Air", line)) | ||
| 83 | + Gtype_2 += len(re.findall("\|Gtype", line)) | ||
| 84 | + Gversion_2 += len(re.findall("\|Gversion", line)) | ||
| 85 | + Med_2 += len(re.findall("\|Med", line)) | ||
| 86 | + Phase_2 += len(re.findall("\|Phase", line)) | ||
| 87 | + Strain_2 += len(re.findall("\|Strain", line)) | ||
| 88 | + Supp_2 += len(re.findall("\|Supp", line)) | ||
| 89 | + Technique_2 += len(re.findall("\|Technique", line)) | ||
| 90 | + Temp_2 += len(re.findall("\|Temp", line)) | ||
| 91 | + OD_2 += len(re.findall("\|OD", line)) | ||
| 92 | + Anti_2 += len(re.findall("\|Anti", line)) | ||
| 93 | + Agit_2 += len(re.findall("\|Agit", line)) | ||
| 94 | + Vess_2 += len(re.findall("\|Vess", line)) | ||
| 95 | + Substrain_2 += len(re.findall("\|Substrain", line)) | ||
| 96 | + pH_2 += len(re.findall("\|pH", line)) | ||
| 97 | + # Encontramos el promedio de palabras por oracion en el archivo de Testing | ||
| 98 | + taggs_by_sentence_Testing = np.append(taggs_by_sentence_Testing, len(re.findall( | ||
| 99 | + "Air|Gtype|Gversion|Med|Phase|Strain|Supp|Technique|Temp|OD|Anti|Agit|Vess|Substrain|pH", line))) | ||
| 100 | + MeanTags_by_Sent_Testing = np.mean(taggs_by_sentence_Testing) | ||
| 101 | + # Descomentar la linea inferior para ver la lista que contienen el numero de etiquetas en cada una de las oraciones. | ||
| 102 | + # print(taggs_by_sentence_Testing) | ||
| 103 | + | ||
| 104 | + with open(os.path.join(args.outputPath, "Train_Test_tags.txt"), mode="w+") as report: | ||
| 105 | + report.write( | ||
| 106 | + "-------------- REPORT --------------\nTraining Dataset:\n") | ||
| 107 | + report.write("Air: %d\n" % Air) | ||
| 108 | + report.write("Gtype: %d\n" % Gtype) | ||
| 109 | + report.write("Gversion: %d\n" % Gversion) | ||
| 110 | + report.write("Med: %d\n" % Med) | ||
| 111 | + report.write("Phase: %d\n" % Phase) | ||
| 112 | + report.write("Strain: %d\n" % Strain) | ||
| 113 | + report.write("Supp: %d\n" % Supp) | ||
| 114 | + report.write("Technique: %d\n" % Technique) | ||
| 115 | + report.write("Temp: %d\n" % Temp) | ||
| 116 | + report.write("OD: %d\n" % OD) | ||
| 117 | + report.write("Anti: %d\n" % Anti) | ||
| 118 | + report.write("Agit: %d\n" % Agit) | ||
| 119 | + report.write("Vess: %d\n" % Vess) | ||
| 120 | + report.write("Substrain: %d\n" % Substrain) | ||
| 121 | + report.write("pH: %d\n\n" % pH) | ||
| 122 | + report.write("Mean tags per sentence: %d\n\n\n" % | ||
| 123 | + MeanTags_by_Sent_Training) | ||
| 124 | + | ||
| 125 | + report.write("Testing Dataset:\n") | ||
| 126 | + report.write("Air: %d\n" % Air_2) | ||
| 127 | + report.write("Gtype: %d\n" % Gtype_2) | ||
| 128 | + report.write("Gversion: %d\n" % Gversion_2) | ||
| 129 | + report.write("Med: %d\n" % Med_2) | ||
| 130 | + report.write("Phase: %d\n" % Phase_2) | ||
| 131 | + report.write("Strain: %d\n" % Strain_2) | ||
| 132 | + report.write("Supp: %d\n" % Supp_2) | ||
| 133 | + report.write("Technique: %d\n" % Technique_2) | ||
| 134 | + report.write("Temp: %d\n" % Temp_2) | ||
| 135 | + report.write("OD: %d\n" % OD_2) | ||
| 136 | + report.write("Anti: %d\n" % Anti_2) | ||
| 137 | + report.write("Agit: %d\n" % Agit_2) | ||
| 138 | + report.write("Vess: %d\n" % Vess_2) | ||
| 139 | + report.write("Substrain: %d\n" % Substrain_2) | ||
| 140 | + report.write("pH: %d\n\n" % pH_2) | ||
| 141 | + report.write("Mean tags per sentence: %d\n" % MeanTags_by_Sent_Testing) | ||
| 142 | + | ||
| 143 | + print("Done.\n") |
-
Please register or login to post a comment