Estefani Gaytan Nunez
1 +-------------- REPORT --------------
2 +Training Dataset:
3 +Air: 111
4 +Gtype: 261
5 +Gversion: 17
6 +Med: 119
7 +Phase: 37
8 +Strain: 3
9 +Supp: 294
10 +Technique: 58
11 +Temp: 56
12 +OD: 65
13 +Anti: 38
14 +Agit: 4
15 +Vess: 2
16 +Substrain: 4
17 +pH: 26
18 +
19 +Mean tags per sentence: 3
20 +
21 +
22 +Testing Dataset:
23 +Air: 88
24 +Gtype: 78
25 +Gversion: 6
26 +Med: 56
27 +Phase: 21
28 +Strain: 0
29 +Supp: 136
30 +Technique: 33
31 +Temp: 15
32 +OD: 21
33 +Anti: 13
34 +Agit: 7
35 +Vess: 0
36 +Substrain: 0
37 +pH: 10
38 +
39 +Mean tags per sentence: 4
1 +# Importacion de librerias
2 +import pandas as pd
3 +import numpy as np
4 +import re
5 +import argparse
6 +import os
7 +
8 +__author__ = 'kevinml'
9 +
10 +# Objective
11 +# Take CoreNLP processed files and extract two thing.
12 +# 1.- Number of Each Tag within Training and Testing Datasets
13 +# 2.- Mean Sentence Tag within Training and Testing Datasets
14 +
15 +# Examples
16 +# python Tags_Training_Testing.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/CRF/data-sets --outputPath /home/kevinml/automatic-extraction-growth-conditions/CRF/analysis_report --TrainingFile training-data-set-70-NER.txt --TestingFile test-data-set-30-NER.txt
17 +
18 +####################################################################################
19 +# MAIN PROGRAM #
20 +####################################################################################
21 +
22 +if __name__ == '__main__':
23 + # Definicion de Parametros
24 + parser = argparse.ArgumentParser()
25 + parser.add_argument(
26 + '--inputPath', help="Ruta donde se encuentran los archivos a procesar. Ej: --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets", required=True)
27 + parser.add_argument(
28 + '--outputPath', help="Ruta donde se depositaran los archivos resultantes. Ej: --outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets", required=True)
29 + parser.add_argument(
30 + '--TrainingFile', help="Archivo a procesar. Ej: --TrainingFile test-data-set-30.txt", required=True)
31 + parser.add_argument(
32 + '--TestingFile', help="Archivo a procesar. Ej: --TestingFile training-data-set-70.txt", required=True)
33 + args = parser.parse_args()
34 +
35 + # Se imprimen los parametros ingresados
36 + print('\n-------------------------------- PARAMETERS --------------------------------\n')
37 + print('Input Path: ' + str(args.inputPath))
38 + print('Training File: ' + str(args.TrainingFile))
39 + print('Testing File: ' + str(args.TestingFile))
40 + print('Output Path: ' + str(args.outputPath))
41 + print('\n-------------------------------- PROCESSING --------------------------------\n')
42 +
43 + Air, Gtype, Gversion, Med, Phase, Strain, Supp, Technique, Temp, OD, Anti, Agit, Vess, Substrain, pH = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
44 + Air_2, Gtype_2, Gversion_2, Med_2, Phase_2, Strain_2, Supp_2, Technique_2, Temp_2, OD_2, Anti_2, Agit_2, Vess_2, Substrain_2, pH_2 = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
45 + taggs_by_sentence_Testing, taggs_by_sentence_Training = np.array(
46 + []), np.array([])
47 +
48 + with open(os.path.join(args.inputPath, args.TrainingFile), mode="r") as Training_file:
49 + for line in Training_file:
50 + # Descomentar las lineas inferiores para ver cada una de las oraciones en el archivo.
51 + # print(line)
52 + # print("\n")
53 + # Encontramos cuantas veces esta cada etiqueta en el archivo de Training
54 + Air += len(re.findall("\|Air", line))
55 + Gtype += len(re.findall("\|Gtype", line))
56 + Gversion += len(re.findall("\|Gversion", line))
57 + Med += len(re.findall("\|Med", line))
58 + Phase += len(re.findall("\|Phase", line))
59 + Strain += len(re.findall("\|Strain", line))
60 + Supp += len(re.findall("\|Supp", line))
61 + Technique += len(re.findall("\|Technique", line))
62 + Temp += len(re.findall("\|Temp", line))
63 + OD += len(re.findall("\|OD", line))
64 + Anti += len(re.findall("\|Anti", line))
65 + Agit += len(re.findall("\|Agit", line))
66 + Vess += len(re.findall("\|Vess", line))
67 + Substrain += len(re.findall("\|Substrain", line))
68 + pH += len(re.findall("\|pH", line))
69 + # Encontramos el promedio de palabras por oracion en el archivo de Training
70 + taggs_by_sentence_Training = np.append(taggs_by_sentence_Training, len(re.findall(
71 + "\|Air|\|Gtype|\|Gversion|\|Med|\|Phase|\|Strain|\|Supp|\|Technique|\|Temp|\|OD|\|Anti|\|Agit|\|Vess|\|Substrain|\|pH", line)))
72 + MeanTags_by_Sent_Training = np.mean(taggs_by_sentence_Training)
73 + # Descomentar la linea inferior para ver la lista que contienen el numero de etiquetas en cada una de las oraciones.
74 + # print(taggs_by_sentence_Training)
75 +
76 + with open(os.path.join(args.inputPath, args.TestingFile), mode="r") as Testing_file:
77 + for line in Testing_file:
78 + # Descomentar las lineas inferiores para ver cada una de las oraciones en el archivo.
79 + # print(line)
80 + # print("\n")
81 + # Encontramos cuantas veces esta cada etiqueta en el archivo de Testing
82 + Air_2 += len(re.findall("\|Air", line))
83 + Gtype_2 += len(re.findall("\|Gtype", line))
84 + Gversion_2 += len(re.findall("\|Gversion", line))
85 + Med_2 += len(re.findall("\|Med", line))
86 + Phase_2 += len(re.findall("\|Phase", line))
87 + Strain_2 += len(re.findall("\|Strain", line))
88 + Supp_2 += len(re.findall("\|Supp", line))
89 + Technique_2 += len(re.findall("\|Technique", line))
90 + Temp_2 += len(re.findall("\|Temp", line))
91 + OD_2 += len(re.findall("\|OD", line))
92 + Anti_2 += len(re.findall("\|Anti", line))
93 + Agit_2 += len(re.findall("\|Agit", line))
94 + Vess_2 += len(re.findall("\|Vess", line))
95 + Substrain_2 += len(re.findall("\|Substrain", line))
96 + pH_2 += len(re.findall("\|pH", line))
97 + # Encontramos el promedio de palabras por oracion en el archivo de Testing
98 + taggs_by_sentence_Testing = np.append(taggs_by_sentence_Testing, len(re.findall(
99 + "Air|Gtype|Gversion|Med|Phase|Strain|Supp|Technique|Temp|OD|Anti|Agit|Vess|Substrain|pH", line)))
100 + MeanTags_by_Sent_Testing = np.mean(taggs_by_sentence_Testing)
101 + # Descomentar la linea inferior para ver la lista que contienen el numero de etiquetas en cada una de las oraciones.
102 + # print(taggs_by_sentence_Testing)
103 +
104 + with open(os.path.join(args.outputPath, "Train_Test_tags.txt"), mode="w+") as report:
105 + report.write(
106 + "-------------- REPORT --------------\nTraining Dataset:\n")
107 + report.write("Air: %d\n" % Air)
108 + report.write("Gtype: %d\n" % Gtype)
109 + report.write("Gversion: %d\n" % Gversion)
110 + report.write("Med: %d\n" % Med)
111 + report.write("Phase: %d\n" % Phase)
112 + report.write("Strain: %d\n" % Strain)
113 + report.write("Supp: %d\n" % Supp)
114 + report.write("Technique: %d\n" % Technique)
115 + report.write("Temp: %d\n" % Temp)
116 + report.write("OD: %d\n" % OD)
117 + report.write("Anti: %d\n" % Anti)
118 + report.write("Agit: %d\n" % Agit)
119 + report.write("Vess: %d\n" % Vess)
120 + report.write("Substrain: %d\n" % Substrain)
121 + report.write("pH: %d\n\n" % pH)
122 + report.write("Mean tags per sentence: %d\n\n\n" %
123 + MeanTags_by_Sent_Training)
124 +
125 + report.write("Testing Dataset:\n")
126 + report.write("Air: %d\n" % Air_2)
127 + report.write("Gtype: %d\n" % Gtype_2)
128 + report.write("Gversion: %d\n" % Gversion_2)
129 + report.write("Med: %d\n" % Med_2)
130 + report.write("Phase: %d\n" % Phase_2)
131 + report.write("Strain: %d\n" % Strain_2)
132 + report.write("Supp: %d\n" % Supp_2)
133 + report.write("Technique: %d\n" % Technique_2)
134 + report.write("Temp: %d\n" % Temp_2)
135 + report.write("OD: %d\n" % OD_2)
136 + report.write("Anti: %d\n" % Anti_2)
137 + report.write("Agit: %d\n" % Agit_2)
138 + report.write("Vess: %d\n" % Vess_2)
139 + report.write("Substrain: %d\n" % Substrain_2)
140 + report.write("pH: %d\n\n" % pH_2)
141 + report.write("Mean tags per sentence: %d\n" % MeanTags_by_Sent_Testing)
142 +
143 + print("Done.\n")