Showing
23 changed files
with
1043 additions
and
69 deletions
| ... | @@ -7,38 +7,36 @@ from pandas import DataFrame as DF | ... | @@ -7,38 +7,36 @@ from pandas import DataFrame as DF |
| 7 | import matplotlib.pyplot as plt | 7 | import matplotlib.pyplot as plt |
| 8 | 8 | ||
| 9 | # Objective | 9 | # Objective |
| 10 | -# Drawn figures of grid reports | 10 | +# Drawn figures of grid reports |
| 11 | # | 11 | # |
| 12 | # Input parameters | 12 | # Input parameters |
| 13 | -# --inputPath=PATH Path of inputfiles | 13 | +# --inputPath Path of inputfiles |
| 14 | -# --outputPath=PATH Path to place output figures | 14 | +# --outputPath Path to place output figures |
| 15 | -# --figureName single run specific name figure, multifigure first part of name | 15 | +# --figureName single run specific name figure, multifigure first part of name |
| 16 | -# --inputFile Use it for a single report | 16 | +# --join boolean, all figures together |
| 17 | -# --version CRF-script version of reports | ||
| 18 | # | 17 | # |
| 19 | # Output | 18 | # Output |
| 20 | # training and test data set | 19 | # training and test data set |
| 21 | # | 20 | # |
| 22 | # Examples | 21 | # Examples |
| 23 | # python figures-reports.py | 22 | # python figures-reports.py |
| 24 | -# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/ | 23 | +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/nov13 |
| 25 | -# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/ | 24 | +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/nov13 |
| 26 | # --figureName FiguresGrid | 25 | # --figureName FiguresGrid |
| 27 | -# --inputFile report_Run1_v11.txt | 26 | +# --join |
| 28 | -# -version v11 | ||
| 29 | 27 | ||
| 30 | -# python figures-reports.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/ --figureName FiguresGrid_v1 --inputFile report_Run1_v11.txt ..version v11 | 28 | + |
| 29 | + | ||
| 30 | +# python figures-reports.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/nov13 --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/nov13 --figureName FiguresGrid --join | ||
| 31 | __author__ = 'egaytan' | 31 | __author__ = 'egaytan' |
| 32 | 32 | ||
| 33 | #################################################################################### | 33 | #################################################################################### |
| 34 | # FUNCTIONS # | 34 | # FUNCTIONS # |
| 35 | #################################################################################### | 35 | #################################################################################### |
| 36 | -def Filter(rfile, options,v): | 36 | + |
| 37 | - if options[0]=='all': | 37 | +def savescreen(output, dic, path): |
| 38 | - if rfile[0:6]=='report' and rfile[-7:-4]==v: return(True) | 38 | + if output: |
| 39 | - elif rfile in options: | 39 | + DF.from_dict(dic).to_csv(path+'.csv', sep = "\t", index = True) |
| 40 | - return(True) | ||
| 41 | - return(False) | ||
| 42 | 40 | ||
| 43 | #################################################################################### | 41 | #################################################################################### |
| 44 | # MAIN PROGRAM # | 42 | # MAIN PROGRAM # |
| ... | @@ -50,8 +48,7 @@ if __name__ == '__main__': | ... | @@ -50,8 +48,7 @@ if __name__ == '__main__': |
| 50 | parser.add_option('--inputPath', dest='inputPath', help='Path of output from CoreNLP', metavar='PATH') | 48 | parser.add_option('--inputPath', dest='inputPath', help='Path of output from CoreNLP', metavar='PATH') |
| 51 | parser.add_option('--outputPath', dest='outputPath', help='Path to place output figures', metavar='PATH') | 49 | parser.add_option('--outputPath', dest='outputPath', help='Path to place output figures', metavar='PATH') |
| 52 | parser.add_option('--figureName', dest='figureName', help='Specific or first part of figurename', metavar='FILE') | 50 | parser.add_option('--figureName', dest='figureName', help='Specific or first part of figurename', metavar='FILE') |
| 53 | - parser.add_option('--version', dest='version', help='script version', metavar='FILE') | 51 | + parser.add_option('--table', dest='table', help='save score-table', action='store_true', default=False) |
| 54 | - parser.add_option('--inputFile', dest='inputFile', help='Use it for a specific report files', metavar='FILE', default='all,') | ||
| 55 | 52 | ||
| 56 | (options, args) = parser.parse_args() | 53 | (options, args) = parser.parse_args() |
| 57 | if len(args) > 0: | 54 | if len(args) > 0: |
| ... | @@ -61,56 +58,40 @@ if __name__ == '__main__': | ... | @@ -61,56 +58,40 @@ if __name__ == '__main__': |
| 61 | print('-------------------------------- PARAMETERS --------------------------------') | 58 | print('-------------------------------- PARAMETERS --------------------------------') |
| 62 | print('Path of output from CoreNLP: ' + str(options.inputPath)) | 59 | print('Path of output from CoreNLP: ' + str(options.inputPath)) |
| 63 | print('Path to place output figures: ' + str(options.outputPath)) | 60 | print('Path to place output figures: ' + str(options.outputPath)) |
| 64 | - print('Specific or first part of figurename: ' + str(options.figureName)) | 61 | + print('Figurename: ' + str(options.figureName)) |
| 65 | - print('CRF-script version: ' + str(options.version)) | ||
| 66 | - | ||
| 67 | print('-------------------------------- PROCESSING --------------------------------') | 62 | print('-------------------------------- PROCESSING --------------------------------') |
| 63 | + reportFileList = [ rfile for rfile in os.listdir(options.inputPath) if rfile[0:7] == "report_"] | ||
| 64 | + print(','.join(reportFileList)) | ||
| 68 | 65 | ||
| 69 | - rawInputRepotsList = str(options.inputFile).split(',') | 66 | + for inputFile in reportFileList: |
| 70 | - reportFileList = [ rfile for rfile in os.listdir(options.inputPath) if Filter(rfile, rawInputRepotsList, str(options.version)) ] | 67 | + scores = df(dict) |
| 71 | - scores = df(dict) | 68 | + for report in reportFileList: |
| 72 | - #CV={} | 69 | + with open(os.path.join(options.inputPath, report), 'r') as File: |
| 73 | - print('Report files: ' + str(options.inputFile )) | 70 | + string = File.read() |
| 74 | - print('\n'.join(reportFileList)) | 71 | + scores[report[7:11]]['CV']=re.findall('best\sCV\sscore\:(\d+\.\d+)', string)[0] |
| 75 | - print('----------------------------------- NOTE -----------------------------------') | 72 | + summaryScores = re.findall('avg\s\/\stotal\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0] |
| 76 | - print('\n-------- All chosen report files should be in inputPath given---------------\n') | 73 | + scores[report[7:11]]['precision']=summaryScores[0] |
| 74 | + scores[report[7:11]]['recall']=summaryScores[1] | ||
| 75 | + scores[report[7:11]]['f1-score']=summaryScores[2] | ||
| 77 | 76 | ||
| 78 | - print('------------------------------- SAVING DATA --------------------------------\n') | ||
| 79 | - for report in reportFileList: | ||
| 80 | - with open(os.path.join(options.inputPath, report), 'r') as File: | ||
| 81 | - string = File.read() | ||
| 82 | - scores[report[7:11]]['CV']=re.findall('best\sCV\sscore\:(\d+\.\d+)', string)[0] | ||
| 83 | - summaryScores = re.findall('avg\s\/\stotal\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0] | ||
| 84 | - scores[report[7:11]]['precision']=summaryScores[0] | ||
| 85 | - scores[report[7:11]]['recall']=summaryScores[1] | ||
| 86 | - scores[report[7:11]]['f1-score']=summaryScores[2] | ||
| 87 | - | ||
| 88 | print(DF(scores).T) | 77 | print(DF(scores).T) |
| 89 | - print('------------------------------- SAVING TABLE --------------------------------\n') | 78 | + scoresTable = DF(scores).T |
| 90 | - with open(os.path.join(options.inputPath, str(options.figureName) ), 'w') as File: | 79 | + print('------------------------------- SAVING DATA --------------------------------') |
| 91 | - | 80 | + print('Saving score-table: ' + str(options.table)) |
| 92 | - scoresTable = DF(scores).T | 81 | + imageName = os.path.join(options.outputPath, options.figureName) |
| 93 | - | 82 | + savescreen(options.table, scores, imageName) |
| 94 | - imageName=os.path.join(options.outputPath, options.figureName) | 83 | + fig = plt.figure() |
| 95 | - ylab = "score", | 84 | + fig.set_figheight(13) |
| 96 | - fig = plt.figure() | 85 | + fig.set_figwidth(20) |
| 97 | - plt.grid(False) | 86 | + plt.ylim(0.7, 1.1) |
| 98 | - plt.rcParams.update({'font.size': 15}) | 87 | + plt.xlabel("Runs") |
| 99 | - fig.set_figheight(13) | 88 | + plt.ylabel("score") |
| 100 | - fig.set_figwidth(20) | 89 | + plt.rcParams.update() |
| 101 | - plt.xlabel("Runs") | 90 | + plt.grid() |
| 102 | - plt.ylabel("score") | 91 | + plt.plot(scoresTable['precision'],'o--', label='precision', linewidth=3, markersize=15) |
| 103 | - plt.xticks(range(8),scoresTable["CV"].index) | 92 | + plt.plot(scoresTable['f1-score'], 'o--', label='F1', linewidth=3, markersize=15) |
| 104 | - plt.plot(scoresTable['CV'], "--", color="red", label="CV") | 93 | + plt.plot(scoresTable['recall'], 'o--', label='recall' , linewidth=3, markersize=15) |
| 105 | - plt.plot(scoresTable['precision'], color="blue", label="precision") | 94 | + plt.plot(scoresTable['CV'], 'o--', label='CV' , linewidth=3, markersize=15) |
| 106 | - plt.plot(scoresTable['f1-score'], color="orange", label="F1") | 95 | + plt.legend(loc='lower right') |
| 107 | - plt.plot(scoresTable['recall'], color="g", label="recall") | 96 | + plt.xticks(range(8),['run1', 'run2', 'run3', 'run4', 'run5', 'run6', 'run7', 'run8']) |
| 108 | - plt.legend(loc='lower right') | 97 | + fig.savefig(imageName, bbox_inches='tight', pad_inches = 0.5) |
| 109 | - plt.tight_layout() | ||
| 110 | - fig.savefig(imageName, pad_inches=0.5) | ||
| 111 | - | ||
| 112 | - | ||
| 113 | - | ||
| 114 | - | ||
| 115 | - | ||
| 116 | - | ... | ... |
CRF/bin/figures/rplots/line-plots-CRF-v1.0.R
0 → 100644
| 1 | +# Based on http://www.sthda.com/english/wiki/ggplot2-line-plot-quick-start-guide-r-software-and-data-visualization | ||
| 2 | + | ||
| 3 | +library(ggplot2) | ||
| 4 | +#library(ggpubr) | ||
| 5 | +#library(cowplot) | ||
| 6 | + | ||
| 7 | +######### BEST MODELS ########## | ||
| 8 | + | ||
| 9 | +# Run1 | ||
| 10 | +# Todas las condiciones | ||
| 11 | +dfa <- data.frame(Measure=rep(c("Precision", "Recall", "F1-score"), each=15), | ||
| 12 | + Strategy=rep(c( | ||
| 13 | + "Agit", | ||
| 14 | + "Gversion", | ||
| 15 | + "Substrain", | ||
| 16 | + "Vess", | ||
| 17 | + "OD", | ||
| 18 | + "Anti", | ||
| 19 | + "Supp", | ||
| 20 | + "Air", | ||
| 21 | + "Gtype", | ||
| 22 | + "Med", | ||
| 23 | + "Temp", | ||
| 24 | + "Technique", | ||
| 25 | + "Phase", | ||
| 26 | + "pH", | ||
| 27 | + "Strain" | ||
| 28 | + ),3), | ||
| 29 | + Score=c( | ||
| 30 | + 0, | ||
| 31 | + 0, | ||
| 32 | + 0, | ||
| 33 | + 0, | ||
| 34 | + 1, | ||
| 35 | + 1, | ||
| 36 | + 0.883, | ||
| 37 | + 0.92, | ||
| 38 | + 0.905, | ||
| 39 | + 0.852, | ||
| 40 | + 0.818, | ||
| 41 | + 0.88, | ||
| 42 | + 1, | ||
| 43 | + 1, | ||
| 44 | + 1, | ||
| 45 | + 0, | ||
| 46 | + 0, | ||
| 47 | + 0, | ||
| 48 | + 0, | ||
| 49 | + 0.405, | ||
| 50 | + 0.444, | ||
| 51 | + 0.669, | ||
| 52 | + 0.742, | ||
| 53 | + 0.811, | ||
| 54 | + 0.912, | ||
| 55 | + 1, | ||
| 56 | + 1, | ||
| 57 | + 0.947, | ||
| 58 | + 1, | ||
| 59 | + 1, | ||
| 60 | + 0, | ||
| 61 | + 0, | ||
| 62 | + 0, | ||
| 63 | + 0, | ||
| 64 | + 0.577, | ||
| 65 | + 0.615, | ||
| 66 | + 0.762, | ||
| 67 | + 0.821, | ||
| 68 | + 0.856, | ||
| 69 | + 0.881, | ||
| 70 | + 0.9, | ||
| 71 | + 0.936, | ||
| 72 | + 0.973, | ||
| 73 | + 1, | ||
| 74 | + 1 | ||
| 75 | + )) | ||
| 76 | + | ||
| 77 | +# Solo condiciones con F1-score > 0 | ||
| 78 | +# Run 1 | ||
| 79 | +df <- data.frame(Measure=rep(c("Precision", "Recall", "F1-score"), each=11), | ||
| 80 | + Strategy=rep(c( | ||
| 81 | + "OD", | ||
| 82 | + "Anti", | ||
| 83 | + "Supp", | ||
| 84 | + "Air", | ||
| 85 | + "Gtype", | ||
| 86 | + "Med", | ||
| 87 | + "Temp", | ||
| 88 | + "Technique", | ||
| 89 | + "Phase", | ||
| 90 | + "pH", | ||
| 91 | + "Strain" | ||
| 92 | + ),3), | ||
| 93 | + Score=c( | ||
| 94 | + 1, | ||
| 95 | + 1, | ||
| 96 | + 0.883, | ||
| 97 | + 0.92, | ||
| 98 | + 0.905, | ||
| 99 | + 0.852, | ||
| 100 | + 0.818, | ||
| 101 | + 0.88, | ||
| 102 | + 1, | ||
| 103 | + 1, | ||
| 104 | + 1, | ||
| 105 | + 0.405, | ||
| 106 | + 0.444, | ||
| 107 | + 0.669, | ||
| 108 | + 0.742, | ||
| 109 | + 0.811, | ||
| 110 | + 0.912, | ||
| 111 | + 1, | ||
| 112 | + 1, | ||
| 113 | + 0.947, | ||
| 114 | + 1, | ||
| 115 | + 1, | ||
| 116 | + 0.577, | ||
| 117 | + 0.615, | ||
| 118 | + 0.762, | ||
| 119 | + 0.821, | ||
| 120 | + 0.856, | ||
| 121 | + 0.881, | ||
| 122 | + 0.9, | ||
| 123 | + 0.936, | ||
| 124 | + 0.973, | ||
| 125 | + 1, | ||
| 126 | + 1 | ||
| 127 | + )) | ||
| 128 | + | ||
| 129 | +head(df) | ||
| 130 | + | ||
| 131 | +pa<-ggplot(df, aes(x=Strategy, y=Score, group=Measure)) + | ||
| 132 | + geom_line(aes(color=Measure))+ | ||
| 133 | + geom_point(aes(color=Measure))+ | ||
| 134 | + scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9"))+ | ||
| 135 | + #scale_color_manual(values=c("#e6194b", "#3cb44b", "#0082c8"))+ | ||
| 136 | + #geom_text(aes(label = Score))+ | ||
| 137 | + labs(title="Scores by condition (Best model, Run1)",x="Condition", y = "Score")+ | ||
| 138 | + theme( | ||
| 139 | + legend.position="top", | ||
| 140 | + # Centrar título: plot.title = element_text(hjust = 0.5), | ||
| 141 | + axis.line = element_line(colour = "gray"), | ||
| 142 | + panel.background = element_blank(), | ||
| 143 | + panel.grid.major = element_blank(), | ||
| 144 | + panel.grid.minor = element_blank(), | ||
| 145 | + panel.border = element_blank() | ||
| 146 | + ) | ||
| 147 | +pa | ||
| 148 | + | ||
| 149 | +ggsave(".png") | ||
| 150 | + | ||
| 151 | +# Solo condiciones con F1-score > 0 | ||
| 152 | +# Run 7 | ||
| 153 | +df <- data.frame(Measure=rep(c("Precision", "Recall", "F1-score"), each=11), | ||
| 154 | + Strategy=rep(c( | ||
| 155 | + "Anti", | ||
| 156 | + "OD", | ||
| 157 | + "Supp", | ||
| 158 | + "Air", | ||
| 159 | + "Gtype", | ||
| 160 | + "Temp", | ||
| 161 | + "Med", | ||
| 162 | + "Technique", | ||
| 163 | + "Phase", | ||
| 164 | + "pH", | ||
| 165 | + "Strain" | ||
| 166 | + ),3), | ||
| 167 | + Score=c( | ||
| 168 | + 0.571, | ||
| 169 | + 1, | ||
| 170 | + 0.886, | ||
| 171 | + 0.939, | ||
| 172 | + 0.876, | ||
| 173 | + 0.818, | ||
| 174 | + 0.897, | ||
| 175 | + 0.952, | ||
| 176 | + 1, | ||
| 177 | + 1, | ||
| 178 | + 1, | ||
| 179 | + 0.444, | ||
| 180 | + 0.405, | ||
| 181 | + 0.684, | ||
| 182 | + 0.742, | ||
| 183 | + 0.802, | ||
| 184 | + 1, | ||
| 185 | + 0.912, | ||
| 186 | + 0.909, | ||
| 187 | + 0.947, | ||
| 188 | + 1, | ||
| 189 | + 1, | ||
| 190 | + 0.5, | ||
| 191 | + 0.577, | ||
| 192 | + 0.772, | ||
| 193 | + 0.829, | ||
| 194 | + 0.837, | ||
| 195 | + 0.9, | ||
| 196 | + 0.904, | ||
| 197 | + 0.93, | ||
| 198 | + 0.973, | ||
| 199 | + 1, | ||
| 200 | + 1 | ||
| 201 | + )) | ||
| 202 | + | ||
| 203 | +head(df) | ||
| 204 | + | ||
| 205 | +pa<-ggplot(df, aes(x=Strategy, y=Score, group=Measure)) + | ||
| 206 | + geom_line(aes(color=Measure))+ | ||
| 207 | + geom_point(aes(color=Measure))+ | ||
| 208 | + scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9"))+ | ||
| 209 | + #scale_color_manual(values=c("#e6194b", "#3cb44b", "#0082c8"))+ | ||
| 210 | + #geom_text(aes(label = Score))+ | ||
| 211 | + labs(title="Scores by condition (Best model, Run7)",x="Condition", y = "Score")+ | ||
| 212 | + theme( | ||
| 213 | + legend.position="top", | ||
| 214 | + # Centrar título: plot.title = element_text(hjust = 0.5), | ||
| 215 | + axis.line = element_line(colour = "gray"), | ||
| 216 | + panel.background = element_blank(), | ||
| 217 | + panel.grid.major = element_blank(), | ||
| 218 | + panel.grid.minor = element_blank(), | ||
| 219 | + panel.border = element_blank() | ||
| 220 | + ) | ||
| 221 | +pa | ||
| 222 | + | ||
| 223 | +ggsave(".png") | ||
| 224 | + |
CRF/bin/figures/rplots/line-plots-CRF-v2.0.R
0 → 100644
| 1 | +# Based on http://www.sthda.com/english/wiki/ggplot2-line-plot-quick-start-guide-r-software-and-data-visualization | ||
| 2 | + | ||
| 3 | +library(ggplot2) | ||
| 4 | +#library(ggpubr) | ||
| 5 | +#library(cowplot) | ||
| 6 | + | ||
| 7 | +######### BEST MODEL ########## | ||
| 8 | + | ||
| 9 | +# Solo condiciones con F1-score > 0 | ||
| 10 | +# Run 6 (report_Run6_v11.txt) | ||
| 11 | +df <- data.frame(Measure=rep(c("Precision", "Recall", "F1-score"), each=11), | ||
| 12 | + Strategy=rep(c( | ||
| 13 | + "Air", | ||
| 14 | + "Anti", | ||
| 15 | + "Gtype", | ||
| 16 | + "Med", | ||
| 17 | + "OD", | ||
| 18 | + "pH", | ||
| 19 | + "Phase", | ||
| 20 | + "Supp", | ||
| 21 | + "Technique", | ||
| 22 | + "Temp", | ||
| 23 | + "Vess" | ||
| 24 | + ),3), | ||
| 25 | + Score=c( | ||
| 26 | + 0.565, | ||
| 27 | + 1, | ||
| 28 | + 0.889, | ||
| 29 | + 1, | ||
| 30 | + 1, | ||
| 31 | + 1, | ||
| 32 | + 0.882, | ||
| 33 | + 0.811, | ||
| 34 | + 1, | ||
| 35 | + 0.923, | ||
| 36 | + 1, | ||
| 37 | + 0.377, | ||
| 38 | + 1, | ||
| 39 | + 0.847, | ||
| 40 | + 0.943, | ||
| 41 | + 0.818, | ||
| 42 | + 1, | ||
| 43 | + 1, | ||
| 44 | + 0.799, | ||
| 45 | + 0.913, | ||
| 46 | + 0.828, | ||
| 47 | + 1, | ||
| 48 | + 0.452, | ||
| 49 | + 1, | ||
| 50 | + 0.867, | ||
| 51 | + 0.971, | ||
| 52 | + 0.9, | ||
| 53 | + 1, | ||
| 54 | + 0.938, | ||
| 55 | + 0.805, | ||
| 56 | + 0.955, | ||
| 57 | + 0.873, | ||
| 58 | + 1 | ||
| 59 | + )) | ||
| 60 | + | ||
| 61 | +head(df) | ||
| 62 | + | ||
| 63 | +pa<-ggplot(df, aes(x=Strategy, y=Score, group=Measure)) + | ||
| 64 | + geom_line(aes(color=Measure))+ | ||
| 65 | + geom_point(aes(color=Measure))+ | ||
| 66 | + scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9"))+ | ||
| 67 | + #scale_color_manual(values=c("#e6194b", "#3cb44b", "#0082c8"))+ | ||
| 68 | + #geom_text(aes(label = Score))+ | ||
| 69 | + labs(title="Scores by condition (Best model, Run1)",x="Condition", y = "Score")+ | ||
| 70 | + theme( | ||
| 71 | + legend.position="top", | ||
| 72 | + # Centrar título: plot.title = element_text(hjust = 0.5), | ||
| 73 | + axis.line = element_line(colour = "gray"), | ||
| 74 | + panel.background = element_blank(), | ||
| 75 | + panel.grid.major = element_blank(), | ||
| 76 | + panel.grid.minor = element_blank(), | ||
| 77 | + panel.border = element_blank() | ||
| 78 | + ) | ||
| 79 | +pa | ||
| 80 | + | ||
| 81 | +ggsave(".png") | ||
| 82 | + |
| 1 | +# Based on http://zevross.com/blog/2019/04/02/easy-multi-panel-plots-in-r-using-facet_wrap-and-facet_grid-from-ggplot2/ | ||
| 2 | + | ||
| 3 | +library(ggplot2) | ||
| 4 | +#library(ggpubr) | ||
| 5 | +#library(cowplot) | ||
| 6 | + | ||
| 7 | +organism = 'ECO' | ||
| 8 | + | ||
| 9 | +if (organism == 'ECO') { | ||
| 10 | +######### ECO DEVELOPMENT DATASET ########## | ||
| 11 | + | ||
| 12 | +# ECO-DEV-WITH-EFFECT-COMBINATION: Combination of strategies with effect in E. coli development dataset | ||
| 13 | +df <- data.frame(Panel=rep(c("Combination of strategies (effect)", "Separated strategies (effect)", "Combination of strategies (no effect)", "Separated strategies (no effect)"), each=12), | ||
| 14 | + Measure=rep(c("Precision", "Recall", "F1-score"), each=4), | ||
| 15 | + Strategy=c(rep(c("D", "D+V", "D+V+At", "D+V+At+Au"),3),rep(c("D", "V", "At", "Au"),3)), | ||
| 16 | + Score=c( | ||
| 17 | + 0.78, 0.79, 0.81, 0.81, 0.41, 0.56, 0.63, 0.63, 0.53, 0.65, 0.71, 0.71, | ||
| 18 | + 0.78, 0.89, 0.93, 1.00, 0.41, 0.35, 0.13, 0.01, 0.53, 0.50, 0.23, 0.02, | ||
| 19 | + 0.82, 0.82, 0.84, 0.84, 0.55, 0.66, 0.72, 0.72, 0.66, 0.73, 0.78, 0.78, | ||
| 20 | + 0.82, 0.88, 0.94, 1.00, 0.55, 0.39, 0.20, 0.01, 0.66, 0.54, 0.33, 0.02)) | ||
| 21 | +filename = "ECO-dev-multi-panel.png" | ||
| 22 | +title_plot = "E. coli development dataset" | ||
| 23 | +} else if (organism == 'STM') | ||
| 24 | +{ | ||
| 25 | +######### STM DEVELOPMENT DATASET ########## | ||
| 26 | + | ||
| 27 | +# STM-DEV-WITH-EFFECT-COMBINATION: Combination of strategies with effect in Salmonella evaluation dataset | ||
| 28 | +df <- data.frame(Panel=rep(c("Combination of strategies (effect)", "Separated strategies (effect)", "Combination of strategies (no effect)", "Separated strategies (no effect)"), each=12), | ||
| 29 | + Measure=rep(c("Precision", "Recall", "F1-score"), each=4), | ||
| 30 | + Strategy=c(rep(c("D", "D+V", "D+V+At", "D+V+At+Au"),3),rep(c("D", "V", "At", "Au"),3)), | ||
| 31 | + Score=c( | ||
| 32 | + 0.78, 0.77, 0.76, 0.76, 0.33, 0.49, 0.54, 0.54, 0.47, 0.60, 0.63, 0.63, | ||
| 33 | + 0.78, 0.81, 0.70, 0.88, 0.33, 0.33, 0.10, 0.01, 0.47, 0.47, 0.18, 0.02, | ||
| 34 | + 0.84, 0.82, 0.81, 0.81, 0.47, 0.59, 0.65, 0.65, 0.60, 0.68, 0.72, 0.72, | ||
| 35 | + 0.84, 0.84, 0.77, 0.86, 0.47, 0.40, 0.17, 0.01, 0.60, 0.55, 0.27, 0.02)) | ||
| 36 | +filename = "STM-dev-multi-panel.png" | ||
| 37 | +title_plot = "Salmonella evaluation dataset" | ||
| 38 | +} | ||
| 39 | + | ||
| 40 | +head(df) | ||
| 41 | + | ||
| 42 | +pa<-ggplot(df, aes(x=Strategy, y=Score, group=Measure)) + | ||
| 43 | + geom_line(aes(color=Measure))+ | ||
| 44 | + geom_point(aes(color=Measure))+ | ||
| 45 | + scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9"))+ | ||
| 46 | + #scale_color_manual(values=c("#e6194b", "#3cb44b", "#0082c8"))+ | ||
| 47 | + geom_text(aes(label = Score))+ | ||
| 48 | + labs(title=title_plot,x="Strategies", y = "Score")+ | ||
| 49 | + #theme_classic()+ | ||
| 50 | + theme( | ||
| 51 | + legend.position="top", | ||
| 52 | + # Centrar título: plot.title = element_text(hjust = 0.5), | ||
| 53 | + axis.line = element_line(colour = "gray"), | ||
| 54 | + panel.background = element_blank(), | ||
| 55 | + panel.grid.major = element_blank(), | ||
| 56 | + panel.grid.minor = element_blank(), | ||
| 57 | + panel.border = element_blank(), | ||
| 58 | + )+ | ||
| 59 | + facet_wrap(~Panel, scale="free") | ||
| 60 | + | ||
| 61 | +ggsave(filename) | ||
| 62 | + |
CRF/bin/grid/grid_v14.sh
0 → 100644
| 1 | +#========================================variant10===================================== | ||
| 2 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 10 > ../../outputs/enero/Run1_v10.txt | ||
| 3 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run2 --variant 10 --S1 > ../../outputs/enero/Run2_v10.txt | ||
| 4 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run3 --variant 10 --S2 > ../../outputs/enero/Run3_v10.txt | ||
| 5 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run4 --variant 10 --S1 --S2 > ../../outputs/enero/Run4_v10.txt | ||
| 6 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run5 --variant 10 --S3 > ../../outputs/enero/Run5_v10.txt | ||
| 7 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run6 --variant 10 --S1 --S3 > ../../outputs/enero/Run6_v10.txt | ||
| 8 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run7 --variant 10 --S2 --S3 > ../../outputs/enero/Run7_v10.txt | ||
| 9 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run8 --variant 10 --S1 --S2 --S3 > ../../outputs/enero/Run8_v10.txt | ||
| 10 | +#=======================================S4 v10======================================= | ||
| 11 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run9 --variant 10 --S4 > ../../outputs/enero/Run9_v10.txt | ||
| 12 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run10 --variant 10 --S4 --S1 > ../../outputs/enero/Run10_v10.txt | ||
| 13 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run11 --variant 10 --S4 --S2 > ../../outputs/enero/Run10_v10.txt | ||
| 14 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run12 --variant 10 --S4 --S1 --S2 > ../../outputs/enero/Run12_v10.txt | ||
| 15 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run13 --variant 10 --S4 --S3 > ../../outputs/enero/Run13_v10.txt | ||
| 16 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run14 --variant 10 --S4 --S1 --S3 > ../../outputs/enero/Run14_v10.txt | ||
| 17 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run15 --variant 10 --S4 --S2 --S3 > ../../outputs/enero/Run15_v10.txt | ||
| 18 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run16 --variant 10 --S4 --S1 --S2 --S3 > ../../outputs/enero/Run16_v10.txt | ||
| 19 | +#========================================variant11===================================== | ||
| 20 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 11 > ../../outputs/enero/Run1_v11.txt | ||
| 21 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run2 --variant 11 --S1 > ../../outputs/enero/Run2_v11.txt | ||
| 22 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run3 --variant 11 --S2 > ../../outputs/enero/Run3_v11.txt | ||
| 23 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run4 --variant 11 --S1 --S2 > ../../outputs/enero/Run4_v11.txt | ||
| 24 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run5 --variant 11 --S3 > ../../outputs/enero/Run5_v11.txt | ||
| 25 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run6 --variant 11 --S1 --S3 > ../../outputs/enero/Run6_v11.txt | ||
| 26 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run7 --variant 11 --S2 --S3 > ../../outputs/enero/Run7_v11.txt | ||
| 27 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run8 --variant 11 --S1 --S2 --S3 > ../../outputs/enero/Run8_v11.txt | ||
| 28 | +#=======================================S4 v11======================================= | ||
| 29 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run9 --variant 11 --S4 > ../../outputs/enero/Run9_v11.txt | ||
| 30 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run10 --variant 11 --S4 --S1 > ../../outputs/enero/Run10_v11.txt | ||
| 31 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run11 --variant 11 --S4 --S2 > ../../outputs/enero/Run11_v11.txt | ||
| 32 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run12 --variant 11 --S4 --S1 --S2 > ../../outputs/enero/Run12_v11.txt | ||
| 33 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run13 --variant 11 --S4 --S3 > ../../outputs/enero/Run13_v11.txt | ||
| 34 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run14 --variant 11 --S4 --S1 --S3 > ../../outputs/enero/Run14_v11.txt | ||
| 35 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run15 --variant 11 --S4 --S2 --S3 > ../../outputs/enero/Run15_v11.txt | ||
| 36 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run16 --variant 11 --S4 --S1 --S2 --S3 > ../../outputs/enero/Run16_v11.txt | ||
| 37 | +#========================================variant12===================================== | ||
| 38 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 12 > ../../outputs/enero/Run1_v12.txt | ||
| 39 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run2 --variant 12 --S1 > ../../outputs/enero/Run2_v12.txt | ||
| 40 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run3 --variant 12 --S2 > ../../outputs/enero/Run3_v12.txt | ||
| 41 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run4 --variant 12 --S1 --S2 > ../../outputs/enero/Run4_v12.txt | ||
| 42 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run5 --variant 12 --S3 > ../../outputs/enero/Run5_v12.txt | ||
| 43 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run6 --variant 12 --S1 --S3 > ../../outputs/enero/Run6_v12.txt | ||
| 44 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run7 --variant 12 --S2 --S3 > ../../outputs/enero/Run7_v12.txt | ||
| 45 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run8 --variant 12 --S1 --S2 --S3 > ../../outputs/enero/Run8_v12.txt | ||
| 46 | +#=======================================S4 v12======================================= | ||
| 47 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run9 --variant 12 --S4 > ../../outputs/enero/Run9_v12.txt | ||
| 48 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run10 --variant 12 --S4 --S1 > ../../outputs/enero/Run10_v12.txt | ||
| 49 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run11 --variant 12 --S4 --S2 > ../../outputs/enero/Run12_v12.txt | ||
| 50 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run12 --variant 12 --S4 --S1 --S2 > ../../outputs/enero/Run12_v12.txt | ||
| 51 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run13 --variant 12 --S4 --S3 > ../../outputs/enero/Run13_v12.txt | ||
| 52 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run14 --variant 12 --S4 --S1 --S3 > ../../outputs/enero/Run14_v12.txt | ||
| 53 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run15 --variant 12 --S4 --S2 --S3 > ../../outputs/enero/Run15_v12.txt | ||
| 54 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run16 --variant 12 --S4 --S1 --S2 --S3 > ../../outputs/enero/Run16_v12.txt | ||
| 55 | +#========================================variant13===================================== | ||
| 56 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 13 > ../../outputs/enero/Run1_v13.txt | ||
| 57 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run2 --variant 13 --S1 > ../../outputs/enero/Run2_v13.txt | ||
| 58 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run3 --variant 13 --S2 > ../../outputs/enero/Run3_v13.txt | ||
| 59 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run4 --variant 13 --S1 --S2 > ../../outputs/enero/Run4_v13.txt | ||
| 60 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run5 --variant 13 --S3 > ../../outputs/enero/Run5_v13.txt | ||
| 61 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run6 --variant 13 --S1 --S3 > ../../outputs/enero/Run6_v13.txt | ||
| 62 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run7 --variant 13 --S2 --S3 > ../../outputs/enero/Run7_v13.txt | ||
| 63 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run8 --variant 13 --S1 --S2 --S3 > ../../outputs/enero/Run8_v13.txt | ||
| 64 | +#=======================================S4 v13======================================= | ||
| 65 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run9 --variant 13 --S4 > ../../outputs/enero/Run9_v13.txt | ||
| 66 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run10 --variant 13 --S4 --S1 > ../../outputs/enero/Run10_v13.txt | ||
| 67 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run11 --variant 13 --S4 --S2 > ../../outputs/enero/Run13_v13.txt | ||
| 68 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run12 --variant 13 --S4 --S1 --S2 > ../../outputs/enero/Run13_v13.txt | ||
| 69 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run13 --variant 13 --S4 --S3 > ../../outputs/enero/Run13_v13.txt | ||
| 70 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run14 --variant 13 --S4 --S1 --S3 > ../../outputs/enero/Run14_v13.txt | ||
| 71 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run15 --variant 13 --S4 --S2 --S3 > ../../outputs/enero/Run15_v13.txt | ||
| 72 | +python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run16 --variant 13 --S4 --S1 --S2 --S3 > ../../outputs/enero/Run16_v13.txt |
File moved
CRF/bin/training/nohup.out
0 → 100644
This diff could not be displayed because it is too large.
| 1 | # -*- coding: UTF-8 -*- | 1 | # -*- coding: UTF-8 -*- |
| 2 | 2 | ||
| 3 | import os | 3 | import os |
| 4 | -from itertools import chain | 4 | +#from itertools import chain |
| 5 | from optparse import OptionParser | 5 | from optparse import OptionParser |
| 6 | from time import time | 6 | from time import time |
| 7 | from collections import Counter | 7 | from collections import Counter | ... | ... |
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +import os # Access operative sistem | ||
| 4 | +#from itertools import chain # No se ocupa | ||
| 5 | +from optparse import OptionParser # Number of transitions | ||
| 6 | +from time import time # Return the time in seconds since the epoch as a float | ||
| 7 | +from collections import Counter # Dict subclass for counting hashable objects | ||
| 8 | +#import re # No se ocupa | ||
| 9 | + | ||
| 10 | +import nltk # Natural Language Toolkit platform to work with human language data | ||
| 11 | +import sklearn # Free software machine learning | ||
| 12 | +import scipy.stats # library of statistical functions | ||
| 13 | +import sys # to exit from Python. | ||
| 14 | + | ||
| 15 | +from sklearn.externals import joblib # provide lightweight pipelining | ||
| 16 | +from sklearn.metrics import make_scorer # Make a scorer from a performance metric or loss function | ||
| 17 | +from sklearn.cross_validation import cross_val_score # Evaluate a score by cross-validation | ||
| 18 | +from sklearn.grid_search import RandomizedSearchCV # Randomized search on hyper parameters | ||
| 19 | + | ||
| 20 | +import sklearn_crfsuite # Thin CRFsuite | ||
| 21 | +from sklearn_crfsuite import scorers # Added scorers.sequence_accuracy | ||
| 22 | +from sklearn_crfsuite import metrics # Add flat recall score to metrics | ||
| 23 | + | ||
| 24 | +from pandas import DataFrame as DF # Contruct dataframe object | ||
| 25 | +from nltk.corpus import stopwords # To exclude top words | ||
| 26 | + | ||
| 27 | +#------------------------------------------------------------------------------- | ||
| 28 | +# Objective | ||
| 29 | +# Training and evaluation of CRFs with sklearn-crfsuite. | ||
| 30 | +# | ||
| 31 | +# Input parameters | ||
| 32 | +# (1) --inputPath Path of training and test data set | ||
| 33 | +# (2) --outputPath Output path to place output files | ||
| 34 | +# (3) --trainingFile File with training data set | ||
| 35 | +# (4) --testFile File with test data set | ||
| 36 | +# (5) --reportName Number of run | ||
| 37 | +# (6) --variant Part of S2 variant | ||
| 38 | +# (7) --nrules Number of crf transitions | ||
| 39 | +# (8) --S1 Inner word features set | ||
| 40 | +# (9) --S2 Complete word features | ||
| 41 | +# (10) --S3 Extended context features | ||
| 42 | +# (11) --S4 Semantic features | ||
| 43 | +# (12) --excludeStopWords | ||
| 44 | +# (13) --excludeSymbols | ||
| 45 | + | ||
| 46 | +# Output | ||
| 47 | +# 1) Best model | ||
| 48 | +# 2) Report | ||
| 49 | + | ||
| 50 | +# Examples | ||
| 51 | +# python3 training_validation_v14.0.1.py | ||
| 52 | +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets | ||
| 53 | +# --trainingFile training-data-set-70-NER.txt | ||
| 54 | +# --testFile test-data-set-30-NER.txt | ||
| 55 | +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | ||
| 56 | +# --nrules 500 | ||
| 57 | +# --reportName Run1 | ||
| 58 | +# --variant 11 | ||
| 59 | +# --S1 | ||
| 60 | +# --S2 | ||
| 61 | +# --S3 | ||
| 62 | +# --S4 | ||
| 63 | + | ||
| 64 | +# python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 10 > ../../outputs/enero/Run1_v10.txt | ||
| 65 | + | ||
| 66 | +################################################################## | ||
| 67 | +# FEATURES # | ||
| 68 | +################################################################## | ||
| 69 | + | ||
| 70 | +#================== COMPLETE WORD FEATURES ======================# | ||
| 71 | + | ||
| 72 | +def isGreek(word): | ||
| 73 | + ## Complete word are greek letters | ||
| 74 | + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω', | ||
| 75 | + 'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω'] | ||
| 76 | + if word in alphabet: | ||
| 77 | + return True | ||
| 78 | + else: | ||
| 79 | + return False | ||
| 80 | + | ||
| 81 | +#================ INNER OF THE WORD FEATURES ====================# | ||
| 82 | + | ||
| 83 | +def hGreek(word): | ||
| 84 | + ## Search for at least has one greek letter | ||
| 85 | + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω','α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω'] | ||
| 86 | + # hexadicimal code | ||
| 87 | + matches = [letter for letter in word if letter in alphabet] | ||
| 88 | + if (len(matches) > 0): | ||
| 89 | + return(True) | ||
| 90 | + else: return(False) | ||
| 91 | + ## At least a greek letter | ||
| 92 | + | ||
| 93 | +def hNumber(word): | ||
| 94 | + ## Al leats has one greek letter | ||
| 95 | + for l in word: | ||
| 96 | + if l.isdigit(): | ||
| 97 | + return True | ||
| 98 | + return False | ||
| 99 | + | ||
| 100 | +def hUpper(word): | ||
| 101 | + ## At least an upper letter | ||
| 102 | + for l in word: | ||
| 103 | + if l.isupper(): return True | ||
| 104 | + return False | ||
| 105 | + | ||
| 106 | +def hLower(word): | ||
| 107 | + ## At least a lower letter | ||
| 108 | + for l in word: | ||
| 109 | + if l.islower(): return True | ||
| 110 | + return False | ||
| 111 | + | ||
| 112 | +#============================FEATURES===========================# | ||
| 113 | + | ||
| 114 | +def word2features(sent, i, S1, S2, S3, S4, v): #SA, v | ||
| 115 | + ## Getting word features | ||
| 116 | + | ||
| 117 | + ## Saving CoreNLP annotations | ||
| 118 | + listElem = sent[i].split('|') | ||
| 119 | + ## Split CoreNLP output by columns | ||
| 120 | + word = listElem[0] | ||
| 121 | + lemma = listElem[1] | ||
| 122 | + postag = listElem[2] | ||
| 123 | + ner = listElem[3] | ||
| 124 | + | ||
| 125 | + #=========================== G =============================# | ||
| 126 | + ## NAME LEVEL G | ||
| 127 | + ## FUTURE TYPE General features | ||
| 128 | + | ||
| 129 | + ## Adding to features dictionary | ||
| 130 | + features = { | ||
| 131 | + ## basal features | ||
| 132 | + 'lemma': lemma, | ||
| 133 | + 'postag': postag | ||
| 134 | + } | ||
| 135 | + | ||
| 136 | + ## Anterior lemma and postag | ||
| 137 | + ## need more tha one word in sentence | ||
| 138 | + if i > 0: | ||
| 139 | + ## Split CoreNLP output by columns | ||
| 140 | + listElem = sent[i - 1].split('|') | ||
| 141 | + | ||
| 142 | + ## Saving CoreNLP annotations | ||
| 143 | + lemma0 = listElem[1] | ||
| 144 | + postag0 = listElem[2] | ||
| 145 | + ## Adding features to dictionary | ||
| 146 | + features.update({ | ||
| 147 | + #LemaG anterior | ||
| 148 | + '-1:lemma': lemma0, | ||
| 149 | + #Postag anterior | ||
| 150 | + '-1:postag': postag0, | ||
| 151 | + }) | ||
| 152 | + | ||
| 153 | + ## Posterior lemma and postag | ||
| 154 | + ## is not the last word | ||
| 155 | + if i < len(sent) - 1: | ||
| 156 | + ## Posterior word | ||
| 157 | + listElem = sent[i + 1].split('|') | ||
| 158 | + ## Saving CoreNLP annotations | ||
| 159 | + lemma2 = listElem[1] | ||
| 160 | + postag2 = listElem[2] | ||
| 161 | + ## Adding to features dictionary | ||
| 162 | + features.update({ | ||
| 163 | + #LemaG posterior | ||
| 164 | + '+1:lemma': lemma2, | ||
| 165 | + #Postag posterior | ||
| 166 | + '+1:postag': postag2, | ||
| 167 | + }) | ||
| 168 | + | ||
| 169 | + #=========================== S1 =============================# | ||
| 170 | + ## NAME LEVEL S1 | ||
| 171 | + ## FEATURE TYPE Inner word features | ||
| 172 | + | ||
| 173 | + if S1: | ||
| 174 | + ## Adding features to dictionary | ||
| 175 | + features.update({ | ||
| 176 | + 'hUpper' : hUpper(word), | ||
| 177 | + 'hLower' : hLower(word), | ||
| 178 | + 'hGreek' : hGreek(word), | ||
| 179 | + 'symb' : word.isalnum() | ||
| 180 | + }) | ||
| 181 | + #========== Variants of inner words features ============# | ||
| 182 | + if v == 10: | ||
| 183 | + #word first character | ||
| 184 | + features['word[:1]']= word[:1] | ||
| 185 | + | ||
| 186 | + #word second character | ||
| 187 | + if len(word)>1: | ||
| 188 | + features['word[:2]']= word[:2] | ||
| 189 | + | ||
| 190 | + if v == 11: | ||
| 191 | + #lemma and postag first dharacter | ||
| 192 | + features['lemma[:1]']= lemma[:1] | ||
| 193 | + features['postag[:1]']= postag[:1] | ||
| 194 | + | ||
| 195 | + #lemma and postag secondChar | ||
| 196 | + if len(lemma)>1: | ||
| 197 | + features['lemma[:2]']= lemma[:2] | ||
| 198 | + if len(postag)>1: | ||
| 199 | + features['postag[:2]']= postag[:2] | ||
| 200 | + | ||
| 201 | + if v == 12: | ||
| 202 | + #word first character | ||
| 203 | + features['word[:1]']= word[:1] | ||
| 204 | + | ||
| 205 | + #word second character | ||
| 206 | + if len(word)>1: | ||
| 207 | + features['word[:2]']= word[:2] | ||
| 208 | + | ||
| 209 | + #postag first character | ||
| 210 | + features['postag[:1]']= postag[:1] | ||
| 211 | + | ||
| 212 | + #postag second character | ||
| 213 | + if len(postag)>1: | ||
| 214 | + features['postag[:2]']= postag[:2] | ||
| 215 | + | ||
| 216 | + if v == 13: | ||
| 217 | + #lemma first character | ||
| 218 | + features['lemma[:1]']= lemma[:1] | ||
| 219 | + | ||
| 220 | + #lemma second character | ||
| 221 | + if len(lemma)>1: | ||
| 222 | + features['lemma[:2]']= lemma[:2] | ||
| 223 | + | ||
| 224 | + #=========================== S2 =============================# | ||
| 225 | + ## NAME LEVEL S2 | ||
| 226 | + ## FEATURE TYPE Complete word features | ||
| 227 | + | ||
| 228 | + if S2: | ||
| 229 | + #Add features to dictionary | ||
| 230 | + features.update({ | ||
| 231 | + 'word' : word, | ||
| 232 | + 'isUpper' : word.isupper(), | ||
| 233 | + 'isLower' : word.islower(), | ||
| 234 | + 'isGreek' : isGreek(word), | ||
| 235 | + 'isNumber' : word.isdigit() | ||
| 236 | + }) | ||
| 237 | + ## Anterior word | ||
| 238 | + ## sentence needs more tha one word | ||
| 239 | + if i > 0: | ||
| 240 | + ## Split CoreNLP output by columns | ||
| 241 | + listElem = sent[i - 1].split('|') | ||
| 242 | + ## Saving CoreNLP annotations | ||
| 243 | + word0 = listElem[0] | ||
| 244 | + features['-1:word']= word0 | ||
| 245 | + | ||
| 246 | + ## Posterior word | ||
| 247 | + ## is not the last word | ||
| 248 | + if i < len(sent)-1: | ||
| 249 | + ## Split CoreNLP output by columns | ||
| 250 | + listElem = sent[i + 1].split('|') | ||
| 251 | + ## Saving CoreNLP annotations | ||
| 252 | + word2 = listElem[0] | ||
| 253 | + features['+1:word']= word2 | ||
| 254 | + | ||
| 255 | + #=========================== S3 =============================# | ||
| 256 | + ## NAME LEVEL S3 | ||
| 257 | + ## FEATURE TYPE Extended context features | ||
| 258 | + if S3: | ||
| 259 | + ## more than two words in sentence | ||
| 260 | + if i > 1: | ||
| 261 | + ## Split CoreNLP output by columns | ||
| 262 | + listElem = sent[i - 2].split('|') | ||
| 263 | + ## Saving CoreNLP annotations | ||
| 264 | + ## two anterior lemma and postag | ||
| 265 | + lemma01 = listElem[1] | ||
| 266 | + postag01 = listElem[2] | ||
| 267 | + features['-2:lemma']= lemma01 | ||
| 268 | + features['-2:postag']= postag01 | ||
| 269 | + | ||
| 270 | + ## is not the penultimate word | ||
| 271 | + if i < len(sent) - 2: | ||
| 272 | + ## Split CoreNLP output by columns | ||
| 273 | + listElem = sent[i + 2].split('|') | ||
| 274 | + ## Saving CoreNLP annotations | ||
| 275 | + lemma02 = listElem[1] | ||
| 276 | + postag02 = listElem[2] | ||
| 277 | + ## two posterior lemma and postag | ||
| 278 | + features['+2:lemma']= lemma02 | ||
| 279 | + features['+2:postag']= postag02 | ||
| 280 | + | ||
| 281 | + #=========================== S4 =============================# | ||
| 282 | + ## NAME LEVEL S4if S4: | ||
| 283 | + ## FEATURE TYPE NER | ||
| 284 | + if S4: | ||
| 285 | + ## more than one word in sentence | ||
| 286 | + if i > 0: | ||
| 287 | + ## Split CoreNLP output by columns | ||
| 288 | + listElem = sent[i - 1].split('|') | ||
| 289 | + ## =============== Anterior ner ====================## | ||
| 290 | + ## Saving CoreNLP annotations according column position | ||
| 291 | + ner0 = listElem[3] | ||
| 292 | + ## Adding to features dictionary | ||
| 293 | + features['-1:ner'] = ner | ||
| 294 | + | ||
| 295 | + ## is not the last word | ||
| 296 | + if i < len(sent) - 1: | ||
| 297 | + ## Split CoreNLP output by columns | ||
| 298 | + listElem = sent[i + 1].split('|') | ||
| 299 | + ## ============= Posterior ner ====================## | ||
| 300 | + ## Saving CoreNLP annotations according column position | ||
| 301 | + ner2 = listElem[3] | ||
| 302 | + ## Adding to features dictionary | ||
| 303 | + features['+1:ner'] = ner2 | ||
| 304 | + | ||
| 305 | + if i > 1: | ||
| 306 | + ## Split CoreNLP output by columns | ||
| 307 | + listElem = sent[i - 2].split('|') | ||
| 308 | + ## Saving CoreNLP annotations | ||
| 309 | + ## =============== 2 Anterior ner =================## | ||
| 310 | + ner01 = listElem[3] | ||
| 311 | + features['-2:ner']= ner01 | ||
| 312 | + | ||
| 313 | + ## is not the penultimate word | ||
| 314 | + if i < len(sent) - 2: | ||
| 315 | + ## Split CoreNLP output by columns | ||
| 316 | + listElem = sent[i + 2].split('|') | ||
| 317 | + ## Saving CoreNLP annotations | ||
| 318 | + ner02 = listElem[3] | ||
| 319 | + ## ============= 2 Posterior ner =================## | ||
| 320 | + features['+2:ner']= ner02 | ||
| 321 | + | ||
| 322 | + return features | ||
| 323 | + | ||
| 324 | +def sent2features(sent, S1, S2, S3, S4, v): | ||
| 325 | + ## Itering in sentence for each word and saving its features | ||
| 326 | + return [word2features(sent, i, S1, S2, S3, S4, v) for i in range(len(sent))] | ||
| 327 | + | ||
| 328 | +def sent2labels(sent): | ||
| 329 | + ## Save tag, last position by word tokens | ||
| 330 | + return [elem.split('|')[-1] for elem in sent] | ||
| 331 | + | ||
| 332 | +def sent2tokens(sent): | ||
| 333 | + return [token for token, postag, label in sent] | ||
| 334 | + | ||
| 335 | +def print_transitions(trans_features, f): | ||
| 336 | + for (label_from, label_to), weight in trans_features: | ||
| 337 | + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) | ||
| 338 | + | ||
| 339 | +def print_state_features(state_features, f): | ||
| 340 | + for (attr, label), weight in state_features: | ||
| 341 | + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | ||
| 342 | + | ||
| 343 | + | ||
| 344 | +__author__ = 'egaytan' | ||
| 345 | + | ||
| 346 | +################################################################## | ||
| 347 | +# MAIN PROGRAM # | ||
| 348 | +################################################################## | ||
| 349 | + | ||
| 350 | +if __name__ == "__main__": | ||
| 351 | + ## Defining parameters | ||
| 352 | + parser = OptionParser() | ||
| 353 | + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH") | ||
| 354 | + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") | ||
| 355 | + parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE") | ||
| 356 | + parser.add_option("--testFile", dest="testFile", help="File with test data set", metavar="FILE") | ||
| 357 | + parser.add_option("--reportName", dest="reportName", help="Report number run", metavar="FILE") | ||
| 358 | + parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE") | ||
| 359 | + parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False) | ||
| 360 | + parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False) | ||
| 361 | + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False) | ||
| 362 | + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False) | ||
| 363 | + parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | ||
| 364 | + parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | ||
| 365 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") | ||
| 366 | + | ||
| 367 | + (options, args) = parser.parse_args() | ||
| 368 | + if len(args) > 0: | ||
| 369 | + parser.error("Any parameter given.") | ||
| 370 | + sys.exit(1) | ||
| 371 | + | ||
| 372 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 373 | + print("Path of test and training data sets: " + options.inputPath) | ||
| 374 | + print("Path of outputs: " + options.outputPath) | ||
| 375 | + print("File with training data set: " + str(options.trainingFile)) | ||
| 376 | + print("File with test data set: " + str(options.testFile)) | ||
| 377 | + print("reportName: " + str(options.reportName)) | ||
| 378 | + print("Exclude stop words: " + str(options.excludeStopWords)) | ||
| 379 | + print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4)) | ||
| 380 | + print("Run variant: " + str(options.variant)) | ||
| 381 | + print("Number of rules on report file: " + str(options.nrules)) | ||
| 382 | + | ||
| 383 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
| 384 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
| 385 | + print("Exclude symbols: " + str(options.excludeSymbols)) | ||
| 386 | + | ||
| 387 | + print('-------------------------------- PROCESSING --------------------------------') | ||
| 388 | + print('Reading corpus...') | ||
| 389 | + t0 = time() | ||
| 390 | + | ||
| 391 | + sentencesTrainingData = [] | ||
| 392 | + sentencesTestData = [] | ||
| 393 | + | ||
| 394 | + stopwords = [word for word in stopwords.words('english')] | ||
| 395 | + | ||
| 396 | + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | ||
| 397 | + for line in iFile.readlines(): | ||
| 398 | + listLine = [] | ||
| 399 | + line = line.strip('\n') | ||
| 400 | + for token in line.split(): | ||
| 401 | + if options.excludeStopWords: | ||
| 402 | + listToken = token.split('|') | ||
| 403 | + lemma = listToken[1] | ||
| 404 | + if lemma in stopwords: | ||
| 405 | + continue | ||
| 406 | + if options.excludeSymbols: | ||
| 407 | + listToken = token.split('|') | ||
| 408 | + lemma = listToken[1] | ||
| 409 | + if lemma in symbols: | ||
| 410 | + continue | ||
| 411 | + listLine.append(token) | ||
| 412 | + sentencesTrainingData.append(listLine) | ||
| 413 | + print(" Sentences training data: " + str(len(sentencesTrainingData))) | ||
| 414 | + | ||
| 415 | + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: | ||
| 416 | + for line in iFile.readlines(): | ||
| 417 | + listLine = [] | ||
| 418 | + line = line.strip('\n') | ||
| 419 | + for token in line.split(): | ||
| 420 | + if options.excludeStopWords: | ||
| 421 | + listToken = token.split('|') | ||
| 422 | + lemma = listToken[1] | ||
| 423 | + if lemma in stopwords: | ||
| 424 | + continue | ||
| 425 | + if options.excludeSymbols: | ||
| 426 | + listToken = token.split('|') | ||
| 427 | + lemma = listToken[1] | ||
| 428 | + if lemma in symbols: | ||
| 429 | + continue | ||
| 430 | + listLine.append(token) | ||
| 431 | + sentencesTestData.append(listLine) | ||
| 432 | + print(" Sentences test data: " + str(len(sentencesTestData))) | ||
| 433 | + | ||
| 434 | + print("Reading corpus done in: %fs" % (time() - t0)) | ||
| 435 | + | ||
| 436 | + print('-------------------------------- FEATURES --------------------------------') | ||
| 437 | + | ||
| 438 | + Dtraning = sent2features(sentencesTrainingData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2] | ||
| 439 | + Dtest = sent2features(sentencesTestData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2] | ||
| 440 | + print('--------------------------Features Training ---------------------------') | ||
| 441 | + print(DF(list(Dtraning.items()))) | ||
| 442 | + print('--------------------------- FeaturesTest -----------------------------') | ||
| 443 | + print(DF(list(Dtest.items()))) | ||
| 444 | + | ||
| 445 | + t0 = time() | ||
| 446 | + | ||
| 447 | + X_train = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTrainingData] | ||
| 448 | + y_train = [sent2labels(s) for s in sentencesTrainingData] | ||
| 449 | + | ||
| 450 | + X_test = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTestData] | ||
| 451 | + # print X_test | ||
| 452 | + y_test = [sent2labels(s) for s in sentencesTestData] | ||
| 453 | + | ||
| 454 | + ''' | ||
| 455 | + Fixed parameters | ||
| 456 | + crf = sklearn_crfsuite.CRF( | ||
| 457 | + algorithm='lbfgs', | ||
| 458 | + c1=0.1, | ||
| 459 | + c2=0.1, | ||
| 460 | + max_iterations=100, | ||
| 461 | + all_pgossible_transitions=True | ||
| 462 | + ) | ||
| 463 | + ''' | ||
| 464 | + # Hyperparameter Optimization | ||
| 465 | + crf = sklearn_crfsuite.CRF( | ||
| 466 | + algorithm='lbfgs', | ||
| 467 | + max_iterations=100, | ||
| 468 | + all_possible_transitions=True | ||
| 469 | + ) | ||
| 470 | + params_space = { | ||
| 471 | + 'c1': scipy.stats.expon(scale=0.5), | ||
| 472 | + 'c2': scipy.stats.expon(scale=0.05), | ||
| 473 | + } | ||
| 474 | + | ||
| 475 | + # Original: labels = list(crf.classes_) | ||
| 476 | + # Original: labels.remove('O') | ||
| 477 | + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH']) | ||
| 478 | + | ||
| 479 | + # use the same metric for evaluation | ||
| 480 | + f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) | ||
| 481 | + | ||
| 482 | + # search | ||
| 483 | + rs = RandomizedSearchCV(crf, params_space, | ||
| 484 | + cv=5, | ||
| 485 | + verbose=3, | ||
| 486 | + n_jobs=-1, | ||
| 487 | + n_iter=100, | ||
| 488 | + scoring=f1_scorer, | ||
| 489 | + random_state=42) | ||
| 490 | + | ||
| 491 | + rs.fit(X_train, y_train) | ||
| 492 | + | ||
| 493 | + # Fixed parameters | ||
| 494 | + # crf.fit(X_train, y_train) | ||
| 495 | + | ||
| 496 | + # Best hiperparameters | ||
| 497 | + # crf = rs.best_estimator_ | ||
| 498 | + | ||
| 499 | + nameReport = str(options.reportName) + '_v'+ str(options.variant) + '.txt' | ||
| 500 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | ||
| 501 | + oFile.write("********** TRAINING AND TESTING REPORT **********\n") | ||
| 502 | + oFile.write("Training file: " + options.trainingFile + '\n') | ||
| 503 | + oFile.write('\n') | ||
| 504 | + oFile.write('best params:' + str(rs.best_params_) + '\n') | ||
| 505 | + oFile.write('best CV score:' + str(rs.best_score_) + '\n') | ||
| 506 | + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000)) | ||
| 507 | + | ||
| 508 | + print("Training done in: %fs" % (time() - t0)) | ||
| 509 | + t0 = time() | ||
| 510 | + | ||
| 511 | + # Update best crf | ||
| 512 | + crf = rs.best_estimator_ | ||
| 513 | + | ||
| 514 | + # Saving model | ||
| 515 | + print(" Saving training model...") | ||
| 516 | + t1 = time() | ||
| 517 | + nameModel = 'model_' + str(options.reportName) + '_v'+ str(options.variant) + '_S1_' + str(options.S1) + '_S2_' + str(options.S2) + '_S3_' + str(options.S3) + '_S4_' + str(options.S4) + '_' + str(options.reportName) + '_v' + str(options.variant) +'.mod' | ||
| 518 | + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | ||
| 519 | + print(" Saving training model done in: %fs" % (time() - t1)) | ||
| 520 | + | ||
| 521 | + # Evaluation against test data | ||
| 522 | + y_pred = crf.predict(X_test) | ||
| 523 | + print("*********************************") | ||
| 524 | + print("Prediction done in: %fs" % (time() - t0)) | ||
| 525 | + | ||
| 526 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile: | ||
| 527 | + oFile.write('\n') | ||
| 528 | + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))) | ||
| 529 | + oFile.write('\n') | ||
| 530 | + # labels = list(crf.classes_) | ||
| 531 | + sorted_labels = sorted( | ||
| 532 | + labels, | ||
| 533 | + key=lambda name: (name[1:], name[0]) | ||
| 534 | + ) | ||
| 535 | + oFile.write(metrics.flat_classification_report( y_test, y_pred, labels=sorted_labels, digits=3)) | ||
| 536 | + oFile.write('\n') | ||
| 537 | + | ||
| 538 | + oFile.write("\nTop likely transitions:\n") | ||
| 539 | + print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile) | ||
| 540 | + oFile.write('\n') | ||
| 541 | + | ||
| 542 | + oFile.write("\nTop unlikely transitions:\n") | ||
| 543 | + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile) | ||
| 544 | + oFile.write('\n') | ||
| 545 | + | ||
| 546 | + oFile.write("\nTop positive:\n") | ||
| 547 | + print_state_features(Counter(crf.state_features_).most_common(options.nrules), oFile) | ||
| 548 | + oFile.write('\n') | ||
| 549 | + | ||
| 550 | + oFile.write("\nTop negative:\n") | ||
| 551 | + print_state_features(Counter(crf.state_features_).most_common()[-options.nrules:], oFile) | ||
| 552 | + oFile.write('\n') | ||
| 553 | + |
File moved
-
Please register or login to post a comment