Estefani Gaytan Nunez

upload

......@@ -10,35 +10,33 @@ import matplotlib.pyplot as plt
# Drawn figures of grid reports
#
# Input parameters
# --inputPath=PATH Path of inputfiles
# --outputPath=PATH Path to place output figures
# --inputPath Path of inputfiles
# --outputPath Path to place output figures
# --figureName single run specific name figure, multifigure first part of name
# --inputFile Use it for a single report
# --version CRF-script version of reports
# --join boolean, all figures together
#
# Output
# training and test data set
#
# Examples
# python figures-reports.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/nov13
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/nov13
# --figureName FiguresGrid
# --inputFile report_Run1_v11.txt
# -version v11
# --join
# python figures-reports.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/ --figureName FiguresGrid_v1 --inputFile report_Run1_v11.txt ..version v11
# python figures-reports.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/nov13 --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/nov13 --figureName FiguresGrid --join
__author__ = 'egaytan'
####################################################################################
# FUNCTIONS #
####################################################################################
def Filter(rfile, options,v):
if options[0]=='all':
if rfile[0:6]=='report' and rfile[-7:-4]==v: return(True)
elif rfile in options:
return(True)
return(False)
def savescreen(output, dic, path):
if output:
DF.from_dict(dic).to_csv(path+'.csv', sep = "\t", index = True)
####################################################################################
# MAIN PROGRAM #
......@@ -50,8 +48,7 @@ if __name__ == '__main__':
parser.add_option('--inputPath', dest='inputPath', help='Path of output from CoreNLP', metavar='PATH')
parser.add_option('--outputPath', dest='outputPath', help='Path to place output figures', metavar='PATH')
parser.add_option('--figureName', dest='figureName', help='Specific or first part of figurename', metavar='FILE')
parser.add_option('--version', dest='version', help='script version', metavar='FILE')
parser.add_option('--inputFile', dest='inputFile', help='Use it for a specific report files', metavar='FILE', default='all,')
parser.add_option('--table', dest='table', help='save score-table', action='store_true', default=False)
(options, args) = parser.parse_args()
if len(args) > 0:
......@@ -61,21 +58,13 @@ if __name__ == '__main__':
print('-------------------------------- PARAMETERS --------------------------------')
print('Path of output from CoreNLP: ' + str(options.inputPath))
print('Path to place output figures: ' + str(options.outputPath))
print('Specific or first part of figurename: ' + str(options.figureName))
print('CRF-script version: ' + str(options.version))
print('Figurename: ' + str(options.figureName))
print('-------------------------------- PROCESSING --------------------------------')
reportFileList = [ rfile for rfile in os.listdir(options.inputPath) if rfile[0:7] == "report_"]
print(','.join(reportFileList))
rawInputRepotsList = str(options.inputFile).split(',')
reportFileList = [ rfile for rfile in os.listdir(options.inputPath) if Filter(rfile, rawInputRepotsList, str(options.version)) ]
for inputFile in reportFileList:
scores = df(dict)
#CV={}
print('Report files: ' + str(options.inputFile ))
print('\n'.join(reportFileList))
print('----------------------------------- NOTE -----------------------------------')
print('\n-------- All chosen report files should be in inputPath given---------------\n')
print('------------------------------- SAVING DATA --------------------------------\n')
for report in reportFileList:
with open(os.path.join(options.inputPath, report), 'r') as File:
string = File.read()
......@@ -86,31 +75,23 @@ if __name__ == '__main__':
scores[report[7:11]]['f1-score']=summaryScores[2]
print(DF(scores).T)
print('------------------------------- SAVING TABLE --------------------------------\n')
with open(os.path.join(options.inputPath, str(options.figureName) ), 'w') as File:
scoresTable = DF(scores).T
imageName=os.path.join(options.outputPath, options.figureName)
ylab = "score",
print('------------------------------- SAVING DATA --------------------------------')
print('Saving score-table: ' + str(options.table))
imageName = os.path.join(options.outputPath, options.figureName)
savescreen(options.table, scores, imageName)
fig = plt.figure()
plt.grid(False)
plt.rcParams.update({'font.size': 15})
fig.set_figheight(13)
fig.set_figwidth(20)
plt.ylim(0.7, 1.1)
plt.xlabel("Runs")
plt.ylabel("score")
plt.xticks(range(8),scoresTable["CV"].index)
plt.plot(scoresTable['CV'], "--", color="red", label="CV")
plt.plot(scoresTable['precision'], color="blue", label="precision")
plt.plot(scoresTable['f1-score'], color="orange", label="F1")
plt.plot(scoresTable['recall'], color="g", label="recall")
plt.rcParams.update()
plt.grid()
plt.plot(scoresTable['precision'],'o--', label='precision', linewidth=3, markersize=15)
plt.plot(scoresTable['f1-score'], 'o--', label='F1', linewidth=3, markersize=15)
plt.plot(scoresTable['recall'], 'o--', label='recall' , linewidth=3, markersize=15)
plt.plot(scoresTable['CV'], 'o--', label='CV' , linewidth=3, markersize=15)
plt.legend(loc='lower right')
plt.tight_layout()
fig.savefig(imageName, pad_inches=0.5)
plt.xticks(range(8),['run1', 'run2', 'run3', 'run4', 'run5', 'run6', 'run7', 'run8'])
fig.savefig(imageName, bbox_inches='tight', pad_inches = 0.5)
......
# Based on http://www.sthda.com/english/wiki/ggplot2-line-plot-quick-start-guide-r-software-and-data-visualization
library(ggplot2)
#library(ggpubr)
#library(cowplot)
######### BEST MODELS ##########
# Run1
# Todas las condiciones
dfa <- data.frame(Measure=rep(c("Precision", "Recall", "F1-score"), each=15),
Strategy=rep(c(
"Agit",
"Gversion",
"Substrain",
"Vess",
"OD",
"Anti",
"Supp",
"Air",
"Gtype",
"Med",
"Temp",
"Technique",
"Phase",
"pH",
"Strain"
),3),
Score=c(
0,
0,
0,
0,
1,
1,
0.883,
0.92,
0.905,
0.852,
0.818,
0.88,
1,
1,
1,
0,
0,
0,
0,
0.405,
0.444,
0.669,
0.742,
0.811,
0.912,
1,
1,
0.947,
1,
1,
0,
0,
0,
0,
0.577,
0.615,
0.762,
0.821,
0.856,
0.881,
0.9,
0.936,
0.973,
1,
1
))
# Solo condiciones con F1-score > 0
# Run 1
df <- data.frame(Measure=rep(c("Precision", "Recall", "F1-score"), each=11),
Strategy=rep(c(
"OD",
"Anti",
"Supp",
"Air",
"Gtype",
"Med",
"Temp",
"Technique",
"Phase",
"pH",
"Strain"
),3),
Score=c(
1,
1,
0.883,
0.92,
0.905,
0.852,
0.818,
0.88,
1,
1,
1,
0.405,
0.444,
0.669,
0.742,
0.811,
0.912,
1,
1,
0.947,
1,
1,
0.577,
0.615,
0.762,
0.821,
0.856,
0.881,
0.9,
0.936,
0.973,
1,
1
))
head(df)
pa<-ggplot(df, aes(x=Strategy, y=Score, group=Measure)) +
geom_line(aes(color=Measure))+
geom_point(aes(color=Measure))+
scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9"))+
#scale_color_manual(values=c("#e6194b", "#3cb44b", "#0082c8"))+
#geom_text(aes(label = Score))+
labs(title="Scores by condition (Best model, Run1)",x="Condition", y = "Score")+
theme(
legend.position="top",
# Centrar título: plot.title = element_text(hjust = 0.5),
axis.line = element_line(colour = "gray"),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank()
)
pa
ggsave(".png")
# Solo condiciones con F1-score > 0
# Run 7
df <- data.frame(Measure=rep(c("Precision", "Recall", "F1-score"), each=11),
Strategy=rep(c(
"Anti",
"OD",
"Supp",
"Air",
"Gtype",
"Temp",
"Med",
"Technique",
"Phase",
"pH",
"Strain"
),3),
Score=c(
0.571,
1,
0.886,
0.939,
0.876,
0.818,
0.897,
0.952,
1,
1,
1,
0.444,
0.405,
0.684,
0.742,
0.802,
1,
0.912,
0.909,
0.947,
1,
1,
0.5,
0.577,
0.772,
0.829,
0.837,
0.9,
0.904,
0.93,
0.973,
1,
1
))
head(df)
pa<-ggplot(df, aes(x=Strategy, y=Score, group=Measure)) +
geom_line(aes(color=Measure))+
geom_point(aes(color=Measure))+
scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9"))+
#scale_color_manual(values=c("#e6194b", "#3cb44b", "#0082c8"))+
#geom_text(aes(label = Score))+
labs(title="Scores by condition (Best model, Run7)",x="Condition", y = "Score")+
theme(
legend.position="top",
# Centrar título: plot.title = element_text(hjust = 0.5),
axis.line = element_line(colour = "gray"),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank()
)
pa
ggsave(".png")
# Based on http://www.sthda.com/english/wiki/ggplot2-line-plot-quick-start-guide-r-software-and-data-visualization
library(ggplot2)
#library(ggpubr)
#library(cowplot)
######### BEST MODEL ##########
# Solo condiciones con F1-score > 0
# Run 6 (report_Run6_v11.txt)
df <- data.frame(Measure=rep(c("Precision", "Recall", "F1-score"), each=11),
Strategy=rep(c(
"Air",
"Anti",
"Gtype",
"Med",
"OD",
"pH",
"Phase",
"Supp",
"Technique",
"Temp",
"Vess"
),3),
Score=c(
0.565,
1,
0.889,
1,
1,
1,
0.882,
0.811,
1,
0.923,
1,
0.377,
1,
0.847,
0.943,
0.818,
1,
1,
0.799,
0.913,
0.828,
1,
0.452,
1,
0.867,
0.971,
0.9,
1,
0.938,
0.805,
0.955,
0.873,
1
))
head(df)
pa<-ggplot(df, aes(x=Strategy, y=Score, group=Measure)) +
geom_line(aes(color=Measure))+
geom_point(aes(color=Measure))+
scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9"))+
#scale_color_manual(values=c("#e6194b", "#3cb44b", "#0082c8"))+
#geom_text(aes(label = Score))+
labs(title="Scores by condition (Best model, Run1)",x="Condition", y = "Score")+
theme(
legend.position="top",
# Centrar título: plot.title = element_text(hjust = 0.5),
axis.line = element_line(colour = "gray"),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank()
)
pa
ggsave(".png")
# Based on http://zevross.com/blog/2019/04/02/easy-multi-panel-plots-in-r-using-facet_wrap-and-facet_grid-from-ggplot2/
library(ggplot2)
#library(ggpubr)
#library(cowplot)
organism = 'ECO'
if (organism == 'ECO') {
######### ECO DEVELOPMENT DATASET ##########
# ECO-DEV-WITH-EFFECT-COMBINATION: Combination of strategies with effect in E. coli development dataset
df <- data.frame(Panel=rep(c("Combination of strategies (effect)", "Separated strategies (effect)", "Combination of strategies (no effect)", "Separated strategies (no effect)"), each=12),
Measure=rep(c("Precision", "Recall", "F1-score"), each=4),
Strategy=c(rep(c("D", "D+V", "D+V+At", "D+V+At+Au"),3),rep(c("D", "V", "At", "Au"),3)),
Score=c(
0.78, 0.79, 0.81, 0.81, 0.41, 0.56, 0.63, 0.63, 0.53, 0.65, 0.71, 0.71,
0.78, 0.89, 0.93, 1.00, 0.41, 0.35, 0.13, 0.01, 0.53, 0.50, 0.23, 0.02,
0.82, 0.82, 0.84, 0.84, 0.55, 0.66, 0.72, 0.72, 0.66, 0.73, 0.78, 0.78,
0.82, 0.88, 0.94, 1.00, 0.55, 0.39, 0.20, 0.01, 0.66, 0.54, 0.33, 0.02))
filename = "ECO-dev-multi-panel.png"
title_plot = "E. coli development dataset"
} else if (organism == 'STM')
{
######### STM DEVELOPMENT DATASET ##########
# STM-DEV-WITH-EFFECT-COMBINATION: Combination of strategies with effect in Salmonella evaluation dataset
df <- data.frame(Panel=rep(c("Combination of strategies (effect)", "Separated strategies (effect)", "Combination of strategies (no effect)", "Separated strategies (no effect)"), each=12),
Measure=rep(c("Precision", "Recall", "F1-score"), each=4),
Strategy=c(rep(c("D", "D+V", "D+V+At", "D+V+At+Au"),3),rep(c("D", "V", "At", "Au"),3)),
Score=c(
0.78, 0.77, 0.76, 0.76, 0.33, 0.49, 0.54, 0.54, 0.47, 0.60, 0.63, 0.63,
0.78, 0.81, 0.70, 0.88, 0.33, 0.33, 0.10, 0.01, 0.47, 0.47, 0.18, 0.02,
0.84, 0.82, 0.81, 0.81, 0.47, 0.59, 0.65, 0.65, 0.60, 0.68, 0.72, 0.72,
0.84, 0.84, 0.77, 0.86, 0.47, 0.40, 0.17, 0.01, 0.60, 0.55, 0.27, 0.02))
filename = "STM-dev-multi-panel.png"
title_plot = "Salmonella evaluation dataset"
}
head(df)
pa<-ggplot(df, aes(x=Strategy, y=Score, group=Measure)) +
geom_line(aes(color=Measure))+
geom_point(aes(color=Measure))+
scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9"))+
#scale_color_manual(values=c("#e6194b", "#3cb44b", "#0082c8"))+
geom_text(aes(label = Score))+
labs(title=title_plot,x="Strategies", y = "Score")+
#theme_classic()+
theme(
legend.position="top",
# Centrar título: plot.title = element_text(hjust = 0.5),
axis.line = element_line(colour = "gray"),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
)+
facet_wrap(~Panel, scale="free")
ggsave(filename)
#========================================variant10=====================================
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 10 > ../../outputs/enero/Run1_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run2 --variant 10 --S1 > ../../outputs/enero/Run2_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run3 --variant 10 --S2 > ../../outputs/enero/Run3_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run4 --variant 10 --S1 --S2 > ../../outputs/enero/Run4_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run5 --variant 10 --S3 > ../../outputs/enero/Run5_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run6 --variant 10 --S1 --S3 > ../../outputs/enero/Run6_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run7 --variant 10 --S2 --S3 > ../../outputs/enero/Run7_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run8 --variant 10 --S1 --S2 --S3 > ../../outputs/enero/Run8_v10.txt
#=======================================S4 v10=======================================
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run9 --variant 10 --S4 > ../../outputs/enero/Run9_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run10 --variant 10 --S4 --S1 > ../../outputs/enero/Run10_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run11 --variant 10 --S4 --S2 > ../../outputs/enero/Run10_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run12 --variant 10 --S4 --S1 --S2 > ../../outputs/enero/Run12_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run13 --variant 10 --S4 --S3 > ../../outputs/enero/Run13_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run14 --variant 10 --S4 --S1 --S3 > ../../outputs/enero/Run14_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run15 --variant 10 --S4 --S2 --S3 > ../../outputs/enero/Run15_v10.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run16 --variant 10 --S4 --S1 --S2 --S3 > ../../outputs/enero/Run16_v10.txt
#========================================variant11=====================================
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 11 > ../../outputs/enero/Run1_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run2 --variant 11 --S1 > ../../outputs/enero/Run2_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run3 --variant 11 --S2 > ../../outputs/enero/Run3_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run4 --variant 11 --S1 --S2 > ../../outputs/enero/Run4_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run5 --variant 11 --S3 > ../../outputs/enero/Run5_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run6 --variant 11 --S1 --S3 > ../../outputs/enero/Run6_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run7 --variant 11 --S2 --S3 > ../../outputs/enero/Run7_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run8 --variant 11 --S1 --S2 --S3 > ../../outputs/enero/Run8_v11.txt
#=======================================S4 v11=======================================
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run9 --variant 11 --S4 > ../../outputs/enero/Run9_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run10 --variant 11 --S4 --S1 > ../../outputs/enero/Run10_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run11 --variant 11 --S4 --S2 > ../../outputs/enero/Run11_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run12 --variant 11 --S4 --S1 --S2 > ../../outputs/enero/Run12_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run13 --variant 11 --S4 --S3 > ../../outputs/enero/Run13_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run14 --variant 11 --S4 --S1 --S3 > ../../outputs/enero/Run14_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run15 --variant 11 --S4 --S2 --S3 > ../../outputs/enero/Run15_v11.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run16 --variant 11 --S4 --S1 --S2 --S3 > ../../outputs/enero/Run16_v11.txt
#========================================variant12=====================================
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 12 > ../../outputs/enero/Run1_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run2 --variant 12 --S1 > ../../outputs/enero/Run2_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run3 --variant 12 --S2 > ../../outputs/enero/Run3_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run4 --variant 12 --S1 --S2 > ../../outputs/enero/Run4_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run5 --variant 12 --S3 > ../../outputs/enero/Run5_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run6 --variant 12 --S1 --S3 > ../../outputs/enero/Run6_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run7 --variant 12 --S2 --S3 > ../../outputs/enero/Run7_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run8 --variant 12 --S1 --S2 --S3 > ../../outputs/enero/Run8_v12.txt
#=======================================S4 v12=======================================
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run9 --variant 12 --S4 > ../../outputs/enero/Run9_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run10 --variant 12 --S4 --S1 > ../../outputs/enero/Run10_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run11 --variant 12 --S4 --S2 > ../../outputs/enero/Run12_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run12 --variant 12 --S4 --S1 --S2 > ../../outputs/enero/Run12_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run13 --variant 12 --S4 --S3 > ../../outputs/enero/Run13_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run14 --variant 12 --S4 --S1 --S3 > ../../outputs/enero/Run14_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run15 --variant 12 --S4 --S2 --S3 > ../../outputs/enero/Run15_v12.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run16 --variant 12 --S4 --S1 --S2 --S3 > ../../outputs/enero/Run16_v12.txt
#========================================variant13=====================================
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 13 > ../../outputs/enero/Run1_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run2 --variant 13 --S1 > ../../outputs/enero/Run2_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run3 --variant 13 --S2 > ../../outputs/enero/Run3_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run4 --variant 13 --S1 --S2 > ../../outputs/enero/Run4_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run5 --variant 13 --S3 > ../../outputs/enero/Run5_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run6 --variant 13 --S1 --S3 > ../../outputs/enero/Run6_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run7 --variant 13 --S2 --S3 > ../../outputs/enero/Run7_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run8 --variant 13 --S1 --S2 --S3 > ../../outputs/enero/Run8_v13.txt
#=======================================S4 v13=======================================
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run9 --variant 13 --S4 > ../../outputs/enero/Run9_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run10 --variant 13 --S4 --S1 > ../../outputs/enero/Run10_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run11 --variant 13 --S4 --S2 > ../../outputs/enero/Run13_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run12 --variant 13 --S4 --S1 --S2 > ../../outputs/enero/Run13_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run13 --variant 13 --S4 --S3 > ../../outputs/enero/Run13_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run14 --variant 13 --S4 --S1 --S3 > ../../outputs/enero/Run14_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run15 --variant 13 --S4 --S2 --S3 > ../../outputs/enero/Run15_v13.txt
python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run16 --variant 13 --S4 --S1 --S2 --S3 > ../../outputs/enero/Run16_v13.txt
This diff could not be displayed because it is too large.
# -*- coding: UTF-8 -*-
import os
from itertools import chain
#from itertools import chain
from optparse import OptionParser
from time import time
from collections import Counter
......
# -*- coding: UTF-8 -*-
import os # Access operative sistem
#from itertools import chain # No se ocupa
from optparse import OptionParser # Number of transitions
from time import time # Return the time in seconds since the epoch as a float
from collections import Counter # Dict subclass for counting hashable objects
#import re # No se ocupa
import nltk # Natural Language Toolkit platform to work with human language data
import sklearn # Free software machine learning
import scipy.stats # library of statistical functions
import sys # to exit from Python.
from sklearn.externals import joblib # provide lightweight pipelining
from sklearn.metrics import make_scorer # Make a scorer from a performance metric or loss function
from sklearn.cross_validation import cross_val_score # Evaluate a score by cross-validation
from sklearn.grid_search import RandomizedSearchCV # Randomized search on hyper parameters
import sklearn_crfsuite # Thin CRFsuite
from sklearn_crfsuite import scorers # Added scorers.sequence_accuracy
from sklearn_crfsuite import metrics # Add flat recall score to metrics
from pandas import DataFrame as DF # Contruct dataframe object
from nltk.corpus import stopwords # To exclude top words
#-------------------------------------------------------------------------------
# Objective
# Training and evaluation of CRFs with sklearn-crfsuite.
#
# Input parameters
# (1) --inputPath Path of training and test data set
# (2) --outputPath Output path to place output files
# (3) --trainingFile File with training data set
# (4) --testFile File with test data set
# (5) --reportName Number of run
# (6) --variant Part of S2 variant
# (7) --nrules Number of crf transitions
# (8) --S1 Inner word features set
# (9) --S2 Complete word features
# (10) --S3 Extended context features
# (11) --S4 Semantic features
# (12) --excludeStopWords
# (13) --excludeSymbols
# Output
# 1) Best model
# 2) Report
# Examples
# python3 training_validation_v14.0.1.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
# --trainingFile training-data-set-70-NER.txt
# --testFile test-data-set-30-NER.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --nrules 500
# --reportName Run1
# --variant 11
# --S1
# --S2
# --S3
# --S4
# python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 10 > ../../outputs/enero/Run1_v10.txt
##################################################################
# FEATURES #
##################################################################
#================== COMPLETE WORD FEATURES ======================#
def isGreek(word):
## Complete word are greek letters
alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
if word in alphabet:
return True
else:
return False
#================ INNER OF THE WORD FEATURES ====================#
def hGreek(word):
## Search for at least has one greek letter
alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω','α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
# hexadicimal code
matches = [letter for letter in word if letter in alphabet]
if (len(matches) > 0):
return(True)
else: return(False)
## At least a greek letter
def hNumber(word):
## Al leats has one greek letter
for l in word:
if l.isdigit():
return True
return False
def hUpper(word):
## At least an upper letter
for l in word:
if l.isupper(): return True
return False
def hLower(word):
## At least a lower letter
for l in word:
if l.islower(): return True
return False
#============================FEATURES===========================#
def word2features(sent, i, S1, S2, S3, S4, v): #SA, v
## Getting word features
## Saving CoreNLP annotations
listElem = sent[i].split('|')
## Split CoreNLP output by columns
word = listElem[0]
lemma = listElem[1]
postag = listElem[2]
ner = listElem[3]
#=========================== G =============================#
## NAME LEVEL G
## FUTURE TYPE General features
## Adding to features dictionary
features = {
## basal features
'lemma': lemma,
'postag': postag
}
## Anterior lemma and postag
## need more tha one word in sentence
if i > 0:
## Split CoreNLP output by columns
listElem = sent[i - 1].split('|')
## Saving CoreNLP annotations
lemma0 = listElem[1]
postag0 = listElem[2]
## Adding features to dictionary
features.update({
#LemaG anterior
'-1:lemma': lemma0,
#Postag anterior
'-1:postag': postag0,
})
## Posterior lemma and postag
## is not the last word
if i < len(sent) - 1:
## Posterior word
listElem = sent[i + 1].split('|')
## Saving CoreNLP annotations
lemma2 = listElem[1]
postag2 = listElem[2]
## Adding to features dictionary
features.update({
#LemaG posterior
'+1:lemma': lemma2,
#Postag posterior
'+1:postag': postag2,
})
#=========================== S1 =============================#
## NAME LEVEL S1
## FEATURE TYPE Inner word features
if S1:
## Adding features to dictionary
features.update({
'hUpper' : hUpper(word),
'hLower' : hLower(word),
'hGreek' : hGreek(word),
'symb' : word.isalnum()
})
#========== Variants of inner words features ============#
if v == 10:
#word first character
features['word[:1]']= word[:1]
#word second character
if len(word)>1:
features['word[:2]']= word[:2]
if v == 11:
#lemma and postag first dharacter
features['lemma[:1]']= lemma[:1]
features['postag[:1]']= postag[:1]
#lemma and postag secondChar
if len(lemma)>1:
features['lemma[:2]']= lemma[:2]
if len(postag)>1:
features['postag[:2]']= postag[:2]
if v == 12:
#word first character
features['word[:1]']= word[:1]
#word second character
if len(word)>1:
features['word[:2]']= word[:2]
#postag first character
features['postag[:1]']= postag[:1]
#postag second character
if len(postag)>1:
features['postag[:2]']= postag[:2]
if v == 13:
#lemma first character
features['lemma[:1]']= lemma[:1]
#lemma second character
if len(lemma)>1:
features['lemma[:2]']= lemma[:2]
#=========================== S2 =============================#
## NAME LEVEL S2
## FEATURE TYPE Complete word features
if S2:
#Add features to dictionary
features.update({
'word' : word,
'isUpper' : word.isupper(),
'isLower' : word.islower(),
'isGreek' : isGreek(word),
'isNumber' : word.isdigit()
})
## Anterior word
## sentence needs more tha one word
if i > 0:
## Split CoreNLP output by columns
listElem = sent[i - 1].split('|')
## Saving CoreNLP annotations
word0 = listElem[0]
features['-1:word']= word0
## Posterior word
## is not the last word
if i < len(sent)-1:
## Split CoreNLP output by columns
listElem = sent[i + 1].split('|')
## Saving CoreNLP annotations
word2 = listElem[0]
features['+1:word']= word2
#=========================== S3 =============================#
## NAME LEVEL S3
## FEATURE TYPE Extended context features
if S3:
## more than two words in sentence
if i > 1:
## Split CoreNLP output by columns
listElem = sent[i - 2].split('|')
## Saving CoreNLP annotations
## two anterior lemma and postag
lemma01 = listElem[1]
postag01 = listElem[2]
features['-2:lemma']= lemma01
features['-2:postag']= postag01
## is not the penultimate word
if i < len(sent) - 2:
## Split CoreNLP output by columns
listElem = sent[i + 2].split('|')
## Saving CoreNLP annotations
lemma02 = listElem[1]
postag02 = listElem[2]
## two posterior lemma and postag
features['+2:lemma']= lemma02
features['+2:postag']= postag02
#=========================== S4 =============================#
## NAME LEVEL S4if S4:
## FEATURE TYPE NER
if S4:
## more than one word in sentence
if i > 0:
## Split CoreNLP output by columns
listElem = sent[i - 1].split('|')
## =============== Anterior ner ====================##
## Saving CoreNLP annotations according column position
ner0 = listElem[3]
## Adding to features dictionary
features['-1:ner'] = ner
## is not the last word
if i < len(sent) - 1:
## Split CoreNLP output by columns
listElem = sent[i + 1].split('|')
## ============= Posterior ner ====================##
## Saving CoreNLP annotations according column position
ner2 = listElem[3]
## Adding to features dictionary
features['+1:ner'] = ner2
if i > 1:
## Split CoreNLP output by columns
listElem = sent[i - 2].split('|')
## Saving CoreNLP annotations
## =============== 2 Anterior ner =================##
ner01 = listElem[3]
features['-2:ner']= ner01
## is not the penultimate word
if i < len(sent) - 2:
## Split CoreNLP output by columns
listElem = sent[i + 2].split('|')
## Saving CoreNLP annotations
ner02 = listElem[3]
## ============= 2 Posterior ner =================##
features['+2:ner']= ner02
return features
def sent2features(sent, S1, S2, S3, S4, v):
## Itering in sentence for each word and saving its features
return [word2features(sent, i, S1, S2, S3, S4, v) for i in range(len(sent))]
def sent2labels(sent):
## Save tag, last position by word tokens
return [elem.split('|')[-1] for elem in sent]
def sent2tokens(sent):
return [token for token, postag, label in sent]
def print_transitions(trans_features, f):
for (label_from, label_to), weight in trans_features:
f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
def print_state_features(state_features, f):
for (attr, label), weight in state_features:
f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
__author__ = 'egaytan'
##################################################################
# MAIN PROGRAM #
##################################################################
if __name__ == "__main__":
## Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile", help="File with test data set", metavar="FILE")
parser.add_option("--reportName", dest="reportName", help="Report number run", metavar="FILE")
parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE")
parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path of test and training data sets: " + options.inputPath)
print("Path of outputs: " + options.outputPath)
print("File with training data set: " + str(options.trainingFile))
print("File with test data set: " + str(options.testFile))
print("reportName: " + str(options.reportName))
print("Exclude stop words: " + str(options.excludeStopWords))
print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
print("Run variant: " + str(options.variant))
print("Number of rules on report file: " + str(options.nrules))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
print("Exclude symbols: " + str(options.excludeSymbols))
print('-------------------------------- PROCESSING --------------------------------')
print('Reading corpus...')
t0 = time()
sentencesTrainingData = []
sentencesTestData = []
stopwords = [word for word in stopwords.words('english')]
with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
for line in iFile.readlines():
listLine = []
line = line.strip('\n')
for token in line.split():
if options.excludeStopWords:
listToken = token.split('|')
lemma = listToken[1]
if lemma in stopwords:
continue
if options.excludeSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
continue
listLine.append(token)
sentencesTrainingData.append(listLine)
print(" Sentences training data: " + str(len(sentencesTrainingData)))
with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
for line in iFile.readlines():
listLine = []
line = line.strip('\n')
for token in line.split():
if options.excludeStopWords:
listToken = token.split('|')
lemma = listToken[1]
if lemma in stopwords:
continue
if options.excludeSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
continue
listLine.append(token)
sentencesTestData.append(listLine)
print(" Sentences test data: " + str(len(sentencesTestData)))
print("Reading corpus done in: %fs" % (time() - t0))
print('-------------------------------- FEATURES --------------------------------')
Dtraning = sent2features(sentencesTrainingData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
Dtest = sent2features(sentencesTestData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
print('--------------------------Features Training ---------------------------')
print(DF(list(Dtraning.items())))
print('--------------------------- FeaturesTest -----------------------------')
print(DF(list(Dtest.items())))
t0 = time()
X_train = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTrainingData]
y_train = [sent2labels(s) for s in sentencesTrainingData]
X_test = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTestData]
# print X_test
y_test = [sent2labels(s) for s in sentencesTestData]
'''
Fixed parameters
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_pgossible_transitions=True
)
'''
# Hyperparameter Optimization
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
max_iterations=100,
all_possible_transitions=True
)
params_space = {
'c1': scipy.stats.expon(scale=0.5),
'c2': scipy.stats.expon(scale=0.05),
}
# Original: labels = list(crf.classes_)
# Original: labels.remove('O')
labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)
# search
rs = RandomizedSearchCV(crf, params_space,
cv=5,
verbose=3,
n_jobs=-1,
n_iter=100,
scoring=f1_scorer,
random_state=42)
rs.fit(X_train, y_train)
# Fixed parameters
# crf.fit(X_train, y_train)
# Best hiperparameters
# crf = rs.best_estimator_
nameReport = str(options.reportName) + '_v'+ str(options.variant) + '.txt'
with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
oFile.write("********** TRAINING AND TESTING REPORT **********\n")
oFile.write("Training file: " + options.trainingFile + '\n')
oFile.write('\n')
oFile.write('best params:' + str(rs.best_params_) + '\n')
oFile.write('best CV score:' + str(rs.best_score_) + '\n')
oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
print("Training done in: %fs" % (time() - t0))
t0 = time()
# Update best crf
crf = rs.best_estimator_
# Saving model
print(" Saving training model...")
t1 = time()
nameModel = 'model_' + str(options.reportName) + '_v'+ str(options.variant) + '_S1_' + str(options.S1) + '_S2_' + str(options.S2) + '_S3_' + str(options.S3) + '_S4_' + str(options.S4) + '_' + str(options.reportName) + '_v' + str(options.variant) +'.mod'
joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
print(" Saving training model done in: %fs" % (time() - t1))
# Evaluation against test data
y_pred = crf.predict(X_test)
print("*********************************")
print("Prediction done in: %fs" % (time() - t0))
with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
oFile.write('\n')
oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
oFile.write('\n')
# labels = list(crf.classes_)
sorted_labels = sorted(
labels,
key=lambda name: (name[1:], name[0])
)
oFile.write(metrics.flat_classification_report( y_test, y_pred, labels=sorted_labels, digits=3))
oFile.write('\n')
oFile.write("\nTop likely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
oFile.write('\n')
oFile.write("\nTop unlikely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
oFile.write('\n')
oFile.write("\nTop positive:\n")
print_state_features(Counter(crf.state_features_).most_common(options.nrules), oFile)
oFile.write('\n')
oFile.write("\nTop negative:\n")
print_state_features(Counter(crf.state_features_).most_common()[-options.nrules:], oFile)
oFile.write('\n')