Showing
5 changed files
with
1092 additions
and
1 deletions
CRF/bin/label-split_training_test_v2.py
0 → 100644
| 1 | +#!/bin/python3 | ||
| 2 | +from optparse import OptionParser | ||
| 3 | +import re | ||
| 4 | +import os | ||
| 5 | +import random | ||
| 6 | + | ||
| 7 | + | ||
| 8 | +# Objective | ||
| 9 | +# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging | ||
| 10 | +# make data sets using only sentences with at least one true-tag | ||
| 11 | +# | ||
| 12 | +# Input parameters | ||
| 13 | +# --inputPath=PATH Path of inputfile | ||
| 14 | +# --outputPath=PATH Path to place output files | ||
| 15 | +# --trainingFile=testFile Output training data set | ||
| 16 | +# --testFile=testFile Output test data set | ||
| 17 | +# | ||
| 18 | +# Output | ||
| 19 | +# training and test data set | ||
| 20 | +# | ||
| 21 | +# Examples | ||
| 22 | +# python label-split_training_test_v2.py | ||
| 23 | +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ | ||
| 24 | +# --inputFile sentences.tsv_pakal_.conll | ||
| 25 | +# --trainingFile training-data-set-70.txt | ||
| 26 | +# --testFile test-data-set-30.txt | ||
| 27 | +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
| 28 | +# | ||
| 29 | +# | ||
| 30 | +# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
| 31 | + | ||
| 32 | + | ||
| 33 | +########################################## | ||
| 34 | +# MAIN PROGRAM # | ||
| 35 | +########################################## | ||
| 36 | + | ||
| 37 | +if __name__ == "__main__": | ||
| 38 | + # Defining parameters | ||
| 39 | + parser = OptionParser() | ||
| 40 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 41 | + help="Path of output from CoreNLP", metavar="PATH") | ||
| 42 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 43 | + help="Output path to place output files", | ||
| 44 | + metavar="PATH") | ||
| 45 | + parser.add_option("--inputFile", dest="inputFile", | ||
| 46 | + help="File with CoreNLP-tagging sentences", metavar="FILE") | ||
| 47 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
| 48 | + help="File with training data set", metavar="FILE") | ||
| 49 | + parser.add_option("--testFile", dest="testFile", | ||
| 50 | + help="File with test data set", metavar="FILE") | ||
| 51 | + | ||
| 52 | + (options, args) = parser.parse_args() | ||
| 53 | + if len(args) > 0: | ||
| 54 | + parser.error("Any parameter given.") | ||
| 55 | + sys.exit(1) | ||
| 56 | + | ||
| 57 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 58 | + print("Path of CoreNLP output: " + options.inputPath) | ||
| 59 | + print("File with CoreNLP-tagging sentences: " + str(options.inputFile)) | ||
| 60 | + print("Path of training data set: " + options.outputPath) | ||
| 61 | + print("File with training data set: " + str(options.trainingFile)) | ||
| 62 | + print("Path of test data set: " + options.outputPath) | ||
| 63 | + print("File with test data set: " + str(options.testFile)) | ||
| 64 | + print('-------------------------------- PROCESSING --------------------------------') | ||
| 65 | + ## begin of tagging | ||
| 66 | + in_labels = { | ||
| 67 | + '<Gtype>': 'Gtype', | ||
| 68 | + '<Gversion>': 'Gversion', | ||
| 69 | + '<Med>': 'Med', | ||
| 70 | + '<Phase>': 'Phase', | ||
| 71 | + '<Supp>': 'Supp', | ||
| 72 | + '<Technique>': 'Technique', | ||
| 73 | + '<Temp>': 'Temp', | ||
| 74 | + '<OD>': 'OD', | ||
| 75 | + '<Anti>': 'Anti' | ||
| 76 | + } | ||
| 77 | + ## End of tagging | ||
| 78 | + out_labels = { | ||
| 79 | + '<Air>': 'O', | ||
| 80 | + '</Air>': 'O', | ||
| 81 | + '</Gtype>': 'O', | ||
| 82 | + '</Gversion>': 'O', | ||
| 83 | + '</Med>': 'O', | ||
| 84 | + '</Phase>': 'O', | ||
| 85 | + '<Sample>': 'O', | ||
| 86 | + '</Sample>': 'O', | ||
| 87 | + '<Serie>': 'O', | ||
| 88 | + '</Serie>': 'O', | ||
| 89 | + '<Strain>': 'O', | ||
| 90 | + '</Strain>': 'O', | ||
| 91 | + '<Substrain>': 'O', | ||
| 92 | + '</Substrain>': 'O', | ||
| 93 | + '</Supp>': 'O', | ||
| 94 | + '</Technique>': 'O', | ||
| 95 | + '</Temp>': 'O', | ||
| 96 | + '</OD>': 'O', | ||
| 97 | + '<Agit>': 'O', | ||
| 98 | + '</Agit>': 'O', | ||
| 99 | + '<Name>': 'O', | ||
| 100 | + '</Name>': 'O', | ||
| 101 | + '<Orgn>': 'O', | ||
| 102 | + '</Orgn>': 'O', | ||
| 103 | + '</Anti>': 'O', | ||
| 104 | + '<Vess>': 'O', | ||
| 105 | + '</Vess>': 'O'} | ||
| 106 | + | ||
| 107 | + # Other label | ||
| 108 | + flag = 'O' | ||
| 109 | + # sentences counter | ||
| 110 | + lista = [] | ||
| 111 | + #First sentence | ||
| 112 | + sentence = '' | ||
| 113 | + with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: | ||
| 114 | + for line in input_file: | ||
| 115 | + if len(line.split('\t')) > 1: | ||
| 116 | + w = line.split('\t')[1] | ||
| 117 | + if w in in_labels or w in out_labels: | ||
| 118 | + #Tagging | ||
| 119 | + if w in in_labels.keys(): flag = in_labels[w] | ||
| 120 | + if w in out_labels: flag = out_labels[w] | ||
| 121 | + else: | ||
| 122 | + if w == "PGCGROWTHCONDITIONS": | ||
| 123 | + words = sentence.split(' ') | ||
| 124 | + #End of sentence | ||
| 125 | + tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ] | ||
| 126 | + #At least one true-tag on sentence | ||
| 127 | + if len(tags)> 0: | ||
| 128 | + lista.append(sentence) | ||
| 129 | + #New setence | ||
| 130 | + sentence = '' | ||
| 131 | + else: | ||
| 132 | + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') | ||
| 133 | + | ||
| 134 | + print("Number of sentences: " + str( len(lista) ) ) | ||
| 135 | + | ||
| 136 | + | ||
| 137 | + # Split 70 30 training and test sentences | ||
| 138 | + trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70)) | ||
| 139 | + testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex] | ||
| 140 | + | ||
| 141 | + with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile: | ||
| 142 | + Data = [lista[i] for i in trainingIndex] | ||
| 143 | + oFile.write('\n'.join(Data)) | ||
| 144 | + | ||
| 145 | + with open(os.path.join(options.outputPath, options.testFile), "w") as oFile: | ||
| 146 | + Data = [lista[i] for i in testIndex] | ||
| 147 | + oFile.write('\n'.join(Data)) | ||
| 148 | + | ||
| 149 | + print("==================================END===================================") |
CRF/bin/label-split_training_test_v2.py.save
0 → 100644
| 1 | +#!/bin/python3 | ||
| 2 | +from optparse import OptionParser | ||
| 3 | +import re | ||
| 4 | +import os | ||
| 5 | +import random | ||
| 6 | + | ||
| 7 | + | ||
| 8 | +# Objective | ||
| 9 | +# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging | ||
| 10 | +# make data sets using only sentences with at least one true-tag | ||
| 11 | +# | ||
| 12 | +# Input parameters | ||
| 13 | +# --inputPath=PATH Path of inputfile | ||
| 14 | +# --outputPath=PATH Path to place output files | ||
| 15 | +# --trainingFile=testFile Output training data set | ||
| 16 | +# --testFile=testFile Output test data set | ||
| 17 | +# | ||
| 18 | +# Output | ||
| 19 | +# training and test data set | ||
| 20 | +# | ||
| 21 | +# Examples | ||
| 22 | +# python label-split_training_test_v2.py | ||
| 23 | +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ | ||
| 24 | +# --inputFile sentences.tsv_pakal_.conll | ||
| 25 | +# --trainingFile training-data-set-70.txt | ||
| 26 | +# --testFile test-data-set-30.txt | ||
| 27 | +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
| 28 | +# | ||
| 29 | +# | ||
| 30 | +# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
| 31 | + | ||
| 32 | + | ||
| 33 | +########################################## | ||
| 34 | +# MAIN PROGRAM # | ||
| 35 | +########################################## | ||
| 36 | + | ||
| 37 | +if __name__ == "__main__": | ||
| 38 | + # Defining parameters | ||
| 39 | + parser = OptionParser() | ||
| 40 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 41 | + help="Path of output from CoreNLP", metavar="PATH") | ||
| 42 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 43 | + help="Output path to place output files", | ||
| 44 | + metavar="PATH") | ||
| 45 | + parser.add_option("--inputFile", dest="inputFile", | ||
| 46 | + help="File with CoreNLP-tagging sentences", metavar="FILE") | ||
| 47 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
| 48 | + help="File with training data set", metavar="FILE") | ||
| 49 | + parser.add_option("--testFile", dest="testFile", | ||
| 50 | + help="File with test data set", metavar="FILE") | ||
| 51 | + | ||
| 52 | + (options, args) = parser.parse_args() | ||
| 53 | + if len(args) > 0: | ||
| 54 | + parser.error("Any parameter given.") | ||
| 55 | + sys.exit(1) | ||
| 56 | + | ||
| 57 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 58 | + print("Path of CoreNLP output: " + options.inputPath) | ||
| 59 | + print("File with CoreNLP-tagging sentences: " + str(options.inputFile)) | ||
| 60 | + print("Path of training data set: " + options.outputPath) | ||
| 61 | + print("File with training data set: " + str(options.trainingFile)) | ||
| 62 | + print("Path of test data set: " + options.outputPath) | ||
| 63 | + print("File with test data set: " + str(options.testFile)) | ||
| 64 | + print('-------------------------------- PROCESSING --------------------------------') | ||
| 65 | + ## begin of tagging | ||
| 66 | + in_labels = { | ||
| 67 | + '<Gtype>': 'Gtype', | ||
| 68 | + '<Gversion>': 'Gversion', | ||
| 69 | + '<Med>': 'Med', | ||
| 70 | + '<Phase>': 'Phase', | ||
| 71 | + '<Supp>': 'Supp', | ||
| 72 | + '<Technique>': 'Technique', | ||
| 73 | + '<Temp>': 'Temp', | ||
| 74 | + '<OD>': 'OD', | ||
| 75 | + '<Anti>': 'Anti', | ||
| 76 | + '<Agit>': 'Agit', | ||
| 77 | + '<Vess>': 'Vess' | ||
| 78 | + } | ||
| 79 | + ## End of tagging | ||
| 80 | + out_labels = { | ||
| 81 | + '<Air>': 'O', | ||
| 82 | + '</Air>': 'O', | ||
| 83 | + '</Gtype>': 'O', | ||
| 84 | + '</Gversion>': 'O', | ||
| 85 | + '</Med>': 'O', | ||
| 86 | + '</Phase>': 'O', | ||
| 87 | + '<Sample>': 'O', | ||
| 88 | + '</Sample>': 'O', | ||
| 89 | + '<Serie>': 'O', | ||
| 90 | + '</Serie>': 'O', | ||
| 91 | + '<Strain>': 'O', | ||
| 92 | + '</Strain>': 'O', | ||
| 93 | + '<Substrain>': 'O', | ||
| 94 | + '</Substrain>': 'O', | ||
| 95 | + '</Supp>': 'O', | ||
| 96 | + '</Technique>': 'O', | ||
| 97 | + '</Temp>': 'O', | ||
| 98 | + '</OD>': 'O', | ||
| 99 | + '</Anti>': 'O', | ||
| 100 | + '</Agit>': 'O', | ||
| 101 | + '<Name>': 'O', | ||
| 102 | + '</Name>': 'O', | ||
| 103 | + '<Orgn>': 'O', | ||
| 104 | + '</Orgn>': 'O', | ||
| 105 | + '</Vess>': 'O'} | ||
| 106 | + | ||
| 107 | + # Other label | ||
| 108 | + flag = 'O' | ||
| 109 | + # sentences counter | ||
| 110 | + n=0 | ||
| 111 | + lista = [] | ||
| 112 | + #First sentence | ||
| 113 | + sentence = '' | ||
| 114 | + with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: | ||
| 115 | + for line in input_file: | ||
| 116 | + if len(line.split('\t')) > 1: | ||
| 117 | + w = line.split('\t')[1] | ||
| 118 | + if w in in_labels or w in out_labels: | ||
| 119 | + #Tagging | ||
| 120 | + if w in in_labels.keys(): flag = in_labels[w] | ||
| 121 | + if w in out_labels: flag = out_labels[w] | ||
| 122 | + else: | ||
| 123 | + if w == "PGCGROWTHCONDITIONS": | ||
| 124 | + words = sentence.split(' ') | ||
| 125 | + tags = [tag for tag in words if word.split('|')[-1] in in_labels.values() ] | ||
| 126 | + #At least one true-tag on sentence | ||
| 127 | + if len(tags)> 0: | ||
| 128 | + lista.append(sentence) | ||
| 129 | + #New setence | ||
| 130 | + sentence = '' | ||
| 131 | + n=n+1 | ||
| 132 | + else: | ||
| 133 | + #Building and save tagging sentence | ||
| 134 | + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') | ||
| 135 | + | ||
| 136 | + print("Number of sentences: " + str(n) + str(len(lista)+1)) | ||
| 137 | + | ||
| 138 | + | ||
| 139 | + # Split 70 30 training and test sentences | ||
| 140 | + trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70)) | ||
| 141 | + testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex] | ||
| 142 | + | ||
| 143 | + with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile: | ||
| 144 | + Data = [lista[i] for i in trainingIndex] | ||
| 145 | + oFile.write('\n'.join(Data)) | ||
| 146 | + | ||
| 147 | + with open(os.path.join(options.outputPath, options.testFile), "w") as oFile: | ||
| 148 | + Data = [lista[i] for i in testIndex] | ||
| 149 | + oFile.write('\n'.join(Data)) | ||
| 150 | + | ||
| 151 | + print("==================================END===================================") |
| ... | @@ -299,7 +299,7 @@ if __name__ == "__main__": | ... | @@ -299,7 +299,7 @@ if __name__ == "__main__": |
| 299 | 299 | ||
| 300 | # Original: labels = list(crf.classes_) | 300 | # Original: labels = list(crf.classes_) |
| 301 | # Original: labels.remove('O') | 301 | # Original: labels.remove('O') |
| 302 | - labels = list(['Air', 'Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Vess']) | 302 | + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti']) |
| 303 | 303 | ||
| 304 | # use the same metric for evaluation | 304 | # use the same metric for evaluation |
| 305 | f1_scorer = make_scorer(metrics.flat_f1_score, | 305 | f1_scorer = make_scorer(metrics.flat_f1_score, | ... | ... |
CRF/bin/training_validation_v4.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +import os | ||
| 4 | +from itertools import chain | ||
| 5 | +from optparse import OptionParser | ||
| 6 | +from time import time | ||
| 7 | +from collections import Counter | ||
| 8 | +import re | ||
| 9 | + | ||
| 10 | +import nltk | ||
| 11 | +import sklearn | ||
| 12 | +import scipy.stats | ||
| 13 | +import sys | ||
| 14 | + | ||
| 15 | +from sklearn.externals import joblib | ||
| 16 | +from sklearn.metrics import make_scorer | ||
| 17 | +from sklearn.cross_validation import cross_val_score | ||
| 18 | +from sklearn.grid_search import RandomizedSearchCV | ||
| 19 | + | ||
| 20 | +import sklearn_crfsuite | ||
| 21 | +from sklearn_crfsuite import scorers | ||
| 22 | +from sklearn_crfsuite import metrics | ||
| 23 | + | ||
| 24 | +from nltk.corpus import stopwords | ||
| 25 | + | ||
| 26 | + | ||
| 27 | +# Objective | ||
| 28 | +# Training and evaluation of CRFs with sklearn-crfsuite. | ||
| 29 | +# | ||
| 30 | +# Input parameters | ||
| 31 | +# --inputPath=PATH Path of training and test data set | ||
| 32 | +# --trainingFile File with training data set | ||
| 33 | +# --testFile File with test data set | ||
| 34 | +# --outputPath=PATH Output path to place output files | ||
| 35 | + | ||
| 36 | +# Output | ||
| 37 | +# 1) Best model | ||
| 38 | + | ||
| 39 | +# Examples | ||
| 40 | +# python training_validation_v3.py | ||
| 41 | +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
| 42 | +# --trainingFile training-data-set-70.txt | ||
| 43 | +# --testFile test-data-set-30.txt | ||
| 44 | +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/ | ||
| 45 | +# python3.4 training-validation_v3.py --inputPatTH /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/ | ||
| 46 | + | ||
| 47 | +################################# | ||
| 48 | +# FUNCTIONS # | ||
| 49 | +################################# | ||
| 50 | + | ||
| 51 | +def isGreek(word): | ||
| 52 | + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω', | ||
| 53 | + 'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω'] | ||
| 54 | + if word in alphabet: | ||
| 55 | + return True | ||
| 56 | + else: | ||
| 57 | + return False | ||
| 58 | + | ||
| 59 | +def word2features(sent, i): | ||
| 60 | + listElem = sent[i].split('|') | ||
| 61 | + word = listElem[0] | ||
| 62 | + lemma = listElem[1] | ||
| 63 | + postag = listElem[2] | ||
| 64 | + | ||
| 65 | + features = { | ||
| 66 | + # Suffixes | ||
| 67 | + #'word[-3:]': word[-3:], | ||
| 68 | + #'word[-2:]': word[-2:], | ||
| 69 | + #'word[-1:]': word[-1:], | ||
| 70 | + #'word.isupper()': word.isupper(), | ||
| 71 | + 'word': word, | ||
| 72 | + 'lemma': lemma, | ||
| 73 | + #'postag': postag, | ||
| 74 | + #'lemma[-3:]': lemma[-3:], | ||
| 75 | + #'lemma[-2:]': lemma[-2:], | ||
| 76 | + #'lemma[-1:]': lemma[-1:], | ||
| 77 | + #'lemma[+3:]': lemma[:3], | ||
| 78 | + #'lemma[+2:]': lemma[:2], | ||
| 79 | + #'lemma[+1:]': lemma[:1], | ||
| 80 | + #'word[:3]': word[:3], | ||
| 81 | + #'word[:2]': word[:2], | ||
| 82 | + #'word[:1]': word[:1], | ||
| 83 | + #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word), | ||
| 84 | + 'isNumber()': word.isdigit(), | ||
| 85 | + 'isGreek(){}'.format(isGreek(word)): isGreek(word), | ||
| 86 | + 'isupper()' : word.isupper(), | ||
| 87 | + 'islower()' : word.islower() | ||
| 88 | + } | ||
| 89 | + if i > 0: | ||
| 90 | + listElem = sent[i - 1].split('|') | ||
| 91 | + word1 = listElem[0] | ||
| 92 | + lemma1 = listElem[1] | ||
| 93 | + postag1 = listElem[2] | ||
| 94 | + features.update({ | ||
| 95 | + #'-1:word': word1, | ||
| 96 | + '-1:lemma': lemma1, | ||
| 97 | + '-1:postag': postag1, | ||
| 98 | + }) | ||
| 99 | + | ||
| 100 | + if i < len(sent) - 1: | ||
| 101 | + listElem = sent[i + 1].split('|') | ||
| 102 | + #word1 = listElem[0] | ||
| 103 | + lemma1 = listElem[1] | ||
| 104 | + postag1 = listElem[2] | ||
| 105 | + features.update({ | ||
| 106 | + #'+1:word': word1, | ||
| 107 | + '+1:lemma': lemma1, | ||
| 108 | + '+1:postag': postag1, | ||
| 109 | + }) | ||
| 110 | + | ||
| 111 | + ''' | ||
| 112 | + if i > 1: | ||
| 113 | + listElem = sent[i - 2].split('|') | ||
| 114 | + word2 = listElem[0] | ||
| 115 | + lemma2 = listElem[1] | ||
| 116 | + postag2 = listElem[2] | ||
| 117 | + features.update({ | ||
| 118 | + '-2:word': word2, | ||
| 119 | + '-2:lemma': lemma2, | ||
| 120 | + }) | ||
| 121 | + | ||
| 122 | + if i < len(sent) - 2: | ||
| 123 | + listElem = sent[i + 2].split('|') | ||
| 124 | + word2 = listElem[0] | ||
| 125 | + lemma2 = listElem[1] | ||
| 126 | + postag2 = listElem[2] | ||
| 127 | + features.update({ | ||
| 128 | + '+2:word': word2, | ||
| 129 | + '+2:lemma': lemma2, | ||
| 130 | + }) | ||
| 131 | + | ||
| 132 | + trigrams = False | ||
| 133 | + if trigrams: | ||
| 134 | + if i > 2: | ||
| 135 | + listElem = sent[i - 3].split('|') | ||
| 136 | + word3 = listElem[0] | ||
| 137 | + lemma3 = listElem[1] | ||
| 138 | + postag3 = listElem[2] | ||
| 139 | + features.update({ | ||
| 140 | + '-3:word': word3, | ||
| 141 | + '-3:lemma': lemma3, | ||
| 142 | + }) | ||
| 143 | + | ||
| 144 | + if i < len(sent) - 3: | ||
| 145 | + listElem = sent[i + 3].split('|') | ||
| 146 | + word3 = listElem[0] | ||
| 147 | + lemma3 = listElem[1] | ||
| 148 | + postag3 = listElem[2] | ||
| 149 | + features.update({ | ||
| 150 | + '+3:word': word3, | ||
| 151 | + '+3:lemma': lemma3, | ||
| 152 | + }) | ||
| 153 | + ''' | ||
| 154 | + return features | ||
| 155 | + | ||
| 156 | + | ||
| 157 | +def sent2features(sent): | ||
| 158 | + return [word2features(sent, i) for i in range(len(sent))] | ||
| 159 | + | ||
| 160 | + | ||
| 161 | +def sent2labels(sent): | ||
| 162 | + return [elem.split('|')[3] for elem in sent] | ||
| 163 | + | ||
| 164 | + | ||
| 165 | +def sent2tokens(sent): | ||
| 166 | + return [token for token, postag, label in sent] | ||
| 167 | + | ||
| 168 | + | ||
| 169 | +def print_transitions(trans_features, f): | ||
| 170 | + for (label_from, label_to), weight in trans_features: | ||
| 171 | + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) | ||
| 172 | + | ||
| 173 | + | ||
| 174 | +def print_state_features(state_features, f): | ||
| 175 | + for (attr, label), weight in state_features: | ||
| 176 | + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | ||
| 177 | + | ||
| 178 | + | ||
| 179 | +__author__ = 'CMendezC' | ||
| 180 | + | ||
| 181 | +########################################## | ||
| 182 | +# MAIN PROGRAM # | ||
| 183 | +########################################## | ||
| 184 | + | ||
| 185 | +if __name__ == "__main__": | ||
| 186 | + # Defining parameters | ||
| 187 | + parser = OptionParser() | ||
| 188 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 189 | + help="Path of training data set", metavar="PATH") | ||
| 190 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 191 | + help="Output path to place output files", | ||
| 192 | + metavar="PATH") | ||
| 193 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
| 194 | + help="File with training data set", metavar="FILE") | ||
| 195 | + parser.add_option("--testFile", dest="testFile", | ||
| 196 | + help="File with test data set", metavar="FILE") | ||
| 197 | + parser.add_option("--excludeStopWords", default=False, | ||
| 198 | + action="store_true", dest="excludeStopWords", | ||
| 199 | + help="Exclude stop words") | ||
| 200 | + parser.add_option("--excludeSymbols", default=False, | ||
| 201 | + action="store_true", dest="excludeSymbols", | ||
| 202 | + help="Exclude punctuation marks") | ||
| 203 | + parser.add_option("--reportFile", dest="reportFile", | ||
| 204 | + help="Report file", metavar="FILE") | ||
| 205 | + | ||
| 206 | + (options, args) = parser.parse_args() | ||
| 207 | + if len(args) > 0: | ||
| 208 | + parser.error("Any parameter given.") | ||
| 209 | + sys.exit(1) | ||
| 210 | + | ||
| 211 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 212 | + print("Path of training data set: " + options.inputPath) | ||
| 213 | + print("File with training data set: " + str(options.trainingFile)) | ||
| 214 | + print("Path of test data set: " + options.inputPath) | ||
| 215 | + print("File with test data set: " + str(options.testFile)) | ||
| 216 | + print("Exclude stop words: " + str(options.excludeStopWords)) | ||
| 217 | + print("Report file: " + str(options.reportFile)) | ||
| 218 | + | ||
| 219 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
| 220 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
| 221 | + #print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols)) | ||
| 222 | + print("Exclude symbols: " + str(options.excludeSymbols)) | ||
| 223 | + | ||
| 224 | + print('-------------------------------- PROCESSING --------------------------------') | ||
| 225 | + print('Reading corpus...') | ||
| 226 | + t0 = time() | ||
| 227 | + | ||
| 228 | + sentencesTrainingData = [] | ||
| 229 | + sentencesTestData = [] | ||
| 230 | + | ||
| 231 | + stopwords = [word for word in stopwords.words('english')] | ||
| 232 | + | ||
| 233 | + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | ||
| 234 | + for line in iFile.readlines(): | ||
| 235 | + listLine = [] | ||
| 236 | + line = line.strip('\n') | ||
| 237 | + for token in line.split(): | ||
| 238 | + if options.excludeStopWords: | ||
| 239 | + listToken = token.split('|') | ||
| 240 | + lemma = listToken[1] | ||
| 241 | + if lemma in stopwords: | ||
| 242 | + continue | ||
| 243 | + if options.excludeSymbols: | ||
| 244 | + listToken = token.split('|') | ||
| 245 | + lemma = listToken[1] | ||
| 246 | + if lemma in symbols: | ||
| 247 | + continue | ||
| 248 | + listLine.append(token) | ||
| 249 | + sentencesTrainingData.append(listLine) | ||
| 250 | + print(" Sentences training data: " + str(len(sentencesTrainingData))) | ||
| 251 | + | ||
| 252 | + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: | ||
| 253 | + for line in iFile.readlines(): | ||
| 254 | + listLine = [] | ||
| 255 | + line = line.strip('\n') | ||
| 256 | + for token in line.split(): | ||
| 257 | + if options.excludeStopWords: | ||
| 258 | + listToken = token.split('|') | ||
| 259 | + lemma = listToken[1] | ||
| 260 | + if lemma in stopwords: | ||
| 261 | + continue | ||
| 262 | + if options.excludeSymbols: | ||
| 263 | + listToken = token.split('|') | ||
| 264 | + lemma = listToken[1] | ||
| 265 | + if lemma in symbols: | ||
| 266 | + continue | ||
| 267 | + listLine.append(token) | ||
| 268 | + sentencesTestData.append(listLine) | ||
| 269 | + print(" Sentences test data: " + str(len(sentencesTestData))) | ||
| 270 | + | ||
| 271 | + print("Reading corpus done in: %fs" % (time() - t0)) | ||
| 272 | + | ||
| 273 | + print(sent2features(sentencesTrainingData[0])[0]) | ||
| 274 | + print(sent2features(sentencesTestData[0])[0]) | ||
| 275 | + t0 = time() | ||
| 276 | + | ||
| 277 | + X_train = [sent2features(s) for s in sentencesTrainingData] | ||
| 278 | + y_train = [sent2labels(s) for s in sentencesTrainingData] | ||
| 279 | + | ||
| 280 | + X_test = [sent2features(s) for s in sentencesTestData] | ||
| 281 | + # print X_test | ||
| 282 | + y_test = [sent2labels(s) for s in sentencesTestData] | ||
| 283 | + | ||
| 284 | + # Fixed parameters | ||
| 285 | + # crf = sklearn_crfsuite.CRF( | ||
| 286 | + # algorithm='lbfgs', | ||
| 287 | + # c1=0.1, | ||
| 288 | + # c2=0.1, | ||
| 289 | + # max_iterations=100, | ||
| 290 | + # all_possible_transitions=True | ||
| 291 | + # ) | ||
| 292 | + | ||
| 293 | + # Hyperparameter Optimization | ||
| 294 | + crf = sklearn_crfsuite.CRF( | ||
| 295 | + algorithm='lbfgs', | ||
| 296 | + max_iterations=100, | ||
| 297 | + all_possible_transitions=True | ||
| 298 | + ) | ||
| 299 | + params_space = { | ||
| 300 | + 'c1': scipy.stats.expon(scale=0.5), | ||
| 301 | + 'c2': scipy.stats.expon(scale=0.05), | ||
| 302 | + } | ||
| 303 | + | ||
| 304 | + # Original: labels = list(crf.classes_) | ||
| 305 | + # Original: labels.remove('O') | ||
| 306 | + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti']) | ||
| 307 | + | ||
| 308 | + # use the same metric for evaluation | ||
| 309 | + f1_scorer = make_scorer(metrics.flat_f1_score, | ||
| 310 | + average='weighted', labels=labels) | ||
| 311 | + | ||
| 312 | + # search | ||
| 313 | + rs = RandomizedSearchCV(crf, params_space, | ||
| 314 | + cv=10, | ||
| 315 | + verbose=3, | ||
| 316 | + n_jobs=-1, | ||
| 317 | + n_iter=20, | ||
| 318 | + # n_iter=50, | ||
| 319 | + scoring=f1_scorer) | ||
| 320 | + rs.fit(X_train, y_train) | ||
| 321 | + | ||
| 322 | + # Fixed parameters | ||
| 323 | + # crf.fit(X_train, y_train) | ||
| 324 | + | ||
| 325 | + # Best hiperparameters | ||
| 326 | + # crf = rs.best_estimator_ | ||
| 327 | + #nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(options.excludeSymbols) + '.txt') | ||
| 328 | + nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.reportFile)) | ||
| 329 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | ||
| 330 | + oFile.write("********** TRAINING AND TESTING REPORT **********\n") | ||
| 331 | + oFile.write("Training file: " + options.trainingFile + '\n') | ||
| 332 | + oFile.write('\n') | ||
| 333 | + oFile.write('best params:' + str(rs.best_params_) + '\n') | ||
| 334 | + oFile.write('best CV score:' + str(rs.best_score_) + '\n') | ||
| 335 | + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000)) | ||
| 336 | + | ||
| 337 | + print("Training done in: %fs" % (time() - t0)) | ||
| 338 | + t0 = time() | ||
| 339 | + | ||
| 340 | + # Update best crf | ||
| 341 | + crf = rs.best_estimator_ | ||
| 342 | + | ||
| 343 | + # Saving model | ||
| 344 | + print(" Saving training model...") | ||
| 345 | + t1 = time() | ||
| 346 | + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | ||
| 347 | + options.excludeSymbols) + '.mod') | ||
| 348 | + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | ||
| 349 | + print(" Saving training model done in: %fs" % (time() - t1)) | ||
| 350 | + | ||
| 351 | + # Evaluation against test data | ||
| 352 | + y_pred = crf.predict(X_test) | ||
| 353 | + print("*********************************") | ||
| 354 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | ||
| 355 | + options.excludeSymbols) + '.txt') | ||
| 356 | + with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: | ||
| 357 | + for y in y_pred: | ||
| 358 | + oFile.write(str(y) + '\n') | ||
| 359 | + | ||
| 360 | + print("*********************************") | ||
| 361 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | ||
| 362 | + options.excludeSymbols) + '.txt') | ||
| 363 | + with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: | ||
| 364 | + for y in y_test: | ||
| 365 | + oFile.write(str(y) + '\n') | ||
| 366 | + | ||
| 367 | + print("Prediction done in: %fs" % (time() - t0)) | ||
| 368 | + | ||
| 369 | + # labels = list(crf.classes_) | ||
| 370 | + # labels.remove('O') | ||
| 371 | + | ||
| 372 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile: | ||
| 373 | + oFile.write('\n') | ||
| 374 | + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))) | ||
| 375 | + oFile.write('\n') | ||
| 376 | + # labels = list(crf.classes_) | ||
| 377 | + sorted_labels = sorted( | ||
| 378 | + labels, | ||
| 379 | + key=lambda name: (name[1:], name[0]) | ||
| 380 | + ) | ||
| 381 | + oFile.write(metrics.flat_classification_report( | ||
| 382 | + y_test, y_pred, labels=sorted_labels, digits=3 | ||
| 383 | + )) | ||
| 384 | + oFile.write('\n') | ||
| 385 | + | ||
| 386 | + oFile.write("\nTop likely transitions:\n") | ||
| 387 | + print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | ||
| 388 | + oFile.write('\n') | ||
| 389 | + | ||
| 390 | + oFile.write("\nTop unlikely transitions:\n") | ||
| 391 | + print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | ||
| 392 | + oFile.write('\n') | ||
| 393 | + | ||
| 394 | + oFile.write("\nTop positive:\n") | ||
| 395 | + print_state_features(Counter(crf.state_features_).most_common(200), oFile) | ||
| 396 | + oFile.write('\n') | ||
| 397 | + | ||
| 398 | + oFile.write("\nTop negative:\n") | ||
| 399 | + print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile) | ||
| 400 | + oFile.write('\n') | ||
| 401 | + |
CRF/bin/training_validation_v5.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +import os | ||
| 4 | +from itertools import chain | ||
| 5 | +from optparse import OptionParser | ||
| 6 | +from time import time | ||
| 7 | +from collections import Counter | ||
| 8 | +import re | ||
| 9 | + | ||
| 10 | +import nltk | ||
| 11 | +import sklearn | ||
| 12 | +import scipy.stats | ||
| 13 | +import sys | ||
| 14 | + | ||
| 15 | +from sklearn.externals import joblib | ||
| 16 | +from sklearn.metrics import make_scorer | ||
| 17 | +from sklearn.cross_validation import cross_val_score | ||
| 18 | +from sklearn.grid_search import RandomizedSearchCV | ||
| 19 | + | ||
| 20 | +import sklearn_crfsuite | ||
| 21 | +from sklearn_crfsuite import scorers | ||
| 22 | +from sklearn_crfsuite import metrics | ||
| 23 | + | ||
| 24 | +from nltk.corpus import stopwords | ||
| 25 | + | ||
| 26 | + | ||
| 27 | +# Objective | ||
| 28 | +# Training and evaluation of CRFs with sklearn-crfsuite. | ||
| 29 | +# | ||
| 30 | +# Input parameters | ||
| 31 | +# --inputPath=PATH Path of training and test data set | ||
| 32 | +# --trainingFile File with training data set | ||
| 33 | +# --testFile File with test data set | ||
| 34 | +# --outputPath=PATH Output path to place output files | ||
| 35 | +# --reportFile Report Fileneme | ||
| 36 | + | ||
| 37 | +# Output | ||
| 38 | +# 1) Best model | ||
| 39 | + | ||
| 40 | +# Examples | ||
| 41 | +# python training_validation_v5.py | ||
| 42 | +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
| 43 | +# --trainingFile training-data-set-70.txt | ||
| 44 | +# --testFile test-data-set-30.txt | ||
| 45 | +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/ | ||
| 46 | +# --reportFile report_1 | ||
| 47 | +# python3.4 training-validation_v5.py --inputPatTH /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/ | ||
| 48 | + | ||
| 49 | +################################# | ||
| 50 | +# FUNCTIONS # | ||
| 51 | +################################# | ||
| 52 | + | ||
| 53 | +def isGreek(word): | ||
| 54 | + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω', | ||
| 55 | + 'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω'] | ||
| 56 | + if word in alphabet: | ||
| 57 | + return True | ||
| 58 | + else: | ||
| 59 | + return False | ||
| 60 | +def hUpper(word): | ||
| 61 | + for l in word: | ||
| 62 | + if l.isupper(): return True | ||
| 63 | + return False | ||
| 64 | + | ||
| 65 | +def hLower(word): | ||
| 66 | + for l in word: | ||
| 67 | + if l.islower(): return True | ||
| 68 | + return False | ||
| 69 | + | ||
| 70 | +def hGreek(word): | ||
| 71 | + for l in word: | ||
| 72 | + if isGreek(l): return True | ||
| 73 | + return False | ||
| 74 | + | ||
| 75 | + | ||
| 76 | +def word2features(sent, i, S1, S2): | ||
| 77 | + listElem = sent[i].split('|') | ||
| 78 | + word = listElem[0] | ||
| 79 | + lemma = listElem[1] | ||
| 80 | + postag = listElem[2] | ||
| 81 | + ner = listElem[3] | ||
| 82 | + | ||
| 83 | + features = { | ||
| 84 | + #General | ||
| 85 | + 'lemma': lemma, | ||
| 86 | + 'postag': postag | ||
| 87 | + } | ||
| 88 | + | ||
| 89 | + if S1: | ||
| 90 | + #S1 | ||
| 91 | + features['word']: word | ||
| 92 | + features['hUpper']: hUpper(word) | ||
| 93 | + features['hLower']: hUpper(word) | ||
| 94 | + features['hGreek']: hGreek(word) | ||
| 95 | + #features['hAlfNum']: hAlfNum(word) | ||
| 96 | + | ||
| 97 | + if S2: | ||
| 98 | + #S2 | ||
| 99 | + features['isUpper']: word.isupper() | ||
| 100 | + features['isLower']: word.isLower() | ||
| 101 | + features['isGreek']: isGreek(word) | ||
| 102 | + features['isNumber']: word.isdigit() | ||
| 103 | + | ||
| 104 | + if i > 0: | ||
| 105 | + listElem = sent[i - 1].split('|') | ||
| 106 | + word1 = listElem[0] | ||
| 107 | + lemma1 = listElem[1] | ||
| 108 | + postag1 = listElem[2] | ||
| 109 | + features.update({ | ||
| 110 | + #Word anterioir | ||
| 111 | + '-1:word': word1, | ||
| 112 | + #LemaG posterior | ||
| 113 | + '-1:lemma': lemma1, | ||
| 114 | + #PostG posterior | ||
| 115 | + '-1:postag': postag1, | ||
| 116 | + }) | ||
| 117 | + | ||
| 118 | + if i < len(sent) - 1: | ||
| 119 | + listElem = sent[i + 1].split('|') | ||
| 120 | + word1 = listElem[0] | ||
| 121 | + lemma1 = listElem[1] | ||
| 122 | + postag1 = listElem[2] | ||
| 123 | + features.update({ | ||
| 124 | + #Word anterioir | ||
| 125 | + '+1:word': word1, | ||
| 126 | + #LemaG posterior | ||
| 127 | + '+1:lemma': lemma1, | ||
| 128 | + #PostG posterior | ||
| 129 | + '+1:postag': postag1, | ||
| 130 | + }) | ||
| 131 | + return features | ||
| 132 | + | ||
| 133 | + | ||
| 134 | +def sent2features(sent, S1, S2): | ||
| 135 | + return [word2features(sent, i, S1, S2) for i in range(len(sent))] | ||
| 136 | + | ||
| 137 | + | ||
| 138 | +def sent2labels(sent): | ||
| 139 | + return [elem.split('|')[3] for elem in sent] | ||
| 140 | + | ||
| 141 | + | ||
| 142 | +def sent2tokens(sent): | ||
| 143 | + return [token for token, postag, label in sent] | ||
| 144 | + | ||
| 145 | + | ||
| 146 | +def print_transitions(trans_features, f): | ||
| 147 | + for (label_from, label_to), weight in trans_features: | ||
| 148 | + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) | ||
| 149 | + | ||
| 150 | + | ||
| 151 | +def print_state_features(state_features, f): | ||
| 152 | + for (attr, label), weight in state_features: | ||
| 153 | + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | ||
| 154 | + | ||
| 155 | + | ||
| 156 | +__author__ = 'CMendezC' | ||
| 157 | + | ||
| 158 | +########################################## | ||
| 159 | +# MAIN PROGRAM # | ||
| 160 | +########################################## | ||
| 161 | + | ||
| 162 | +if __name__ == "__main__": | ||
| 163 | + # Defining parameters | ||
| 164 | + parser = OptionParser() | ||
| 165 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 166 | + help="Path of training data set", metavar="PATH") | ||
| 167 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 168 | + help="Output path to place output files", | ||
| 169 | + metavar="PATH") | ||
| 170 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
| 171 | + help="File with training data set", metavar="FILE") | ||
| 172 | + parser.add_option("--testFile", dest="testFile", | ||
| 173 | + help="File with test data set", metavar="FILE") | ||
| 174 | + parser.add_option("--excludeStopWords", default=False, | ||
| 175 | + action="store_true", dest="excludeStopWords", | ||
| 176 | + help="Exclude stop words") | ||
| 177 | + parser.add_option("--excludeSymbols", default=False, | ||
| 178 | + action="store_true", dest="excludeSymbols", | ||
| 179 | + help="Exclude punctuation marks") | ||
| 180 | + parser.add_option("--reportFile", dest="reportFile", | ||
| 181 | + help="Report file", metavar="FILE") | ||
| 182 | + parser.add_option("--S1", default=False, | ||
| 183 | + action="store_true", dest="S1", | ||
| 184 | + help="Level specificity") | ||
| 185 | + parser.add_option("--S2", default=False, | ||
| 186 | + action="store_true", dest="S2", | ||
| 187 | + help="Level specificity") | ||
| 188 | + | ||
| 189 | + (options, args) = parser.parse_args() | ||
| 190 | + if len(args) > 0: | ||
| 191 | + parser.error("Any parameter given.") | ||
| 192 | + sys.exit(1) | ||
| 193 | + | ||
| 194 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 195 | + print("Path of training data set: " + options.inputPath) | ||
| 196 | + print("File with training data set: " + str(options.trainingFile)) | ||
| 197 | + print("Path of test data set: " + options.inputPath) | ||
| 198 | + print("File with test data set: " + str(options.testFile)) | ||
| 199 | + print("Exclude stop words: " + str(options.excludeStopWords)) | ||
| 200 | + print("Levels: " + str(options.S1) + " " + str(options.S2)) | ||
| 201 | + print("Report file: " + str(options.reportFile)) | ||
| 202 | + | ||
| 203 | + | ||
| 204 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
| 205 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
| 206 | + print("Exclude symbols: " + str(options.excludeSymbols)) | ||
| 207 | + | ||
| 208 | + print('-------------------------------- PROCESSING --------------------------------') | ||
| 209 | + print('Reading corpus...') | ||
| 210 | + t0 = time() | ||
| 211 | + | ||
| 212 | + sentencesTrainingData = [] | ||
| 213 | + sentencesTestData = [] | ||
| 214 | + | ||
| 215 | + stopwords = [word for word in stopwords.words('english')] | ||
| 216 | + | ||
| 217 | + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | ||
| 218 | + for line in iFile.readlines(): | ||
| 219 | + listLine = [] | ||
| 220 | + line = line.strip('\n') | ||
| 221 | + for token in line.split(): | ||
| 222 | + if options.excludeStopWords: | ||
| 223 | + listToken = token.split('|') | ||
| 224 | + lemma = listToken[1] | ||
| 225 | + if lemma in stopwords: | ||
| 226 | + continue | ||
| 227 | + if options.excludeSymbols: | ||
| 228 | + listToken = token.split('|') | ||
| 229 | + lemma = listToken[1] | ||
| 230 | + if lemma in symbols: | ||
| 231 | + continue | ||
| 232 | + listLine.append(token) | ||
| 233 | + sentencesTrainingData.append(listLine) | ||
| 234 | + print(" Sentences training data: " + str(len(sentencesTrainingData))) | ||
| 235 | + | ||
| 236 | + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: | ||
| 237 | + for line in iFile.readlines(): | ||
| 238 | + listLine = [] | ||
| 239 | + line = line.strip('\n') | ||
| 240 | + for token in line.split(): | ||
| 241 | + if options.excludeStopWords: | ||
| 242 | + listToken = token.split('|') | ||
| 243 | + lemma = listToken[1] | ||
| 244 | + if lemma in stopwords: | ||
| 245 | + continue | ||
| 246 | + if options.excludeSymbols: | ||
| 247 | + listToken = token.split('|') | ||
| 248 | + lemma = listToken[1] | ||
| 249 | + if lemma in symbols: | ||
| 250 | + continue | ||
| 251 | + listLine.append(token) | ||
| 252 | + sentencesTestData.append(listLine) | ||
| 253 | + print(" Sentences test data: " + str(len(sentencesTestData))) | ||
| 254 | + | ||
| 255 | + print("Reading corpus done in: %fs" % (time() - t0)) | ||
| 256 | + | ||
| 257 | + if options.S1: S1 = 0 | ||
| 258 | + else: S1 = 1 | ||
| 259 | + if options.S2: S2 = 0 | ||
| 260 | + else: S2 = 1 | ||
| 261 | + | ||
| 262 | + print(sent2features(sentencesTrainingData[0], S1, S2)[0]) | ||
| 263 | + print(sent2features(sentencesTestData[0], S1, S2)[0]) | ||
| 264 | + t0 = time() | ||
| 265 | + | ||
| 266 | + X_train = [sent2features(s, S1, S2) for s in sentencesTrainingData] | ||
| 267 | + y_train = [sent2labels(s) for s in sentencesTrainingData] | ||
| 268 | + | ||
| 269 | + X_test = [sent2features(s, S1, S2) for s in sentencesTestData] | ||
| 270 | + # print X_test | ||
| 271 | + y_test = [sent2labels(s) for s in sentencesTestData] | ||
| 272 | + | ||
| 273 | + # Fixed parameters | ||
| 274 | + # crf = sklearn_crfsuite.CRF( | ||
| 275 | + # algorithm='lbfgs', | ||
| 276 | + # c1=0.1, | ||
| 277 | + # c2=0.1, | ||
| 278 | + # max_iterations=100, | ||
| 279 | + # all_possible_transitions=True | ||
| 280 | + # ) | ||
| 281 | + | ||
| 282 | + # Hyperparameter Optimization | ||
| 283 | + crf = sklearn_crfsuite.CRF( | ||
| 284 | + algorithm='lbfgs', | ||
| 285 | + max_iterations=100, | ||
| 286 | + all_possible_transitions=True | ||
| 287 | + ) | ||
| 288 | + params_space = { | ||
| 289 | + 'c1': scipy.stats.expon(scale=0.5), | ||
| 290 | + 'c2': scipy.stats.expon(scale=0.05), | ||
| 291 | + } | ||
| 292 | + | ||
| 293 | + # Original: labels = list(crf.classes_) | ||
| 294 | + # Original: labels.remove('O') | ||
| 295 | + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti']) | ||
| 296 | + | ||
| 297 | + # use the same metric for evaluation | ||
| 298 | + f1_scorer = make_scorer(metrics.flat_f1_score, | ||
| 299 | + average='weighted', labels=labels) | ||
| 300 | + | ||
| 301 | + # search | ||
| 302 | + rs = RandomizedSearchCV(crf, params_space, | ||
| 303 | + cv=10, | ||
| 304 | + verbose=3, | ||
| 305 | + n_jobs=-1, | ||
| 306 | + n_iter=20, | ||
| 307 | + # n_iter=50, | ||
| 308 | + scoring=f1_scorer) | ||
| 309 | + rs.fit(X_train, y_train) | ||
| 310 | + | ||
| 311 | + # Fixed parameters | ||
| 312 | + # crf.fit(X_train, y_train) | ||
| 313 | + | ||
| 314 | + # Best hiperparameters | ||
| 315 | + # crf = rs.best_estimator_ | ||
| 316 | + nameReport = options.trainingFile.replace('.txt', str(options.reportFile) + '.txt') | ||
| 317 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | ||
| 318 | + oFile.write("********** TRAINING AND TESTING REPORT **********\n") | ||
| 319 | + oFile.write("Training file: " + options.trainingFile + '\n') | ||
| 320 | + oFile.write('\n') | ||
| 321 | + oFile.write('best params:' + str(rs.best_params_) + '\n') | ||
| 322 | + oFile.write('best CV score:' + str(rs.best_score_) + '\n') | ||
| 323 | + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000)) | ||
| 324 | + | ||
| 325 | + print("Training done in: %fs" % (time() - t0)) | ||
| 326 | + t0 = time() | ||
| 327 | + | ||
| 328 | + # Update best crf | ||
| 329 | + crf = rs.best_estimator_ | ||
| 330 | + | ||
| 331 | + # Saving model | ||
| 332 | + print(" Saving training model...") | ||
| 333 | + t1 = time() | ||
| 334 | + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | ||
| 335 | + options.excludeSymbols) + '.mod') | ||
| 336 | + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | ||
| 337 | + print(" Saving training model done in: %fs" % (time() - t1)) | ||
| 338 | + | ||
| 339 | + # Evaluation against test data | ||
| 340 | + y_pred = crf.predict(X_test) | ||
| 341 | + print("*********************************") | ||
| 342 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | ||
| 343 | + options.excludeSymbols) + '.txt') | ||
| 344 | + with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: | ||
| 345 | + for y in y_pred: | ||
| 346 | + oFile.write(str(y) + '\n') | ||
| 347 | + | ||
| 348 | + print("*********************************") | ||
| 349 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | ||
| 350 | + options.excludeSymbols) + '.txt') | ||
| 351 | + with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: | ||
| 352 | + for y in y_test: | ||
| 353 | + oFile.write(str(y) + '\n') | ||
| 354 | + | ||
| 355 | + print("Prediction done in: %fs" % (time() - t0)) | ||
| 356 | + | ||
| 357 | + # labels = list(crf.classes_) | ||
| 358 | + # labels.remove('O') | ||
| 359 | + | ||
| 360 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile: | ||
| 361 | + oFile.write('\n') | ||
| 362 | + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))) | ||
| 363 | + oFile.write('\n') | ||
| 364 | + # labels = list(crf.classes_) | ||
| 365 | + sorted_labels = sorted( | ||
| 366 | + labels, | ||
| 367 | + key=lambda name: (name[1:], name[0]) | ||
| 368 | + ) | ||
| 369 | + oFile.write(metrics.flat_classification_report( | ||
| 370 | + y_test, y_pred, labels=sorted_labels, digits=3 | ||
| 371 | + )) | ||
| 372 | + oFile.write('\n') | ||
| 373 | + | ||
| 374 | + oFile.write("\nTop likely transitions:\n") | ||
| 375 | + print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | ||
| 376 | + oFile.write('\n') | ||
| 377 | + | ||
| 378 | + oFile.write("\nTop unlikely transitions:\n") | ||
| 379 | + print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | ||
| 380 | + oFile.write('\n') | ||
| 381 | + | ||
| 382 | + oFile.write("\nTop positive:\n") | ||
| 383 | + print_state_features(Counter(crf.state_features_).most_common(200), oFile) | ||
| 384 | + oFile.write('\n') | ||
| 385 | + | ||
| 386 | + oFile.write("\nTop negative:\n") | ||
| 387 | + print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile) | ||
| 388 | + oFile.write('\n') | ||
| 389 | + | ||
| 390 | + |
-
Please register or login to post a comment