Estefani Gaytan Nunez

upload

...@@ -9,9 +9,11 @@ import random ...@@ -9,9 +9,11 @@ import random
9 # 9 #
10 # Input parameters 10 # Input parameters
11 # --inputPath=PATH Path of inputfile 11 # --inputPath=PATH Path of inputfile
12 +# --inputFile Output CoreNLP file with tagging sentences
12 # --outputPath=PATH Path to place output files 13 # --outputPath=PATH Path to place output files
13 # --trainingFile=testFile Output training data set 14 # --trainingFile=testFile Output training data set
14 # --testFile=testFile Output test data set 15 # --testFile=testFile Output test data set
16 +# --index Select a limit CoreNLP output column
15 # 17 #
16 # Output 18 # Output
17 # training and test data set 19 # training and test data set
...@@ -23,7 +25,7 @@ import random ...@@ -23,7 +25,7 @@ import random
23 # --trainingFile training-data-set-70_v4.txt 25 # --trainingFile training-data-set-70_v4.txt
24 # --testFile test-data-set-30_v4.txt 26 # --testFile test-data-set-30_v4.txt
25 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets 27 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
26 -# 28 +# --index 5
27 # 29 #
28 # python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5 30 # python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
29 31
......
...@@ -11,6 +11,7 @@ from optparse import OptionParser ...@@ -11,6 +11,7 @@ from optparse import OptionParser
11 # --outputFile=File Output data set 11 # --outputFile=File Output data set
12 # --minWordLen Minimum word length 12 # --minWordLen Minimum word length
13 # --minSenLen Minimum sentence length 13 # --minSenLen Minimum sentence length
14 +# --index Select a limit CoreNLP output column
14 # 15 #
15 # Output 16 # Output
16 # Tagged sentences reconstruction 17 # Tagged sentences reconstruction
...@@ -23,6 +24,7 @@ from optparse import OptionParser ...@@ -23,6 +24,7 @@ from optparse import OptionParser
23 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input 24 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
24 # --minWordLen 2 25 # --minWordLen 2
25 # --minSenLen 1 26 # --minSenLen 1
27 +# --index 5
26 # 28 #
27 #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 29 #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
28 30
...@@ -39,7 +41,7 @@ if __name__ == "__main__": ...@@ -39,7 +41,7 @@ if __name__ == "__main__":
39 parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE") 41 parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE")
40 parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int") 42 parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
41 parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int") 43 parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int")
42 - 44 + parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)
43 45
44 (options, args) = parser.parse_args() 46 (options, args) = parser.parse_args()
45 if len(args) > 0: 47 if len(args) > 0:
...@@ -58,13 +60,16 @@ if __name__ == "__main__": ...@@ -58,13 +60,16 @@ if __name__ == "__main__":
58 lista = [] 60 lista = []
59 #First sentence 61 #First sentence
60 sentence = '' 62 sentence = ''
63 + #count
64 + i = 0
61 with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: 65 with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
62 for line in input_file: 66 for line in input_file:
63 if len(line.split('\t')) > 1: 67 if len(line.split('\t')) > 1:
64 w = line.split('\t')[1] 68 w = line.split('\t')[1]
65 if w == "PGCGROWTHCONDITIONS": 69 if w == "PGCGROWTHCONDITIONS":
70 + i = i + 1
66 if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL: 71 if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:
67 - print( "EXCLUDE: " + sentence.lstrip() ) 72 + print( "EXCLUDE: " + str(i) + "line" + sentence.lstrip() )
68 else: 73 else:
69 #End of sentence 74 #End of sentence
70 lista.append(sentence.lstrip()) 75 lista.append(sentence.lstrip())
...@@ -74,7 +79,7 @@ if __name__ == "__main__": ...@@ -74,7 +79,7 @@ if __name__ == "__main__":
74 sentence = '' 79 sentence = ''
75 else: 80 else:
76 #Building and save tagging sentence 81 #Building and save tagging sentence
77 - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])) 82 + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index]))
78 83
79 print("Number of sentences: " + str(n)) 84 print("Number of sentences: " + str(n))
80 85
......
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from pandas import DataFrame as DF
5 +from optparse import OptionParser
6 +from time import time
7 +from collections import Counter
8 +
9 +import nltk
10 +import sklearn
11 +import scipy.stats
12 +import sys
13 +
14 +import joblib
15 +from sklearn.metrics import make_scorer
16 +from sklearn.model_selection import cross_val_score
17 +from sklearn.model_selection import RandomizedSearchCV
18 +
19 +import sklearn_crfsuite
20 +from sklearn_crfsuite import scorers
21 +from sklearn_crfsuite import metrics
22 +
23 +from nltk.corpus import stopwords
24 +
25 +import training_validation_v14 as training
26 +
27 +#-------------------------------------------------------------------------------
28 +# Objective
29 +# Tagging transformed file with CRF model with sklearn-crfsuite.
30 +#
31 +# Input parameters
32 +# --inputPath=PATH Path of transformed files x|y|z
33 +# --modelPath Path to CRF model
34 +# --modelName Model name
35 +# --outputPath=PATH Output path to place output files
36 +# --filteringStopWords Filtering stop words
37 +# --filterSymbols Filtering punctuation marks
38 +
39 +# Output
40 +# 1) Tagged files in transformed format
41 +
42 +# Examples
43 +# python3 tagging.py
44 +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
45 +# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
46 +# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
47 +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
48 +# --filterSymbols
49 +
50 +# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt
51 +
52 +__author__ = 'egaytan'
53 +
54 +##########################################
55 +# MAIN PROGRAM #
56 +##########################################
57 +
58 +if __name__ == "__main__":
59 + # Defining parameters
60 + parser = OptionParser()
61 + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
62 + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
63 + parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
64 + parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
65 + parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE")
66 + parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
67 + parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
68 + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
69 + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
70 + parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words", action="store_true", default=False)
71 + parser.add_option("--filterSymbols", dest="filterSymbols", help="Filtering punctuation marks", action="store_true", default=False)
72 +
73 + (options, args) = parser.parse_args()
74 + if len(args) > 0:
75 + parser.error("Any parameter given.")
76 + sys.exit(1)
77 +
78 + print('-------------------------------- PARAMETERS --------------------------------')
79 + print("Path to read input files: " + options.inputPath)
80 + print("Mode name: " + str(options.modelName))
81 + print("Model path: " + options.modelPath)
82 + print("Path to place output files: " + options.outputPath)
83 + print("Filtering stop words: " + str(options.filterStopWords))
84 + print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
85 + print("Run variant: " + str(options.variant))
86 +
87 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
88 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
89 +
90 + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
91 +
92 + print('-------------------------------- PROCESSING --------------------------------')
93 +
94 + stopwords = [word for word in stopwords.words('english')]
95 +
96 + # Read CRF model
97 + t0 = time()
98 + print('Reading CRF model...')
99 + crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
100 + print("Reading CRF model done in: %fs" % (time() - t0))
101 +
102 + # Reading sentences
103 + print('Processing corpus...')
104 + t0 = time()
105 + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
106 + # Walk directory to read files
107 + for path, dirs, files in os.walk(options.inputPath):
108 + # For each file in dir
109 + for file in files:
110 + print("Preprocessing file..." + str(file))
111 + sentencesInputData = []
112 + sentencesOutputData = []
113 + with open(os.path.join(options.inputPath, file), "r") as iFile:
114 + lines = iFile.readlines()
115 + for line in lines:
116 + listLine = []
117 + for token in line.strip('\n').split():
118 + if options.filterStopWords:
119 + listToken = token.split('|')
120 + lemma = listToken[1]
121 + if lemma in stopwords:
122 + continue
123 + if options.filterSymbols:
124 + listToken = token.split('|')
125 + lemma = listToken[1]
126 + if lemma in symbols:
127 + if lemma == ',':
128 + print("Coma , identificada")
129 + continue
130 + listLine.append(token)
131 + sentencesInputData.append(listLine)
132 + X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
133 + print("Sentences input data: " + str(len(sentencesInputData)))
134 +
135 +
136 + # Predicting tags
137 + t1 = time()
138 + print("Predicting tags with model")
139 + y_pred = crf.predict(X_input)
140 + print("Prediction done in: %fs" % (time() - t1))
141 +
142 +
143 + # Tagging with CRF model
144 + print("Tagging file")
145 + for line, tagLine in zip(lines, y_pred):
146 + Ltags = set(labels).intersection(set(tagLine))
147 + outputLine = ''
148 + line = line.strip('\n')
149 + #print("\nLine: " + str(line))
150 + #print ("CRF tagged line: " + str(tagLine))
151 + tb = 'O'
152 + i = 0
153 + if len(tagLine)==1:
154 + if tagLine[0] in labels:
155 + start = '<' + tagLine[0] + '> '
156 + end = '<' + tagLine[0] + '/>'
157 + word = line.split('|')[0] + ' '
158 + outputLine = start + word + end
159 + else:
160 + outputLine = line.split(' ')[0]
161 + #print(outputLine + '\t' + ', '.join(Ltags))
162 + sentencesOutputData.append([outputLine, ', '.join(Ltags)])
163 + continue
164 +
165 + for word,tag in zip(line.split(' '), tagLine):
166 + # start tagging
167 + if tag in labels and tb == 'O':
168 + # start tagging
169 + outputLine += '<' + tag + '> '
170 + tb = tag
171 + outputLine += word.split('|')[0] + ' '
172 + i += 1
173 + continue
174 + # end tagging
175 + elif tb in labels:
176 + if i+1==len(tagLine):
177 + # end tagging
178 + outputLine += word.split('|')[0] + ' '
179 + outputLine += '<' + tag + '/> '
180 + tb = 'O'
181 + i += 1
182 + continue
183 + elif tagLine[i+1]=='O':
184 + # end tagging
185 + outputLine += word.split('|')[0] + ' '
186 + outputLine += '<' + tag + '/> '
187 + tb = 'O'
188 + i += 1
189 + continue
190 + # word tagged
191 + outputLine += word.split('|')[0] + ' '
192 + i += 1
193 + #print(outputLine + '\t' + ', '.join(Ltags))
194 + sentencesOutputData.append([outputLine, ', '.join(Ltags)])
195 +
196 + print( DF(sentencesOutputData) )
197 +
198 + # Save tags
199 + '''
200 + with open(os.path.join(options.outputPath, file), "w") as oFile:
201 + for line in sentencesOutputData:
202 + oFile.write(line + '\n')
203 +
204 + print("Processing corpus done in: %fs" % (time() - t0))
205 +'''
206 +
207 +
208 +
209 +
210 +
211 +
212 +
213 +
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from optparse import OptionParser
5 +from time import time
6 +from collections import Counter
7 +
8 +import nltk
9 +import sklearn
10 +import scipy.stats
11 +import sys
12 +
13 +#from sklearn.externals import joblib
14 +import joblib
15 +from sklearn.metrics import make_scorer
16 +#from sklearn.cross_validation import cross_val_score
17 +from sklearn.model_selection import cross_val_score
18 +#from sklearn.grid_search import RandomizedSearchCV
19 +from sklearn.model_selection import RandomizedSearchCV
20 +
21 +import sklearn_crfsuite
22 +from sklearn_crfsuite import scorers
23 +from sklearn_crfsuite import metrics
24 +
25 +from nltk.corpus import stopwords
26 +
27 +#################################
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os # Access operative sistem
4 +#from itertools import chain # No se ocupa
5 +from optparse import OptionParser # Number of transitions
6 +from time import time # Return the time in seconds since the epoch as a float
7 +from collections import Counter # Dict subclass for counting hashable objects
8 +#import re # No se ocupa
9 +
10 +import nltk # Natural Language Toolkit platform to work with human language data
11 +import sklearn # Free software machine learning
12 +import scipy.stats # library of statistical functions
13 +import sys # to exit from Python.
14 +
15 +import joblib # provide lightweight pipelining
16 +from sklearn.metrics import make_scorer # Make a scorer from a performance metric or loss function
17 +from sklearn.model_selection import cross_val_score # Evaluate a score by cross-validation
18 +from sklearn.model_selection import RandomizedSearchCV # Randomized search on hyper parameters
19 +
20 +import sklearn_crfsuite # Thin CRFsuite
21 +from sklearn_crfsuite import scorers # Added scorers.sequence_accuracy
22 +from sklearn_crfsuite import metrics # Add flat recall score to metrics
23 +
24 +from pandas import DataFrame as DF # Contruct dataframe object
25 +from nltk.corpus import stopwords # To exclude top words
26 +
27 +#-------------------------------------------------------------------------------
28 +# Objective
29 +# Training and evaluation of CRFs with sklearn-crfsuite.
30 +#
31 +# Input parameters
32 +# (1) --inputPath Path of training and test data set
33 +# (2) --outputPath Output path to place output files
34 +# (3) --trainingFile File with training data set
35 +# (4) --testFile File with test data set
36 +# (5) --reportName Number of run
37 +# (6) --variant Part of S2 variant
38 +# (7) --nrules Number of crf transitions
39 +# (8) --S1 Inner word features set
40 +# (9) --S2 Complete word features
41 +# (10) --S3 Extended context features
42 +# (11) --S4 Semantic features
43 +# (12) --excludeStopWords
44 +# (13) --excludeSymbols
45 +
46 +# Output
47 +# 1) Best model
48 +# 2) Report
49 +
50 +# Examples
51 +# python3 training_validation_v14.0.1.py
52 +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/input/
53 +# --trainingFile training-data-set-70-NER.txt
54 +# --testFile test-data-set-30-NER.txt
55 +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
56 +# --nrules 500
57 +# --reportName Run1
58 +# --variant 11
59 +# --S1
60 +# --S2
61 +# --S3
62 +# --S4
63 +
64 +# python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 10 > ../../outputs/enero/Run1_v10.txt
65 +
66 +##################################################################
67 +# FEATURES #
68 +##################################################################
69 +
70 +#================== COMPLETE WORD FEATURES ======================#
71 +
72 +def isGreek(word):
73 + ## Complete word are greek letters
74 + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
75 + 'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
76 + if word in alphabet:
77 + return True
78 + else:
79 + return False
80 +
81 +#================ INNER OF THE WORD FEATURES ====================#
82 +
83 +def hGreek(word):
84 + ## Search for at least has one greek letter
85 + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω','α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
86 + # hexadicimal code
87 + matches = [letter for letter in word if letter in alphabet]
88 + if (len(matches) > 0):
89 + return(True)
90 + else: return(False)
91 + ## At least a greek letter
92 +
93 +def hNumber(word):
94 + ## Al leats has one greek letter
95 + for l in word:
96 + if l.isdigit():
97 + return True
98 + return False
99 +
100 +def hUpper(word):
101 + ## At least an upper letter
102 + for l in word:
103 + if l.isupper(): return True
104 + return False
105 +
106 +def hLower(word):
107 + ## At least a lower letter
108 + for l in word:
109 + if l.islower(): return True
110 + return False
111 +
112 +#============================FEATURES===========================#
113 +
114 +def word2features(sent, i, S1, S2, S3, S4, v): #SA, v
115 + ## Getting word features
116 +
117 + ## Saving CoreNLP annotations
118 + listElem = sent[i].split('|')
119 + ## Split CoreNLP output by columns
120 + word = listElem[0]
121 + lemma = listElem[1]
122 + postag = listElem[2]
123 + ner = listElem[3]
124 +
125 + #=========================== G =============================#
126 + ## NAME LEVEL G
127 + ## FUTURE TYPE General features
128 +
129 + ## Adding to features dictionary
130 + features = {
131 + ## basal features
132 + 'lemma': lemma,
133 + 'postag': postag
134 + }
135 +
136 + ## Anterior lemma and postag
137 + ## need more tha one word in sentence
138 + if i > 0:
139 + ## Split CoreNLP output by columns
140 + listElem = sent[i - 1].split('|')
141 +
142 + ## Saving CoreNLP annotations
143 + lemma0 = listElem[1]
144 + postag0 = listElem[2]
145 + ## Adding features to dictionary
146 + features.update({
147 + #LemaG anterior
148 + '-1:lemma': lemma0,
149 + #Postag anterior
150 + '-1:postag': postag0,
151 + })
152 +
153 + ## Posterior lemma and postag
154 + ## is not the last word
155 + if i < len(sent) - 1:
156 + ## Posterior word
157 + listElem = sent[i + 1].split('|')
158 + ## Saving CoreNLP annotations
159 + lemma2 = listElem[1]
160 + postag2 = listElem[2]
161 + ## Adding to features dictionary
162 + features.update({
163 + #LemaG posterior
164 + '+1:lemma': lemma2,
165 + #Postag posterior
166 + '+1:postag': postag2,
167 + })
168 +
169 + #=========================== S1 =============================#
170 + ## NAME LEVEL S1
171 + ## FEATURE TYPE Inner word features
172 +
173 + if S1:
174 + ## Adding features to dictionary
175 + features.update({
176 + 'hUpper' : hUpper(word),
177 + 'hLower' : hLower(word),
178 + 'hGreek' : hGreek(word),
179 + 'symb' : word.isalnum()
180 + })
181 + #========== Variants of inner words features ============#
182 + if v == 10:
183 + #word first character
184 + features['word[:1]']= word[:1]
185 +
186 + #word second character
187 + if len(word)>1:
188 + features['word[:2]']= word[:2]
189 +
190 + if v == 11:
191 + #lemma and postag first dharacter
192 + features['lemma[:1]']= lemma[:1]
193 + features['postag[:1]']= postag[:1]
194 +
195 + #lemma and postag secondChar
196 + if len(lemma)>1:
197 + features['lemma[:2]']= lemma[:2]
198 + if len(postag)>1:
199 + features['postag[:2]']= postag[:2]
200 +
201 + if v == 12:
202 + #word first character
203 + features['word[:1]']= word[:1]
204 +
205 + #word second character
206 + if len(word)>1:
207 + features['word[:2]']= word[:2]
208 +
209 + #postag first character
210 + features['postag[:1]']= postag[:1]
211 +
212 + #postag second character
213 + if len(postag)>1:
214 + features['postag[:2]']= postag[:2]
215 +
216 + if v == 13:
217 + #lemma first character
218 + features['lemma[:1]']= lemma[:1]
219 +
220 + #lemma second character
221 + if len(lemma)>1:
222 + features['lemma[:2]']= lemma[:2]
223 +
224 + #=========================== S2 =============================#
225 + ## NAME LEVEL S2
226 + ## FEATURE TYPE Complete word features
227 +
228 + if S2:
229 + #Add features to dictionary
230 + features.update({
231 + 'word' : word,
232 + 'isUpper' : word.isupper(),
233 + 'isLower' : word.islower(),
234 + 'isGreek' : isGreek(word),
235 + 'isNumber' : word.isdigit()
236 + })
237 + ## Anterior word
238 + ## sentence needs more tha one word
239 + if i > 0:
240 + ## Split CoreNLP output by columns
241 + listElem = sent[i - 1].split('|')
242 + ## Saving CoreNLP annotations
243 + word0 = listElem[0]
244 + features['-1:word']= word0
245 +
246 + ## Posterior word
247 + ## is not the last word
248 + if i < len(sent)-1:
249 + ## Split CoreNLP output by columns
250 + listElem = sent[i + 1].split('|')
251 + ## Saving CoreNLP annotations
252 + word2 = listElem[0]
253 + features['+1:word']= word2
254 +
255 + #=========================== S3 =============================#
256 + ## NAME LEVEL S3
257 + ## FEATURE TYPE Extended context features
258 + if S3:
259 + ## more than two words in sentence
260 + if i > 1:
261 + ## Split CoreNLP output by columns
262 + listElem = sent[i - 2].split('|')
263 + ## Saving CoreNLP annotations
264 + ## two anterior lemma and postag
265 + lemma01 = listElem[1]
266 + postag01 = listElem[2]
267 + features['-2:lemma']= lemma01
268 + features['-2:postag']= postag01
269 +
270 + ## is not the penultimate word
271 + if i < len(sent) - 2:
272 + ## Split CoreNLP output by columns
273 + listElem = sent[i + 2].split('|')
274 + ## Saving CoreNLP annotations
275 + lemma02 = listElem[1]
276 + postag02 = listElem[2]
277 + ## two posterior lemma and postag
278 + features['+2:lemma']= lemma02
279 + features['+2:postag']= postag02
280 +
281 + #=========================== S4 =============================#
282 + ## NAME LEVEL S4if S4:
283 + ## FEATURE TYPE NER
284 + if S4:
285 + ## more than one word in sentence
286 + if i > 0:
287 + ## Split CoreNLP output by columns
288 + listElem = sent[i - 1].split('|')
289 + ## =============== Anterior ner ====================##
290 + ## Saving CoreNLP annotations according column position
291 + ner0 = listElem[3]
292 + ## Adding to features dictionary
293 + features['-1:ner'] = ner
294 +
295 + ## is not the last word
296 + if i < len(sent) - 1:
297 + ## Split CoreNLP output by columns
298 + listElem = sent[i + 1].split('|')
299 + ## ============= Posterior ner ====================##
300 + ## Saving CoreNLP annotations according column position
301 + ner2 = listElem[3]
302 + ## Adding to features dictionary
303 + features['+1:ner'] = ner2
304 +
305 + if i > 1:
306 + ## Split CoreNLP output by columns
307 + listElem = sent[i - 2].split('|')
308 + ## Saving CoreNLP annotations
309 + ## =============== 2 Anterior ner =================##
310 + ner01 = listElem[3]
311 + features['-2:ner']= ner01
312 +
313 + ## is not the penultimate word
314 + if i < len(sent) - 2:
315 + ## Split CoreNLP output by columns
316 + listElem = sent[i + 2].split('|')
317 + ## Saving CoreNLP annotations
318 + ner02 = listElem[3]
319 + ## ============= 2 Posterior ner =================##
320 + features['+2:ner']= ner02
321 +
322 + return features
323 +
324 +def sent2features(sent, S1, S2, S3, S4, v):
325 + ## Itering in sentence for each word and saving its features
326 + return [word2features(sent, i, S1, S2, S3, S4, v) for i in range(len(sent))]
327 +
328 +def sent2labels(sent):
329 + ## Save tag, last position by word tokens
330 + return [elem.split('|')[-1] for elem in sent]
331 +
332 +def sent2tokens(sent):
333 + return [token for token, postag, label in sent]
334 +
335 +def print_transitions(trans_features, f):
336 + for (label_from, label_to), weight in trans_features:
337 + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
338 +
339 +def print_state_features(state_features, f):
340 + for (attr, label), weight in state_features:
341 + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
342 +
343 +
344 +__author__ = 'egaytan'
345 +
346 +##################################################################
347 +# MAIN PROGRAM #
348 +##################################################################
349 +
350 +if __name__ == "__main__":
351 + ## Defining parameters
352 + parser = OptionParser()
353 + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
354 + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
355 + parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE")
356 + parser.add_option("--testFile", dest="testFile", help="File with test data set", metavar="FILE")
357 + parser.add_option("--reportName", dest="reportName", help="Report number run", metavar="FILE")
358 + parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE")
359 + parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
360 + parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
361 + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
362 + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
363 + parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
364 + parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
365 + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
366 +
367 + (options, args) = parser.parse_args()
368 + if len(args) > 0:
369 + parser.error("Any parameter given.")
370 + sys.exit(1)
371 +
372 + print('-------------------------------- PARAMETERS --------------------------------')
373 + print("Path of test and training data sets: " + options.inputPath)
374 + print("Path of outputs: " + options.outputPath)
375 + print("File with training data set: " + str(options.trainingFile))
376 + print("File with test data set: " + str(options.testFile))
377 + print("reportName: " + str(options.reportName))
378 + print("Exclude stop words: " + str(options.excludeStopWords))
379 + print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
380 + print("Run variant: " + str(options.variant))
381 + print("Number of rules on report file: " + str(options.nrules))
382 +
383 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
384 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
385 + print("Exclude symbols: " + str(options.excludeSymbols))
386 +
387 + print('-------------------------------- PROCESSING --------------------------------')
388 + print('Reading corpus...')
389 + t0 = time()
390 +
391 + sentencesTrainingData = []
392 + sentencesTestData = []
393 +
394 + stopwords = [word for word in stopwords.words('english')]
395 +
396 + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
397 + for line in iFile.readlines():
398 + listLine = []
399 + line = line.strip('\n')
400 + for token in line.split():
401 + if options.excludeStopWords:
402 + listToken = token.split('|')
403 + lemma = listToken[1]
404 + if lemma in stopwords:
405 + continue
406 + if options.excludeSymbols:
407 + listToken = token.split('|')
408 + lemma = listToken[1]
409 + if lemma in symbols:
410 + continue
411 + listLine.append(token)
412 + sentencesTrainingData.append(listLine)
413 + print(" Sentences training data: " + str(len(sentencesTrainingData)))
414 +
415 + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
416 + for line in iFile.readlines():
417 + listLine = []
418 + line = line.strip('\n')
419 + for token in line.split():
420 + if options.excludeStopWords:
421 + listToken = token.split('|')
422 + lemma = listToken[1]
423 + if lemma in stopwords:
424 + continue
425 + if options.excludeSymbols:
426 + listToken = token.split('|')
427 + lemma = listToken[1]
428 + if lemma in symbols:
429 + continue
430 + listLine.append(token)
431 + sentencesTestData.append(listLine)
432 + print(" Sentences test data: " + str(len(sentencesTestData)))
433 +
434 + print("Reading corpus done in: %fs" % (time() - t0))
435 +
436 + print('-------------------------------- FEATURES --------------------------------')
437 +
438 + Dtraning = sent2features(sentencesTrainingData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
439 + Dtest = sent2features(sentencesTestData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
440 + print('--------------------------Features Training ---------------------------')
441 + print(DF(list(Dtraning.items())))
442 + print('--------------------------- FeaturesTest -----------------------------')
443 + print(DF(list(Dtest.items())))
444 +
445 + t0 = time()
446 +
447 + X_train = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTrainingData]
448 + y_train = [sent2labels(s) for s in sentencesTrainingData]
449 +
450 + X_test = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTestData]
451 + # print X_test
452 + y_test = [sent2labels(s) for s in sentencesTestData]
453 +
454 + '''
455 + Fixed parameters
456 + crf = sklearn_crfsuite.CRF(
457 + algorithm='lbfgs',
458 + c1=0.1,
459 + c2=0.1,
460 + max_iterations=100,
461 + all_pgossible_transitions=True
462 + )
463 + '''
464 + # Hyperparameter Optimization
465 + crf = sklearn_crfsuite.CRF(
466 + algorithm='lbfgs',
467 + max_iterations=100,
468 + all_possible_transitions=True
469 + )
470 + params_space = {
471 + 'c1': scipy.stats.expon(scale=0.5),
472 + 'c2': scipy.stats.expon(scale=0.05),
473 + }
474 +
475 + # Original: labels = list(crf.classes_)
476 + # Original: labels.remove('O')
477 + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
478 +
479 + # use the same metric for evaluation
480 + f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)
481 +
482 + # search
483 + rs = RandomizedSearchCV(crf, params_space,
484 + cv=5,
485 + verbose=3,
486 + n_jobs=-1,
487 + n_iter=100,
488 + scoring=f1_scorer,
489 + random_state=42)
490 +
491 + rs.fit(X_train, y_train)
492 +
493 + # Fixed parameters
494 + # crf.fit(X_train, y_train)
495 +
496 + # Best hiperparameters
497 + # crf = rs.best_estimator_
498 +
499 + nameReport = str(options.reportName) + '_v'+ str(options.variant) + '.txt'
500 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
501 + oFile.write("********** TRAINING AND TESTING REPORT **********\n")
502 + oFile.write("Training file: " + options.trainingFile + '\n')
503 + oFile.write('\n')
504 + oFile.write('best params:' + str(rs.best_params_) + '\n')
505 + oFile.write('best CV score:' + str(rs.best_score_) + '\n')
506 + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
507 +
508 + print("Training done in: %fs" % (time() - t0))
509 + t0 = time()
510 +
511 + # Update best crf
512 + crf = rs.best_estimator_
513 +
514 + # Saving model
515 + print(" Saving training model...")
516 + t1 = time()
517 + nameModel = 'model_' + str(options.reportName) + '_v'+ str(options.variant) + '_S1_' + str(options.S1) + '_S2_' + str(options.S2) + '_S3_' + str(options.S3) + '_S4_' + str(options.S4) + '_' + str(options.reportName) + '_v' + str(options.variant) +'.mod'
518 + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
519 + print(" Saving training model done in: %fs" % (time() - t1))
520 +
521 + # Evaluation against test data
522 + y_pred = crf.predict(X_test)
523 + print("*********************************")
524 + print("Prediction done in: %fs" % (time() - t0))
525 +
526 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
527 + oFile.write('\n')
528 + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
529 + oFile.write('\n')
530 + # labels = list(crf.classes_)
531 + sorted_labels = sorted(
532 + labels,
533 + key=lambda name: (name[1:], name[0])
534 + )
535 + oFile.write(metrics.flat_classification_report( y_test, y_pred, labels=sorted_labels, digits=3))
536 + oFile.write('\n')
537 +
538 + oFile.write("\nTop likely transitions:\n")
539 + print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
540 + oFile.write('\n')
541 +
542 + oFile.write("\nTop unlikely transitions:\n")
543 + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
544 + oFile.write('\n')
545 +
546 + oFile.write("\nTop positive:\n")
547 + print_state_features(Counter(crf.state_features_).most_common(options.nrules), oFile)
548 + oFile.write('\n')
549 +
550 + oFile.write("\nTop negative:\n")
551 + print_state_features(Counter(crf.state_features_).most_common()[-options.nrules:], oFile)
552 + oFile.write('\n')
553 +
This diff could not be displayed because it is too large.
1 +-------------------------------- PARAMETERS --------------------------------
2 +Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
3 +Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
4 +Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models
5 +Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
6 +Filtering stop words: False
7 +Levels: S1: FalseS2: FalseS3: FalseS4: False
8 +Run variant: None
9 +Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
10 +-------------------------------- PROCESSING --------------------------------
11 +Reading CRF model...
12 +Reading CRF model done in: 0.008342s
13 +Processing corpus...
14 +Preprocessing file...annot-input_bg_v3.txt
15 +Sentences input data: 14716
16 +Predicting tags with model
17 +Prediction done in: 0.983480s
18 +Tagging file
19 + 0 1
20 +0 <Gtype> antibody : Flag <Gtype/> Gtype
21 +1 <Gversion> ChIP-Seq <Gversion/> Gversion
22 +2 Cultures of Caulobacter -LRB- TLS1631-TLS1633 ... Gtype
23 +3 <Gtype> developmental stage : mixed population... Gtype
24 +4 DNA was isolated using the Qiagen Cell Lysis a...
25 +5 Escherichia coli
26 +6 Escherichia coli AB1157
27 +7 For analysis of ChIP-seq data , Hiseq 2500 Ill...
28 +8 For analysis of IDAP-seq data , Hiseq 2500 Ill... Gtype
29 +9 Genome _ build : NC _ 000913.3
30 +10 Genome _ build : NC _ 011916.1
31 +11 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
32 +12 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
33 +13 <Gtype> genotype : AB1157 ybbD : : parS site 1... Gtype
34 +14 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
35 +15 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
36 +16 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
37 +17 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
38 +18 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
39 +19 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
40 +20 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
41 +21 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
42 +22 <Gtype> genotype : AB1157 ybbD : : parS site 6... Gtype
43 +23 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
44 +24 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
45 +25 Hiseq 2500 Illumina short reads -LRB- 50 bp -R...
46 +26 LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG
47 +27 LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG
48 +28 LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG
49 +29 LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG
50 +... ... ...
51 +14686 <Phase> ESBL019 Coliform <Phase/> Phase
52 +14687 <Gtype> ESBL019 Filamented <Gtype/> Gtype
53 +14688 ESBL019 Reverted
54 +14689 <Phase> ESBL019 Transition <Phase/> Phase
55 +14690 Escherichia coli
56 +14691 Four morphologic states of ESBL019 were used d...
57 +14692 <Gtype> morphology : Coliform <Gtype/> Gtype
58 +14693 <Gtype> morphology : Filamented <Gtype/> Gtype
59 +14694 morphology : Reverted -LRB- reverted back from...
60 +14695 morphology : Transition -LRB- from Coli into F...
61 +14696 RNA isolation was performed using an RNeasy mi...
62 +14697 <Gtype> strain : beta-lactamase -LRB- ESBL -RR... Gtype
63 +14698 The E. coli isolate ESBL019 was originally iso...
64 +14699 Escherichia coli
65 +14700 lexA 10 ' after UV vs. 0 ' , MG1655
66 +14701 <Gtype> lexA 10 min after UV treatment , 25 ug... Gtype
67 +14702 lexA 20 ' after NOuv vs. 0 ' , MG1655
68 +14703 lexA 20 ' after UV vs. 0 ' , MG1655
69 +14704 lexA 20 min after NOuv , 25 ug total RNA , 2 u...
70 +14705 <Gtype> lexA 20 min after UV treatment , 25 ug... Gtype
71 +14706 lexA 40 ' after UV vs. 0 ' , MG1655
72 +14707 <Gtype> lexA 40 min after UV treatment , 25 ug... Gtype
73 +14708 lexA 5 ' after UV vs. 0 ' , MG1655
74 +14709 <Gtype> lexA 5 min after UV treatment , 25 ug ... Gtype
75 +14710 lexA 60 ' after NOuv vs. 0 ' , MG1655
76 +14711 lexA 60 ' after UV vs. 0 ' , MG1655
77 +14712 lexA 60 min after NOuv , 25 ug total RNA , 2 u...
78 +14713 <Gtype> lexA 60 min after UV treatment , 25 ug... Gtype
79 +14714 lexA vs. wt , before UV treatment , MG1655
80 +14715 untreated cells , 25 ug total RNA
81 +
82 +[14716 rows x 2 columns]