Showing
5 changed files
with
1157 additions
and
0 deletions
.idea/vcs.xml
0 → 100644
prepare-abstracts.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +from time import time | ||
7 | +import re | ||
8 | + | ||
9 | +__author__ = 'CMendezC' | ||
10 | + | ||
11 | +# Objective: Take text-annotated-abstracts-original.txt as input | ||
12 | +# for obtaining abstracts separated in files without tags and collecting dictionary of genes | ||
13 | +# for tagging after NLP pipeline. | ||
14 | + | ||
15 | +# Parameters: | ||
16 | +# 1) --inputPath Input path. | ||
17 | +# 2) --inputFile Input file. | ||
18 | +# 3) --outputPath Output path | ||
19 | + | ||
20 | +# Execution: | ||
21 | +#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original | ||
22 | + | ||
23 | +if __name__ == "__main__": | ||
24 | + # Parameter definition | ||
25 | + parser = OptionParser() | ||
26 | + parser.add_option("--inputPath", dest="inputPath", | ||
27 | + help="Input path", metavar="PATH") | ||
28 | + parser.add_option("--inputFile", dest="inputFile", | ||
29 | + help="Input file", metavar="FILE") | ||
30 | + parser.add_option("--outputPath", dest="outputPath", | ||
31 | + help="Output path", metavar="PATH") | ||
32 | + | ||
33 | + (options, args) = parser.parse_args() | ||
34 | + if len(args) > 0: | ||
35 | + parser.error("None parameters indicated.") | ||
36 | + sys.exit(1) | ||
37 | + | ||
38 | + # Printing parameter values | ||
39 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
40 | + print("Input path: " + str(options.inputPath)) | ||
41 | + print("Input file", str(options.inputFile)) | ||
42 | + print("Output path: " + str(options.outputPath)) | ||
43 | + | ||
44 | + filesWritten = 0 | ||
45 | + t0 = time() | ||
46 | + hashGenes = {} | ||
47 | + | ||
48 | + rePmid = re.compile(r'([\d])+\|a\|') | ||
49 | + reGene = re.compile(r'<g>([^<]+)</g>') | ||
50 | + with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: | ||
51 | + print("Reading file..." + options.inputFile) | ||
52 | + for line in iFile: | ||
53 | + line = line.strip('\n') | ||
54 | + for gene in reGene.findall(line): | ||
55 | + print("genes: {}".format(gene)) | ||
56 | + result = rePmid.match(line) | ||
57 | + if result: | ||
58 | + with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: | ||
59 | + oFile.write(line) | ||
60 | + | ||
61 | + | ||
62 | + |
preparing-training-validation-test.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +from time import time | ||
7 | +import json | ||
8 | +from nltk.corpus import stopwords | ||
9 | + | ||
10 | +__author__ = 'CMendezC' | ||
11 | + | ||
12 | +# Objective: Take transformed file with format word|lemma|tag, | ||
13 | +# for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP | ||
14 | +# and create file with an additional tagging for CRF training. For example: | ||
15 | +# the|the|dt N-terminal|N-terminal|NN| domain|domain|NN --> | ||
16 | +# the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O | ||
17 | +# Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP). | ||
18 | +# We expect that these words are going to have one aTag in some context and different one in others. | ||
19 | +# The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here. | ||
20 | +# In output file we only maintain the lemma and the tag or the word and the tag. | ||
21 | +# This additional tagging is going to give us clues for aspect classification. | ||
22 | + | ||
23 | +# Parameters: | ||
24 | +# 1) --inputPath Path to read files. | ||
25 | +# 2) --trainingFile File name with training data. | ||
26 | +# 3) --testFile File name with test data. | ||
27 | +# 4) --outputPath Path to write files. File names are concatenated with feature name. | ||
28 | +# 5) ELIMINATED --feature Type of feature to extract and create file: lemma | ||
29 | +# 6) --termPath Path to read term files | ||
30 | +# 7) --termFiles JSON file with terms files and tags | ||
31 | +# 8) --termPath Path to read JSON file with information about frequent words files | ||
32 | +# 9) --inputFileFreq JSON file with information about frequent words | ||
33 | +# 10 --skip=N Skip N words to form skip mentions | ||
34 | +# 11) --stopWords Filtering stop words | ||
35 | +# 12) --filterPunctMarks Filtering punctuation marks | ||
36 | + | ||
37 | +# Ouput: | ||
38 | +# 1) Files created. Name of feature is concatenated | ||
39 | + | ||
40 | +# Execution: | ||
41 | +# ASPECTS | ||
42 | +# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
43 | + | ||
44 | +# SENTENCES | ||
45 | +# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
46 | + | ||
47 | +# none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json | ||
48 | +# stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords | ||
49 | +# stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks | ||
50 | +# filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks | ||
51 | +# ¿? --SKIP | ||
52 | + | ||
53 | +def getSkipMentions(aList, aSkip): | ||
54 | + hashTemp = {} | ||
55 | + for j in range(0, aSkip): | ||
56 | + listTemp = [] | ||
57 | + for i in range(0, len(aList), aSkip+1): | ||
58 | + listTemp.append(aList[i + j]) | ||
59 | + hashTemp[j] = listTemp | ||
60 | + return hashTemp | ||
61 | + | ||
62 | +########################################################### | ||
63 | +# MAIN PROGRAM # | ||
64 | +########################################################### | ||
65 | + | ||
66 | +if __name__ == "__main__": | ||
67 | + # Parameter definition | ||
68 | + parser = OptionParser() | ||
69 | + parser.add_option("--inputPath", dest="inputPath", | ||
70 | + help="Path to read files", metavar="PATH") | ||
71 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
72 | + help="File with training examples", metavar="FILE") | ||
73 | + parser.add_option("--testFile", dest="testFile", | ||
74 | + help="File with test examples", metavar="FILE") | ||
75 | + parser.add_option("--trainingClassesFile", dest="trainingClassesFile", | ||
76 | + help="File with training classes", metavar="FILE") | ||
77 | + parser.add_option("--testClassesFile", dest="testClassesFile", | ||
78 | + help="File with test classes", metavar="FILE") | ||
79 | + parser.add_option("--outputPath", dest="outputPath", | ||
80 | + help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH") | ||
81 | + parser.add_option("--termPath", dest="termPath", | ||
82 | + help="Path to read term files", metavar="PATH") | ||
83 | + parser.add_option("--termFiles", dest="termFiles", | ||
84 | + help="JSON file with terms files and tags", metavar="PATH") | ||
85 | + parser.add_option("--inputFileFreq", dest="inputFileFreq", | ||
86 | + help="JSON file with information about frequent words", metavar="PATH") | ||
87 | + parser.add_option("--skip", type="int", | ||
88 | + dest="skip", default=0, | ||
89 | + help="Skip mentions", metavar="N") | ||
90 | + parser.add_option("--filterStopWords", default=False, | ||
91 | + action="store_true", dest="filterStopWords", | ||
92 | + help="Filtering stop words") | ||
93 | + parser.add_option("--filterPunctMarks", default=False, | ||
94 | + action="store_true", dest="filterPunctMarks", | ||
95 | + help="Filtering punctuation marks") | ||
96 | + | ||
97 | + (options, args) = parser.parse_args() | ||
98 | + if len(args) > 0: | ||
99 | + parser.error("None parameters indicated.") | ||
100 | + sys.exit(1) | ||
101 | + | ||
102 | + # Printing parameter values | ||
103 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
104 | + print("Path to read files: " + str(options.inputPath)) | ||
105 | + print("File with training examples", str(options.trainingFile)) | ||
106 | + print("File with test examples", str(options.testFile)) | ||
107 | + print("File with training classes", str(options.trainingClassesFile)) | ||
108 | + print("File with test classes", str(options.testClassesFile)) | ||
109 | + print("File with training classes", str(options.trainingClassesFile)) | ||
110 | + print("File with test classes", str(options.testClassesFile)) | ||
111 | + print("Path to write output files: " + str(options.outputPath)) | ||
112 | + print("JSON file with information about frequent words: " + str(options.inputFileFreq)) | ||
113 | + print("Skip mentions: " + str(options.skip)) | ||
114 | + print("Filtering stop words: " + str(options.stopWords)) | ||
115 | + punctMarks = ['.', ',', ':', ';', '?', '!', '\'', '"'] | ||
116 | + print("Filtering puntuation marks " + str(punctMarks) + ': '+ str(options.filterPunctMarks)) | ||
117 | + | ||
118 | + filesRead = 0 | ||
119 | + t0 = time() | ||
120 | + | ||
121 | + print('Loading biological term files...') | ||
122 | + with open(os.path.join(options.termPath, options.termFiles)) as data_file: | ||
123 | + hashes = json.load(data_file) | ||
124 | + print(' Loading biological term files... done') | ||
125 | + | ||
126 | + hashTagAspect = hashes["hashTagAspect"] | ||
127 | + | ||
128 | + print('Loading frequent words...') | ||
129 | + with open(os.path.join(options.termPath, options.inputFileFreq)) as data_file: | ||
130 | + hashAspectFreqWords = json.load(data_file) | ||
131 | + print(' Loading frequent words... done') | ||
132 | + | ||
133 | + listFiles = [options.trainingFile, options.testFile] | ||
134 | + listClassesFiles = [options.trainingClassesFile, options.testClassesFile] | ||
135 | + | ||
136 | + for iFile, cFile in zip(listFiles, listClassesFiles): | ||
137 | + with open(os.path.join(options.inputPath, iFile), "r", encoding="utf-8", errors="replace") as tFile: | ||
138 | + print("Reading file..." + iFile) | ||
139 | + lines = [l.strip('\n') for l in tFile.readlines()] | ||
140 | + filesRead += 1 | ||
141 | + with open(os.path.join(options.inputPath, cFile), "r", encoding="utf-8", errors="replace") as clFile: | ||
142 | + print("Reading file..." + cFile) | ||
143 | + classes = [c.strip('\n') for c in clFile.readlines()] | ||
144 | + listLines = [] | ||
145 | + print("Processing files... ") | ||
146 | + for line, c in zip(lines, classes): | ||
147 | + # print("class: ", c) | ||
148 | + listTokenLine = [] | ||
149 | + # listLemmaLine = [] | ||
150 | + for tok in line.split(): | ||
151 | + tokList = tok.split("|") | ||
152 | + word = tokList[0] | ||
153 | + lemma = tokList[1] | ||
154 | + tag = tokList[2] | ||
155 | + # Filtering stopwords | ||
156 | + if options.stopWords: | ||
157 | + if lemma in stopwords.words('english'): | ||
158 | + continue | ||
159 | + if options.filterPunctMarks: | ||
160 | + if lemma in punctMarks: | ||
161 | + continue | ||
162 | + # if tag in hashTagAspect: | ||
163 | + # We change tag for aspect tag only in the case of aspect tag coincide with class. | ||
164 | + # We want that CRF learn when to change term tag to aspect tag in correct context | ||
165 | + if tag in hashTagAspect: | ||
166 | + if hashTagAspect[tag] == c: | ||
167 | + aTag = hashTagAspect[tag] | ||
168 | + else: | ||
169 | + aTag = 'O' | ||
170 | + else: | ||
171 | + if c in hashAspectFreqWords: | ||
172 | + # print("class: ", c) | ||
173 | + hashFreqWords = hashAspectFreqWords[c] | ||
174 | + # We verify if word or lemma is in frequent words. | ||
175 | + # These frequent words are word-forms (tokens) | ||
176 | + if word.lower() in hashFreqWords or lemma in hashFreqWords: | ||
177 | + aTag = c | ||
178 | + else: | ||
179 | + aTag = 'O' | ||
180 | + else: | ||
181 | + aTag = 'O' | ||
182 | + listTokenLine.append(word + "|" + lemma + "|" + tag + "|" + aTag) | ||
183 | + # if feature == "word": | ||
184 | + listLines.append(listTokenLine) | ||
185 | + # if feature == "lemma": | ||
186 | + # listLines = listLemmaLine.strip() + '\n' | ||
187 | + if options.skip > 0: | ||
188 | + t0 = time() | ||
189 | + skipTemp = options.skip | ||
190 | + for i in range(1, options.skip): | ||
191 | + hashTemp = getSkipMentions(listLines, skipTemp) | ||
192 | + # skipTemp -= 1 | ||
193 | + for key in hashTemp: | ||
194 | + listLines = hashTemp[key] | ||
195 | + with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str( | ||
196 | + options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(skipTemp) + '.txt')), "w", | ||
197 | + encoding="utf-8") as oFile: | ||
198 | + for line in listLines: | ||
199 | + oFile.write(line) | ||
200 | + print("Skip mention done in: %fs" % (time() - t0)) | ||
201 | + else: | ||
202 | + with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str( | ||
203 | + options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(options.skip) + '.txt')), "w", | ||
204 | + encoding="utf-8") as oFile: | ||
205 | + for line in listLines: | ||
206 | + for token in line: | ||
207 | + oFile.write(token + ' ') | ||
208 | + oFile.write('\n') | ||
209 | + | ||
210 | + print("Files processed: " + str(filesRead)) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
tagging_Sklearn_crfsuite.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +import os | ||
4 | +from itertools import chain | ||
5 | +from optparse import OptionParser | ||
6 | +from time import time | ||
7 | +from collections import Counter | ||
8 | + | ||
9 | +import nltk | ||
10 | +import sklearn | ||
11 | +import scipy.stats | ||
12 | +import sys | ||
13 | + | ||
14 | +from sklearn.externals import joblib | ||
15 | +from sklearn.metrics import make_scorer | ||
16 | +from sklearn.cross_validation import cross_val_score | ||
17 | +from sklearn.grid_search import RandomizedSearchCV | ||
18 | + | ||
19 | +import sklearn_crfsuite | ||
20 | +from sklearn_crfsuite import scorers | ||
21 | +from sklearn_crfsuite import metrics | ||
22 | + | ||
23 | +from nltk.corpus import stopwords | ||
24 | +from trainingTesting_Sklearn_crfsuite import word2features | ||
25 | +from trainingTesting_Sklearn_crfsuite import sent2features | ||
26 | +# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum | ||
27 | +# from trainingTesting_Sklearn_crfsuite import hasDigit | ||
28 | + | ||
29 | +# Objective | ||
30 | +# Tagging transformed file with CRF model with sklearn-crfsuite. | ||
31 | +# | ||
32 | +# Input parameters | ||
33 | +# --inputPath=PATH Path of transformed files x|y|z | ||
34 | +# --modelPath Path to CRF model | ||
35 | +# --modelName Model name | ||
36 | +# --outputPath=PATH Output path to place output files | ||
37 | +# --filteringStopWords Filtering stop words | ||
38 | +# --filterSymbols Filtering punctuation marks | ||
39 | + | ||
40 | +# Output | ||
41 | +# 1) Tagged files in transformed format | ||
42 | + | ||
43 | +# Examples | ||
44 | +# Sentences | ||
45 | +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt | ||
46 | +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt | ||
47 | + | ||
48 | +################################# | ||
49 | +# FUNCTIONS # | ||
50 | +################################# | ||
51 | +# def hasDigit(text): | ||
52 | +# has = False | ||
53 | +# if len(text) < 3: | ||
54 | +# return False | ||
55 | +# myRegex = nltk.re.compile('[0-9]') | ||
56 | +# if myRegex.search(text) != None: | ||
57 | +# has = True | ||
58 | +# return has | ||
59 | +# | ||
60 | +# | ||
61 | +# def hasNonAlphaNum(text): | ||
62 | +# has = False | ||
63 | +# if len(text) < 3: | ||
64 | +# return False | ||
65 | +# myRegex = nltk.re.compile('\W') | ||
66 | +# if myRegex.search(text) != None: | ||
67 | +# has = True | ||
68 | +# return has | ||
69 | + | ||
70 | +# IMPORTED FROM TRAINING SCRIPT | ||
71 | +# def word2features(sent, i): | ||
72 | +# # print "i: " + str(i) | ||
73 | +# # print "sent[i]" + sent[i] | ||
74 | +# listElem = sent[i].split('|') | ||
75 | +# word = listElem[0] | ||
76 | +# lemma = listElem[1] | ||
77 | +# postag = listElem[2] | ||
78 | +# | ||
79 | +# features = { | ||
80 | +# # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(), | ||
81 | +# # Suffixes | ||
82 | +# 'word[-3:]': word[-3:], | ||
83 | +# 'word[-2:]': word[-2:], | ||
84 | +# 'word[-1:]': word[-1:], | ||
85 | +# 'word.isupper()': word.isupper(), | ||
86 | +# 'word.istitle()': word.istitle(), | ||
87 | +# 'word.hasDigit()': hasDigit(word), | ||
88 | +# 'word.hasNonAlphaNum': hasNonAlphaNum(word), | ||
89 | +# # 'word.isdigit()': word.isdigit(), | ||
90 | +# 'word': word, | ||
91 | +# 'lemma': lemma, | ||
92 | +# 'lemma[-3:]': lemma[-3:], | ||
93 | +# 'lemma[-2:]': lemma[-2:], | ||
94 | +# 'lemma[-1:]': lemma[-1:], | ||
95 | +# 'postag': postag, | ||
96 | +# # Prefixes | ||
97 | +# 'postag[:2]': postag[:2], | ||
98 | +# 'postag[:1]': postag[:1], | ||
99 | +# } | ||
100 | +# if i > 0: | ||
101 | +# listElem = sent[i - 1].split('|') | ||
102 | +# word1 = listElem[0] | ||
103 | +# lemma1 = listElem[1] | ||
104 | +# postag1 = listElem[2] | ||
105 | +# features.update({ | ||
106 | +# '-1:word.lower()': word1.lower(), | ||
107 | +# '-1:word.istitle()': word1.istitle(), | ||
108 | +# '-1:word.isupper()': word1.isupper(), | ||
109 | +# '-1:word.hasDigit()': hasDigit(word1), | ||
110 | +# '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
111 | +# '-1:word': word1, | ||
112 | +# '-1:lemma': lemma1, | ||
113 | +# '-1:postag': postag1, | ||
114 | +# '-1:postag[:2]': postag1[:2], | ||
115 | +# '-1:postag[:1]': postag1[:1], | ||
116 | +# }) | ||
117 | +# # else: | ||
118 | +# # features['BOS'] = True | ||
119 | +# | ||
120 | +# if i < len(sent) - 1: | ||
121 | +# listElem = sent[i + 1].split('|') | ||
122 | +# word1 = listElem[0] | ||
123 | +# lemma1 = listElem[1] | ||
124 | +# postag1 = listElem[2] | ||
125 | +# features.update({ | ||
126 | +# '+1:word.lower()': word1.lower(), | ||
127 | +# '+1:word.istitle()': word1.istitle(), | ||
128 | +# '+1:word.isupper()': word1.isupper(), | ||
129 | +# '+1:word.hasDigit()': hasDigit(word1), | ||
130 | +# '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
131 | +# '+1:word': word1, | ||
132 | +# '+1:lemma': lemma1, | ||
133 | +# '+1:postag': postag1, | ||
134 | +# '+1:postag[:2]': postag1[:2], | ||
135 | +# '+1:postag[:1]': postag1[:1], | ||
136 | +# }) | ||
137 | +# # else: | ||
138 | +# # features['EOS'] = True | ||
139 | +# if i > 1: | ||
140 | +# listElem = sent[i - 2].split('|') | ||
141 | +# word2 = listElem[0] | ||
142 | +# lemma2 = listElem[1] | ||
143 | +# postag2 = listElem[2] | ||
144 | +# features.update({ | ||
145 | +# '-2:word.lower()': word2.lower(), | ||
146 | +# '-2:word.istitle()': word2.istitle(), | ||
147 | +# '-2:word.isupper()': word2.isupper(), | ||
148 | +# '-2:word.hasDigit()': hasDigit(word2), | ||
149 | +# '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
150 | +# '-2:word': word2, | ||
151 | +# '-2:lemma': lemma2, | ||
152 | +# '-2:postag': postag2, | ||
153 | +# '-2:postag[:2]': postag2[:2], | ||
154 | +# '-2:postag[:1]': postag2[:1], | ||
155 | +# }) | ||
156 | +# | ||
157 | +# if i < len(sent) - 2: | ||
158 | +# listElem = sent[i + 2].split('|') | ||
159 | +# word2 = listElem[0] | ||
160 | +# lemma2 = listElem[1] | ||
161 | +# postag2 = listElem[2] | ||
162 | +# features.update({ | ||
163 | +# '+2:word.lower()': word2.lower(), | ||
164 | +# '+2:word.istitle()': word2.istitle(), | ||
165 | +# '+2:word.isupper()': word2.isupper(), | ||
166 | +# '+2:word.hasDigit()': hasDigit(word2), | ||
167 | +# '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
168 | +# '+2:word': word2, | ||
169 | +# '+2:lemma': lemma2, | ||
170 | +# '+2:postag': postag2, | ||
171 | +# '+2:postag[:2]': postag2[:2], | ||
172 | +# '+2:postag[:1]': postag2[:1], | ||
173 | +# }) | ||
174 | +# | ||
175 | +# trigrams = False | ||
176 | +# if trigrams: | ||
177 | +# if i > 2: | ||
178 | +# listElem = sent[i - 3].split('|') | ||
179 | +# word3 = listElem[0] | ||
180 | +# lemma3 = listElem[1] | ||
181 | +# postag3 = listElem[2] | ||
182 | +# features.update({ | ||
183 | +# '-3:word.lower()': word3.lower(), | ||
184 | +# '-3:word.istitle()': word3.istitle(), | ||
185 | +# '-3:word.isupper()': word3.isupper(), | ||
186 | +# '-3:word.hasDigit()': hasDigit(word3), | ||
187 | +# '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
188 | +# '-3:word': word3, | ||
189 | +# '-3:lemma': lemma3, | ||
190 | +# '-3:postag': postag3, | ||
191 | +# '-3:postag[:2]': postag3[:2], | ||
192 | +# '-3:postag[:1]': postag3[:1], | ||
193 | +# }) | ||
194 | +# | ||
195 | +# if i < len(sent) - 3: | ||
196 | +# listElem = sent[i + 3].split('|') | ||
197 | +# word3 = listElem[0] | ||
198 | +# lemma3 = listElem[1] | ||
199 | +# postag3 = listElem[2] | ||
200 | +# features.update({ | ||
201 | +# '+3:word.lower()': word3.lower(), | ||
202 | +# '+3:word.istitle()': word3.istitle(), | ||
203 | +# '+3:word.isupper()': word3.isupper(), | ||
204 | +# '+3:word.hasDigit()': hasDigit(word3), | ||
205 | +# '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
206 | +# '+3:word': word3, | ||
207 | +# '+3:lemma': lemma3, | ||
208 | +# '+3:postag': postag3, | ||
209 | +# '+3:postag[:2]': postag3[:2], | ||
210 | +# '+3:postag[:1]': postag3[:1], | ||
211 | +# }) | ||
212 | +# | ||
213 | +# return features | ||
214 | + | ||
215 | + | ||
216 | +# def sent2features(sent): | ||
217 | +# return [word2features(sent, i) for i in range(len(sent))] | ||
218 | + | ||
219 | + | ||
220 | +__author__ = 'CMendezC' | ||
221 | + | ||
222 | +########################################## | ||
223 | +# MAIN PROGRAM # | ||
224 | +########################################## | ||
225 | + | ||
226 | +if __name__ == "__main__": | ||
227 | + # Defining parameters | ||
228 | + parser = OptionParser() | ||
229 | + parser.add_option("--inputPath", dest="inputPath", | ||
230 | + help="Path of training data set", metavar="PATH") | ||
231 | + parser.add_option("--outputPath", dest="outputPath", | ||
232 | + help="Output path to place output files", | ||
233 | + metavar="PATH") | ||
234 | + parser.add_option("--modelPath", dest="modelPath", | ||
235 | + help="Path to read CRF model", | ||
236 | + metavar="PATH") | ||
237 | + parser.add_option("--modelName", dest="modelName", | ||
238 | + help="Model name", metavar="TEXT") | ||
239 | + parser.add_option("--filterStopWords", default=False, | ||
240 | + action="store_true", dest="filterStopWords", | ||
241 | + help="Filtering stop words") | ||
242 | + parser.add_option("--filterSymbols", default=False, | ||
243 | + action="store_true", dest="filterSymbols", | ||
244 | + help="Filtering punctuation marks") | ||
245 | + | ||
246 | + (options, args) = parser.parse_args() | ||
247 | + if len(args) > 0: | ||
248 | + parser.error("Any parameter given.") | ||
249 | + sys.exit(1) | ||
250 | + | ||
251 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
252 | + print("Path to read input files: " + options.inputPath) | ||
253 | + print("Mode name: " + str(options.modelName)) | ||
254 | + print("Model path: " + options.modelPath) | ||
255 | + print("Path to place output files: " + options.outputPath) | ||
256 | + print("Filtering stop words: " + str(options.filterStopWords)) | ||
257 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
258 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
259 | + # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
260 | + # '}', '[', ']', '*', '%', '$', '#', '&', '°']] | ||
261 | + # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{', | ||
262 | + # u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`'] | ||
263 | + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | ||
264 | + | ||
265 | + print('-------------------------------- PROCESSING --------------------------------') | ||
266 | + | ||
267 | + stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | ||
268 | + | ||
269 | + # Read CRF model | ||
270 | + t0 = time() | ||
271 | + print('Reading CRF model...') | ||
272 | + crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod')) | ||
273 | + print("Reading CRF model done in: %fs" % (time() - t0)) | ||
274 | + | ||
275 | + print('Processing corpus...') | ||
276 | + t0 = time() | ||
277 | + labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) | ||
278 | + # Walk directory to read files | ||
279 | + for path, dirs, files in os.walk(options.inputPath): | ||
280 | + # For each file in dir | ||
281 | + for file in files: | ||
282 | + print(" Preprocessing file..." + str(file)) | ||
283 | + sentencesInputData = [] | ||
284 | + sentencesOutputData = [] | ||
285 | + with open(os.path.join(options.inputPath, file), "r") as iFile: | ||
286 | + lines = iFile.readlines() | ||
287 | + for line in lines: | ||
288 | + listLine = [] | ||
289 | + # line = line.decode("utf-8") | ||
290 | + for token in line.strip('\n').split(): | ||
291 | + if options.filterStopWords: | ||
292 | + listToken = token.split('|') | ||
293 | + lemma = listToken[1] | ||
294 | + # Original if lemma in stopwords.words('english'): | ||
295 | + if lemma in stopwords: | ||
296 | + continue | ||
297 | + if options.filterSymbols: | ||
298 | + listToken = token.split('|') | ||
299 | + lemma = listToken[1] | ||
300 | + if lemma in symbols: | ||
301 | + if lemma == ',': | ||
302 | + print "Coma , identificada" | ||
303 | + continue | ||
304 | + listLine.append(token) | ||
305 | + sentencesInputData.append(listLine) | ||
306 | + print " Sentences input data: " + str(len(sentencesInputData)) | ||
307 | + # print sentencesInputData[0] | ||
308 | + # print(sent2features(sentencesInputData[0])[0]) | ||
309 | + # print(sent2labels(sentencesInputData[0])) | ||
310 | + X_input = [sent2features(s) for s in sentencesInputData] | ||
311 | + print(sent2features(sentencesInputData[0])[0]) | ||
312 | + # y_test = [sent2labels(s) for s in sentencesInputData] | ||
313 | + # Predicting tags | ||
314 | + t1 = time() | ||
315 | + print " Predicting tags with model" | ||
316 | + y_pred = crf.predict(X_input) | ||
317 | + print y_pred[0] | ||
318 | + print(" Prediction done in: %fs" % (time() - t1)) | ||
319 | + # Tagging with CRF model | ||
320 | + print " Tagging file" | ||
321 | + for line, tagLine in zip(lines, y_pred): | ||
322 | + outputLine = '' | ||
323 | + idx_tagLine = 0 | ||
324 | + line = line.strip('\n') | ||
325 | + print "\nLine: " + str(line) | ||
326 | + print "CRF tagged line: " + str(tagLine) | ||
327 | + for token in line.split(): | ||
328 | + listToken = token.split('|') | ||
329 | + word = listToken[0] | ||
330 | + lemma = listToken[1] | ||
331 | + tag = listToken[2] | ||
332 | + if options.filterStopWords: | ||
333 | + if lemma in stopwords: | ||
334 | + outputLine += token + ' ' | ||
335 | + continue | ||
336 | + if options.filterSymbols: | ||
337 | + if lemma in symbols: | ||
338 | + if lemma == ',': | ||
339 | + print "Coma , identificada" | ||
340 | + outputLine += token + ' ' | ||
341 | + continue | ||
342 | + CRFtag = tagLine[idx_tagLine] | ||
343 | + if (tag not in labels) and (CRFtag != 'O'): | ||
344 | + print "*** CRF change token {} to {}".format(token, CRFtag) | ||
345 | + outputLine += word + '|' + lemma + '|' + CRFtag + ' ' | ||
346 | + else: | ||
347 | + outputLine += word + '|' + lemma + '|' + tag + ' ' | ||
348 | + idx_tagLine += 1 | ||
349 | + sentencesOutputData.append(outputLine.rstrip()) | ||
350 | + with open(os.path.join(options.outputPath, file), "w") as oFile: | ||
351 | + for line in sentencesOutputData: | ||
352 | + oFile.write(line + '\n') | ||
353 | + | ||
354 | + print("Processing corpus done in: %fs" % (time() - t0)) |
training-validation.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +import os | ||
4 | +from itertools import chain | ||
5 | +from optparse import OptionParser | ||
6 | +from time import time | ||
7 | +from collections import Counter | ||
8 | + | ||
9 | +import nltk | ||
10 | +import sklearn | ||
11 | +import scipy.stats | ||
12 | +import sys | ||
13 | + | ||
14 | +from sklearn.externals import joblib | ||
15 | +from sklearn.metrics import make_scorer | ||
16 | +from sklearn.cross_validation import cross_val_score | ||
17 | +from sklearn.grid_search import RandomizedSearchCV | ||
18 | + | ||
19 | +import sklearn_crfsuite | ||
20 | +from sklearn_crfsuite import scorers | ||
21 | +from sklearn_crfsuite import metrics | ||
22 | + | ||
23 | +from nltk.corpus import stopwords | ||
24 | + | ||
25 | + | ||
26 | +# Objective | ||
27 | +# Training and evaluation of CRFs with sklearn-crfsuite. | ||
28 | +# | ||
29 | +# Input parameters | ||
30 | +# --inputPath=PATH Path of training and test data set | ||
31 | +# --trainingFile File with training data set | ||
32 | +# --testFile File with test data set | ||
33 | +# --outputPath=PATH Output path to place output files | ||
34 | +# --filteringStopWords Filtering stop words | ||
35 | +# --filterSymbols Filtering punctuation marks | ||
36 | + | ||
37 | +# Output | ||
38 | +# 1) Best model | ||
39 | + | ||
40 | +# Examples | ||
41 | +# Sentences | ||
42 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt | ||
43 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt | ||
44 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt | ||
45 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt | ||
46 | + | ||
47 | +# Aspects | ||
48 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt | ||
49 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt | ||
50 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt | ||
51 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt | ||
52 | + | ||
53 | +################################# | ||
54 | +# FUNCTIONS # | ||
55 | +################################# | ||
56 | + | ||
57 | +def wordSize(text): | ||
58 | + lWord = len(text) | ||
59 | + if lWord == 1: | ||
60 | + return '1' | ||
61 | + elif lWord == 2: | ||
62 | + return '2' | ||
63 | + elif lWord == 3: | ||
64 | + return '3' | ||
65 | + elif lWord == 4: | ||
66 | + return '4' | ||
67 | + elif lWord == 5: | ||
68 | + return '5' | ||
69 | + elif 6 <= lWord <= 10: | ||
70 | + return '6-10' | ||
71 | + elif 11 <= lWord <= 15: | ||
72 | + return '11-15' | ||
73 | + elif 16 <= lWord <= 20: | ||
74 | + return '16-20' | ||
75 | + elif 21 <= lWord <= 30: | ||
76 | + return '21-30' | ||
77 | + else: | ||
78 | + return '>30' | ||
79 | + | ||
80 | +def hasUpperLower(text): | ||
81 | + has = False | ||
82 | + if len(text) < 3: | ||
83 | + return False | ||
84 | + regexUp = nltk.re.compile('[A-Z]') | ||
85 | + regexLo = nltk.re.compile('[a-z]') | ||
86 | + if (regexUp.search(text) != None) and (regexLo.search(text) != None): | ||
87 | + has = True | ||
88 | + return has | ||
89 | + | ||
90 | +def hasDigit(text): | ||
91 | + has = False | ||
92 | + if len(text) < 3: | ||
93 | + return False | ||
94 | + myRegex = nltk.re.compile('[0-9]') | ||
95 | + if myRegex.search(text) != None: | ||
96 | + has = True | ||
97 | + return has | ||
98 | + | ||
99 | + | ||
100 | +def hasNonAlphaNum(text): | ||
101 | + has = False | ||
102 | + if len(text) < 3: | ||
103 | + return False | ||
104 | + myRegex = nltk.re.compile('\W') | ||
105 | + if myRegex.search(text) != None: | ||
106 | + has = True | ||
107 | + return has | ||
108 | + | ||
109 | +def word2features(sent, i): | ||
110 | + # print "i: " + str(i) | ||
111 | + # print "sent[i]" + sent[i] | ||
112 | + listElem = sent[i].split('|') | ||
113 | + word = listElem[0] | ||
114 | + lemma = listElem[1] | ||
115 | + postag = listElem[2] | ||
116 | + | ||
117 | + features = { | ||
118 | + # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(), | ||
119 | + # Suffixes | ||
120 | + 'word[-3:]': word[-3:], | ||
121 | + 'word[-2:]': word[-2:], | ||
122 | + 'word[-1:]': word[-1:], | ||
123 | + 'word.isupper()': word.isupper(), | ||
124 | + 'word.istitle()': word.istitle(), | ||
125 | + 'word.hasDigit()': hasDigit(word), | ||
126 | + 'word.hasNonAlphaNum': hasNonAlphaNum(word), | ||
127 | + # 'word.hasUpperLower': hasUpperLower(word), | ||
128 | + #'wordSize': wordSize(word), | ||
129 | + # 'word.isdigit()': word.isdigit(), | ||
130 | + 'word': word, | ||
131 | + 'lemma': lemma, | ||
132 | + 'lemma[-3:]': lemma[-3:], | ||
133 | + 'lemma[-2:]': lemma[-2:], | ||
134 | + 'lemma[-1:]': lemma[-1:], | ||
135 | + 'postag': postag, | ||
136 | + # Prefixes | ||
137 | + 'postag[:2]': postag[:2], | ||
138 | + 'postag[:1]': postag[:1], | ||
139 | + } | ||
140 | + if i > 0: | ||
141 | + listElem = sent[i - 1].split('|') | ||
142 | + word1 = listElem[0] | ||
143 | + lemma1 = listElem[1] | ||
144 | + postag1 = listElem[2] | ||
145 | + features.update({ | ||
146 | + '-1:word.lower()': word1.lower(), | ||
147 | + '-1:word.istitle()': word1.istitle(), | ||
148 | + '-1:word.isupper()': word1.isupper(), | ||
149 | + '-1:word.hasDigit()': hasDigit(word1), | ||
150 | + '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
151 | + # '-1:word.hasUpperLower': hasUpperLower(word1), | ||
152 | + '-1:word': word1, | ||
153 | + '-1:lemma': lemma1, | ||
154 | + '-1:postag': postag1, | ||
155 | + '-1:postag[:2]': postag1[:2], | ||
156 | + '-1:postag[:1]': postag1[:1], | ||
157 | + }) | ||
158 | + # else: | ||
159 | + # features['BOS'] = True | ||
160 | + | ||
161 | + if i < len(sent) - 1: | ||
162 | + listElem = sent[i + 1].split('|') | ||
163 | + word1 = listElem[0] | ||
164 | + lemma1 = listElem[1] | ||
165 | + postag1 = listElem[2] | ||
166 | + features.update({ | ||
167 | + '+1:word.lower()': word1.lower(), | ||
168 | + '+1:word.istitle()': word1.istitle(), | ||
169 | + '+1:word.isupper()': word1.isupper(), | ||
170 | + '+1:word.hasDigit()': hasDigit(word1), | ||
171 | + '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
172 | + # '+1:word.hasUpperLower': hasUpperLower(word1), | ||
173 | + '+1:word': word1, | ||
174 | + '+1:lemma': lemma1, | ||
175 | + '+1:postag': postag1, | ||
176 | + '+1:postag[:2]': postag1[:2], | ||
177 | + '+1:postag[:1]': postag1[:1], | ||
178 | + }) | ||
179 | + # else: | ||
180 | + # features['EOS'] = True | ||
181 | + if i > 1: | ||
182 | + listElem = sent[i - 2].split('|') | ||
183 | + word2 = listElem[0] | ||
184 | + lemma2 = listElem[1] | ||
185 | + postag2 = listElem[2] | ||
186 | + features.update({ | ||
187 | + '-2:word.lower()': word2.lower(), | ||
188 | + '-2:word.istitle()': word2.istitle(), | ||
189 | + '-2:word.isupper()': word2.isupper(), | ||
190 | + '-2:word.hasDigit()': hasDigit(word2), | ||
191 | + '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
192 | + # '-2:word.hasUpperLower': hasUpperLower(word2), | ||
193 | + '-2:word': word2, | ||
194 | + '-2:lemma': lemma2, | ||
195 | + '-2:postag': postag2, | ||
196 | + '-2:postag[:2]': postag2[:2], | ||
197 | + '-2:postag[:1]': postag2[:1], | ||
198 | + }) | ||
199 | + | ||
200 | + if i < len(sent) - 2: | ||
201 | + listElem = sent[i + 2].split('|') | ||
202 | + word2 = listElem[0] | ||
203 | + lemma2 = listElem[1] | ||
204 | + postag2 = listElem[2] | ||
205 | + features.update({ | ||
206 | + '+2:word.lower()': word2.lower(), | ||
207 | + '+2:word.istitle()': word2.istitle(), | ||
208 | + '+2:word.isupper()': word2.isupper(), | ||
209 | + '+2:word.hasDigit()': hasDigit(word2), | ||
210 | + '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
211 | + # '+2:word.hasUpperLower': hasUpperLower(word2), | ||
212 | + '+2:word': word2, | ||
213 | + '+2:lemma': lemma2, | ||
214 | + '+2:postag': postag2, | ||
215 | + '+2:postag[:2]': postag2[:2], | ||
216 | + '+2:postag[:1]': postag2[:1], | ||
217 | + }) | ||
218 | + | ||
219 | + trigrams = False | ||
220 | + if trigrams: | ||
221 | + if i > 2: | ||
222 | + listElem = sent[i - 3].split('|') | ||
223 | + word3 = listElem[0] | ||
224 | + lemma3 = listElem[1] | ||
225 | + postag3 = listElem[2] | ||
226 | + features.update({ | ||
227 | + '-3:word.lower()': word3.lower(), | ||
228 | + '-3:word.istitle()': word3.istitle(), | ||
229 | + '-3:word.isupper()': word3.isupper(), | ||
230 | + '-3:word.hasDigit()': hasDigit(word3), | ||
231 | + '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
232 | + # '-3:word.hasUpperLower': hasUpperLower(word3), | ||
233 | + '-3:word': word3, | ||
234 | + '-3:lemma': lemma3, | ||
235 | + '-3:postag': postag3, | ||
236 | + '-3:postag[:2]': postag3[:2], | ||
237 | + '-3:postag[:1]': postag3[:1], | ||
238 | + }) | ||
239 | + | ||
240 | + if i < len(sent) - 3: | ||
241 | + listElem = sent[i + 3].split('|') | ||
242 | + word3 = listElem[0] | ||
243 | + lemma3 = listElem[1] | ||
244 | + postag3 = listElem[2] | ||
245 | + features.update({ | ||
246 | + '+3:word.lower()': word3.lower(), | ||
247 | + '+3:word.istitle()': word3.istitle(), | ||
248 | + '+3:word.isupper()': word3.isupper(), | ||
249 | + '+3:word.hasDigit()': hasDigit(word3), | ||
250 | + '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
251 | + # '+3:word.hasUpperLower': hasUpperLower(word3), | ||
252 | + '+3:word': word3, | ||
253 | + '+3:lemma': lemma3, | ||
254 | + '+3:postag': postag3, | ||
255 | + '+3:postag[:2]': postag3[:2], | ||
256 | + '+3:postag[:1]': postag3[:1], | ||
257 | + }) | ||
258 | + | ||
259 | + return features | ||
260 | + | ||
261 | + | ||
262 | +def sent2features(sent): | ||
263 | + return [word2features(sent, i) for i in range(len(sent))] | ||
264 | + | ||
265 | + | ||
266 | +def sent2labels(sent): | ||
267 | + return [elem.split('|')[3] for elem in sent] | ||
268 | + # return [label for token, postag, label in sent] | ||
269 | + | ||
270 | + | ||
271 | +def sent2tokens(sent): | ||
272 | + return [token for token, postag, label in sent] | ||
273 | + | ||
274 | + | ||
275 | +def print_transitions(trans_features, f): | ||
276 | + for (label_from, label_to), weight in trans_features: | ||
277 | + # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight)) | ||
278 | + # f.write("label_from :" + label_from) | ||
279 | + # f.write("label_to :" + label_to) | ||
280 | + # f.write("label_weight :" + weight) | ||
281 | + # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight)) | ||
282 | + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) | ||
283 | + | ||
284 | + | ||
285 | +def print_state_features(state_features, f): | ||
286 | + for (attr, label), weight in state_features: | ||
287 | + # f.write("%0.6f %-8s %s\n" % (weight, label, attr)) | ||
288 | + # f.write(attr.encode("utf-8")) | ||
289 | + # '{:06.2f}'.format(3.141592653589793) | ||
290 | + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | ||
291 | + | ||
292 | + | ||
293 | +__author__ = 'CMendezC' | ||
294 | + | ||
295 | +########################################## | ||
296 | +# MAIN PROGRAM # | ||
297 | +########################################## | ||
298 | + | ||
299 | +if __name__ == "__main__": | ||
300 | + # Defining parameters | ||
301 | + parser = OptionParser() | ||
302 | + parser.add_option("--inputPath", dest="inputPath", | ||
303 | + help="Path of training data set", metavar="PATH") | ||
304 | + parser.add_option("--outputPath", dest="outputPath", | ||
305 | + help="Output path to place output files", | ||
306 | + metavar="PATH") | ||
307 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
308 | + help="File with training data set", metavar="FILE") | ||
309 | + parser.add_option("--testFile", dest="testFile", | ||
310 | + help="File with test data set", metavar="FILE") | ||
311 | + parser.add_option("--filterStopWords", default=False, | ||
312 | + action="store_true", dest="filterStopWords", | ||
313 | + help="Filtering stop words") | ||
314 | + parser.add_option("--filterSymbols", default=False, | ||
315 | + action="store_true", dest="filterSymbols", | ||
316 | + help="Filtering punctuation marks") | ||
317 | + | ||
318 | + (options, args) = parser.parse_args() | ||
319 | + if len(args) > 0: | ||
320 | + parser.error("Any parameter given.") | ||
321 | + sys.exit(1) | ||
322 | + | ||
323 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
324 | + print("Path of training data set: " + options.inputPath) | ||
325 | + print("File with training data set: " + str(options.trainingFile)) | ||
326 | + print("Path of test data set: " + options.inputPath) | ||
327 | + print("File with test data set: " + str(options.testFile)) | ||
328 | + print("Filtering stop words: " + str(options.filterStopWords)) | ||
329 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
330 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
331 | + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | ||
332 | + | ||
333 | + print('-------------------------------- PROCESSING --------------------------------') | ||
334 | + print('Reading corpus...') | ||
335 | + t0 = time() | ||
336 | + | ||
337 | + sentencesTrainingData = [] | ||
338 | + sentencesTestData = [] | ||
339 | + | ||
340 | + stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | ||
341 | + | ||
342 | + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | ||
343 | + # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
344 | + for line in iFile.readlines(): | ||
345 | + listLine = [] | ||
346 | + line = line.decode("utf-8") | ||
347 | + for token in line.strip('\n').split(): | ||
348 | + if options.filterStopWords: | ||
349 | + listToken = token.split('|') | ||
350 | + lemma = listToken[1] | ||
351 | + # Original: if lemma in stopwords.words('english'): | ||
352 | + # trainingTesting_Sklearn_crfsuite.py:269: | ||
353 | + # UnicodeWarning: Unicode equal comparison failed to | ||
354 | + # convert both arguments to Unicode - | ||
355 | + # interpreting them as being unequal | ||
356 | + if lemma in stopwords: | ||
357 | + continue | ||
358 | + if options.filterSymbols: | ||
359 | + listToken = token.split('|') | ||
360 | + lemma = listToken[1] | ||
361 | + if lemma in symbols: | ||
362 | + # if lemma == ',': | ||
363 | + # print "Coma , identificada" | ||
364 | + continue | ||
365 | + listLine.append(token) | ||
366 | + sentencesTrainingData.append(listLine) | ||
367 | + print " Sentences training data: " + str(len(sentencesTrainingData)) | ||
368 | + # print sentencesTrainingData[0] | ||
369 | + | ||
370 | + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: | ||
371 | + # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
372 | + for line in iFile.readlines(): | ||
373 | + listLine = [] | ||
374 | + line = line.decode("utf-8") | ||
375 | + for token in line.strip('\n').split(): | ||
376 | + if options.filterStopWords: | ||
377 | + listToken = token.split('|') | ||
378 | + lemma = listToken[1] | ||
379 | + # Original if lemma in stopwords.words('english'): | ||
380 | + if lemma in stopwords: | ||
381 | + continue | ||
382 | + if options.filterSymbols: | ||
383 | + listToken = token.split('|') | ||
384 | + lemma = listToken[1] | ||
385 | + if lemma in symbols: | ||
386 | + # if lemma == ',': | ||
387 | + # print "Coma , identificada" | ||
388 | + continue | ||
389 | + listLine.append(token) | ||
390 | + sentencesTestData.append(listLine) | ||
391 | + print " Sentences test data: " + str(len(sentencesTestData)) | ||
392 | + # print sentencesTestData[0] | ||
393 | + | ||
394 | + print("Reading corpus done in: %fs" % (time() - t0)) | ||
395 | + | ||
396 | + print(sent2features(sentencesTrainingData[0])[0]) | ||
397 | + print(sent2features(sentencesTestData[0])[0]) | ||
398 | + # print(sent2labels(sentencesTrainingData[0])) | ||
399 | + # print(sent2labels(sentencesTestData[0])) | ||
400 | + t0 = time() | ||
401 | + | ||
402 | + X_train = [sent2features(s) for s in sentencesTrainingData] | ||
403 | + y_train = [sent2labels(s) for s in sentencesTrainingData] | ||
404 | + | ||
405 | + X_test = [sent2features(s) for s in sentencesTestData] | ||
406 | + # print X_test | ||
407 | + y_test = [sent2labels(s) for s in sentencesTestData] | ||
408 | + | ||
409 | + # Fixed parameters | ||
410 | + # crf = sklearn_crfsuite.CRF( | ||
411 | + # algorithm='lbfgs', | ||
412 | + # c1=0.1, | ||
413 | + # c2=0.1, | ||
414 | + # max_iterations=100, | ||
415 | + # all_possible_transitions=True | ||
416 | + # ) | ||
417 | + | ||
418 | + # Hyperparameter Optimization | ||
419 | + crf = sklearn_crfsuite.CRF( | ||
420 | + algorithm='lbfgs', | ||
421 | + max_iterations=100, | ||
422 | + all_possible_transitions=True | ||
423 | + ) | ||
424 | + params_space = { | ||
425 | + 'c1': scipy.stats.expon(scale=0.5), | ||
426 | + 'c2': scipy.stats.expon(scale=0.05), | ||
427 | + } | ||
428 | + | ||
429 | + # Original: labels = list(crf.classes_) | ||
430 | + # Original: labels.remove('O') | ||
431 | + labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) | ||
432 | + | ||
433 | + # use the same metric for evaluation | ||
434 | + f1_scorer = make_scorer(metrics.flat_f1_score, | ||
435 | + average='weighted', labels=labels) | ||
436 | + | ||
437 | + # search | ||
438 | + rs = RandomizedSearchCV(crf, params_space, | ||
439 | + cv=3, | ||
440 | + verbose=3, | ||
441 | + n_jobs=-1, | ||
442 | + n_iter=20, | ||
443 | + # n_iter=50, | ||
444 | + scoring=f1_scorer) | ||
445 | + rs.fit(X_train, y_train) | ||
446 | + | ||
447 | + # Fixed parameters | ||
448 | + # crf.fit(X_train, y_train) | ||
449 | + | ||
450 | + # Best hiperparameters | ||
451 | + # crf = rs.best_estimator_ | ||
452 | + nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
453 | + options.filterSymbols) + '.txt') | ||
454 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | ||
455 | + oFile.write("********** TRAINING AND TESTING REPORT **********\n") | ||
456 | + oFile.write("Training file: " + options.trainingFile + '\n') | ||
457 | + oFile.write('\n') | ||
458 | + oFile.write('best params:' + str(rs.best_params_) + '\n') | ||
459 | + oFile.write('best CV score:' + str(rs.best_score_) + '\n') | ||
460 | + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000)) | ||
461 | + | ||
462 | + print("Training done in: %fs" % (time() - t0)) | ||
463 | + t0 = time() | ||
464 | + | ||
465 | + # Update best crf | ||
466 | + crf = rs.best_estimator_ | ||
467 | + | ||
468 | + # Saving model | ||
469 | + print(" Saving training model...") | ||
470 | + t1 = time() | ||
471 | + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
472 | + options.filterSymbols) + '.mod') | ||
473 | + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | ||
474 | + print(" Saving training model done in: %fs" % (time() - t1)) | ||
475 | + | ||
476 | + # Evaluation against test data | ||
477 | + y_pred = crf.predict(X_test) | ||
478 | + print("*********************************") | ||
479 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
480 | + options.filterSymbols) + '.txt') | ||
481 | + with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: | ||
482 | + for y in y_pred: | ||
483 | + oFile.write(str(y) + '\n') | ||
484 | + | ||
485 | + print("*********************************") | ||
486 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
487 | + options.filterSymbols) + '.txt') | ||
488 | + with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: | ||
489 | + for y in y_test: | ||
490 | + oFile.write(str(y) + '\n') | ||
491 | + | ||
492 | + print("Prediction done in: %fs" % (time() - t0)) | ||
493 | + | ||
494 | + # labels = list(crf.classes_) | ||
495 | + # labels.remove('O') | ||
496 | + | ||
497 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile: | ||
498 | + oFile.write('\n') | ||
499 | + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))) | ||
500 | + oFile.write('\n') | ||
501 | + # labels = list(crf.classes_) | ||
502 | + sorted_labels = sorted( | ||
503 | + labels, | ||
504 | + key=lambda name: (name[1:], name[0]) | ||
505 | + ) | ||
506 | + oFile.write(metrics.flat_classification_report( | ||
507 | + y_test, y_pred, labels=sorted_labels, digits=3 | ||
508 | + )) | ||
509 | + oFile.write('\n') | ||
510 | + | ||
511 | + oFile.write("\nTop likely transitions:\n") | ||
512 | + print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | ||
513 | + oFile.write('\n') | ||
514 | + | ||
515 | + oFile.write("\nTop unlikely transitions:\n") | ||
516 | + print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | ||
517 | + oFile.write('\n') | ||
518 | + | ||
519 | + oFile.write("\nTop positive:\n") | ||
520 | + print_state_features(Counter(crf.state_features_).most_common(200), oFile) | ||
521 | + oFile.write('\n') | ||
522 | + | ||
523 | + oFile.write("\nTop negative:\n") | ||
524 | + print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile) | ||
525 | + oFile.write('\n') |
-
Please register or login to post a comment