Carlos-Francisco Méndez-Cruz

Setting up project

1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="VcsDirectoryMappings">
4 + <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 + </component>
6 +</project>
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +import re
8 +
9 +__author__ = 'CMendezC'
10 +
11 +# Objective: Take text-annotated-abstracts-original.txt as input
12 +# for obtaining abstracts separated in files without tags and collecting dictionary of genes
13 +# for tagging after NLP pipeline.
14 +
15 +# Parameters:
16 +# 1) --inputPath Input path.
17 +# 2) --inputFile Input file.
18 +# 3) --outputPath Output path
19 +
20 +# Execution:
21 +#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original
22 +
23 +if __name__ == "__main__":
24 + # Parameter definition
25 + parser = OptionParser()
26 + parser.add_option("--inputPath", dest="inputPath",
27 + help="Input path", metavar="PATH")
28 + parser.add_option("--inputFile", dest="inputFile",
29 + help="Input file", metavar="FILE")
30 + parser.add_option("--outputPath", dest="outputPath",
31 + help="Output path", metavar="PATH")
32 +
33 + (options, args) = parser.parse_args()
34 + if len(args) > 0:
35 + parser.error("None parameters indicated.")
36 + sys.exit(1)
37 +
38 + # Printing parameter values
39 + print('-------------------------------- PARAMETERS --------------------------------')
40 + print("Input path: " + str(options.inputPath))
41 + print("Input file", str(options.inputFile))
42 + print("Output path: " + str(options.outputPath))
43 +
44 + filesWritten = 0
45 + t0 = time()
46 + hashGenes = {}
47 +
48 + rePmid = re.compile(r'([\d])+\|a\|')
49 + reGene = re.compile(r'<g>([^<]+)</g>')
50 + with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
51 + print("Reading file..." + options.inputFile)
52 + for line in iFile:
53 + line = line.strip('\n')
54 + for gene in reGene.findall(line):
55 + print("genes: {}".format(gene))
56 + result = rePmid.match(line)
57 + if result:
58 + with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
59 + oFile.write(line)
60 +
61 +
62 +
1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +import json
8 +from nltk.corpus import stopwords
9 +
10 +__author__ = 'CMendezC'
11 +
12 +# Objective: Take transformed file with format word|lemma|tag,
13 +# for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP
14 +# and create file with an additional tagging for CRF training. For example:
15 +# the|the|dt N-terminal|N-terminal|NN| domain|domain|NN -->
16 +# the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O
17 +# Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP).
18 +# We expect that these words are going to have one aTag in some context and different one in others.
19 +# The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here.
20 +# In output file we only maintain the lemma and the tag or the word and the tag.
21 +# This additional tagging is going to give us clues for aspect classification.
22 +
23 +# Parameters:
24 +# 1) --inputPath Path to read files.
25 +# 2) --trainingFile File name with training data.
26 +# 3) --testFile File name with test data.
27 +# 4) --outputPath Path to write files. File names are concatenated with feature name.
28 +# 5) ELIMINATED --feature Type of feature to extract and create file: lemma
29 +# 6) --termPath Path to read term files
30 +# 7) --termFiles JSON file with terms files and tags
31 +# 8) --termPath Path to read JSON file with information about frequent words files
32 +# 9) --inputFileFreq JSON file with information about frequent words
33 +# 10 --skip=N Skip N words to form skip mentions
34 +# 11) --stopWords Filtering stop words
35 +# 12) --filterPunctMarks Filtering punctuation marks
36 +
37 +# Ouput:
38 +# 1) Files created. Name of feature is concatenated
39 +
40 +# Execution:
41 +# ASPECTS
42 +# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
43 +
44 +# SENTENCES
45 +# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
46 +
47 +# none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json
48 +# stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords
49 +# stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks
50 +# filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks
51 +# ¿? --SKIP
52 +
53 +def getSkipMentions(aList, aSkip):
54 + hashTemp = {}
55 + for j in range(0, aSkip):
56 + listTemp = []
57 + for i in range(0, len(aList), aSkip+1):
58 + listTemp.append(aList[i + j])
59 + hashTemp[j] = listTemp
60 + return hashTemp
61 +
62 +###########################################################
63 +# MAIN PROGRAM #
64 +###########################################################
65 +
66 +if __name__ == "__main__":
67 + # Parameter definition
68 + parser = OptionParser()
69 + parser.add_option("--inputPath", dest="inputPath",
70 + help="Path to read files", metavar="PATH")
71 + parser.add_option("--trainingFile", dest="trainingFile",
72 + help="File with training examples", metavar="FILE")
73 + parser.add_option("--testFile", dest="testFile",
74 + help="File with test examples", metavar="FILE")
75 + parser.add_option("--trainingClassesFile", dest="trainingClassesFile",
76 + help="File with training classes", metavar="FILE")
77 + parser.add_option("--testClassesFile", dest="testClassesFile",
78 + help="File with test classes", metavar="FILE")
79 + parser.add_option("--outputPath", dest="outputPath",
80 + help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
81 + parser.add_option("--termPath", dest="termPath",
82 + help="Path to read term files", metavar="PATH")
83 + parser.add_option("--termFiles", dest="termFiles",
84 + help="JSON file with terms files and tags", metavar="PATH")
85 + parser.add_option("--inputFileFreq", dest="inputFileFreq",
86 + help="JSON file with information about frequent words", metavar="PATH")
87 + parser.add_option("--skip", type="int",
88 + dest="skip", default=0,
89 + help="Skip mentions", metavar="N")
90 + parser.add_option("--filterStopWords", default=False,
91 + action="store_true", dest="filterStopWords",
92 + help="Filtering stop words")
93 + parser.add_option("--filterPunctMarks", default=False,
94 + action="store_true", dest="filterPunctMarks",
95 + help="Filtering punctuation marks")
96 +
97 + (options, args) = parser.parse_args()
98 + if len(args) > 0:
99 + parser.error("None parameters indicated.")
100 + sys.exit(1)
101 +
102 + # Printing parameter values
103 + print('-------------------------------- PARAMETERS --------------------------------')
104 + print("Path to read files: " + str(options.inputPath))
105 + print("File with training examples", str(options.trainingFile))
106 + print("File with test examples", str(options.testFile))
107 + print("File with training classes", str(options.trainingClassesFile))
108 + print("File with test classes", str(options.testClassesFile))
109 + print("File with training classes", str(options.trainingClassesFile))
110 + print("File with test classes", str(options.testClassesFile))
111 + print("Path to write output files: " + str(options.outputPath))
112 + print("JSON file with information about frequent words: " + str(options.inputFileFreq))
113 + print("Skip mentions: " + str(options.skip))
114 + print("Filtering stop words: " + str(options.stopWords))
115 + punctMarks = ['.', ',', ':', ';', '?', '!', '\'', '"']
116 + print("Filtering puntuation marks " + str(punctMarks) + ': '+ str(options.filterPunctMarks))
117 +
118 + filesRead = 0
119 + t0 = time()
120 +
121 + print('Loading biological term files...')
122 + with open(os.path.join(options.termPath, options.termFiles)) as data_file:
123 + hashes = json.load(data_file)
124 + print(' Loading biological term files... done')
125 +
126 + hashTagAspect = hashes["hashTagAspect"]
127 +
128 + print('Loading frequent words...')
129 + with open(os.path.join(options.termPath, options.inputFileFreq)) as data_file:
130 + hashAspectFreqWords = json.load(data_file)
131 + print(' Loading frequent words... done')
132 +
133 + listFiles = [options.trainingFile, options.testFile]
134 + listClassesFiles = [options.trainingClassesFile, options.testClassesFile]
135 +
136 + for iFile, cFile in zip(listFiles, listClassesFiles):
137 + with open(os.path.join(options.inputPath, iFile), "r", encoding="utf-8", errors="replace") as tFile:
138 + print("Reading file..." + iFile)
139 + lines = [l.strip('\n') for l in tFile.readlines()]
140 + filesRead += 1
141 + with open(os.path.join(options.inputPath, cFile), "r", encoding="utf-8", errors="replace") as clFile:
142 + print("Reading file..." + cFile)
143 + classes = [c.strip('\n') for c in clFile.readlines()]
144 + listLines = []
145 + print("Processing files... ")
146 + for line, c in zip(lines, classes):
147 + # print("class: ", c)
148 + listTokenLine = []
149 + # listLemmaLine = []
150 + for tok in line.split():
151 + tokList = tok.split("|")
152 + word = tokList[0]
153 + lemma = tokList[1]
154 + tag = tokList[2]
155 + # Filtering stopwords
156 + if options.stopWords:
157 + if lemma in stopwords.words('english'):
158 + continue
159 + if options.filterPunctMarks:
160 + if lemma in punctMarks:
161 + continue
162 + # if tag in hashTagAspect:
163 + # We change tag for aspect tag only in the case of aspect tag coincide with class.
164 + # We want that CRF learn when to change term tag to aspect tag in correct context
165 + if tag in hashTagAspect:
166 + if hashTagAspect[tag] == c:
167 + aTag = hashTagAspect[tag]
168 + else:
169 + aTag = 'O'
170 + else:
171 + if c in hashAspectFreqWords:
172 + # print("class: ", c)
173 + hashFreqWords = hashAspectFreqWords[c]
174 + # We verify if word or lemma is in frequent words.
175 + # These frequent words are word-forms (tokens)
176 + if word.lower() in hashFreqWords or lemma in hashFreqWords:
177 + aTag = c
178 + else:
179 + aTag = 'O'
180 + else:
181 + aTag = 'O'
182 + listTokenLine.append(word + "|" + lemma + "|" + tag + "|" + aTag)
183 + # if feature == "word":
184 + listLines.append(listTokenLine)
185 + # if feature == "lemma":
186 + # listLines = listLemmaLine.strip() + '\n'
187 + if options.skip > 0:
188 + t0 = time()
189 + skipTemp = options.skip
190 + for i in range(1, options.skip):
191 + hashTemp = getSkipMentions(listLines, skipTemp)
192 + # skipTemp -= 1
193 + for key in hashTemp:
194 + listLines = hashTemp[key]
195 + with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
196 + options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(skipTemp) + '.txt')), "w",
197 + encoding="utf-8") as oFile:
198 + for line in listLines:
199 + oFile.write(line)
200 + print("Skip mention done in: %fs" % (time() - t0))
201 + else:
202 + with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
203 + options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(options.skip) + '.txt')), "w",
204 + encoding="utf-8") as oFile:
205 + for line in listLines:
206 + for token in line:
207 + oFile.write(token + ' ')
208 + oFile.write('\n')
209 +
210 + print("Files processed: " + str(filesRead))
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from itertools import chain
5 +from optparse import OptionParser
6 +from time import time
7 +from collections import Counter
8 +
9 +import nltk
10 +import sklearn
11 +import scipy.stats
12 +import sys
13 +
14 +from sklearn.externals import joblib
15 +from sklearn.metrics import make_scorer
16 +from sklearn.cross_validation import cross_val_score
17 +from sklearn.grid_search import RandomizedSearchCV
18 +
19 +import sklearn_crfsuite
20 +from sklearn_crfsuite import scorers
21 +from sklearn_crfsuite import metrics
22 +
23 +from nltk.corpus import stopwords
24 +from trainingTesting_Sklearn_crfsuite import word2features
25 +from trainingTesting_Sklearn_crfsuite import sent2features
26 +# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum
27 +# from trainingTesting_Sklearn_crfsuite import hasDigit
28 +
29 +# Objective
30 +# Tagging transformed file with CRF model with sklearn-crfsuite.
31 +#
32 +# Input parameters
33 +# --inputPath=PATH Path of transformed files x|y|z
34 +# --modelPath Path to CRF model
35 +# --modelName Model name
36 +# --outputPath=PATH Output path to place output files
37 +# --filteringStopWords Filtering stop words
38 +# --filterSymbols Filtering punctuation marks
39 +
40 +# Output
41 +# 1) Tagged files in transformed format
42 +
43 +# Examples
44 +# Sentences
45 +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt
46 +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt
47 +
48 +#################################
49 +# FUNCTIONS #
50 +#################################
51 +# def hasDigit(text):
52 +# has = False
53 +# if len(text) < 3:
54 +# return False
55 +# myRegex = nltk.re.compile('[0-9]')
56 +# if myRegex.search(text) != None:
57 +# has = True
58 +# return has
59 +#
60 +#
61 +# def hasNonAlphaNum(text):
62 +# has = False
63 +# if len(text) < 3:
64 +# return False
65 +# myRegex = nltk.re.compile('\W')
66 +# if myRegex.search(text) != None:
67 +# has = True
68 +# return has
69 +
70 +# IMPORTED FROM TRAINING SCRIPT
71 +# def word2features(sent, i):
72 +# # print "i: " + str(i)
73 +# # print "sent[i]" + sent[i]
74 +# listElem = sent[i].split('|')
75 +# word = listElem[0]
76 +# lemma = listElem[1]
77 +# postag = listElem[2]
78 +#
79 +# features = {
80 +# # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
81 +# # Suffixes
82 +# 'word[-3:]': word[-3:],
83 +# 'word[-2:]': word[-2:],
84 +# 'word[-1:]': word[-1:],
85 +# 'word.isupper()': word.isupper(),
86 +# 'word.istitle()': word.istitle(),
87 +# 'word.hasDigit()': hasDigit(word),
88 +# 'word.hasNonAlphaNum': hasNonAlphaNum(word),
89 +# # 'word.isdigit()': word.isdigit(),
90 +# 'word': word,
91 +# 'lemma': lemma,
92 +# 'lemma[-3:]': lemma[-3:],
93 +# 'lemma[-2:]': lemma[-2:],
94 +# 'lemma[-1:]': lemma[-1:],
95 +# 'postag': postag,
96 +# # Prefixes
97 +# 'postag[:2]': postag[:2],
98 +# 'postag[:1]': postag[:1],
99 +# }
100 +# if i > 0:
101 +# listElem = sent[i - 1].split('|')
102 +# word1 = listElem[0]
103 +# lemma1 = listElem[1]
104 +# postag1 = listElem[2]
105 +# features.update({
106 +# '-1:word.lower()': word1.lower(),
107 +# '-1:word.istitle()': word1.istitle(),
108 +# '-1:word.isupper()': word1.isupper(),
109 +# '-1:word.hasDigit()': hasDigit(word1),
110 +# '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
111 +# '-1:word': word1,
112 +# '-1:lemma': lemma1,
113 +# '-1:postag': postag1,
114 +# '-1:postag[:2]': postag1[:2],
115 +# '-1:postag[:1]': postag1[:1],
116 +# })
117 +# # else:
118 +# # features['BOS'] = True
119 +#
120 +# if i < len(sent) - 1:
121 +# listElem = sent[i + 1].split('|')
122 +# word1 = listElem[0]
123 +# lemma1 = listElem[1]
124 +# postag1 = listElem[2]
125 +# features.update({
126 +# '+1:word.lower()': word1.lower(),
127 +# '+1:word.istitle()': word1.istitle(),
128 +# '+1:word.isupper()': word1.isupper(),
129 +# '+1:word.hasDigit()': hasDigit(word1),
130 +# '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
131 +# '+1:word': word1,
132 +# '+1:lemma': lemma1,
133 +# '+1:postag': postag1,
134 +# '+1:postag[:2]': postag1[:2],
135 +# '+1:postag[:1]': postag1[:1],
136 +# })
137 +# # else:
138 +# # features['EOS'] = True
139 +# if i > 1:
140 +# listElem = sent[i - 2].split('|')
141 +# word2 = listElem[0]
142 +# lemma2 = listElem[1]
143 +# postag2 = listElem[2]
144 +# features.update({
145 +# '-2:word.lower()': word2.lower(),
146 +# '-2:word.istitle()': word2.istitle(),
147 +# '-2:word.isupper()': word2.isupper(),
148 +# '-2:word.hasDigit()': hasDigit(word2),
149 +# '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
150 +# '-2:word': word2,
151 +# '-2:lemma': lemma2,
152 +# '-2:postag': postag2,
153 +# '-2:postag[:2]': postag2[:2],
154 +# '-2:postag[:1]': postag2[:1],
155 +# })
156 +#
157 +# if i < len(sent) - 2:
158 +# listElem = sent[i + 2].split('|')
159 +# word2 = listElem[0]
160 +# lemma2 = listElem[1]
161 +# postag2 = listElem[2]
162 +# features.update({
163 +# '+2:word.lower()': word2.lower(),
164 +# '+2:word.istitle()': word2.istitle(),
165 +# '+2:word.isupper()': word2.isupper(),
166 +# '+2:word.hasDigit()': hasDigit(word2),
167 +# '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
168 +# '+2:word': word2,
169 +# '+2:lemma': lemma2,
170 +# '+2:postag': postag2,
171 +# '+2:postag[:2]': postag2[:2],
172 +# '+2:postag[:1]': postag2[:1],
173 +# })
174 +#
175 +# trigrams = False
176 +# if trigrams:
177 +# if i > 2:
178 +# listElem = sent[i - 3].split('|')
179 +# word3 = listElem[0]
180 +# lemma3 = listElem[1]
181 +# postag3 = listElem[2]
182 +# features.update({
183 +# '-3:word.lower()': word3.lower(),
184 +# '-3:word.istitle()': word3.istitle(),
185 +# '-3:word.isupper()': word3.isupper(),
186 +# '-3:word.hasDigit()': hasDigit(word3),
187 +# '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
188 +# '-3:word': word3,
189 +# '-3:lemma': lemma3,
190 +# '-3:postag': postag3,
191 +# '-3:postag[:2]': postag3[:2],
192 +# '-3:postag[:1]': postag3[:1],
193 +# })
194 +#
195 +# if i < len(sent) - 3:
196 +# listElem = sent[i + 3].split('|')
197 +# word3 = listElem[0]
198 +# lemma3 = listElem[1]
199 +# postag3 = listElem[2]
200 +# features.update({
201 +# '+3:word.lower()': word3.lower(),
202 +# '+3:word.istitle()': word3.istitle(),
203 +# '+3:word.isupper()': word3.isupper(),
204 +# '+3:word.hasDigit()': hasDigit(word3),
205 +# '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
206 +# '+3:word': word3,
207 +# '+3:lemma': lemma3,
208 +# '+3:postag': postag3,
209 +# '+3:postag[:2]': postag3[:2],
210 +# '+3:postag[:1]': postag3[:1],
211 +# })
212 +#
213 +# return features
214 +
215 +
216 +# def sent2features(sent):
217 +# return [word2features(sent, i) for i in range(len(sent))]
218 +
219 +
220 +__author__ = 'CMendezC'
221 +
222 +##########################################
223 +# MAIN PROGRAM #
224 +##########################################
225 +
226 +if __name__ == "__main__":
227 + # Defining parameters
228 + parser = OptionParser()
229 + parser.add_option("--inputPath", dest="inputPath",
230 + help="Path of training data set", metavar="PATH")
231 + parser.add_option("--outputPath", dest="outputPath",
232 + help="Output path to place output files",
233 + metavar="PATH")
234 + parser.add_option("--modelPath", dest="modelPath",
235 + help="Path to read CRF model",
236 + metavar="PATH")
237 + parser.add_option("--modelName", dest="modelName",
238 + help="Model name", metavar="TEXT")
239 + parser.add_option("--filterStopWords", default=False,
240 + action="store_true", dest="filterStopWords",
241 + help="Filtering stop words")
242 + parser.add_option("--filterSymbols", default=False,
243 + action="store_true", dest="filterSymbols",
244 + help="Filtering punctuation marks")
245 +
246 + (options, args) = parser.parse_args()
247 + if len(args) > 0:
248 + parser.error("Any parameter given.")
249 + sys.exit(1)
250 +
251 + print('-------------------------------- PARAMETERS --------------------------------')
252 + print("Path to read input files: " + options.inputPath)
253 + print("Mode name: " + str(options.modelName))
254 + print("Model path: " + options.modelPath)
255 + print("Path to place output files: " + options.outputPath)
256 + print("Filtering stop words: " + str(options.filterStopWords))
257 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
258 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
259 + # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
260 + # '}', '[', ']', '*', '%', '$', '#', '&', '°']]
261 + # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{',
262 + # u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`']
263 + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
264 +
265 + print('-------------------------------- PROCESSING --------------------------------')
266 +
267 + stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
268 +
269 + # Read CRF model
270 + t0 = time()
271 + print('Reading CRF model...')
272 + crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod'))
273 + print("Reading CRF model done in: %fs" % (time() - t0))
274 +
275 + print('Processing corpus...')
276 + t0 = time()
277 + labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
278 + # Walk directory to read files
279 + for path, dirs, files in os.walk(options.inputPath):
280 + # For each file in dir
281 + for file in files:
282 + print(" Preprocessing file..." + str(file))
283 + sentencesInputData = []
284 + sentencesOutputData = []
285 + with open(os.path.join(options.inputPath, file), "r") as iFile:
286 + lines = iFile.readlines()
287 + for line in lines:
288 + listLine = []
289 + # line = line.decode("utf-8")
290 + for token in line.strip('\n').split():
291 + if options.filterStopWords:
292 + listToken = token.split('|')
293 + lemma = listToken[1]
294 + # Original if lemma in stopwords.words('english'):
295 + if lemma in stopwords:
296 + continue
297 + if options.filterSymbols:
298 + listToken = token.split('|')
299 + lemma = listToken[1]
300 + if lemma in symbols:
301 + if lemma == ',':
302 + print "Coma , identificada"
303 + continue
304 + listLine.append(token)
305 + sentencesInputData.append(listLine)
306 + print " Sentences input data: " + str(len(sentencesInputData))
307 + # print sentencesInputData[0]
308 + # print(sent2features(sentencesInputData[0])[0])
309 + # print(sent2labels(sentencesInputData[0]))
310 + X_input = [sent2features(s) for s in sentencesInputData]
311 + print(sent2features(sentencesInputData[0])[0])
312 + # y_test = [sent2labels(s) for s in sentencesInputData]
313 + # Predicting tags
314 + t1 = time()
315 + print " Predicting tags with model"
316 + y_pred = crf.predict(X_input)
317 + print y_pred[0]
318 + print(" Prediction done in: %fs" % (time() - t1))
319 + # Tagging with CRF model
320 + print " Tagging file"
321 + for line, tagLine in zip(lines, y_pred):
322 + outputLine = ''
323 + idx_tagLine = 0
324 + line = line.strip('\n')
325 + print "\nLine: " + str(line)
326 + print "CRF tagged line: " + str(tagLine)
327 + for token in line.split():
328 + listToken = token.split('|')
329 + word = listToken[0]
330 + lemma = listToken[1]
331 + tag = listToken[2]
332 + if options.filterStopWords:
333 + if lemma in stopwords:
334 + outputLine += token + ' '
335 + continue
336 + if options.filterSymbols:
337 + if lemma in symbols:
338 + if lemma == ',':
339 + print "Coma , identificada"
340 + outputLine += token + ' '
341 + continue
342 + CRFtag = tagLine[idx_tagLine]
343 + if (tag not in labels) and (CRFtag != 'O'):
344 + print "*** CRF change token {} to {}".format(token, CRFtag)
345 + outputLine += word + '|' + lemma + '|' + CRFtag + ' '
346 + else:
347 + outputLine += word + '|' + lemma + '|' + tag + ' '
348 + idx_tagLine += 1
349 + sentencesOutputData.append(outputLine.rstrip())
350 + with open(os.path.join(options.outputPath, file), "w") as oFile:
351 + for line in sentencesOutputData:
352 + oFile.write(line + '\n')
353 +
354 + print("Processing corpus done in: %fs" % (time() - t0))
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from itertools import chain
5 +from optparse import OptionParser
6 +from time import time
7 +from collections import Counter
8 +
9 +import nltk
10 +import sklearn
11 +import scipy.stats
12 +import sys
13 +
14 +from sklearn.externals import joblib
15 +from sklearn.metrics import make_scorer
16 +from sklearn.cross_validation import cross_val_score
17 +from sklearn.grid_search import RandomizedSearchCV
18 +
19 +import sklearn_crfsuite
20 +from sklearn_crfsuite import scorers
21 +from sklearn_crfsuite import metrics
22 +
23 +from nltk.corpus import stopwords
24 +
25 +
26 +# Objective
27 +# Training and evaluation of CRFs with sklearn-crfsuite.
28 +#
29 +# Input parameters
30 +# --inputPath=PATH Path of training and test data set
31 +# --trainingFile File with training data set
32 +# --testFile File with test data set
33 +# --outputPath=PATH Output path to place output files
34 +# --filteringStopWords Filtering stop words
35 +# --filterSymbols Filtering punctuation marks
36 +
37 +# Output
38 +# 1) Best model
39 +
40 +# Examples
41 +# Sentences
42 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt
43 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt
44 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt
45 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt
46 +
47 +# Aspects
48 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt
49 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt
50 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt
51 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt
52 +
53 +#################################
54 +# FUNCTIONS #
55 +#################################
56 +
57 +def wordSize(text):
58 + lWord = len(text)
59 + if lWord == 1:
60 + return '1'
61 + elif lWord == 2:
62 + return '2'
63 + elif lWord == 3:
64 + return '3'
65 + elif lWord == 4:
66 + return '4'
67 + elif lWord == 5:
68 + return '5'
69 + elif 6 <= lWord <= 10:
70 + return '6-10'
71 + elif 11 <= lWord <= 15:
72 + return '11-15'
73 + elif 16 <= lWord <= 20:
74 + return '16-20'
75 + elif 21 <= lWord <= 30:
76 + return '21-30'
77 + else:
78 + return '>30'
79 +
80 +def hasUpperLower(text):
81 + has = False
82 + if len(text) < 3:
83 + return False
84 + regexUp = nltk.re.compile('[A-Z]')
85 + regexLo = nltk.re.compile('[a-z]')
86 + if (regexUp.search(text) != None) and (regexLo.search(text) != None):
87 + has = True
88 + return has
89 +
90 +def hasDigit(text):
91 + has = False
92 + if len(text) < 3:
93 + return False
94 + myRegex = nltk.re.compile('[0-9]')
95 + if myRegex.search(text) != None:
96 + has = True
97 + return has
98 +
99 +
100 +def hasNonAlphaNum(text):
101 + has = False
102 + if len(text) < 3:
103 + return False
104 + myRegex = nltk.re.compile('\W')
105 + if myRegex.search(text) != None:
106 + has = True
107 + return has
108 +
109 +def word2features(sent, i):
110 + # print "i: " + str(i)
111 + # print "sent[i]" + sent[i]
112 + listElem = sent[i].split('|')
113 + word = listElem[0]
114 + lemma = listElem[1]
115 + postag = listElem[2]
116 +
117 + features = {
118 + # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
119 + # Suffixes
120 + 'word[-3:]': word[-3:],
121 + 'word[-2:]': word[-2:],
122 + 'word[-1:]': word[-1:],
123 + 'word.isupper()': word.isupper(),
124 + 'word.istitle()': word.istitle(),
125 + 'word.hasDigit()': hasDigit(word),
126 + 'word.hasNonAlphaNum': hasNonAlphaNum(word),
127 + # 'word.hasUpperLower': hasUpperLower(word),
128 + #'wordSize': wordSize(word),
129 + # 'word.isdigit()': word.isdigit(),
130 + 'word': word,
131 + 'lemma': lemma,
132 + 'lemma[-3:]': lemma[-3:],
133 + 'lemma[-2:]': lemma[-2:],
134 + 'lemma[-1:]': lemma[-1:],
135 + 'postag': postag,
136 + # Prefixes
137 + 'postag[:2]': postag[:2],
138 + 'postag[:1]': postag[:1],
139 + }
140 + if i > 0:
141 + listElem = sent[i - 1].split('|')
142 + word1 = listElem[0]
143 + lemma1 = listElem[1]
144 + postag1 = listElem[2]
145 + features.update({
146 + '-1:word.lower()': word1.lower(),
147 + '-1:word.istitle()': word1.istitle(),
148 + '-1:word.isupper()': word1.isupper(),
149 + '-1:word.hasDigit()': hasDigit(word1),
150 + '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
151 + # '-1:word.hasUpperLower': hasUpperLower(word1),
152 + '-1:word': word1,
153 + '-1:lemma': lemma1,
154 + '-1:postag': postag1,
155 + '-1:postag[:2]': postag1[:2],
156 + '-1:postag[:1]': postag1[:1],
157 + })
158 + # else:
159 + # features['BOS'] = True
160 +
161 + if i < len(sent) - 1:
162 + listElem = sent[i + 1].split('|')
163 + word1 = listElem[0]
164 + lemma1 = listElem[1]
165 + postag1 = listElem[2]
166 + features.update({
167 + '+1:word.lower()': word1.lower(),
168 + '+1:word.istitle()': word1.istitle(),
169 + '+1:word.isupper()': word1.isupper(),
170 + '+1:word.hasDigit()': hasDigit(word1),
171 + '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
172 + # '+1:word.hasUpperLower': hasUpperLower(word1),
173 + '+1:word': word1,
174 + '+1:lemma': lemma1,
175 + '+1:postag': postag1,
176 + '+1:postag[:2]': postag1[:2],
177 + '+1:postag[:1]': postag1[:1],
178 + })
179 + # else:
180 + # features['EOS'] = True
181 + if i > 1:
182 + listElem = sent[i - 2].split('|')
183 + word2 = listElem[0]
184 + lemma2 = listElem[1]
185 + postag2 = listElem[2]
186 + features.update({
187 + '-2:word.lower()': word2.lower(),
188 + '-2:word.istitle()': word2.istitle(),
189 + '-2:word.isupper()': word2.isupper(),
190 + '-2:word.hasDigit()': hasDigit(word2),
191 + '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
192 + # '-2:word.hasUpperLower': hasUpperLower(word2),
193 + '-2:word': word2,
194 + '-2:lemma': lemma2,
195 + '-2:postag': postag2,
196 + '-2:postag[:2]': postag2[:2],
197 + '-2:postag[:1]': postag2[:1],
198 + })
199 +
200 + if i < len(sent) - 2:
201 + listElem = sent[i + 2].split('|')
202 + word2 = listElem[0]
203 + lemma2 = listElem[1]
204 + postag2 = listElem[2]
205 + features.update({
206 + '+2:word.lower()': word2.lower(),
207 + '+2:word.istitle()': word2.istitle(),
208 + '+2:word.isupper()': word2.isupper(),
209 + '+2:word.hasDigit()': hasDigit(word2),
210 + '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
211 + # '+2:word.hasUpperLower': hasUpperLower(word2),
212 + '+2:word': word2,
213 + '+2:lemma': lemma2,
214 + '+2:postag': postag2,
215 + '+2:postag[:2]': postag2[:2],
216 + '+2:postag[:1]': postag2[:1],
217 + })
218 +
219 + trigrams = False
220 + if trigrams:
221 + if i > 2:
222 + listElem = sent[i - 3].split('|')
223 + word3 = listElem[0]
224 + lemma3 = listElem[1]
225 + postag3 = listElem[2]
226 + features.update({
227 + '-3:word.lower()': word3.lower(),
228 + '-3:word.istitle()': word3.istitle(),
229 + '-3:word.isupper()': word3.isupper(),
230 + '-3:word.hasDigit()': hasDigit(word3),
231 + '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
232 + # '-3:word.hasUpperLower': hasUpperLower(word3),
233 + '-3:word': word3,
234 + '-3:lemma': lemma3,
235 + '-3:postag': postag3,
236 + '-3:postag[:2]': postag3[:2],
237 + '-3:postag[:1]': postag3[:1],
238 + })
239 +
240 + if i < len(sent) - 3:
241 + listElem = sent[i + 3].split('|')
242 + word3 = listElem[0]
243 + lemma3 = listElem[1]
244 + postag3 = listElem[2]
245 + features.update({
246 + '+3:word.lower()': word3.lower(),
247 + '+3:word.istitle()': word3.istitle(),
248 + '+3:word.isupper()': word3.isupper(),
249 + '+3:word.hasDigit()': hasDigit(word3),
250 + '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
251 + # '+3:word.hasUpperLower': hasUpperLower(word3),
252 + '+3:word': word3,
253 + '+3:lemma': lemma3,
254 + '+3:postag': postag3,
255 + '+3:postag[:2]': postag3[:2],
256 + '+3:postag[:1]': postag3[:1],
257 + })
258 +
259 + return features
260 +
261 +
262 +def sent2features(sent):
263 + return [word2features(sent, i) for i in range(len(sent))]
264 +
265 +
266 +def sent2labels(sent):
267 + return [elem.split('|')[3] for elem in sent]
268 + # return [label for token, postag, label in sent]
269 +
270 +
271 +def sent2tokens(sent):
272 + return [token for token, postag, label in sent]
273 +
274 +
275 +def print_transitions(trans_features, f):
276 + for (label_from, label_to), weight in trans_features:
277 + # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
278 + # f.write("label_from :" + label_from)
279 + # f.write("label_to :" + label_to)
280 + # f.write("label_weight :" + weight)
281 + # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
282 + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
283 +
284 +
285 +def print_state_features(state_features, f):
286 + for (attr, label), weight in state_features:
287 + # f.write("%0.6f %-8s %s\n" % (weight, label, attr))
288 + # f.write(attr.encode("utf-8"))
289 + # '{:06.2f}'.format(3.141592653589793)
290 + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
291 +
292 +
293 +__author__ = 'CMendezC'
294 +
295 +##########################################
296 +# MAIN PROGRAM #
297 +##########################################
298 +
299 +if __name__ == "__main__":
300 + # Defining parameters
301 + parser = OptionParser()
302 + parser.add_option("--inputPath", dest="inputPath",
303 + help="Path of training data set", metavar="PATH")
304 + parser.add_option("--outputPath", dest="outputPath",
305 + help="Output path to place output files",
306 + metavar="PATH")
307 + parser.add_option("--trainingFile", dest="trainingFile",
308 + help="File with training data set", metavar="FILE")
309 + parser.add_option("--testFile", dest="testFile",
310 + help="File with test data set", metavar="FILE")
311 + parser.add_option("--filterStopWords", default=False,
312 + action="store_true", dest="filterStopWords",
313 + help="Filtering stop words")
314 + parser.add_option("--filterSymbols", default=False,
315 + action="store_true", dest="filterSymbols",
316 + help="Filtering punctuation marks")
317 +
318 + (options, args) = parser.parse_args()
319 + if len(args) > 0:
320 + parser.error("Any parameter given.")
321 + sys.exit(1)
322 +
323 + print('-------------------------------- PARAMETERS --------------------------------')
324 + print("Path of training data set: " + options.inputPath)
325 + print("File with training data set: " + str(options.trainingFile))
326 + print("Path of test data set: " + options.inputPath)
327 + print("File with test data set: " + str(options.testFile))
328 + print("Filtering stop words: " + str(options.filterStopWords))
329 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
330 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
331 + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
332 +
333 + print('-------------------------------- PROCESSING --------------------------------')
334 + print('Reading corpus...')
335 + t0 = time()
336 +
337 + sentencesTrainingData = []
338 + sentencesTestData = []
339 +
340 + stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
341 +
342 + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
343 + # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
344 + for line in iFile.readlines():
345 + listLine = []
346 + line = line.decode("utf-8")
347 + for token in line.strip('\n').split():
348 + if options.filterStopWords:
349 + listToken = token.split('|')
350 + lemma = listToken[1]
351 + # Original: if lemma in stopwords.words('english'):
352 + # trainingTesting_Sklearn_crfsuite.py:269:
353 + # UnicodeWarning: Unicode equal comparison failed to
354 + # convert both arguments to Unicode -
355 + # interpreting them as being unequal
356 + if lemma in stopwords:
357 + continue
358 + if options.filterSymbols:
359 + listToken = token.split('|')
360 + lemma = listToken[1]
361 + if lemma in symbols:
362 + # if lemma == ',':
363 + # print "Coma , identificada"
364 + continue
365 + listLine.append(token)
366 + sentencesTrainingData.append(listLine)
367 + print " Sentences training data: " + str(len(sentencesTrainingData))
368 + # print sentencesTrainingData[0]
369 +
370 + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
371 + # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
372 + for line in iFile.readlines():
373 + listLine = []
374 + line = line.decode("utf-8")
375 + for token in line.strip('\n').split():
376 + if options.filterStopWords:
377 + listToken = token.split('|')
378 + lemma = listToken[1]
379 + # Original if lemma in stopwords.words('english'):
380 + if lemma in stopwords:
381 + continue
382 + if options.filterSymbols:
383 + listToken = token.split('|')
384 + lemma = listToken[1]
385 + if lemma in symbols:
386 + # if lemma == ',':
387 + # print "Coma , identificada"
388 + continue
389 + listLine.append(token)
390 + sentencesTestData.append(listLine)
391 + print " Sentences test data: " + str(len(sentencesTestData))
392 + # print sentencesTestData[0]
393 +
394 + print("Reading corpus done in: %fs" % (time() - t0))
395 +
396 + print(sent2features(sentencesTrainingData[0])[0])
397 + print(sent2features(sentencesTestData[0])[0])
398 + # print(sent2labels(sentencesTrainingData[0]))
399 + # print(sent2labels(sentencesTestData[0]))
400 + t0 = time()
401 +
402 + X_train = [sent2features(s) for s in sentencesTrainingData]
403 + y_train = [sent2labels(s) for s in sentencesTrainingData]
404 +
405 + X_test = [sent2features(s) for s in sentencesTestData]
406 + # print X_test
407 + y_test = [sent2labels(s) for s in sentencesTestData]
408 +
409 + # Fixed parameters
410 + # crf = sklearn_crfsuite.CRF(
411 + # algorithm='lbfgs',
412 + # c1=0.1,
413 + # c2=0.1,
414 + # max_iterations=100,
415 + # all_possible_transitions=True
416 + # )
417 +
418 + # Hyperparameter Optimization
419 + crf = sklearn_crfsuite.CRF(
420 + algorithm='lbfgs',
421 + max_iterations=100,
422 + all_possible_transitions=True
423 + )
424 + params_space = {
425 + 'c1': scipy.stats.expon(scale=0.5),
426 + 'c2': scipy.stats.expon(scale=0.05),
427 + }
428 +
429 + # Original: labels = list(crf.classes_)
430 + # Original: labels.remove('O')
431 + labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
432 +
433 + # use the same metric for evaluation
434 + f1_scorer = make_scorer(metrics.flat_f1_score,
435 + average='weighted', labels=labels)
436 +
437 + # search
438 + rs = RandomizedSearchCV(crf, params_space,
439 + cv=3,
440 + verbose=3,
441 + n_jobs=-1,
442 + n_iter=20,
443 + # n_iter=50,
444 + scoring=f1_scorer)
445 + rs.fit(X_train, y_train)
446 +
447 + # Fixed parameters
448 + # crf.fit(X_train, y_train)
449 +
450 + # Best hiperparameters
451 + # crf = rs.best_estimator_
452 + nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
453 + options.filterSymbols) + '.txt')
454 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
455 + oFile.write("********** TRAINING AND TESTING REPORT **********\n")
456 + oFile.write("Training file: " + options.trainingFile + '\n')
457 + oFile.write('\n')
458 + oFile.write('best params:' + str(rs.best_params_) + '\n')
459 + oFile.write('best CV score:' + str(rs.best_score_) + '\n')
460 + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
461 +
462 + print("Training done in: %fs" % (time() - t0))
463 + t0 = time()
464 +
465 + # Update best crf
466 + crf = rs.best_estimator_
467 +
468 + # Saving model
469 + print(" Saving training model...")
470 + t1 = time()
471 + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
472 + options.filterSymbols) + '.mod')
473 + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
474 + print(" Saving training model done in: %fs" % (time() - t1))
475 +
476 + # Evaluation against test data
477 + y_pred = crf.predict(X_test)
478 + print("*********************************")
479 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
480 + options.filterSymbols) + '.txt')
481 + with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
482 + for y in y_pred:
483 + oFile.write(str(y) + '\n')
484 +
485 + print("*********************************")
486 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
487 + options.filterSymbols) + '.txt')
488 + with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
489 + for y in y_test:
490 + oFile.write(str(y) + '\n')
491 +
492 + print("Prediction done in: %fs" % (time() - t0))
493 +
494 + # labels = list(crf.classes_)
495 + # labels.remove('O')
496 +
497 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
498 + oFile.write('\n')
499 + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
500 + oFile.write('\n')
501 + # labels = list(crf.classes_)
502 + sorted_labels = sorted(
503 + labels,
504 + key=lambda name: (name[1:], name[0])
505 + )
506 + oFile.write(metrics.flat_classification_report(
507 + y_test, y_pred, labels=sorted_labels, digits=3
508 + ))
509 + oFile.write('\n')
510 +
511 + oFile.write("\nTop likely transitions:\n")
512 + print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
513 + oFile.write('\n')
514 +
515 + oFile.write("\nTop unlikely transitions:\n")
516 + print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
517 + oFile.write('\n')
518 +
519 + oFile.write("\nTop positive:\n")
520 + print_state_features(Counter(crf.state_features_).most_common(200), oFile)
521 + oFile.write('\n')
522 +
523 + oFile.write("\nTop negative:\n")
524 + print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
525 + oFile.write('\n')