Showing
5 changed files
with
1157 additions
and
0 deletions
.idea/vcs.xml
0 → 100644
prepare-abstracts.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +from optparse import OptionParser | ||
| 4 | +import os | ||
| 5 | +import sys | ||
| 6 | +from time import time | ||
| 7 | +import re | ||
| 8 | + | ||
| 9 | +__author__ = 'CMendezC' | ||
| 10 | + | ||
| 11 | +# Objective: Take text-annotated-abstracts-original.txt as input | ||
| 12 | +# for obtaining abstracts separated in files without tags and collecting dictionary of genes | ||
| 13 | +# for tagging after NLP pipeline. | ||
| 14 | + | ||
| 15 | +# Parameters: | ||
| 16 | +# 1) --inputPath Input path. | ||
| 17 | +# 2) --inputFile Input file. | ||
| 18 | +# 3) --outputPath Output path | ||
| 19 | + | ||
| 20 | +# Execution: | ||
| 21 | +#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original | ||
| 22 | + | ||
| 23 | +if __name__ == "__main__": | ||
| 24 | + # Parameter definition | ||
| 25 | + parser = OptionParser() | ||
| 26 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 27 | + help="Input path", metavar="PATH") | ||
| 28 | + parser.add_option("--inputFile", dest="inputFile", | ||
| 29 | + help="Input file", metavar="FILE") | ||
| 30 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 31 | + help="Output path", metavar="PATH") | ||
| 32 | + | ||
| 33 | + (options, args) = parser.parse_args() | ||
| 34 | + if len(args) > 0: | ||
| 35 | + parser.error("None parameters indicated.") | ||
| 36 | + sys.exit(1) | ||
| 37 | + | ||
| 38 | + # Printing parameter values | ||
| 39 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 40 | + print("Input path: " + str(options.inputPath)) | ||
| 41 | + print("Input file", str(options.inputFile)) | ||
| 42 | + print("Output path: " + str(options.outputPath)) | ||
| 43 | + | ||
| 44 | + filesWritten = 0 | ||
| 45 | + t0 = time() | ||
| 46 | + hashGenes = {} | ||
| 47 | + | ||
| 48 | + rePmid = re.compile(r'([\d])+\|a\|') | ||
| 49 | + reGene = re.compile(r'<g>([^<]+)</g>') | ||
| 50 | + with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: | ||
| 51 | + print("Reading file..." + options.inputFile) | ||
| 52 | + for line in iFile: | ||
| 53 | + line = line.strip('\n') | ||
| 54 | + for gene in reGene.findall(line): | ||
| 55 | + print("genes: {}".format(gene)) | ||
| 56 | + result = rePmid.match(line) | ||
| 57 | + if result: | ||
| 58 | + with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: | ||
| 59 | + oFile.write(line) | ||
| 60 | + | ||
| 61 | + | ||
| 62 | + |
preparing-training-validation-test.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +from optparse import OptionParser | ||
| 4 | +import os | ||
| 5 | +import sys | ||
| 6 | +from time import time | ||
| 7 | +import json | ||
| 8 | +from nltk.corpus import stopwords | ||
| 9 | + | ||
| 10 | +__author__ = 'CMendezC' | ||
| 11 | + | ||
| 12 | +# Objective: Take transformed file with format word|lemma|tag, | ||
| 13 | +# for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP | ||
| 14 | +# and create file with an additional tagging for CRF training. For example: | ||
| 15 | +# the|the|dt N-terminal|N-terminal|NN| domain|domain|NN --> | ||
| 16 | +# the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O | ||
| 17 | +# Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP). | ||
| 18 | +# We expect that these words are going to have one aTag in some context and different one in others. | ||
| 19 | +# The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here. | ||
| 20 | +# In output file we only maintain the lemma and the tag or the word and the tag. | ||
| 21 | +# This additional tagging is going to give us clues for aspect classification. | ||
| 22 | + | ||
| 23 | +# Parameters: | ||
| 24 | +# 1) --inputPath Path to read files. | ||
| 25 | +# 2) --trainingFile File name with training data. | ||
| 26 | +# 3) --testFile File name with test data. | ||
| 27 | +# 4) --outputPath Path to write files. File names are concatenated with feature name. | ||
| 28 | +# 5) ELIMINATED --feature Type of feature to extract and create file: lemma | ||
| 29 | +# 6) --termPath Path to read term files | ||
| 30 | +# 7) --termFiles JSON file with terms files and tags | ||
| 31 | +# 8) --termPath Path to read JSON file with information about frequent words files | ||
| 32 | +# 9) --inputFileFreq JSON file with information about frequent words | ||
| 33 | +# 10 --skip=N Skip N words to form skip mentions | ||
| 34 | +# 11) --stopWords Filtering stop words | ||
| 35 | +# 12) --filterPunctMarks Filtering punctuation marks | ||
| 36 | + | ||
| 37 | +# Ouput: | ||
| 38 | +# 1) Files created. Name of feature is concatenated | ||
| 39 | + | ||
| 40 | +# Execution: | ||
| 41 | +# ASPECTS | ||
| 42 | +# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
| 43 | + | ||
| 44 | +# SENTENCES | ||
| 45 | +# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
| 46 | + | ||
| 47 | +# none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json | ||
| 48 | +# stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords | ||
| 49 | +# stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks | ||
| 50 | +# filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks | ||
| 51 | +# ¿? --SKIP | ||
| 52 | + | ||
| 53 | +def getSkipMentions(aList, aSkip): | ||
| 54 | + hashTemp = {} | ||
| 55 | + for j in range(0, aSkip): | ||
| 56 | + listTemp = [] | ||
| 57 | + for i in range(0, len(aList), aSkip+1): | ||
| 58 | + listTemp.append(aList[i + j]) | ||
| 59 | + hashTemp[j] = listTemp | ||
| 60 | + return hashTemp | ||
| 61 | + | ||
| 62 | +########################################################### | ||
| 63 | +# MAIN PROGRAM # | ||
| 64 | +########################################################### | ||
| 65 | + | ||
| 66 | +if __name__ == "__main__": | ||
| 67 | + # Parameter definition | ||
| 68 | + parser = OptionParser() | ||
| 69 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 70 | + help="Path to read files", metavar="PATH") | ||
| 71 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
| 72 | + help="File with training examples", metavar="FILE") | ||
| 73 | + parser.add_option("--testFile", dest="testFile", | ||
| 74 | + help="File with test examples", metavar="FILE") | ||
| 75 | + parser.add_option("--trainingClassesFile", dest="trainingClassesFile", | ||
| 76 | + help="File with training classes", metavar="FILE") | ||
| 77 | + parser.add_option("--testClassesFile", dest="testClassesFile", | ||
| 78 | + help="File with test classes", metavar="FILE") | ||
| 79 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 80 | + help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH") | ||
| 81 | + parser.add_option("--termPath", dest="termPath", | ||
| 82 | + help="Path to read term files", metavar="PATH") | ||
| 83 | + parser.add_option("--termFiles", dest="termFiles", | ||
| 84 | + help="JSON file with terms files and tags", metavar="PATH") | ||
| 85 | + parser.add_option("--inputFileFreq", dest="inputFileFreq", | ||
| 86 | + help="JSON file with information about frequent words", metavar="PATH") | ||
| 87 | + parser.add_option("--skip", type="int", | ||
| 88 | + dest="skip", default=0, | ||
| 89 | + help="Skip mentions", metavar="N") | ||
| 90 | + parser.add_option("--filterStopWords", default=False, | ||
| 91 | + action="store_true", dest="filterStopWords", | ||
| 92 | + help="Filtering stop words") | ||
| 93 | + parser.add_option("--filterPunctMarks", default=False, | ||
| 94 | + action="store_true", dest="filterPunctMarks", | ||
| 95 | + help="Filtering punctuation marks") | ||
| 96 | + | ||
| 97 | + (options, args) = parser.parse_args() | ||
| 98 | + if len(args) > 0: | ||
| 99 | + parser.error("None parameters indicated.") | ||
| 100 | + sys.exit(1) | ||
| 101 | + | ||
| 102 | + # Printing parameter values | ||
| 103 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 104 | + print("Path to read files: " + str(options.inputPath)) | ||
| 105 | + print("File with training examples", str(options.trainingFile)) | ||
| 106 | + print("File with test examples", str(options.testFile)) | ||
| 107 | + print("File with training classes", str(options.trainingClassesFile)) | ||
| 108 | + print("File with test classes", str(options.testClassesFile)) | ||
| 109 | + print("File with training classes", str(options.trainingClassesFile)) | ||
| 110 | + print("File with test classes", str(options.testClassesFile)) | ||
| 111 | + print("Path to write output files: " + str(options.outputPath)) | ||
| 112 | + print("JSON file with information about frequent words: " + str(options.inputFileFreq)) | ||
| 113 | + print("Skip mentions: " + str(options.skip)) | ||
| 114 | + print("Filtering stop words: " + str(options.stopWords)) | ||
| 115 | + punctMarks = ['.', ',', ':', ';', '?', '!', '\'', '"'] | ||
| 116 | + print("Filtering puntuation marks " + str(punctMarks) + ': '+ str(options.filterPunctMarks)) | ||
| 117 | + | ||
| 118 | + filesRead = 0 | ||
| 119 | + t0 = time() | ||
| 120 | + | ||
| 121 | + print('Loading biological term files...') | ||
| 122 | + with open(os.path.join(options.termPath, options.termFiles)) as data_file: | ||
| 123 | + hashes = json.load(data_file) | ||
| 124 | + print(' Loading biological term files... done') | ||
| 125 | + | ||
| 126 | + hashTagAspect = hashes["hashTagAspect"] | ||
| 127 | + | ||
| 128 | + print('Loading frequent words...') | ||
| 129 | + with open(os.path.join(options.termPath, options.inputFileFreq)) as data_file: | ||
| 130 | + hashAspectFreqWords = json.load(data_file) | ||
| 131 | + print(' Loading frequent words... done') | ||
| 132 | + | ||
| 133 | + listFiles = [options.trainingFile, options.testFile] | ||
| 134 | + listClassesFiles = [options.trainingClassesFile, options.testClassesFile] | ||
| 135 | + | ||
| 136 | + for iFile, cFile in zip(listFiles, listClassesFiles): | ||
| 137 | + with open(os.path.join(options.inputPath, iFile), "r", encoding="utf-8", errors="replace") as tFile: | ||
| 138 | + print("Reading file..." + iFile) | ||
| 139 | + lines = [l.strip('\n') for l in tFile.readlines()] | ||
| 140 | + filesRead += 1 | ||
| 141 | + with open(os.path.join(options.inputPath, cFile), "r", encoding="utf-8", errors="replace") as clFile: | ||
| 142 | + print("Reading file..." + cFile) | ||
| 143 | + classes = [c.strip('\n') for c in clFile.readlines()] | ||
| 144 | + listLines = [] | ||
| 145 | + print("Processing files... ") | ||
| 146 | + for line, c in zip(lines, classes): | ||
| 147 | + # print("class: ", c) | ||
| 148 | + listTokenLine = [] | ||
| 149 | + # listLemmaLine = [] | ||
| 150 | + for tok in line.split(): | ||
| 151 | + tokList = tok.split("|") | ||
| 152 | + word = tokList[0] | ||
| 153 | + lemma = tokList[1] | ||
| 154 | + tag = tokList[2] | ||
| 155 | + # Filtering stopwords | ||
| 156 | + if options.stopWords: | ||
| 157 | + if lemma in stopwords.words('english'): | ||
| 158 | + continue | ||
| 159 | + if options.filterPunctMarks: | ||
| 160 | + if lemma in punctMarks: | ||
| 161 | + continue | ||
| 162 | + # if tag in hashTagAspect: | ||
| 163 | + # We change tag for aspect tag only in the case of aspect tag coincide with class. | ||
| 164 | + # We want that CRF learn when to change term tag to aspect tag in correct context | ||
| 165 | + if tag in hashTagAspect: | ||
| 166 | + if hashTagAspect[tag] == c: | ||
| 167 | + aTag = hashTagAspect[tag] | ||
| 168 | + else: | ||
| 169 | + aTag = 'O' | ||
| 170 | + else: | ||
| 171 | + if c in hashAspectFreqWords: | ||
| 172 | + # print("class: ", c) | ||
| 173 | + hashFreqWords = hashAspectFreqWords[c] | ||
| 174 | + # We verify if word or lemma is in frequent words. | ||
| 175 | + # These frequent words are word-forms (tokens) | ||
| 176 | + if word.lower() in hashFreqWords or lemma in hashFreqWords: | ||
| 177 | + aTag = c | ||
| 178 | + else: | ||
| 179 | + aTag = 'O' | ||
| 180 | + else: | ||
| 181 | + aTag = 'O' | ||
| 182 | + listTokenLine.append(word + "|" + lemma + "|" + tag + "|" + aTag) | ||
| 183 | + # if feature == "word": | ||
| 184 | + listLines.append(listTokenLine) | ||
| 185 | + # if feature == "lemma": | ||
| 186 | + # listLines = listLemmaLine.strip() + '\n' | ||
| 187 | + if options.skip > 0: | ||
| 188 | + t0 = time() | ||
| 189 | + skipTemp = options.skip | ||
| 190 | + for i in range(1, options.skip): | ||
| 191 | + hashTemp = getSkipMentions(listLines, skipTemp) | ||
| 192 | + # skipTemp -= 1 | ||
| 193 | + for key in hashTemp: | ||
| 194 | + listLines = hashTemp[key] | ||
| 195 | + with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str( | ||
| 196 | + options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(skipTemp) + '.txt')), "w", | ||
| 197 | + encoding="utf-8") as oFile: | ||
| 198 | + for line in listLines: | ||
| 199 | + oFile.write(line) | ||
| 200 | + print("Skip mention done in: %fs" % (time() - t0)) | ||
| 201 | + else: | ||
| 202 | + with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str( | ||
| 203 | + options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(options.skip) + '.txt')), "w", | ||
| 204 | + encoding="utf-8") as oFile: | ||
| 205 | + for line in listLines: | ||
| 206 | + for token in line: | ||
| 207 | + oFile.write(token + ' ') | ||
| 208 | + oFile.write('\n') | ||
| 209 | + | ||
| 210 | + print("Files processed: " + str(filesRead)) | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
tagging_Sklearn_crfsuite.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +import os | ||
| 4 | +from itertools import chain | ||
| 5 | +from optparse import OptionParser | ||
| 6 | +from time import time | ||
| 7 | +from collections import Counter | ||
| 8 | + | ||
| 9 | +import nltk | ||
| 10 | +import sklearn | ||
| 11 | +import scipy.stats | ||
| 12 | +import sys | ||
| 13 | + | ||
| 14 | +from sklearn.externals import joblib | ||
| 15 | +from sklearn.metrics import make_scorer | ||
| 16 | +from sklearn.cross_validation import cross_val_score | ||
| 17 | +from sklearn.grid_search import RandomizedSearchCV | ||
| 18 | + | ||
| 19 | +import sklearn_crfsuite | ||
| 20 | +from sklearn_crfsuite import scorers | ||
| 21 | +from sklearn_crfsuite import metrics | ||
| 22 | + | ||
| 23 | +from nltk.corpus import stopwords | ||
| 24 | +from trainingTesting_Sklearn_crfsuite import word2features | ||
| 25 | +from trainingTesting_Sklearn_crfsuite import sent2features | ||
| 26 | +# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum | ||
| 27 | +# from trainingTesting_Sklearn_crfsuite import hasDigit | ||
| 28 | + | ||
| 29 | +# Objective | ||
| 30 | +# Tagging transformed file with CRF model with sklearn-crfsuite. | ||
| 31 | +# | ||
| 32 | +# Input parameters | ||
| 33 | +# --inputPath=PATH Path of transformed files x|y|z | ||
| 34 | +# --modelPath Path to CRF model | ||
| 35 | +# --modelName Model name | ||
| 36 | +# --outputPath=PATH Output path to place output files | ||
| 37 | +# --filteringStopWords Filtering stop words | ||
| 38 | +# --filterSymbols Filtering punctuation marks | ||
| 39 | + | ||
| 40 | +# Output | ||
| 41 | +# 1) Tagged files in transformed format | ||
| 42 | + | ||
| 43 | +# Examples | ||
| 44 | +# Sentences | ||
| 45 | +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt | ||
| 46 | +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt | ||
| 47 | + | ||
| 48 | +################################# | ||
| 49 | +# FUNCTIONS # | ||
| 50 | +################################# | ||
| 51 | +# def hasDigit(text): | ||
| 52 | +# has = False | ||
| 53 | +# if len(text) < 3: | ||
| 54 | +# return False | ||
| 55 | +# myRegex = nltk.re.compile('[0-9]') | ||
| 56 | +# if myRegex.search(text) != None: | ||
| 57 | +# has = True | ||
| 58 | +# return has | ||
| 59 | +# | ||
| 60 | +# | ||
| 61 | +# def hasNonAlphaNum(text): | ||
| 62 | +# has = False | ||
| 63 | +# if len(text) < 3: | ||
| 64 | +# return False | ||
| 65 | +# myRegex = nltk.re.compile('\W') | ||
| 66 | +# if myRegex.search(text) != None: | ||
| 67 | +# has = True | ||
| 68 | +# return has | ||
| 69 | + | ||
| 70 | +# IMPORTED FROM TRAINING SCRIPT | ||
| 71 | +# def word2features(sent, i): | ||
| 72 | +# # print "i: " + str(i) | ||
| 73 | +# # print "sent[i]" + sent[i] | ||
| 74 | +# listElem = sent[i].split('|') | ||
| 75 | +# word = listElem[0] | ||
| 76 | +# lemma = listElem[1] | ||
| 77 | +# postag = listElem[2] | ||
| 78 | +# | ||
| 79 | +# features = { | ||
| 80 | +# # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(), | ||
| 81 | +# # Suffixes | ||
| 82 | +# 'word[-3:]': word[-3:], | ||
| 83 | +# 'word[-2:]': word[-2:], | ||
| 84 | +# 'word[-1:]': word[-1:], | ||
| 85 | +# 'word.isupper()': word.isupper(), | ||
| 86 | +# 'word.istitle()': word.istitle(), | ||
| 87 | +# 'word.hasDigit()': hasDigit(word), | ||
| 88 | +# 'word.hasNonAlphaNum': hasNonAlphaNum(word), | ||
| 89 | +# # 'word.isdigit()': word.isdigit(), | ||
| 90 | +# 'word': word, | ||
| 91 | +# 'lemma': lemma, | ||
| 92 | +# 'lemma[-3:]': lemma[-3:], | ||
| 93 | +# 'lemma[-2:]': lemma[-2:], | ||
| 94 | +# 'lemma[-1:]': lemma[-1:], | ||
| 95 | +# 'postag': postag, | ||
| 96 | +# # Prefixes | ||
| 97 | +# 'postag[:2]': postag[:2], | ||
| 98 | +# 'postag[:1]': postag[:1], | ||
| 99 | +# } | ||
| 100 | +# if i > 0: | ||
| 101 | +# listElem = sent[i - 1].split('|') | ||
| 102 | +# word1 = listElem[0] | ||
| 103 | +# lemma1 = listElem[1] | ||
| 104 | +# postag1 = listElem[2] | ||
| 105 | +# features.update({ | ||
| 106 | +# '-1:word.lower()': word1.lower(), | ||
| 107 | +# '-1:word.istitle()': word1.istitle(), | ||
| 108 | +# '-1:word.isupper()': word1.isupper(), | ||
| 109 | +# '-1:word.hasDigit()': hasDigit(word1), | ||
| 110 | +# '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
| 111 | +# '-1:word': word1, | ||
| 112 | +# '-1:lemma': lemma1, | ||
| 113 | +# '-1:postag': postag1, | ||
| 114 | +# '-1:postag[:2]': postag1[:2], | ||
| 115 | +# '-1:postag[:1]': postag1[:1], | ||
| 116 | +# }) | ||
| 117 | +# # else: | ||
| 118 | +# # features['BOS'] = True | ||
| 119 | +# | ||
| 120 | +# if i < len(sent) - 1: | ||
| 121 | +# listElem = sent[i + 1].split('|') | ||
| 122 | +# word1 = listElem[0] | ||
| 123 | +# lemma1 = listElem[1] | ||
| 124 | +# postag1 = listElem[2] | ||
| 125 | +# features.update({ | ||
| 126 | +# '+1:word.lower()': word1.lower(), | ||
| 127 | +# '+1:word.istitle()': word1.istitle(), | ||
| 128 | +# '+1:word.isupper()': word1.isupper(), | ||
| 129 | +# '+1:word.hasDigit()': hasDigit(word1), | ||
| 130 | +# '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
| 131 | +# '+1:word': word1, | ||
| 132 | +# '+1:lemma': lemma1, | ||
| 133 | +# '+1:postag': postag1, | ||
| 134 | +# '+1:postag[:2]': postag1[:2], | ||
| 135 | +# '+1:postag[:1]': postag1[:1], | ||
| 136 | +# }) | ||
| 137 | +# # else: | ||
| 138 | +# # features['EOS'] = True | ||
| 139 | +# if i > 1: | ||
| 140 | +# listElem = sent[i - 2].split('|') | ||
| 141 | +# word2 = listElem[0] | ||
| 142 | +# lemma2 = listElem[1] | ||
| 143 | +# postag2 = listElem[2] | ||
| 144 | +# features.update({ | ||
| 145 | +# '-2:word.lower()': word2.lower(), | ||
| 146 | +# '-2:word.istitle()': word2.istitle(), | ||
| 147 | +# '-2:word.isupper()': word2.isupper(), | ||
| 148 | +# '-2:word.hasDigit()': hasDigit(word2), | ||
| 149 | +# '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
| 150 | +# '-2:word': word2, | ||
| 151 | +# '-2:lemma': lemma2, | ||
| 152 | +# '-2:postag': postag2, | ||
| 153 | +# '-2:postag[:2]': postag2[:2], | ||
| 154 | +# '-2:postag[:1]': postag2[:1], | ||
| 155 | +# }) | ||
| 156 | +# | ||
| 157 | +# if i < len(sent) - 2: | ||
| 158 | +# listElem = sent[i + 2].split('|') | ||
| 159 | +# word2 = listElem[0] | ||
| 160 | +# lemma2 = listElem[1] | ||
| 161 | +# postag2 = listElem[2] | ||
| 162 | +# features.update({ | ||
| 163 | +# '+2:word.lower()': word2.lower(), | ||
| 164 | +# '+2:word.istitle()': word2.istitle(), | ||
| 165 | +# '+2:word.isupper()': word2.isupper(), | ||
| 166 | +# '+2:word.hasDigit()': hasDigit(word2), | ||
| 167 | +# '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
| 168 | +# '+2:word': word2, | ||
| 169 | +# '+2:lemma': lemma2, | ||
| 170 | +# '+2:postag': postag2, | ||
| 171 | +# '+2:postag[:2]': postag2[:2], | ||
| 172 | +# '+2:postag[:1]': postag2[:1], | ||
| 173 | +# }) | ||
| 174 | +# | ||
| 175 | +# trigrams = False | ||
| 176 | +# if trigrams: | ||
| 177 | +# if i > 2: | ||
| 178 | +# listElem = sent[i - 3].split('|') | ||
| 179 | +# word3 = listElem[0] | ||
| 180 | +# lemma3 = listElem[1] | ||
| 181 | +# postag3 = listElem[2] | ||
| 182 | +# features.update({ | ||
| 183 | +# '-3:word.lower()': word3.lower(), | ||
| 184 | +# '-3:word.istitle()': word3.istitle(), | ||
| 185 | +# '-3:word.isupper()': word3.isupper(), | ||
| 186 | +# '-3:word.hasDigit()': hasDigit(word3), | ||
| 187 | +# '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
| 188 | +# '-3:word': word3, | ||
| 189 | +# '-3:lemma': lemma3, | ||
| 190 | +# '-3:postag': postag3, | ||
| 191 | +# '-3:postag[:2]': postag3[:2], | ||
| 192 | +# '-3:postag[:1]': postag3[:1], | ||
| 193 | +# }) | ||
| 194 | +# | ||
| 195 | +# if i < len(sent) - 3: | ||
| 196 | +# listElem = sent[i + 3].split('|') | ||
| 197 | +# word3 = listElem[0] | ||
| 198 | +# lemma3 = listElem[1] | ||
| 199 | +# postag3 = listElem[2] | ||
| 200 | +# features.update({ | ||
| 201 | +# '+3:word.lower()': word3.lower(), | ||
| 202 | +# '+3:word.istitle()': word3.istitle(), | ||
| 203 | +# '+3:word.isupper()': word3.isupper(), | ||
| 204 | +# '+3:word.hasDigit()': hasDigit(word3), | ||
| 205 | +# '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
| 206 | +# '+3:word': word3, | ||
| 207 | +# '+3:lemma': lemma3, | ||
| 208 | +# '+3:postag': postag3, | ||
| 209 | +# '+3:postag[:2]': postag3[:2], | ||
| 210 | +# '+3:postag[:1]': postag3[:1], | ||
| 211 | +# }) | ||
| 212 | +# | ||
| 213 | +# return features | ||
| 214 | + | ||
| 215 | + | ||
| 216 | +# def sent2features(sent): | ||
| 217 | +# return [word2features(sent, i) for i in range(len(sent))] | ||
| 218 | + | ||
| 219 | + | ||
| 220 | +__author__ = 'CMendezC' | ||
| 221 | + | ||
| 222 | +########################################## | ||
| 223 | +# MAIN PROGRAM # | ||
| 224 | +########################################## | ||
| 225 | + | ||
| 226 | +if __name__ == "__main__": | ||
| 227 | + # Defining parameters | ||
| 228 | + parser = OptionParser() | ||
| 229 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 230 | + help="Path of training data set", metavar="PATH") | ||
| 231 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 232 | + help="Output path to place output files", | ||
| 233 | + metavar="PATH") | ||
| 234 | + parser.add_option("--modelPath", dest="modelPath", | ||
| 235 | + help="Path to read CRF model", | ||
| 236 | + metavar="PATH") | ||
| 237 | + parser.add_option("--modelName", dest="modelName", | ||
| 238 | + help="Model name", metavar="TEXT") | ||
| 239 | + parser.add_option("--filterStopWords", default=False, | ||
| 240 | + action="store_true", dest="filterStopWords", | ||
| 241 | + help="Filtering stop words") | ||
| 242 | + parser.add_option("--filterSymbols", default=False, | ||
| 243 | + action="store_true", dest="filterSymbols", | ||
| 244 | + help="Filtering punctuation marks") | ||
| 245 | + | ||
| 246 | + (options, args) = parser.parse_args() | ||
| 247 | + if len(args) > 0: | ||
| 248 | + parser.error("Any parameter given.") | ||
| 249 | + sys.exit(1) | ||
| 250 | + | ||
| 251 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 252 | + print("Path to read input files: " + options.inputPath) | ||
| 253 | + print("Mode name: " + str(options.modelName)) | ||
| 254 | + print("Model path: " + options.modelPath) | ||
| 255 | + print("Path to place output files: " + options.outputPath) | ||
| 256 | + print("Filtering stop words: " + str(options.filterStopWords)) | ||
| 257 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
| 258 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
| 259 | + # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
| 260 | + # '}', '[', ']', '*', '%', '$', '#', '&', '°']] | ||
| 261 | + # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{', | ||
| 262 | + # u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`'] | ||
| 263 | + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | ||
| 264 | + | ||
| 265 | + print('-------------------------------- PROCESSING --------------------------------') | ||
| 266 | + | ||
| 267 | + stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | ||
| 268 | + | ||
| 269 | + # Read CRF model | ||
| 270 | + t0 = time() | ||
| 271 | + print('Reading CRF model...') | ||
| 272 | + crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod')) | ||
| 273 | + print("Reading CRF model done in: %fs" % (time() - t0)) | ||
| 274 | + | ||
| 275 | + print('Processing corpus...') | ||
| 276 | + t0 = time() | ||
| 277 | + labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) | ||
| 278 | + # Walk directory to read files | ||
| 279 | + for path, dirs, files in os.walk(options.inputPath): | ||
| 280 | + # For each file in dir | ||
| 281 | + for file in files: | ||
| 282 | + print(" Preprocessing file..." + str(file)) | ||
| 283 | + sentencesInputData = [] | ||
| 284 | + sentencesOutputData = [] | ||
| 285 | + with open(os.path.join(options.inputPath, file), "r") as iFile: | ||
| 286 | + lines = iFile.readlines() | ||
| 287 | + for line in lines: | ||
| 288 | + listLine = [] | ||
| 289 | + # line = line.decode("utf-8") | ||
| 290 | + for token in line.strip('\n').split(): | ||
| 291 | + if options.filterStopWords: | ||
| 292 | + listToken = token.split('|') | ||
| 293 | + lemma = listToken[1] | ||
| 294 | + # Original if lemma in stopwords.words('english'): | ||
| 295 | + if lemma in stopwords: | ||
| 296 | + continue | ||
| 297 | + if options.filterSymbols: | ||
| 298 | + listToken = token.split('|') | ||
| 299 | + lemma = listToken[1] | ||
| 300 | + if lemma in symbols: | ||
| 301 | + if lemma == ',': | ||
| 302 | + print "Coma , identificada" | ||
| 303 | + continue | ||
| 304 | + listLine.append(token) | ||
| 305 | + sentencesInputData.append(listLine) | ||
| 306 | + print " Sentences input data: " + str(len(sentencesInputData)) | ||
| 307 | + # print sentencesInputData[0] | ||
| 308 | + # print(sent2features(sentencesInputData[0])[0]) | ||
| 309 | + # print(sent2labels(sentencesInputData[0])) | ||
| 310 | + X_input = [sent2features(s) for s in sentencesInputData] | ||
| 311 | + print(sent2features(sentencesInputData[0])[0]) | ||
| 312 | + # y_test = [sent2labels(s) for s in sentencesInputData] | ||
| 313 | + # Predicting tags | ||
| 314 | + t1 = time() | ||
| 315 | + print " Predicting tags with model" | ||
| 316 | + y_pred = crf.predict(X_input) | ||
| 317 | + print y_pred[0] | ||
| 318 | + print(" Prediction done in: %fs" % (time() - t1)) | ||
| 319 | + # Tagging with CRF model | ||
| 320 | + print " Tagging file" | ||
| 321 | + for line, tagLine in zip(lines, y_pred): | ||
| 322 | + outputLine = '' | ||
| 323 | + idx_tagLine = 0 | ||
| 324 | + line = line.strip('\n') | ||
| 325 | + print "\nLine: " + str(line) | ||
| 326 | + print "CRF tagged line: " + str(tagLine) | ||
| 327 | + for token in line.split(): | ||
| 328 | + listToken = token.split('|') | ||
| 329 | + word = listToken[0] | ||
| 330 | + lemma = listToken[1] | ||
| 331 | + tag = listToken[2] | ||
| 332 | + if options.filterStopWords: | ||
| 333 | + if lemma in stopwords: | ||
| 334 | + outputLine += token + ' ' | ||
| 335 | + continue | ||
| 336 | + if options.filterSymbols: | ||
| 337 | + if lemma in symbols: | ||
| 338 | + if lemma == ',': | ||
| 339 | + print "Coma , identificada" | ||
| 340 | + outputLine += token + ' ' | ||
| 341 | + continue | ||
| 342 | + CRFtag = tagLine[idx_tagLine] | ||
| 343 | + if (tag not in labels) and (CRFtag != 'O'): | ||
| 344 | + print "*** CRF change token {} to {}".format(token, CRFtag) | ||
| 345 | + outputLine += word + '|' + lemma + '|' + CRFtag + ' ' | ||
| 346 | + else: | ||
| 347 | + outputLine += word + '|' + lemma + '|' + tag + ' ' | ||
| 348 | + idx_tagLine += 1 | ||
| 349 | + sentencesOutputData.append(outputLine.rstrip()) | ||
| 350 | + with open(os.path.join(options.outputPath, file), "w") as oFile: | ||
| 351 | + for line in sentencesOutputData: | ||
| 352 | + oFile.write(line + '\n') | ||
| 353 | + | ||
| 354 | + print("Processing corpus done in: %fs" % (time() - t0)) |
training-validation.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +import os | ||
| 4 | +from itertools import chain | ||
| 5 | +from optparse import OptionParser | ||
| 6 | +from time import time | ||
| 7 | +from collections import Counter | ||
| 8 | + | ||
| 9 | +import nltk | ||
| 10 | +import sklearn | ||
| 11 | +import scipy.stats | ||
| 12 | +import sys | ||
| 13 | + | ||
| 14 | +from sklearn.externals import joblib | ||
| 15 | +from sklearn.metrics import make_scorer | ||
| 16 | +from sklearn.cross_validation import cross_val_score | ||
| 17 | +from sklearn.grid_search import RandomizedSearchCV | ||
| 18 | + | ||
| 19 | +import sklearn_crfsuite | ||
| 20 | +from sklearn_crfsuite import scorers | ||
| 21 | +from sklearn_crfsuite import metrics | ||
| 22 | + | ||
| 23 | +from nltk.corpus import stopwords | ||
| 24 | + | ||
| 25 | + | ||
| 26 | +# Objective | ||
| 27 | +# Training and evaluation of CRFs with sklearn-crfsuite. | ||
| 28 | +# | ||
| 29 | +# Input parameters | ||
| 30 | +# --inputPath=PATH Path of training and test data set | ||
| 31 | +# --trainingFile File with training data set | ||
| 32 | +# --testFile File with test data set | ||
| 33 | +# --outputPath=PATH Output path to place output files | ||
| 34 | +# --filteringStopWords Filtering stop words | ||
| 35 | +# --filterSymbols Filtering punctuation marks | ||
| 36 | + | ||
| 37 | +# Output | ||
| 38 | +# 1) Best model | ||
| 39 | + | ||
| 40 | +# Examples | ||
| 41 | +# Sentences | ||
| 42 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt | ||
| 43 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt | ||
| 44 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt | ||
| 45 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt | ||
| 46 | + | ||
| 47 | +# Aspects | ||
| 48 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt | ||
| 49 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt | ||
| 50 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt | ||
| 51 | +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt | ||
| 52 | + | ||
| 53 | +################################# | ||
| 54 | +# FUNCTIONS # | ||
| 55 | +################################# | ||
| 56 | + | ||
| 57 | +def wordSize(text): | ||
| 58 | + lWord = len(text) | ||
| 59 | + if lWord == 1: | ||
| 60 | + return '1' | ||
| 61 | + elif lWord == 2: | ||
| 62 | + return '2' | ||
| 63 | + elif lWord == 3: | ||
| 64 | + return '3' | ||
| 65 | + elif lWord == 4: | ||
| 66 | + return '4' | ||
| 67 | + elif lWord == 5: | ||
| 68 | + return '5' | ||
| 69 | + elif 6 <= lWord <= 10: | ||
| 70 | + return '6-10' | ||
| 71 | + elif 11 <= lWord <= 15: | ||
| 72 | + return '11-15' | ||
| 73 | + elif 16 <= lWord <= 20: | ||
| 74 | + return '16-20' | ||
| 75 | + elif 21 <= lWord <= 30: | ||
| 76 | + return '21-30' | ||
| 77 | + else: | ||
| 78 | + return '>30' | ||
| 79 | + | ||
| 80 | +def hasUpperLower(text): | ||
| 81 | + has = False | ||
| 82 | + if len(text) < 3: | ||
| 83 | + return False | ||
| 84 | + regexUp = nltk.re.compile('[A-Z]') | ||
| 85 | + regexLo = nltk.re.compile('[a-z]') | ||
| 86 | + if (regexUp.search(text) != None) and (regexLo.search(text) != None): | ||
| 87 | + has = True | ||
| 88 | + return has | ||
| 89 | + | ||
| 90 | +def hasDigit(text): | ||
| 91 | + has = False | ||
| 92 | + if len(text) < 3: | ||
| 93 | + return False | ||
| 94 | + myRegex = nltk.re.compile('[0-9]') | ||
| 95 | + if myRegex.search(text) != None: | ||
| 96 | + has = True | ||
| 97 | + return has | ||
| 98 | + | ||
| 99 | + | ||
| 100 | +def hasNonAlphaNum(text): | ||
| 101 | + has = False | ||
| 102 | + if len(text) < 3: | ||
| 103 | + return False | ||
| 104 | + myRegex = nltk.re.compile('\W') | ||
| 105 | + if myRegex.search(text) != None: | ||
| 106 | + has = True | ||
| 107 | + return has | ||
| 108 | + | ||
| 109 | +def word2features(sent, i): | ||
| 110 | + # print "i: " + str(i) | ||
| 111 | + # print "sent[i]" + sent[i] | ||
| 112 | + listElem = sent[i].split('|') | ||
| 113 | + word = listElem[0] | ||
| 114 | + lemma = listElem[1] | ||
| 115 | + postag = listElem[2] | ||
| 116 | + | ||
| 117 | + features = { | ||
| 118 | + # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(), | ||
| 119 | + # Suffixes | ||
| 120 | + 'word[-3:]': word[-3:], | ||
| 121 | + 'word[-2:]': word[-2:], | ||
| 122 | + 'word[-1:]': word[-1:], | ||
| 123 | + 'word.isupper()': word.isupper(), | ||
| 124 | + 'word.istitle()': word.istitle(), | ||
| 125 | + 'word.hasDigit()': hasDigit(word), | ||
| 126 | + 'word.hasNonAlphaNum': hasNonAlphaNum(word), | ||
| 127 | + # 'word.hasUpperLower': hasUpperLower(word), | ||
| 128 | + #'wordSize': wordSize(word), | ||
| 129 | + # 'word.isdigit()': word.isdigit(), | ||
| 130 | + 'word': word, | ||
| 131 | + 'lemma': lemma, | ||
| 132 | + 'lemma[-3:]': lemma[-3:], | ||
| 133 | + 'lemma[-2:]': lemma[-2:], | ||
| 134 | + 'lemma[-1:]': lemma[-1:], | ||
| 135 | + 'postag': postag, | ||
| 136 | + # Prefixes | ||
| 137 | + 'postag[:2]': postag[:2], | ||
| 138 | + 'postag[:1]': postag[:1], | ||
| 139 | + } | ||
| 140 | + if i > 0: | ||
| 141 | + listElem = sent[i - 1].split('|') | ||
| 142 | + word1 = listElem[0] | ||
| 143 | + lemma1 = listElem[1] | ||
| 144 | + postag1 = listElem[2] | ||
| 145 | + features.update({ | ||
| 146 | + '-1:word.lower()': word1.lower(), | ||
| 147 | + '-1:word.istitle()': word1.istitle(), | ||
| 148 | + '-1:word.isupper()': word1.isupper(), | ||
| 149 | + '-1:word.hasDigit()': hasDigit(word1), | ||
| 150 | + '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
| 151 | + # '-1:word.hasUpperLower': hasUpperLower(word1), | ||
| 152 | + '-1:word': word1, | ||
| 153 | + '-1:lemma': lemma1, | ||
| 154 | + '-1:postag': postag1, | ||
| 155 | + '-1:postag[:2]': postag1[:2], | ||
| 156 | + '-1:postag[:1]': postag1[:1], | ||
| 157 | + }) | ||
| 158 | + # else: | ||
| 159 | + # features['BOS'] = True | ||
| 160 | + | ||
| 161 | + if i < len(sent) - 1: | ||
| 162 | + listElem = sent[i + 1].split('|') | ||
| 163 | + word1 = listElem[0] | ||
| 164 | + lemma1 = listElem[1] | ||
| 165 | + postag1 = listElem[2] | ||
| 166 | + features.update({ | ||
| 167 | + '+1:word.lower()': word1.lower(), | ||
| 168 | + '+1:word.istitle()': word1.istitle(), | ||
| 169 | + '+1:word.isupper()': word1.isupper(), | ||
| 170 | + '+1:word.hasDigit()': hasDigit(word1), | ||
| 171 | + '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
| 172 | + # '+1:word.hasUpperLower': hasUpperLower(word1), | ||
| 173 | + '+1:word': word1, | ||
| 174 | + '+1:lemma': lemma1, | ||
| 175 | + '+1:postag': postag1, | ||
| 176 | + '+1:postag[:2]': postag1[:2], | ||
| 177 | + '+1:postag[:1]': postag1[:1], | ||
| 178 | + }) | ||
| 179 | + # else: | ||
| 180 | + # features['EOS'] = True | ||
| 181 | + if i > 1: | ||
| 182 | + listElem = sent[i - 2].split('|') | ||
| 183 | + word2 = listElem[0] | ||
| 184 | + lemma2 = listElem[1] | ||
| 185 | + postag2 = listElem[2] | ||
| 186 | + features.update({ | ||
| 187 | + '-2:word.lower()': word2.lower(), | ||
| 188 | + '-2:word.istitle()': word2.istitle(), | ||
| 189 | + '-2:word.isupper()': word2.isupper(), | ||
| 190 | + '-2:word.hasDigit()': hasDigit(word2), | ||
| 191 | + '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
| 192 | + # '-2:word.hasUpperLower': hasUpperLower(word2), | ||
| 193 | + '-2:word': word2, | ||
| 194 | + '-2:lemma': lemma2, | ||
| 195 | + '-2:postag': postag2, | ||
| 196 | + '-2:postag[:2]': postag2[:2], | ||
| 197 | + '-2:postag[:1]': postag2[:1], | ||
| 198 | + }) | ||
| 199 | + | ||
| 200 | + if i < len(sent) - 2: | ||
| 201 | + listElem = sent[i + 2].split('|') | ||
| 202 | + word2 = listElem[0] | ||
| 203 | + lemma2 = listElem[1] | ||
| 204 | + postag2 = listElem[2] | ||
| 205 | + features.update({ | ||
| 206 | + '+2:word.lower()': word2.lower(), | ||
| 207 | + '+2:word.istitle()': word2.istitle(), | ||
| 208 | + '+2:word.isupper()': word2.isupper(), | ||
| 209 | + '+2:word.hasDigit()': hasDigit(word2), | ||
| 210 | + '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
| 211 | + # '+2:word.hasUpperLower': hasUpperLower(word2), | ||
| 212 | + '+2:word': word2, | ||
| 213 | + '+2:lemma': lemma2, | ||
| 214 | + '+2:postag': postag2, | ||
| 215 | + '+2:postag[:2]': postag2[:2], | ||
| 216 | + '+2:postag[:1]': postag2[:1], | ||
| 217 | + }) | ||
| 218 | + | ||
| 219 | + trigrams = False | ||
| 220 | + if trigrams: | ||
| 221 | + if i > 2: | ||
| 222 | + listElem = sent[i - 3].split('|') | ||
| 223 | + word3 = listElem[0] | ||
| 224 | + lemma3 = listElem[1] | ||
| 225 | + postag3 = listElem[2] | ||
| 226 | + features.update({ | ||
| 227 | + '-3:word.lower()': word3.lower(), | ||
| 228 | + '-3:word.istitle()': word3.istitle(), | ||
| 229 | + '-3:word.isupper()': word3.isupper(), | ||
| 230 | + '-3:word.hasDigit()': hasDigit(word3), | ||
| 231 | + '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
| 232 | + # '-3:word.hasUpperLower': hasUpperLower(word3), | ||
| 233 | + '-3:word': word3, | ||
| 234 | + '-3:lemma': lemma3, | ||
| 235 | + '-3:postag': postag3, | ||
| 236 | + '-3:postag[:2]': postag3[:2], | ||
| 237 | + '-3:postag[:1]': postag3[:1], | ||
| 238 | + }) | ||
| 239 | + | ||
| 240 | + if i < len(sent) - 3: | ||
| 241 | + listElem = sent[i + 3].split('|') | ||
| 242 | + word3 = listElem[0] | ||
| 243 | + lemma3 = listElem[1] | ||
| 244 | + postag3 = listElem[2] | ||
| 245 | + features.update({ | ||
| 246 | + '+3:word.lower()': word3.lower(), | ||
| 247 | + '+3:word.istitle()': word3.istitle(), | ||
| 248 | + '+3:word.isupper()': word3.isupper(), | ||
| 249 | + '+3:word.hasDigit()': hasDigit(word3), | ||
| 250 | + '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
| 251 | + # '+3:word.hasUpperLower': hasUpperLower(word3), | ||
| 252 | + '+3:word': word3, | ||
| 253 | + '+3:lemma': lemma3, | ||
| 254 | + '+3:postag': postag3, | ||
| 255 | + '+3:postag[:2]': postag3[:2], | ||
| 256 | + '+3:postag[:1]': postag3[:1], | ||
| 257 | + }) | ||
| 258 | + | ||
| 259 | + return features | ||
| 260 | + | ||
| 261 | + | ||
| 262 | +def sent2features(sent): | ||
| 263 | + return [word2features(sent, i) for i in range(len(sent))] | ||
| 264 | + | ||
| 265 | + | ||
| 266 | +def sent2labels(sent): | ||
| 267 | + return [elem.split('|')[3] for elem in sent] | ||
| 268 | + # return [label for token, postag, label in sent] | ||
| 269 | + | ||
| 270 | + | ||
| 271 | +def sent2tokens(sent): | ||
| 272 | + return [token for token, postag, label in sent] | ||
| 273 | + | ||
| 274 | + | ||
| 275 | +def print_transitions(trans_features, f): | ||
| 276 | + for (label_from, label_to), weight in trans_features: | ||
| 277 | + # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight)) | ||
| 278 | + # f.write("label_from :" + label_from) | ||
| 279 | + # f.write("label_to :" + label_to) | ||
| 280 | + # f.write("label_weight :" + weight) | ||
| 281 | + # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight)) | ||
| 282 | + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) | ||
| 283 | + | ||
| 284 | + | ||
| 285 | +def print_state_features(state_features, f): | ||
| 286 | + for (attr, label), weight in state_features: | ||
| 287 | + # f.write("%0.6f %-8s %s\n" % (weight, label, attr)) | ||
| 288 | + # f.write(attr.encode("utf-8")) | ||
| 289 | + # '{:06.2f}'.format(3.141592653589793) | ||
| 290 | + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | ||
| 291 | + | ||
| 292 | + | ||
| 293 | +__author__ = 'CMendezC' | ||
| 294 | + | ||
| 295 | +########################################## | ||
| 296 | +# MAIN PROGRAM # | ||
| 297 | +########################################## | ||
| 298 | + | ||
| 299 | +if __name__ == "__main__": | ||
| 300 | + # Defining parameters | ||
| 301 | + parser = OptionParser() | ||
| 302 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 303 | + help="Path of training data set", metavar="PATH") | ||
| 304 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 305 | + help="Output path to place output files", | ||
| 306 | + metavar="PATH") | ||
| 307 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
| 308 | + help="File with training data set", metavar="FILE") | ||
| 309 | + parser.add_option("--testFile", dest="testFile", | ||
| 310 | + help="File with test data set", metavar="FILE") | ||
| 311 | + parser.add_option("--filterStopWords", default=False, | ||
| 312 | + action="store_true", dest="filterStopWords", | ||
| 313 | + help="Filtering stop words") | ||
| 314 | + parser.add_option("--filterSymbols", default=False, | ||
| 315 | + action="store_true", dest="filterSymbols", | ||
| 316 | + help="Filtering punctuation marks") | ||
| 317 | + | ||
| 318 | + (options, args) = parser.parse_args() | ||
| 319 | + if len(args) > 0: | ||
| 320 | + parser.error("Any parameter given.") | ||
| 321 | + sys.exit(1) | ||
| 322 | + | ||
| 323 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 324 | + print("Path of training data set: " + options.inputPath) | ||
| 325 | + print("File with training data set: " + str(options.trainingFile)) | ||
| 326 | + print("Path of test data set: " + options.inputPath) | ||
| 327 | + print("File with test data set: " + str(options.testFile)) | ||
| 328 | + print("Filtering stop words: " + str(options.filterStopWords)) | ||
| 329 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
| 330 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
| 331 | + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | ||
| 332 | + | ||
| 333 | + print('-------------------------------- PROCESSING --------------------------------') | ||
| 334 | + print('Reading corpus...') | ||
| 335 | + t0 = time() | ||
| 336 | + | ||
| 337 | + sentencesTrainingData = [] | ||
| 338 | + sentencesTestData = [] | ||
| 339 | + | ||
| 340 | + stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | ||
| 341 | + | ||
| 342 | + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | ||
| 343 | + # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
| 344 | + for line in iFile.readlines(): | ||
| 345 | + listLine = [] | ||
| 346 | + line = line.decode("utf-8") | ||
| 347 | + for token in line.strip('\n').split(): | ||
| 348 | + if options.filterStopWords: | ||
| 349 | + listToken = token.split('|') | ||
| 350 | + lemma = listToken[1] | ||
| 351 | + # Original: if lemma in stopwords.words('english'): | ||
| 352 | + # trainingTesting_Sklearn_crfsuite.py:269: | ||
| 353 | + # UnicodeWarning: Unicode equal comparison failed to | ||
| 354 | + # convert both arguments to Unicode - | ||
| 355 | + # interpreting them as being unequal | ||
| 356 | + if lemma in stopwords: | ||
| 357 | + continue | ||
| 358 | + if options.filterSymbols: | ||
| 359 | + listToken = token.split('|') | ||
| 360 | + lemma = listToken[1] | ||
| 361 | + if lemma in symbols: | ||
| 362 | + # if lemma == ',': | ||
| 363 | + # print "Coma , identificada" | ||
| 364 | + continue | ||
| 365 | + listLine.append(token) | ||
| 366 | + sentencesTrainingData.append(listLine) | ||
| 367 | + print " Sentences training data: " + str(len(sentencesTrainingData)) | ||
| 368 | + # print sentencesTrainingData[0] | ||
| 369 | + | ||
| 370 | + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: | ||
| 371 | + # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
| 372 | + for line in iFile.readlines(): | ||
| 373 | + listLine = [] | ||
| 374 | + line = line.decode("utf-8") | ||
| 375 | + for token in line.strip('\n').split(): | ||
| 376 | + if options.filterStopWords: | ||
| 377 | + listToken = token.split('|') | ||
| 378 | + lemma = listToken[1] | ||
| 379 | + # Original if lemma in stopwords.words('english'): | ||
| 380 | + if lemma in stopwords: | ||
| 381 | + continue | ||
| 382 | + if options.filterSymbols: | ||
| 383 | + listToken = token.split('|') | ||
| 384 | + lemma = listToken[1] | ||
| 385 | + if lemma in symbols: | ||
| 386 | + # if lemma == ',': | ||
| 387 | + # print "Coma , identificada" | ||
| 388 | + continue | ||
| 389 | + listLine.append(token) | ||
| 390 | + sentencesTestData.append(listLine) | ||
| 391 | + print " Sentences test data: " + str(len(sentencesTestData)) | ||
| 392 | + # print sentencesTestData[0] | ||
| 393 | + | ||
| 394 | + print("Reading corpus done in: %fs" % (time() - t0)) | ||
| 395 | + | ||
| 396 | + print(sent2features(sentencesTrainingData[0])[0]) | ||
| 397 | + print(sent2features(sentencesTestData[0])[0]) | ||
| 398 | + # print(sent2labels(sentencesTrainingData[0])) | ||
| 399 | + # print(sent2labels(sentencesTestData[0])) | ||
| 400 | + t0 = time() | ||
| 401 | + | ||
| 402 | + X_train = [sent2features(s) for s in sentencesTrainingData] | ||
| 403 | + y_train = [sent2labels(s) for s in sentencesTrainingData] | ||
| 404 | + | ||
| 405 | + X_test = [sent2features(s) for s in sentencesTestData] | ||
| 406 | + # print X_test | ||
| 407 | + y_test = [sent2labels(s) for s in sentencesTestData] | ||
| 408 | + | ||
| 409 | + # Fixed parameters | ||
| 410 | + # crf = sklearn_crfsuite.CRF( | ||
| 411 | + # algorithm='lbfgs', | ||
| 412 | + # c1=0.1, | ||
| 413 | + # c2=0.1, | ||
| 414 | + # max_iterations=100, | ||
| 415 | + # all_possible_transitions=True | ||
| 416 | + # ) | ||
| 417 | + | ||
| 418 | + # Hyperparameter Optimization | ||
| 419 | + crf = sklearn_crfsuite.CRF( | ||
| 420 | + algorithm='lbfgs', | ||
| 421 | + max_iterations=100, | ||
| 422 | + all_possible_transitions=True | ||
| 423 | + ) | ||
| 424 | + params_space = { | ||
| 425 | + 'c1': scipy.stats.expon(scale=0.5), | ||
| 426 | + 'c2': scipy.stats.expon(scale=0.05), | ||
| 427 | + } | ||
| 428 | + | ||
| 429 | + # Original: labels = list(crf.classes_) | ||
| 430 | + # Original: labels.remove('O') | ||
| 431 | + labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) | ||
| 432 | + | ||
| 433 | + # use the same metric for evaluation | ||
| 434 | + f1_scorer = make_scorer(metrics.flat_f1_score, | ||
| 435 | + average='weighted', labels=labels) | ||
| 436 | + | ||
| 437 | + # search | ||
| 438 | + rs = RandomizedSearchCV(crf, params_space, | ||
| 439 | + cv=3, | ||
| 440 | + verbose=3, | ||
| 441 | + n_jobs=-1, | ||
| 442 | + n_iter=20, | ||
| 443 | + # n_iter=50, | ||
| 444 | + scoring=f1_scorer) | ||
| 445 | + rs.fit(X_train, y_train) | ||
| 446 | + | ||
| 447 | + # Fixed parameters | ||
| 448 | + # crf.fit(X_train, y_train) | ||
| 449 | + | ||
| 450 | + # Best hiperparameters | ||
| 451 | + # crf = rs.best_estimator_ | ||
| 452 | + nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
| 453 | + options.filterSymbols) + '.txt') | ||
| 454 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | ||
| 455 | + oFile.write("********** TRAINING AND TESTING REPORT **********\n") | ||
| 456 | + oFile.write("Training file: " + options.trainingFile + '\n') | ||
| 457 | + oFile.write('\n') | ||
| 458 | + oFile.write('best params:' + str(rs.best_params_) + '\n') | ||
| 459 | + oFile.write('best CV score:' + str(rs.best_score_) + '\n') | ||
| 460 | + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000)) | ||
| 461 | + | ||
| 462 | + print("Training done in: %fs" % (time() - t0)) | ||
| 463 | + t0 = time() | ||
| 464 | + | ||
| 465 | + # Update best crf | ||
| 466 | + crf = rs.best_estimator_ | ||
| 467 | + | ||
| 468 | + # Saving model | ||
| 469 | + print(" Saving training model...") | ||
| 470 | + t1 = time() | ||
| 471 | + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
| 472 | + options.filterSymbols) + '.mod') | ||
| 473 | + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | ||
| 474 | + print(" Saving training model done in: %fs" % (time() - t1)) | ||
| 475 | + | ||
| 476 | + # Evaluation against test data | ||
| 477 | + y_pred = crf.predict(X_test) | ||
| 478 | + print("*********************************") | ||
| 479 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
| 480 | + options.filterSymbols) + '.txt') | ||
| 481 | + with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: | ||
| 482 | + for y in y_pred: | ||
| 483 | + oFile.write(str(y) + '\n') | ||
| 484 | + | ||
| 485 | + print("*********************************") | ||
| 486 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
| 487 | + options.filterSymbols) + '.txt') | ||
| 488 | + with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: | ||
| 489 | + for y in y_test: | ||
| 490 | + oFile.write(str(y) + '\n') | ||
| 491 | + | ||
| 492 | + print("Prediction done in: %fs" % (time() - t0)) | ||
| 493 | + | ||
| 494 | + # labels = list(crf.classes_) | ||
| 495 | + # labels.remove('O') | ||
| 496 | + | ||
| 497 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile: | ||
| 498 | + oFile.write('\n') | ||
| 499 | + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))) | ||
| 500 | + oFile.write('\n') | ||
| 501 | + # labels = list(crf.classes_) | ||
| 502 | + sorted_labels = sorted( | ||
| 503 | + labels, | ||
| 504 | + key=lambda name: (name[1:], name[0]) | ||
| 505 | + ) | ||
| 506 | + oFile.write(metrics.flat_classification_report( | ||
| 507 | + y_test, y_pred, labels=sorted_labels, digits=3 | ||
| 508 | + )) | ||
| 509 | + oFile.write('\n') | ||
| 510 | + | ||
| 511 | + oFile.write("\nTop likely transitions:\n") | ||
| 512 | + print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | ||
| 513 | + oFile.write('\n') | ||
| 514 | + | ||
| 515 | + oFile.write("\nTop unlikely transitions:\n") | ||
| 516 | + print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | ||
| 517 | + oFile.write('\n') | ||
| 518 | + | ||
| 519 | + oFile.write("\nTop positive:\n") | ||
| 520 | + print_state_features(Counter(crf.state_features_).most_common(200), oFile) | ||
| 521 | + oFile.write('\n') | ||
| 522 | + | ||
| 523 | + oFile.write("\nTop negative:\n") | ||
| 524 | + print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile) | ||
| 525 | + oFile.write('\n') |
-
Please register or login to post a comment