Obtaining training and test data sets

Carlos-Francisco Méndez-Cruz
Commit e9aa86d0384ab69bee3d127a384cbe30a1db3c81 e9aa86d0 1 parent 919edf2e
Showing 4 changed files with 610 additions and 212 deletions
prepare-training-test.py
preparing-training-validation-test.py
training-validation-v1.py
training-validation.py
--- a/prepare-training-test.py 0 → 100644
View file @e9aa86d
+++ b/prepare-training-test.py 0 → 100644
View file @e9aa86d
+ # -*- coding: UTF-8 -*-
+ 
+ from optparse import OptionParser
+ import os
+ import sys
+ from time import time
+ 
+ __author__ = 'CMendezC'
+ 
+ # Objective: Join transformed files for obtaining training and test data sets
+ 
+ # Parameters:
+ #   1) --inputPath      Path to read files.
+ #   2) --trainingFile   File name for training data.
+ #   3) --testFile       File name for test data.
+ #   4) --outputPath     Path to write files.
+ 
+ # Ouput:
+ #   1) Files created.
+ 
+ # Execution:
+ # python prepare-training-test.py
+ # --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed
+ # --trainingFile training-data-set-70.txt
+ # --testFile test-data-set-30.txt
+ # --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
+ # python prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ if __name__ == "__main__":
+     # Parameter definition
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path to read files", metavar="PATH")
+     parser.add_option("--trainingFile", dest="trainingFile",
+                       help="File for training examples", metavar="FILE")
+     parser.add_option("--testFile", dest="testFile",
+                       help="File for test examples", metavar="FILE")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("None parameters indicated.")
+         sys.exit(1)
+ 
+     # Printing parameter values
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path to read files: " + str(options.inputPath))
+     print("File for training examples", str(options.trainingFile))
+     print("File for test examples", str(options.testFile))
+     print("Path to write output files: " + str(options.outputPath))
+ 
+     t0 = time()
+     trainingDataset = []
+     testDataset = []
+ 
+     counter = 1
+     for path, dirs, files in os.walk(options.inputPath):
+         # For each file in dir
+         for file in files:
+             if counter <= 70:
+                 print("   Joining file {} to training data set".format(file))
+                 with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
+                     for line in iFile:
+                         line = line.strip('\r\n')
+                         trainingDataset.append(line)
+             if counter > 70 and counter <= 100:
+                 print("   Joining file {} to test data set".format(file))
+                 with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
+                     for line in iFile:
+                         line = line.strip('\r\n')
+                         testDataset.append(line)
+     with open(os.path.join(options.outputPath, options.trainingFile), "r", encoding="utf-8", errors="replace") as oFile:
+         for line in trainingDataset:
+             oFile.write("{}\n".format(line))
+     with open(os.path.join(options.outputPath, options.testFile), "r", encoding="utf-8", errors="replace") as oFile:
+         for line in testDataset:
+             oFile.write("{}\n".format(line))
+ 
--- a/preparing-training-validation-test.py deleted 100644 → 0
View file @919edf2
+++ b/preparing-training-validation-test.py deleted 100644 → 0
View file @919edf2
- # -*- coding: UTF-8 -*-
- 
- from optparse import OptionParser
- import os
- import sys
- from time import time
- import json
- from nltk.corpus import stopwords
- 
- __author__ = 'CMendezC'
- 
- # Objective: Take transformed file with format word|lemma|tag,
- #   for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP
- #   and create file with an additional tagging for CRF training. For example:
- #   the|the|dt N-terminal|N-terminal|NN| domain|domain|NN -->
- #   the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O
- #   Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP).
- #   We expect that these words are going to have one aTag in some context and different one in others.
- #   The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here.
- #   In output file we only maintain the lemma and the tag or the word and the tag.
- #   This additional tagging is going to give us clues for aspect classification.
- 
- # Parameters:
- #   1) --inputPath      Path to read files.
- #   2) --trainingFile   File name with training data.
- #   3) --testFile       File name with test data.
- #   4) --outputPath     Path to write files. File names are concatenated with feature name.
- #   5) ELIMINATED --feature        Type of feature to extract and create file: lemma
- #   6) --termPath       Path to read term files
- #   7) --termFiles      JSON file with terms files and tags
- #   8) --termPath      Path to read JSON file with information about frequent words files
- #   9) --inputFileFreq   JSON file with information about frequent words
- #   10  --skip=N     Skip N words to form skip mentions
- #   11) --stopWords   Filtering stop words
- #   12) --filterPunctMarks      Filtering punctuation marks
- 
- # Ouput:
- #   1) Files created. Name of feature is concatenated
- 
- # Execution:
- # ASPECTS
- # python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
- 
- # SENTENCES
- # python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
- 
- # none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json
- # stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords
- # stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks
- # filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks
- # ¿? --SKIP
- 
- def getSkipMentions(aList, aSkip):
-     hashTemp = {}
-     for j in range(0, aSkip):
-         listTemp = []
-         for i in range(0, len(aList), aSkip+1):
-             listTemp.append(aList[i + j])
-         hashTemp[j] = listTemp
-     return hashTemp
- 
- ###########################################################
- #                       MAIN PROGRAM                      #
- ###########################################################
- 
- if __name__ == "__main__":
-     # Parameter definition
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Path to read files", metavar="PATH")
-     parser.add_option("--trainingFile", dest="trainingFile",
-                       help="File with training examples", metavar="FILE")
-     parser.add_option("--testFile", dest="testFile",
-                       help="File with test examples", metavar="FILE")
-     parser.add_option("--trainingClassesFile", dest="trainingClassesFile",
-                       help="File with training classes", metavar="FILE")
-     parser.add_option("--testClassesFile", dest="testClassesFile",
-                       help="File with test classes", metavar="FILE")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
-     parser.add_option("--termPath", dest="termPath",
-                       help="Path to read term files", metavar="PATH")
-     parser.add_option("--termFiles", dest="termFiles",
-                   help="JSON file with terms files and tags", metavar="PATH")
-     parser.add_option("--inputFileFreq", dest="inputFileFreq",
-                   help="JSON file with information about frequent words", metavar="PATH")
-     parser.add_option("--skip", type="int",
-                       dest="skip", default=0,
-                       help="Skip mentions", metavar="N")
-     parser.add_option("--filterStopWords", default=False,
-                   action="store_true", dest="filterStopWords",
-                   help="Filtering stop words")
-     parser.add_option("--filterPunctMarks", default=False,
-                       action="store_true", dest="filterPunctMarks",
-                       help="Filtering punctuation marks")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("None parameters indicated.")
-         sys.exit(1)
- 
-     # Printing parameter values
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path to read files: " + str(options.inputPath))
-     print("File with training examples", str(options.trainingFile))
-     print("File with test examples", str(options.testFile))
-     print("File with training classes", str(options.trainingClassesFile))
-     print("File with test classes", str(options.testClassesFile))
-     print("File with training classes", str(options.trainingClassesFile))
-     print("File with test classes", str(options.testClassesFile))
-     print("Path to write output files: " + str(options.outputPath))
-     print("JSON file with information about frequent words: " + str(options.inputFileFreq))
-     print("Skip mentions: " + str(options.skip))
-     print("Filtering stop words: " + str(options.stopWords))
-     punctMarks = ['.', ',', ':', ';', '?', '!', '\'', '"']
-     print("Filtering puntuation marks " + str(punctMarks) + ': '+ str(options.filterPunctMarks))
- 
-     filesRead = 0
-     t0 = time()
- 
-     print('Loading biological term files...')
-     with open(os.path.join(options.termPath, options.termFiles)) as data_file:
-         hashes = json.load(data_file)
-     print('   Loading biological term files... done')
- 
-     hashTagAspect = hashes["hashTagAspect"]
- 
-     print('Loading frequent words...')
-     with open(os.path.join(options.termPath, options.inputFileFreq)) as data_file:
-         hashAspectFreqWords = json.load(data_file)
-     print('   Loading frequent words... done')
- 
-     listFiles = [options.trainingFile, options.testFile]
-     listClassesFiles = [options.trainingClassesFile, options.testClassesFile]
- 
-     for iFile, cFile in zip(listFiles, listClassesFiles):
-         with open(os.path.join(options.inputPath, iFile), "r", encoding="utf-8", errors="replace") as tFile:
-             print("Reading file..." + iFile)
-             lines = [l.strip('\n') for l in  tFile.readlines()]
-             filesRead += 1
-         with open(os.path.join(options.inputPath, cFile), "r", encoding="utf-8", errors="replace") as clFile:
-             print("Reading file..." + cFile)
-             classes = [c.strip('\n') for c in clFile.readlines()]
-         listLines = []
-         print("Processing files... ")
-         for line, c in zip(lines, classes):
-             # print("class: ", c)
-             listTokenLine = []
-             # listLemmaLine = []
-             for tok in line.split():
-                 tokList = tok.split("|")
-                 word = tokList[0]
-                 lemma = tokList[1]
-                 tag = tokList[2]
-                 # Filtering stopwords
-                 if options.stopWords:
-                     if lemma in stopwords.words('english'):
-                         continue
-                 if options.filterPunctMarks:
-                     if lemma in punctMarks:
-                         continue
-                 # if tag in hashTagAspect:
-                 # We change tag for aspect tag only in the case of aspect tag coincide with class.
-                 # We want that CRF learn when to change term tag to aspect tag in correct context
-                 if tag in hashTagAspect:
-                     if hashTagAspect[tag] == c:
-                         aTag = hashTagAspect[tag]
-                     else:
-                         aTag = 'O'
-                 else:
-                     if c in hashAspectFreqWords:
-                         # print("class: ", c)
-                         hashFreqWords = hashAspectFreqWords[c]
-                         # We verify if word or lemma is in frequent words.
-                         # These frequent words are word-forms (tokens)
-                         if word.lower() in hashFreqWords or lemma in hashFreqWords:
-                             aTag = c
-                         else:
-                             aTag = 'O'
-                     else:
-                         aTag = 'O'
-                 listTokenLine.append(word + "|" + lemma + "|" + tag + "|" + aTag)
-             # if feature == "word":
-             listLines.append(listTokenLine)
-             # if feature == "lemma":
-                 # listLines = listLemmaLine.strip() + '\n'
-             if options.skip > 0:
-                 t0 = time()
-                 skipTemp = options.skip
-                 for i in range(1, options.skip):
-                     hashTemp = getSkipMentions(listLines, skipTemp)
-                     # skipTemp -= 1
-                     for key in hashTemp:
-                         listLines = hashTemp[key]
-                         with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
-                                 options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(skipTemp) + '.txt')), "w",
-                                   encoding="utf-8") as oFile:
-                             for line in listLines:
-                                 oFile.write(line)
-                 print("Skip mention done in: %fs" % (time() - t0))
-             else:
-                 with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
-                         options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(options.skip) + '.txt')), "w",
-                           encoding="utf-8") as oFile:
-                     for line in listLines:
-                         for token in line:
-                             oFile.write(token + ' ')
-                         oFile.write('\n')
- 
-     print("Files processed: " + str(filesRead))
\ No newline at end of file
--- a/training-validation-v1.py 0 → 100644
View file @e9aa86d
+++ b/training-validation-v1.py 0 → 100644
View file @e9aa86d
+ # -*- coding: UTF-8 -*-
+ 
+ import os
+ from itertools import chain
+ from optparse import OptionParser
+ from time import time
+ from collections import Counter
+ 
+ import nltk
+ import sklearn
+ import scipy.stats
+ import sys
+ 
+ from sklearn.externals import joblib
+ from sklearn.metrics import make_scorer
+ from sklearn.cross_validation import cross_val_score
+ from sklearn.grid_search import RandomizedSearchCV
+ 
+ import sklearn_crfsuite
+ from sklearn_crfsuite import scorers
+ from sklearn_crfsuite import metrics
+ 
+ from nltk.corpus import stopwords
+ 
+ 
+ # Objective
+ # Training and evaluation of CRFs with sklearn-crfsuite.
+ #
+ # Input parameters
+ # --inputPath=PATH    Path of training and test data set
+ # --trainingFile        File with training data set
+ # --testFile        File with test data set
+ # --outputPath=PATH    Output path to place output files
+ # --filteringStopWords   Filtering stop words
+ # --filterSymbols      Filtering punctuation marks
+ 
+ # Output
+ # 1) Best model
+ 
+ # Examples
+ # Sentences
+ # C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt
+ # C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt
+ # C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt
+ # C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt
+ 
+ # Aspects
+ # C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt
+ # C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt
+ # C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt
+ # C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt
+ 
+ #################################
+ #           FUNCTIONS           #
+ #################################
+ 
+ def wordSize(text):
+     lWord = len(text)
+     if lWord == 1:
+         return '1'
+     elif lWord == 2:
+         return '2'
+     elif lWord == 3:
+         return '3'
+     elif lWord == 4:
+         return '4'
+     elif lWord == 5:
+         return '5'
+     elif 6 <= lWord <= 10:
+         return '6-10'
+     elif 11 <= lWord <= 15:
+         return '11-15'
+     elif 16 <= lWord <= 20:
+         return '16-20'
+     elif 21 <= lWord <= 30:
+         return '21-30'
+     else:
+         return '>30'
+ 
+ def hasUpperLower(text):
+     has = False
+     if len(text) < 3:
+         return False
+     regexUp = nltk.re.compile('[A-Z]')
+     regexLo = nltk.re.compile('[a-z]')
+     if (regexUp.search(text) != None) and (regexLo.search(text) != None):
+         has = True
+     return has
+ 
+ def hasDigit(text):
+     has = False
+     if len(text) < 3:
+         return False
+     myRegex = nltk.re.compile('[0-9]')
+     if myRegex.search(text) != None:
+         has = True
+     return has
+ 
+ 
+ def hasNonAlphaNum(text):
+     has = False
+     if len(text) < 3:
+         return False
+     myRegex = nltk.re.compile('\W')
+     if myRegex.search(text) != None:
+         has = True
+     return has
+ 
+ def word2features(sent, i):
+     # print "i: " + str(i)
+     # print "sent[i]" + sent[i]
+     listElem = sent[i].split('|')
+     word = listElem[0]
+     lemma = listElem[1]
+     postag = listElem[2]
+ 
+     features = {
+         # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
+         # Suffixes
+         'word[-3:]': word[-3:],
+         'word[-2:]': word[-2:],
+         'word[-1:]': word[-1:],
+         'word.isupper()': word.isupper(),
+         'word.istitle()': word.istitle(),
+         'word.hasDigit()': hasDigit(word),
+         'word.hasNonAlphaNum': hasNonAlphaNum(word),
+         # 'word.hasUpperLower': hasUpperLower(word),
+         #'wordSize': wordSize(word),
+         # 'word.isdigit()': word.isdigit(),
+         'word': word,
+         'lemma': lemma,
+         'lemma[-3:]': lemma[-3:],
+         'lemma[-2:]': lemma[-2:],
+         'lemma[-1:]': lemma[-1:],
+         'postag': postag,
+         # Prefixes
+         'postag[:2]': postag[:2],
+         'postag[:1]': postag[:1],
+     }
+     if i > 0:
+         listElem = sent[i - 1].split('|')
+         word1 = listElem[0]
+         lemma1 = listElem[1]
+         postag1 = listElem[2]
+         features.update({
+             '-1:word.lower()': word1.lower(),
+             '-1:word.istitle()': word1.istitle(),
+             '-1:word.isupper()': word1.isupper(),
+             '-1:word.hasDigit()': hasDigit(word1),
+             '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
+             # '-1:word.hasUpperLower': hasUpperLower(word1),
+             '-1:word': word1,
+             '-1:lemma': lemma1,
+             '-1:postag': postag1,
+             '-1:postag[:2]': postag1[:2],
+             '-1:postag[:1]': postag1[:1],
+         })
+     # else:
+     #    features['BOS'] = True
+ 
+     if i < len(sent) - 1:
+         listElem = sent[i + 1].split('|')
+         word1 = listElem[0]
+         lemma1 = listElem[1]
+         postag1 = listElem[2]
+         features.update({
+             '+1:word.lower()': word1.lower(),
+             '+1:word.istitle()': word1.istitle(),
+             '+1:word.isupper()': word1.isupper(),
+             '+1:word.hasDigit()': hasDigit(word1),
+             '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
+             # '+1:word.hasUpperLower': hasUpperLower(word1),
+             '+1:word': word1,
+             '+1:lemma': lemma1,
+             '+1:postag': postag1,
+             '+1:postag[:2]': postag1[:2],
+             '+1:postag[:1]': postag1[:1],
+         })
+     # else:
+     #    features['EOS'] = True
+     if i > 1:
+         listElem = sent[i - 2].split('|')
+         word2 = listElem[0]
+         lemma2 = listElem[1]
+         postag2 = listElem[2]
+         features.update({
+             '-2:word.lower()': word2.lower(),
+             '-2:word.istitle()': word2.istitle(),
+             '-2:word.isupper()': word2.isupper(),
+             '-2:word.hasDigit()': hasDigit(word2),
+             '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
+             # '-2:word.hasUpperLower': hasUpperLower(word2),
+             '-2:word': word2,
+             '-2:lemma': lemma2,
+             '-2:postag': postag2,
+             '-2:postag[:2]': postag2[:2],
+             '-2:postag[:1]': postag2[:1],
+         })
+ 
+     if i < len(sent) - 2:
+         listElem = sent[i + 2].split('|')
+         word2 = listElem[0]
+         lemma2 = listElem[1]
+         postag2 = listElem[2]
+         features.update({
+             '+2:word.lower()': word2.lower(),
+             '+2:word.istitle()': word2.istitle(),
+             '+2:word.isupper()': word2.isupper(),
+             '+2:word.hasDigit()': hasDigit(word2),
+             '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
+             # '+2:word.hasUpperLower': hasUpperLower(word2),
+             '+2:word': word2,
+             '+2:lemma': lemma2,
+             '+2:postag': postag2,
+             '+2:postag[:2]': postag2[:2],
+             '+2:postag[:1]': postag2[:1],
+         })
+ 
+     trigrams = False
+     if trigrams:
+         if i > 2:
+             listElem = sent[i - 3].split('|')
+             word3 = listElem[0]
+             lemma3 = listElem[1]
+             postag3 = listElem[2]
+             features.update({
+                 '-3:word.lower()': word3.lower(),
+                 '-3:word.istitle()': word3.istitle(),
+                 '-3:word.isupper()': word3.isupper(),
+                 '-3:word.hasDigit()': hasDigit(word3),
+                 '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
+                 # '-3:word.hasUpperLower': hasUpperLower(word3),
+                 '-3:word': word3,
+                 '-3:lemma': lemma3,
+                 '-3:postag': postag3,
+                 '-3:postag[:2]': postag3[:2],
+                 '-3:postag[:1]': postag3[:1],
+             })
+ 
+         if i < len(sent) - 3:
+             listElem = sent[i + 3].split('|')
+             word3 = listElem[0]
+             lemma3 = listElem[1]
+             postag3 = listElem[2]
+             features.update({
+                 '+3:word.lower()': word3.lower(),
+                 '+3:word.istitle()': word3.istitle(),
+                 '+3:word.isupper()': word3.isupper(),
+                 '+3:word.hasDigit()': hasDigit(word3),
+                 '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
+                 # '+3:word.hasUpperLower': hasUpperLower(word3),
+                 '+3:word': word3,
+                 '+3:lemma': lemma3,
+                 '+3:postag': postag3,
+                 '+3:postag[:2]': postag3[:2],
+                 '+3:postag[:1]': postag3[:1],
+             })
+ 
+     return features
+ 
+ 
+ def sent2features(sent):
+     return [word2features(sent, i) for i in range(len(sent))]
+ 
+ 
+ def sent2labels(sent):
+     return [elem.split('|')[3] for elem in sent]
+     # return [label for token, postag, label in sent]
+ 
+ 
+ def sent2tokens(sent):
+     return [token for token, postag, label in sent]
+ 
+ 
+ def print_transitions(trans_features, f):
+     for (label_from, label_to), weight in trans_features:
+         # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
+         # f.write("label_from :" + label_from)
+         # f.write("label_to :" + label_to)
+         # f.write("label_weight :" + weight)
+         # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
+         f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
+ 
+ 
+ def print_state_features(state_features, f):
+     for (attr, label), weight in state_features:
+         # f.write("%0.6f %-8s %s\n" % (weight, label, attr))
+         # f.write(attr.encode("utf-8"))
+         # '{:06.2f}'.format(3.141592653589793)
+         f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
+ 
+ 
+ __author__ = 'CMendezC'
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path of training data set", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path to place output files",
+                       metavar="PATH")
+     parser.add_option("--trainingFile", dest="trainingFile",
+                       help="File with training data set", metavar="FILE")
+     parser.add_option("--testFile", dest="testFile",
+                       help="File with test data set", metavar="FILE")
+     parser.add_option("--filterStopWords", default=False,
+                       action="store_true", dest="filterStopWords",
+                       help="Filtering stop words")
+     parser.add_option("--filterSymbols", default=False,
+                       action="store_true", dest="filterSymbols",
+                       help="Filtering punctuation marks")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path of training data set: " + options.inputPath)
+     print("File with training data set: " + str(options.trainingFile))
+     print("Path of test data set: " + options.inputPath)
+     print("File with test data set: " + str(options.testFile))
+     print("Filtering stop words: " + str(options.filterStopWords))
+     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
+     print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+ 
+     print('-------------------------------- PROCESSING --------------------------------')
+     print('Reading corpus...')
+     t0 = time()
+ 
+     sentencesTrainingData = []
+     sentencesTestData = []
+ 
+     stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
+ 
+     with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
+         # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
+         for line in iFile.readlines():
+             listLine = []
+             line = line.decode("utf-8")
+             for token in line.strip('\n').split():
+                 if options.filterStopWords:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     # Original: if lemma in stopwords.words('english'):
+                     # trainingTesting_Sklearn_crfsuite.py:269:
+                     # UnicodeWarning: Unicode equal comparison failed to
+                     # convert both arguments to Unicode -
+                     # interpreting them as being unequal
+                     if lemma in stopwords:
+                         continue
+                 if options.filterSymbols:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in symbols:
+                         # if lemma == ',':
+                         #     print "Coma , identificada"
+                         continue
+                 listLine.append(token)
+             sentencesTrainingData.append(listLine)
+         print "   Sentences training data: " + str(len(sentencesTrainingData))
+         # print sentencesTrainingData[0]
+ 
+     with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
+         # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
+         for line in iFile.readlines():
+             listLine = []
+             line = line.decode("utf-8")
+             for token in line.strip('\n').split():
+                 if options.filterStopWords:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     # Original if lemma in stopwords.words('english'):
+                     if lemma in stopwords:
+                         continue
+                 if options.filterSymbols:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in symbols:
+                         # if lemma == ',':
+                          #    print "Coma , identificada"
+                         continue
+                 listLine.append(token)
+             sentencesTestData.append(listLine)
+         print "   Sentences test data: " + str(len(sentencesTestData))
+         # print sentencesTestData[0]
+ 
+     print("Reading corpus done in: %fs" % (time() - t0))
+ 
+     print(sent2features(sentencesTrainingData[0])[0])
+     print(sent2features(sentencesTestData[0])[0])
+     # print(sent2labels(sentencesTrainingData[0]))
+     # print(sent2labels(sentencesTestData[0]))
+     t0 = time()
+ 
+     X_train = [sent2features(s) for s in sentencesTrainingData]
+     y_train = [sent2labels(s) for s in sentencesTrainingData]
+ 
+     X_test = [sent2features(s) for s in sentencesTestData]
+     # print X_test
+     y_test = [sent2labels(s) for s in sentencesTestData]
+ 
+     # Fixed parameters
+     # crf = sklearn_crfsuite.CRF(
+     #     algorithm='lbfgs',
+     #     c1=0.1,
+     #     c2=0.1,
+     #     max_iterations=100,
+     #     all_possible_transitions=True
+     # )
+ 
+     # Hyperparameter Optimization
+     crf = sklearn_crfsuite.CRF(
+         algorithm='lbfgs',
+         max_iterations=100,
+         all_possible_transitions=True
+     )
+     params_space = {
+         'c1': scipy.stats.expon(scale=0.5),
+         'c2': scipy.stats.expon(scale=0.05),
+     }
+ 
+     # Original: labels = list(crf.classes_)
+     # Original: labels.remove('O')
+     labels = list(['GENE'])
+ 
+     # use the same metric for evaluation
+     f1_scorer = make_scorer(metrics.flat_f1_score,
+                             average='weighted', labels=labels)
+ 
+     # search
+     rs = RandomizedSearchCV(crf, params_space,
+                             cv=10,
+                             verbose=3,
+                             n_jobs=-1,
+                             n_iter=20,
+                             # n_iter=50,
+                             scoring=f1_scorer)
+     rs.fit(X_train, y_train)
+ 
+     # Fixed parameters
+     # crf.fit(X_train, y_train)
+ 
+     # Best hiperparameters
+     # crf = rs.best_estimator_
+     nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
+         options.filterSymbols) + '.txt')
+     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
+         oFile.write("********** TRAINING AND TESTING REPORT **********\n")
+         oFile.write("Training file: " + options.trainingFile + '\n')
+         oFile.write('\n')
+         oFile.write('best params:' + str(rs.best_params_) + '\n')
+         oFile.write('best CV score:' + str(rs.best_score_) + '\n')
+         oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
+ 
+     print("Training done in: %fs" % (time() - t0))
+     t0 = time()
+ 
+     # Update best crf
+     crf = rs.best_estimator_
+ 
+     # Saving model
+     print("     Saving training model...")
+     t1 = time()
+     nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
+         options.filterSymbols) + '.mod')
+     joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
+     print("        Saving training model done in: %fs" % (time() - t1))
+ 
+     # Evaluation against test data
+     y_pred = crf.predict(X_test)
+     print("*********************************")
+     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
+         options.filterSymbols) + '.txt')
+     with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
+         for y in y_pred:
+             oFile.write(str(y) + '\n')
+ 
+     print("*********************************")
+     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
+         options.filterSymbols) + '.txt')
+     with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
+         for y in y_test:
+             oFile.write(str(y) + '\n')
+ 
+     print("Prediction done in: %fs" % (time() - t0))
+ 
+     # labels = list(crf.classes_)
+     # labels.remove('O')
+ 
+     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
+         oFile.write('\n')
+         oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
+         oFile.write('\n')
+         # labels = list(crf.classes_)
+         sorted_labels = sorted(
+             labels,
+             key=lambda name: (name[1:], name[0])
+         )
+         oFile.write(metrics.flat_classification_report(
+             y_test, y_pred, labels=sorted_labels, digits=3
+         ))
+         oFile.write('\n')
+ 
+         oFile.write("\nTop likely transitions:\n")
+         print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop unlikely transitions:\n")
+         print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop positive:\n")
+         print_state_features(Counter(crf.state_features_).most_common(200), oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop negative:\n")
+         print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
+         oFile.write('\n')
--- a/training-validation.py
View file @e9aa86d
+++ b/training-validation.py
View file @e9aa86d
@@ -428,7 +428,7 @@ if __name__ == "__main__":
 
     # Original: labels = list(crf.classes_)
     # Original: labels.remove('O')
-     labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
+     labels = list(['GENE'])
 
     # use the same metric for evaluation
     f1_scorer = make_scorer(metrics.flat_f1_score,
@@ -436,7 +436,7 @@ if __name__ == "__main__":
 
     # search
     rs = RandomizedSearchCV(crf, params_space,
-                             cv=3,
+                             cv=10,
                             verbose=3,
                             n_jobs=-1,
                             n_iter=20,