Setting up project

Carlos-Francisco Méndez-Cruz
Commit 8d2b2c6e106861449f34af2f9de2771c719d11f3 8d2b2c6e 0 parents
Showing 5 changed files with 1157 additions and 0 deletions
.idea/vcs.xml
prepare-abstracts.py
preparing-training-validation-test.py
tagging_Sklearn_crfsuite.py
training-validation.py
--- a/.idea/vcs.xml 0 → 100644
View file @8d2b2c6
+++ b/.idea/vcs.xml 0 → 100644
View file @8d2b2c6
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/prepare-abstracts.py 0 → 100644
View file @8d2b2c6
+++ b/prepare-abstracts.py 0 → 100644
View file @8d2b2c6
+# -*- coding: UTF-8 -*-
+
+from optparse import OptionParser
+import os
+import sys
+from time import time
+import re
+
+__author__ = 'CMendezC'
+
+# Objective: Take text-annotated-abstracts-original.txt as input
+# for obtaining abstracts separated in files without tags and collecting dictionary of genes
+# for tagging after NLP pipeline.
+
+# Parameters:
+#   1) --inputPath      Input path.
+#   2) --inputFile   Input file.
+#   3) --outputPath     Output path
+
+# Execution:
+#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Input path", metavar="PATH")
+    parser.add_option("--inputFile", dest="inputFile",
+                      help="Input file", metavar="FILE")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path", metavar="PATH")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Input path: " + str(options.inputPath))
+    print("Input file", str(options.inputFile))
+    print("Output path: " + str(options.outputPath))
+
+    filesWritten = 0
+    t0 = time()
+    hashGenes = {}
+
+    rePmid = re.compile(r'([\d])+\|a\|')
+    reGene = re.compile(r'<g>([^<]+)</g>')
+    with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
+        print("Reading file..." + options.inputFile)
+        for line in iFile:
+            line = line.strip('\n')
+            for gene in reGene.findall(line):
+                print("genes: {}".format(gene))
+            result = rePmid.match(line)
+            if result:
+                with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
+                    oFile.write(line)
+
+
+
--- a/preparing-training-validation-test.py 0 → 100644
View file @8d2b2c6
+++ b/preparing-training-validation-test.py 0 → 100644
View file @8d2b2c6
+# -*- coding: UTF-8 -*-
+
+from optparse import OptionParser
+import os
+import sys
+from time import time
+import json
+from nltk.corpus import stopwords
+
+__author__ = 'CMendezC'
+
+# Objective: Take transformed file with format word|lemma|tag,
+#   for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP
+#   and create file with an additional tagging for CRF training. For example:
+#   the|the|dt N-terminal|N-terminal|NN| domain|domain|NN -->
+#   the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O
+#   Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP).
+#   We expect that these words are going to have one aTag in some context and different one in others.
+#   The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here.
+#   In output file we only maintain the lemma and the tag or the word and the tag.
+#   This additional tagging is going to give us clues for aspect classification.
+
+# Parameters:
+#   1) --inputPath      Path to read files.
+#   2) --trainingFile   File name with training data.
+#   3) --testFile       File name with test data.
+#   4) --outputPath     Path to write files. File names are concatenated with feature name.
+#   5) ELIMINATED --feature        Type of feature to extract and create file: lemma
+#   6) --termPath       Path to read term files
+#   7) --termFiles      JSON file with terms files and tags
+#   8) --termPath      Path to read JSON file with information about frequent words files
+#   9) --inputFileFreq   JSON file with information about frequent words
+#   10  --skip=N     Skip N words to form skip mentions
+#   11) --stopWords   Filtering stop words
+#   12) --filterPunctMarks      Filtering punctuation marks
+
+# Ouput:
+#   1) Files created. Name of feature is concatenated
+
+# Execution:
+# ASPECTS
+# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# SENTENCES
+# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json
+# stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords
+# stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks
+# filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks
+# ¿? --SKIP
+
+def getSkipMentions(aList, aSkip):
+    hashTemp = {}
+    for j in range(0, aSkip):
+        listTemp = []
+        for i in range(0, len(aList), aSkip+1):
+            listTemp.append(aList[i + j])
+        hashTemp[j] = listTemp
+    return hashTemp
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path to read files", metavar="PATH")
+    parser.add_option("--trainingFile", dest="trainingFile",
+                      help="File with training examples", metavar="FILE")
+    parser.add_option("--testFile", dest="testFile",
+                      help="File with test examples", metavar="FILE")
+    parser.add_option("--trainingClassesFile", dest="trainingClassesFile",
+                      help="File with training classes", metavar="FILE")
+    parser.add_option("--testClassesFile", dest="testClassesFile",
+                      help="File with test classes", metavar="FILE")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
+    parser.add_option("--termPath", dest="termPath",
+                      help="Path to read term files", metavar="PATH")
+    parser.add_option("--termFiles", dest="termFiles",
+                  help="JSON file with terms files and tags", metavar="PATH")
+    parser.add_option("--inputFileFreq", dest="inputFileFreq",
+                  help="JSON file with information about frequent words", metavar="PATH")
+    parser.add_option("--skip", type="int",
+                      dest="skip", default=0,
+                      help="Skip mentions", metavar="N")
+    parser.add_option("--filterStopWords", default=False,
+                  action="store_true", dest="filterStopWords",
+                  help="Filtering stop words")
+    parser.add_option("--filterPunctMarks", default=False,
+                      action="store_true", dest="filterPunctMarks",
+                      help="Filtering punctuation marks")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read files: " + str(options.inputPath))
+    print("File with training examples", str(options.trainingFile))
+    print("File with test examples", str(options.testFile))
+    print("File with training classes", str(options.trainingClassesFile))
+    print("File with test classes", str(options.testClassesFile))
+    print("File with training classes", str(options.trainingClassesFile))
+    print("File with test classes", str(options.testClassesFile))
+    print("Path to write output files: " + str(options.outputPath))
+    print("JSON file with information about frequent words: " + str(options.inputFileFreq))
+    print("Skip mentions: " + str(options.skip))
+    print("Filtering stop words: " + str(options.stopWords))
+    punctMarks = ['.', ',', ':', ';', '?', '!', '\'', '"']
+    print("Filtering puntuation marks " + str(punctMarks) + ': '+ str(options.filterPunctMarks))
+
+    filesRead = 0
+    t0 = time()
+
+    print('Loading biological term files...')
+    with open(os.path.join(options.termPath, options.termFiles)) as data_file:
+        hashes = json.load(data_file)
+    print('   Loading biological term files... done')
+
+    hashTagAspect = hashes["hashTagAspect"]
+
+    print('Loading frequent words...')
+    with open(os.path.join(options.termPath, options.inputFileFreq)) as data_file:
+        hashAspectFreqWords = json.load(data_file)
+    print('   Loading frequent words... done')
+
+    listFiles = [options.trainingFile, options.testFile]
+    listClassesFiles = [options.trainingClassesFile, options.testClassesFile]
+
+    for iFile, cFile in zip(listFiles, listClassesFiles):
+        with open(os.path.join(options.inputPath, iFile), "r", encoding="utf-8", errors="replace") as tFile:
+            print("Reading file..." + iFile)
+            lines = [l.strip('\n') for l in  tFile.readlines()]
+            filesRead += 1
+        with open(os.path.join(options.inputPath, cFile), "r", encoding="utf-8", errors="replace") as clFile:
+            print("Reading file..." + cFile)
+            classes = [c.strip('\n') for c in clFile.readlines()]
+        listLines = []
+        print("Processing files... ")
+        for line, c in zip(lines, classes):
+            # print("class: ", c)
+            listTokenLine = []
+            # listLemmaLine = []
+            for tok in line.split():
+                tokList = tok.split("|")
+                word = tokList[0]
+                lemma = tokList[1]
+                tag = tokList[2]
+                # Filtering stopwords
+                if options.stopWords:
+                    if lemma in stopwords.words('english'):
+                        continue
+                if options.filterPunctMarks:
+                    if lemma in punctMarks:
+                        continue
+                # if tag in hashTagAspect:
+                # We change tag for aspect tag only in the case of aspect tag coincide with class.
+                # We want that CRF learn when to change term tag to aspect tag in correct context
+                if tag in hashTagAspect:
+                    if hashTagAspect[tag] == c:
+                        aTag = hashTagAspect[tag]
+                    else:
+                        aTag = 'O'
+                else:
+                    if c in hashAspectFreqWords:
+                        # print("class: ", c)
+                        hashFreqWords = hashAspectFreqWords[c]
+                        # We verify if word or lemma is in frequent words.
+                        # These frequent words are word-forms (tokens)
+                        if word.lower() in hashFreqWords or lemma in hashFreqWords:
+                            aTag = c
+                        else:
+                            aTag = 'O'
+                    else:
+                        aTag = 'O'
+                listTokenLine.append(word + "|" + lemma + "|" + tag + "|" + aTag)
+            # if feature == "word":
+            listLines.append(listTokenLine)
+            # if feature == "lemma":
+                # listLines = listLemmaLine.strip() + '\n'
+            if options.skip > 0:
+                t0 = time()
+                skipTemp = options.skip
+                for i in range(1, options.skip):
+                    hashTemp = getSkipMentions(listLines, skipTemp)
+                    # skipTemp -= 1
+                    for key in hashTemp:
+                        listLines = hashTemp[key]
+                        with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
+                                options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(skipTemp) + '.txt')), "w",
+                                  encoding="utf-8") as oFile:
+                            for line in listLines:
+                                oFile.write(line)
+                print("Skip mention done in: %fs" % (time() - t0))
+            else:
+                with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
+                        options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(options.skip) + '.txt')), "w",
+                          encoding="utf-8") as oFile:
+                    for line in listLines:
+                        for token in line:
+                            oFile.write(token + ' ')
+                        oFile.write('\n')
+
+    print("Files processed: " + str(filesRead))
\ No newline at end of file
--- a/tagging_Sklearn_crfsuite.py 0 → 100644
View file @8d2b2c6
+++ b/tagging_Sklearn_crfsuite.py 0 → 100644
View file @8d2b2c6
+# -*- coding: UTF-8 -*-
+
+import os
+from itertools import chain
+from optparse import OptionParser
+from time import time
+from collections import Counter
+
+import nltk
+import sklearn
+import scipy.stats
+import sys
+
+from sklearn.externals import joblib
+from sklearn.metrics import make_scorer
+from sklearn.cross_validation import cross_val_score
+from sklearn.grid_search import RandomizedSearchCV
+
+import sklearn_crfsuite
+from sklearn_crfsuite import scorers
+from sklearn_crfsuite import metrics
+
+from nltk.corpus import stopwords
+from trainingTesting_Sklearn_crfsuite import word2features
+from trainingTesting_Sklearn_crfsuite import sent2features
+# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum
+# from trainingTesting_Sklearn_crfsuite import hasDigit
+
+# Objective
+# Tagging transformed file with CRF model with sklearn-crfsuite.
+#
+# Input parameters
+# --inputPath=PATH    Path of transformed files x|y|z
+# --modelPath        Path to CRF model
+# --modelName    Model name
+# --outputPath=PATH    Output path to place output files
+# --filteringStopWords   Filtering stop words
+# --filterSymbols      Filtering punctuation marks
+
+# Output
+# 1) Tagged files in transformed format
+
+# Examples
+# Sentences
+# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt
+# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt
+
+#################################
+#           FUNCTIONS           #
+#################################
+# def hasDigit(text):
+#     has = False
+#     if len(text) < 3:
+#         return False
+#     myRegex = nltk.re.compile('[0-9]')
+#     if myRegex.search(text) != None:
+#         has = True
+#     return has
+#
+#
+# def hasNonAlphaNum(text):
+#     has = False
+#     if len(text) < 3:
+#         return False
+#     myRegex = nltk.re.compile('\W')
+#     if myRegex.search(text) != None:
+#         has = True
+#     return has
+
+# IMPORTED FROM TRAINING SCRIPT
+# def word2features(sent, i):
+#     # print "i: " + str(i)
+#     # print "sent[i]" + sent[i]
+#     listElem = sent[i].split('|')
+#     word = listElem[0]
+#     lemma = listElem[1]
+#     postag = listElem[2]
+#
+#     features = {
+#         # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
+#         # Suffixes
+#         'word[-3:]': word[-3:],
+#         'word[-2:]': word[-2:],
+#         'word[-1:]': word[-1:],
+#         'word.isupper()': word.isupper(),
+#         'word.istitle()': word.istitle(),
+#         'word.hasDigit()': hasDigit(word),
+#         'word.hasNonAlphaNum': hasNonAlphaNum(word),
+#         # 'word.isdigit()': word.isdigit(),
+#         'word': word,
+#         'lemma': lemma,
+#         'lemma[-3:]': lemma[-3:],
+#         'lemma[-2:]': lemma[-2:],
+#         'lemma[-1:]': lemma[-1:],
+#         'postag': postag,
+#         # Prefixes
+#         'postag[:2]': postag[:2],
+#         'postag[:1]': postag[:1],
+#     }
+#     if i > 0:
+#         listElem = sent[i - 1].split('|')
+#         word1 = listElem[0]
+#         lemma1 = listElem[1]
+#         postag1 = listElem[2]
+#         features.update({
+#             '-1:word.lower()': word1.lower(),
+#             '-1:word.istitle()': word1.istitle(),
+#             '-1:word.isupper()': word1.isupper(),
+#             '-1:word.hasDigit()': hasDigit(word1),
+#             '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
+#             '-1:word': word1,
+#             '-1:lemma': lemma1,
+#             '-1:postag': postag1,
+#             '-1:postag[:2]': postag1[:2],
+#             '-1:postag[:1]': postag1[:1],
+#         })
+#     # else:
+#     #    features['BOS'] = True
+#
+#     if i < len(sent) - 1:
+#         listElem = sent[i + 1].split('|')
+#         word1 = listElem[0]
+#         lemma1 = listElem[1]
+#         postag1 = listElem[2]
+#         features.update({
+#             '+1:word.lower()': word1.lower(),
+#             '+1:word.istitle()': word1.istitle(),
+#             '+1:word.isupper()': word1.isupper(),
+#             '+1:word.hasDigit()': hasDigit(word1),
+#             '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
+#             '+1:word': word1,
+#             '+1:lemma': lemma1,
+#             '+1:postag': postag1,
+#             '+1:postag[:2]': postag1[:2],
+#             '+1:postag[:1]': postag1[:1],
+#         })
+#     # else:
+#     #    features['EOS'] = True
+#     if i > 1:
+#         listElem = sent[i - 2].split('|')
+#         word2 = listElem[0]
+#         lemma2 = listElem[1]
+#         postag2 = listElem[2]
+#         features.update({
+#             '-2:word.lower()': word2.lower(),
+#             '-2:word.istitle()': word2.istitle(),
+#             '-2:word.isupper()': word2.isupper(),
+#             '-2:word.hasDigit()': hasDigit(word2),
+#             '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
+#             '-2:word': word2,
+#             '-2:lemma': lemma2,
+#             '-2:postag': postag2,
+#             '-2:postag[:2]': postag2[:2],
+#             '-2:postag[:1]': postag2[:1],
+#         })
+#
+#     if i < len(sent) - 2:
+#         listElem = sent[i + 2].split('|')
+#         word2 = listElem[0]
+#         lemma2 = listElem[1]
+#         postag2 = listElem[2]
+#         features.update({
+#             '+2:word.lower()': word2.lower(),
+#             '+2:word.istitle()': word2.istitle(),
+#             '+2:word.isupper()': word2.isupper(),
+#             '+2:word.hasDigit()': hasDigit(word2),
+#             '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
+#             '+2:word': word2,
+#             '+2:lemma': lemma2,
+#             '+2:postag': postag2,
+#             '+2:postag[:2]': postag2[:2],
+#             '+2:postag[:1]': postag2[:1],
+#         })
+#
+#     trigrams = False
+#     if trigrams:
+#         if i > 2:
+#             listElem = sent[i - 3].split('|')
+#             word3 = listElem[0]
+#             lemma3 = listElem[1]
+#             postag3 = listElem[2]
+#             features.update({
+#                 '-3:word.lower()': word3.lower(),
+#                 '-3:word.istitle()': word3.istitle(),
+#                 '-3:word.isupper()': word3.isupper(),
+#                 '-3:word.hasDigit()': hasDigit(word3),
+#                 '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
+#                 '-3:word': word3,
+#                 '-3:lemma': lemma3,
+#                 '-3:postag': postag3,
+#                 '-3:postag[:2]': postag3[:2],
+#                 '-3:postag[:1]': postag3[:1],
+#             })
+#
+#         if i < len(sent) - 3:
+#             listElem = sent[i + 3].split('|')
+#             word3 = listElem[0]
+#             lemma3 = listElem[1]
+#             postag3 = listElem[2]
+#             features.update({
+#                 '+3:word.lower()': word3.lower(),
+#                 '+3:word.istitle()': word3.istitle(),
+#                 '+3:word.isupper()': word3.isupper(),
+#                 '+3:word.hasDigit()': hasDigit(word3),
+#                 '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
+#                 '+3:word': word3,
+#                 '+3:lemma': lemma3,
+#                 '+3:postag': postag3,
+#                 '+3:postag[:2]': postag3[:2],
+#                 '+3:postag[:1]': postag3[:1],
+#             })
+#
+#     return features
+
+
+# def sent2features(sent):
+#     return [word2features(sent, i) for i in range(len(sent))]
+
+
+__author__ = 'CMendezC'
+
+##########################################
+#               MAIN PROGRAM             #
+##########################################
+
+if __name__ == "__main__":
+    # Defining parameters
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path of training data set", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path to place output files",
+                      metavar="PATH")
+    parser.add_option("--modelPath", dest="modelPath",
+                      help="Path to read CRF model",
+                      metavar="PATH")
+    parser.add_option("--modelName", dest="modelName",
+                      help="Model name", metavar="TEXT")
+    parser.add_option("--filterStopWords", default=False,
+                      action="store_true", dest="filterStopWords",
+                      help="Filtering stop words")
+    parser.add_option("--filterSymbols", default=False,
+                      action="store_true", dest="filterSymbols",
+                      help="Filtering punctuation marks")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + options.inputPath)
+    print("Mode name: " + str(options.modelName))
+    print("Model path: " + options.modelPath)
+    print("Path to place output files: " + options.outputPath)
+    print("Filtering stop words: " + str(options.filterStopWords))
+    symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+               '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
+    # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+    #            '}', '[', ']', '*', '%', '$', '#', '&', '°']]
+    # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{',
+    #             u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`']
+    print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+
+    print('-------------------------------- PROCESSING --------------------------------')
+
+    stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
+
+    # Read CRF model
+    t0 = time()
+    print('Reading CRF model...')
+    crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod'))
+    print("Reading CRF model done in: %fs" % (time() - t0))
+
+    print('Processing corpus...')
+    t0 = time()
+    labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("   Preprocessing file..." + str(file))
+            sentencesInputData = []
+            sentencesOutputData = []
+            with open(os.path.join(options.inputPath, file), "r") as iFile:
+                lines = iFile.readlines()
+                for line in lines:
+                    listLine = []
+                    # line = line.decode("utf-8")
+                    for token in line.strip('\n').split():
+                        if options.filterStopWords:
+                            listToken = token.split('|')
+                            lemma = listToken[1]
+                            # Original if lemma in stopwords.words('english'):
+                            if lemma in stopwords:
+                                continue
+                        if options.filterSymbols:
+                            listToken = token.split('|')
+                            lemma = listToken[1]
+                            if lemma in symbols:
+                                if lemma == ',':
+                                    print "Coma , identificada"
+                                continue
+                        listLine.append(token)
+                    sentencesInputData.append(listLine)
+                print "   Sentences input data: " + str(len(sentencesInputData))
+                # print sentencesInputData[0]
+                # print(sent2features(sentencesInputData[0])[0])
+                # print(sent2labels(sentencesInputData[0]))
+                X_input = [sent2features(s) for s in sentencesInputData]
+                print(sent2features(sentencesInputData[0])[0])
+                # y_test = [sent2labels(s) for s in sentencesInputData]
+                # Predicting tags
+                t1 = time()
+                print "   Predicting tags with model"
+                y_pred = crf.predict(X_input)
+                print y_pred[0]
+                print("      Prediction done in: %fs" % (time() - t1))
+                # Tagging with CRF model
+                print "   Tagging file"
+                for line, tagLine in zip(lines, y_pred):
+                    outputLine = ''
+                    idx_tagLine = 0
+                    line = line.strip('\n')
+                    print "\nLine: " + str(line)
+                    print "CRF tagged line: " + str(tagLine)
+                    for token in line.split():
+                        listToken = token.split('|')
+                        word = listToken[0]
+                        lemma = listToken[1]
+                        tag = listToken[2]
+                        if options.filterStopWords:
+                            if lemma in stopwords:
+                                outputLine += token + ' '
+                                continue
+                        if options.filterSymbols:
+                            if lemma in symbols:
+                                if lemma == ',':
+                                    print "Coma , identificada"
+                                outputLine += token + ' '
+                                continue
+                        CRFtag = tagLine[idx_tagLine]
+                        if (tag not in labels) and (CRFtag != 'O'):
+                            print "*** CRF change token {} to {}".format(token, CRFtag)
+                            outputLine += word + '|' + lemma + '|' + CRFtag + ' '
+                        else:
+                            outputLine += word + '|' + lemma + '|' + tag + ' '
+                        idx_tagLine += 1
+                    sentencesOutputData.append(outputLine.rstrip())
+            with open(os.path.join(options.outputPath, file), "w") as oFile:
+                for line in sentencesOutputData:
+                    oFile.write(line + '\n')
+
+    print("Processing corpus done in: %fs" % (time() - t0))
--- a/training-validation.py 0 → 100644
View file @8d2b2c6
+++ b/training-validation.py 0 → 100644
View file @8d2b2c6
+# -*- coding: UTF-8 -*-
+
+import os
+from itertools import chain
+from optparse import OptionParser
+from time import time
+from collections import Counter
+
+import nltk
+import sklearn
+import scipy.stats
+import sys
+
+from sklearn.externals import joblib
+from sklearn.metrics import make_scorer
+from sklearn.cross_validation import cross_val_score
+from sklearn.grid_search import RandomizedSearchCV
+
+import sklearn_crfsuite
+from sklearn_crfsuite import scorers
+from sklearn_crfsuite import metrics
+
+from nltk.corpus import stopwords
+
+
+# Objective
+# Training and evaluation of CRFs with sklearn-crfsuite.
+#
+# Input parameters
+# --inputPath=PATH    Path of training and test data set
+# --trainingFile        File with training data set
+# --testFile        File with test data set
+# --outputPath=PATH    Output path to place output files
+# --filteringStopWords   Filtering stop words
+# --filterSymbols      Filtering punctuation marks
+
+# Output
+# 1) Best model
+
+# Examples
+# Sentences
+# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt
+# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt
+# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt
+# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt
+
+# Aspects
+# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt
+# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt
+# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt
+# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt
+
+#################################
+#           FUNCTIONS           #
+#################################
+
+def wordSize(text):
+    lWord = len(text)
+    if lWord == 1:
+        return '1'
+    elif lWord == 2:
+        return '2'
+    elif lWord == 3:
+        return '3'
+    elif lWord == 4:
+        return '4'
+    elif lWord == 5:
+        return '5'
+    elif 6 <= lWord <= 10:
+        return '6-10'
+    elif 11 <= lWord <= 15:
+        return '11-15'
+    elif 16 <= lWord <= 20:
+        return '16-20'
+    elif 21 <= lWord <= 30:
+        return '21-30'
+    else:
+        return '>30'
+
+def hasUpperLower(text):
+    has = False
+    if len(text) < 3:
+        return False
+    regexUp = nltk.re.compile('[A-Z]')
+    regexLo = nltk.re.compile('[a-z]')
+    if (regexUp.search(text) != None) and (regexLo.search(text) != None):
+        has = True
+    return has
+
+def hasDigit(text):
+    has = False
+    if len(text) < 3:
+        return False
+    myRegex = nltk.re.compile('[0-9]')
+    if myRegex.search(text) != None:
+        has = True
+    return has
+
+
+def hasNonAlphaNum(text):
+    has = False
+    if len(text) < 3:
+        return False
+    myRegex = nltk.re.compile('\W')
+    if myRegex.search(text) != None:
+        has = True
+    return has
+
+def word2features(sent, i):
+    # print "i: " + str(i)
+    # print "sent[i]" + sent[i]
+    listElem = sent[i].split('|')
+    word = listElem[0]
+    lemma = listElem[1]
+    postag = listElem[2]
+
+    features = {
+        # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
+        # Suffixes
+        'word[-3:]': word[-3:],
+        'word[-2:]': word[-2:],
+        'word[-1:]': word[-1:],
+        'word.isupper()': word.isupper(),
+        'word.istitle()': word.istitle(),
+        'word.hasDigit()': hasDigit(word),
+        'word.hasNonAlphaNum': hasNonAlphaNum(word),
+        # 'word.hasUpperLower': hasUpperLower(word),
+        #'wordSize': wordSize(word),
+        # 'word.isdigit()': word.isdigit(),
+        'word': word,
+        'lemma': lemma,
+        'lemma[-3:]': lemma[-3:],
+        'lemma[-2:]': lemma[-2:],
+        'lemma[-1:]': lemma[-1:],
+        'postag': postag,
+        # Prefixes
+        'postag[:2]': postag[:2],
+        'postag[:1]': postag[:1],
+    }
+    if i > 0:
+        listElem = sent[i - 1].split('|')
+        word1 = listElem[0]
+        lemma1 = listElem[1]
+        postag1 = listElem[2]
+        features.update({
+            '-1:word.lower()': word1.lower(),
+            '-1:word.istitle()': word1.istitle(),
+            '-1:word.isupper()': word1.isupper(),
+            '-1:word.hasDigit()': hasDigit(word1),
+            '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
+            # '-1:word.hasUpperLower': hasUpperLower(word1),
+            '-1:word': word1,
+            '-1:lemma': lemma1,
+            '-1:postag': postag1,
+            '-1:postag[:2]': postag1[:2],
+            '-1:postag[:1]': postag1[:1],
+        })
+    # else:
+    #    features['BOS'] = True
+
+    if i < len(sent) - 1:
+        listElem = sent[i + 1].split('|')
+        word1 = listElem[0]
+        lemma1 = listElem[1]
+        postag1 = listElem[2]
+        features.update({
+            '+1:word.lower()': word1.lower(),
+            '+1:word.istitle()': word1.istitle(),
+            '+1:word.isupper()': word1.isupper(),
+            '+1:word.hasDigit()': hasDigit(word1),
+            '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
+            # '+1:word.hasUpperLower': hasUpperLower(word1),
+            '+1:word': word1,
+            '+1:lemma': lemma1,
+            '+1:postag': postag1,
+            '+1:postag[:2]': postag1[:2],
+            '+1:postag[:1]': postag1[:1],
+        })
+    # else:
+    #    features['EOS'] = True
+    if i > 1:
+        listElem = sent[i - 2].split('|')
+        word2 = listElem[0]
+        lemma2 = listElem[1]
+        postag2 = listElem[2]
+        features.update({
+            '-2:word.lower()': word2.lower(),
+            '-2:word.istitle()': word2.istitle(),
+            '-2:word.isupper()': word2.isupper(),
+            '-2:word.hasDigit()': hasDigit(word2),
+            '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
+            # '-2:word.hasUpperLower': hasUpperLower(word2),
+            '-2:word': word2,
+            '-2:lemma': lemma2,
+            '-2:postag': postag2,
+            '-2:postag[:2]': postag2[:2],
+            '-2:postag[:1]': postag2[:1],
+        })
+
+    if i < len(sent) - 2:
+        listElem = sent[i + 2].split('|')
+        word2 = listElem[0]
+        lemma2 = listElem[1]
+        postag2 = listElem[2]
+        features.update({
+            '+2:word.lower()': word2.lower(),
+            '+2:word.istitle()': word2.istitle(),
+            '+2:word.isupper()': word2.isupper(),
+            '+2:word.hasDigit()': hasDigit(word2),
+            '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
+            # '+2:word.hasUpperLower': hasUpperLower(word2),
+            '+2:word': word2,
+            '+2:lemma': lemma2,
+            '+2:postag': postag2,
+            '+2:postag[:2]': postag2[:2],
+            '+2:postag[:1]': postag2[:1],
+        })
+
+    trigrams = False
+    if trigrams:
+        if i > 2:
+            listElem = sent[i - 3].split('|')
+            word3 = listElem[0]
+            lemma3 = listElem[1]
+            postag3 = listElem[2]
+            features.update({
+                '-3:word.lower()': word3.lower(),
+                '-3:word.istitle()': word3.istitle(),
+                '-3:word.isupper()': word3.isupper(),
+                '-3:word.hasDigit()': hasDigit(word3),
+                '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
+                # '-3:word.hasUpperLower': hasUpperLower(word3),
+                '-3:word': word3,
+                '-3:lemma': lemma3,
+                '-3:postag': postag3,
+                '-3:postag[:2]': postag3[:2],
+                '-3:postag[:1]': postag3[:1],
+            })
+
+        if i < len(sent) - 3:
+            listElem = sent[i + 3].split('|')
+            word3 = listElem[0]
+            lemma3 = listElem[1]
+            postag3 = listElem[2]
+            features.update({
+                '+3:word.lower()': word3.lower(),
+                '+3:word.istitle()': word3.istitle(),
+                '+3:word.isupper()': word3.isupper(),
+                '+3:word.hasDigit()': hasDigit(word3),
+                '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
+                # '+3:word.hasUpperLower': hasUpperLower(word3),
+                '+3:word': word3,
+                '+3:lemma': lemma3,
+                '+3:postag': postag3,
+                '+3:postag[:2]': postag3[:2],
+                '+3:postag[:1]': postag3[:1],
+            })
+
+    return features
+
+
+def sent2features(sent):
+    return [word2features(sent, i) for i in range(len(sent))]
+
+
+def sent2labels(sent):
+    return [elem.split('|')[3] for elem in sent]
+    # return [label for token, postag, label in sent]
+
+
+def sent2tokens(sent):
+    return [token for token, postag, label in sent]
+
+
+def print_transitions(trans_features, f):
+    for (label_from, label_to), weight in trans_features:
+        # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
+        # f.write("label_from :" + label_from)
+        # f.write("label_to :" + label_to)
+        # f.write("label_weight :" + weight)
+        # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
+        f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
+
+
+def print_state_features(state_features, f):
+    for (attr, label), weight in state_features:
+        # f.write("%0.6f %-8s %s\n" % (weight, label, attr))
+        # f.write(attr.encode("utf-8"))
+        # '{:06.2f}'.format(3.141592653589793)
+        f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
+
+
+__author__ = 'CMendezC'
+
+##########################################
+#               MAIN PROGRAM             #
+##########################################
+
+if __name__ == "__main__":
+    # Defining parameters
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path of training data set", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path to place output files",
+                      metavar="PATH")
+    parser.add_option("--trainingFile", dest="trainingFile",
+                      help="File with training data set", metavar="FILE")
+    parser.add_option("--testFile", dest="testFile",
+                      help="File with test data set", metavar="FILE")
+    parser.add_option("--filterStopWords", default=False,
+                      action="store_true", dest="filterStopWords",
+                      help="Filtering stop words")
+    parser.add_option("--filterSymbols", default=False,
+                      action="store_true", dest="filterSymbols",
+                      help="Filtering punctuation marks")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path of training data set: " + options.inputPath)
+    print("File with training data set: " + str(options.trainingFile))
+    print("Path of test data set: " + options.inputPath)
+    print("File with test data set: " + str(options.testFile))
+    print("Filtering stop words: " + str(options.filterStopWords))
+    symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+               '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
+    print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+
+    print('-------------------------------- PROCESSING --------------------------------')
+    print('Reading corpus...')
+    t0 = time()
+
+    sentencesTrainingData = []
+    sentencesTestData = []
+
+    stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
+
+    with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
+        # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
+        for line in iFile.readlines():
+            listLine = []
+            line = line.decode("utf-8")
+            for token in line.strip('\n').split():
+                if options.filterStopWords:
+                    listToken = token.split('|')
+                    lemma = listToken[1]
+                    # Original: if lemma in stopwords.words('english'):
+                    # trainingTesting_Sklearn_crfsuite.py:269:
+                    # UnicodeWarning: Unicode equal comparison failed to
+                    # convert both arguments to Unicode -
+                    # interpreting them as being unequal
+                    if lemma in stopwords:
+                        continue
+                if options.filterSymbols:
+                    listToken = token.split('|')
+                    lemma = listToken[1]
+                    if lemma in symbols:
+                        # if lemma == ',':
+                        #     print "Coma , identificada"
+                        continue
+                listLine.append(token)
+            sentencesTrainingData.append(listLine)
+        print "   Sentences training data: " + str(len(sentencesTrainingData))
+        # print sentencesTrainingData[0]
+
+    with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
+        # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
+        for line in iFile.readlines():
+            listLine = []
+            line = line.decode("utf-8")
+            for token in line.strip('\n').split():
+                if options.filterStopWords:
+                    listToken = token.split('|')
+                    lemma = listToken[1]
+                    # Original if lemma in stopwords.words('english'):
+                    if lemma in stopwords:
+                        continue
+                if options.filterSymbols:
+                    listToken = token.split('|')
+                    lemma = listToken[1]
+                    if lemma in symbols:
+                        # if lemma == ',':
+                         #    print "Coma , identificada"
+                        continue
+                listLine.append(token)
+            sentencesTestData.append(listLine)
+        print "   Sentences test data: " + str(len(sentencesTestData))
+        # print sentencesTestData[0]
+
+    print("Reading corpus done in: %fs" % (time() - t0))
+
+    print(sent2features(sentencesTrainingData[0])[0])
+    print(sent2features(sentencesTestData[0])[0])
+    # print(sent2labels(sentencesTrainingData[0]))
+    # print(sent2labels(sentencesTestData[0]))
+    t0 = time()
+
+    X_train = [sent2features(s) for s in sentencesTrainingData]
+    y_train = [sent2labels(s) for s in sentencesTrainingData]
+
+    X_test = [sent2features(s) for s in sentencesTestData]
+    # print X_test
+    y_test = [sent2labels(s) for s in sentencesTestData]
+
+    # Fixed parameters
+    # crf = sklearn_crfsuite.CRF(
+    #     algorithm='lbfgs',
+    #     c1=0.1,
+    #     c2=0.1,
+    #     max_iterations=100,
+    #     all_possible_transitions=True
+    # )
+
+    # Hyperparameter Optimization
+    crf = sklearn_crfsuite.CRF(
+        algorithm='lbfgs',
+        max_iterations=100,
+        all_possible_transitions=True
+    )
+    params_space = {
+        'c1': scipy.stats.expon(scale=0.5),
+        'c2': scipy.stats.expon(scale=0.05),
+    }
+
+    # Original: labels = list(crf.classes_)
+    # Original: labels.remove('O')
+    labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
+
+    # use the same metric for evaluation
+    f1_scorer = make_scorer(metrics.flat_f1_score,
+                            average='weighted', labels=labels)
+
+    # search
+    rs = RandomizedSearchCV(crf, params_space,
+                            cv=3,
+                            verbose=3,
+                            n_jobs=-1,
+                            n_iter=20,
+                            # n_iter=50,
+                            scoring=f1_scorer)
+    rs.fit(X_train, y_train)
+
+    # Fixed parameters
+    # crf.fit(X_train, y_train)
+
+    # Best hiperparameters
+    # crf = rs.best_estimator_
+    nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
+        options.filterSymbols) + '.txt')
+    with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
+        oFile.write("********** TRAINING AND TESTING REPORT **********\n")
+        oFile.write("Training file: " + options.trainingFile + '\n')
+        oFile.write('\n')
+        oFile.write('best params:' + str(rs.best_params_) + '\n')
+        oFile.write('best CV score:' + str(rs.best_score_) + '\n')
+        oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
+
+    print("Training done in: %fs" % (time() - t0))
+    t0 = time()
+
+    # Update best crf
+    crf = rs.best_estimator_
+
+    # Saving model
+    print("     Saving training model...")
+    t1 = time()
+    nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
+        options.filterSymbols) + '.mod')
+    joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
+    print("        Saving training model done in: %fs" % (time() - t1))
+
+    # Evaluation against test data
+    y_pred = crf.predict(X_test)
+    print("*********************************")
+    name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
+        options.filterSymbols) + '.txt')
+    with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
+        for y in y_pred:
+            oFile.write(str(y) + '\n')
+
+    print("*********************************")
+    name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
+        options.filterSymbols) + '.txt')
+    with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
+        for y in y_test:
+            oFile.write(str(y) + '\n')
+
+    print("Prediction done in: %fs" % (time() - t0))
+
+    # labels = list(crf.classes_)
+    # labels.remove('O')
+
+    with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
+        oFile.write('\n')
+        oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
+        oFile.write('\n')
+        # labels = list(crf.classes_)
+        sorted_labels = sorted(
+            labels,
+            key=lambda name: (name[1:], name[0])
+        )
+        oFile.write(metrics.flat_classification_report(
+            y_test, y_pred, labels=sorted_labels, digits=3
+        ))
+        oFile.write('\n')
+
+        oFile.write("\nTop likely transitions:\n")
+        print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
+        oFile.write('\n')
+
+        oFile.write("\nTop unlikely transitions:\n")
+        print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
+        oFile.write('\n')
+
+        oFile.write("\nTop positive:\n")
+        print_state_features(Counter(crf.state_features_).most_common(200), oFile)
+        oFile.write('\n')
+
+        oFile.write("\nTop negative:\n")
+        print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
+        oFile.write('\n')