training_validation_v14.0.1.py 21.9 KB

Raw Blame History Permalink

# -*- coding: UTF-8 -*-

import os                           # Access operative sistem
#from itertools import chain        # No se ocupa
from optparse import OptionParser   # Number of transitions
from time import time               # Return the time in seconds since the epoch as a float
from collections import Counter     # Dict subclass for counting hashable objects
#import re                          # No se ocupa

import nltk                         # Natural Language Toolkit platform to work with human language data
import sklearn                      # Free software machine learning
import scipy.stats                  # library of statistical functions
import sys                          # to exit from Python.

from sklearn.externals import joblib                    # provide lightweight pipelining
from sklearn.metrics import make_scorer                 # Make a scorer from a performance metric or loss function
from sklearn.cross_validation import cross_val_score    # Evaluate a score by cross-validation
from sklearn.grid_search import RandomizedSearchCV      # Randomized search on hyper parameters

import sklearn_crfsuite                                 # Thin CRFsuite
from sklearn_crfsuite import scorers                    # Added scorers.sequence_accuracy
from sklearn_crfsuite import metrics                    # Add flat recall score to metrics

from pandas import DataFrame as DF                      # Contruct dataframe object
from nltk.corpus import stopwords                       # To exclude top words

#-------------------------------------------------------------------------------
# Objective
# Training and evaluation of CRFs with sklearn-crfsuite.
#
# Input parameters
# (1)   --inputPath                   Path of training and test data set
# (2)   --outputPath                  Output path to place output files
# (3)   --trainingFile                File with training data set
# (4)   --testFile                    File with test data set
# (5)   --reportName                  Number of run
# (6)   --variant                     Part of S2 variant
# (7)   --nrules                      Number of crf transitions
# (8)   --S1                          Inner word features set
# (9)   --S2                          Complete word features
# (10)  --S3                          Extended context features
# (11)  --S4                          Semantic features
# (12)  --excludeStopWords
# (13)  --excludeSymbols

# Output
# 1) Best model
# 2) Report

# Examples
# python3 training_validation_v14.0.1.py
# --inputPath     /home/egaytan/automatic-extraction-growth-conditions/CRF/input/
# --trainingFile  training-data-set-70-NER.txt
# --testFile      test-data-set-30-NER.txt
# --outputPath    /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --nrules        500
# --reportName    Run1
# --variant       11
# --S1
# --S2
# --S3
# --S4

# python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 10 >                 ../../outputs/enero/Run1_v10.txt

##################################################################
#                             FEATURES                           #
##################################################################

#================== COMPLETE WORD FEATURES ======================#

def isGreek(word):
    ## Complete word are greek letters
    alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
    'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
    if word in alphabet:
        return True
    else:
        return False

#================ INNER OF THE WORD FEATURES ====================#

def hGreek(word):
    ## Search for at least has one greek letter
    alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω','α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
    # hexadicimal code
    matches = [letter for letter in word if letter in alphabet]
    if (len(matches) > 0):
        return(True)
    else: return(False)
    ## At least a greek letter

def hNumber(word):
    ## Al leats has one greek letter
    for l in word:
        if l.isdigit():
            return True
    return False

def hUpper(word):
    ## At least an upper letter
    for l in word:
        if l.isupper(): return True
    return False

def hLower(word):
    ## At least a lower letter
    for l in word:
        if l.islower(): return True
    return False

#============================FEATURES===========================#

def word2features(sent, i, S1, S2, S3, S4, v): #SA, v
    ## Getting word features

    ## Saving CoreNLP annotations
    listElem = sent[i].split('|')
    ## Split CoreNLP output by columns
    word   = listElem[0]
    lemma  = listElem[1]
    postag = listElem[2]
    ner    = listElem[3]

    #=========================== G =============================#
    ## NAME LEVEL G
    ## FUTURE TYPE General features

    ## Adding to features dictionary
    features = {
        ## basal features
        'lemma': lemma,
        'postag': postag
        }

    ## Anterior lemma and postag
    ## need more tha one word in sentence
    if i > 0:
        ## Split CoreNLP output by columns
        listElem = sent[i - 1].split('|')

        ## Saving CoreNLP annotations
        lemma0 = listElem[1]
        postag0 = listElem[2]
        ## Adding features to dictionary
        features.update({
            #LemaG anterior
            '-1:lemma': lemma0,
            #Postag anterior
            '-1:postag': postag0,
        })

    ## Posterior lemma and postag
    ## is not the last word
    if i < len(sent) - 1:
        ## Posterior word
        listElem = sent[i + 1].split('|')
        ## Saving CoreNLP annotations
        lemma2 = listElem[1]
        postag2 = listElem[2]
        ## Adding to features dictionary
        features.update({
            #LemaG  posterior
            '+1:lemma': lemma2,
            #Postag posterior
            '+1:postag': postag2,
        })

    #=========================== S1 =============================#
    ## NAME LEVEL S1
    ## FEATURE TYPE Inner word features

    if S1:
        ## Adding features to dictionary
        features.update({
        'hUpper' :  hUpper(word),
        'hLower' :  hLower(word),
        'hGreek' :  hGreek(word),
        'symb'   :  word.isalnum()
        })
        #========== Variants of inner words features ============#
        if v == 10:
          #word first character
          features['word[:1]']= word[:1]

          #word second character
          if len(word)>1:
              features['word[:2]']= word[:2]

        if v == 11:
          #lemma and postag first dharacter
          features['lemma[:1]']= lemma[:1]
          features['postag[:1]']= postag[:1]

          #lemma and postag secondChar
          if len(lemma)>1:
              features['lemma[:2]']= lemma[:2]
          if len(postag)>1:
              features['postag[:2]']= postag[:2]

        if v == 12:
          #word first character
          features['word[:1]']= word[:1]

          #word second character
          if len(word)>1:
              features['word[:2]']= word[:2]

          #postag first character
          features['postag[:1]']= postag[:1]

          #postag second character
          if len(postag)>1:
              features['postag[:2]']= postag[:2]

        if v == 13:
          #lemma first character
          features['lemma[:1]']= lemma[:1]

          #lemma second character
          if len(lemma)>1:
              features['lemma[:2]']= lemma[:2]

    #=========================== S2 =============================#
    ## NAME LEVEL S2
    ## FEATURE TYPE Complete word features

    if S2:
        #Add features to dictionary
        features.update({
            'word'      :  word,
            'isUpper'   :  word.isupper(),
            'isLower'   :  word.islower(),
            'isGreek'   :  isGreek(word),
            'isNumber'  :  word.isdigit()
        })
        ## Anterior word
        ## sentence needs more tha one word
        if i > 0:
            ## Split CoreNLP output by columns
            listElem = sent[i - 1].split('|')
            ## Saving CoreNLP annotations
            word0 = listElem[0]
            features['-1:word']=  word0

        ## Posterior word
        ## is not the last word
        if i < len(sent)-1:
            ## Split CoreNLP output by columns
            listElem = sent[i + 1].split('|')
            ## Saving CoreNLP annotations
            word2 = listElem[0]
            features['+1:word']=  word2

    #=========================== S3 =============================#
    ## NAME LEVEL S3
    ## FEATURE TYPE Extended context features
    if S3:
        ## more than two words in sentence
        if i > 1:
            ## Split CoreNLP output by columns
            listElem = sent[i - 2].split('|')
            ## Saving CoreNLP annotations
            ## two anterior lemma and postag
            lemma01 = listElem[1]
            postag01 = listElem[2]
            features['-2:lemma']=  lemma01
            features['-2:postag']=  postag01

        ## is not the penultimate word
        if i < len(sent) - 2:
            ## Split CoreNLP output by columns
            listElem = sent[i + 2].split('|')
            ## Saving CoreNLP annotations
            lemma02 = listElem[1]
            postag02 = listElem[2]
            ## two posterior lemma and postag
            features['+2:lemma']=  lemma02
            features['+2:postag']=  postag02

    #=========================== S4 =============================#
    ## NAME LEVEL S4if S4:
    ## FEATURE TYPE NER
    if S4:
        ## more than one word in sentence
        if i > 0:
          ## Split CoreNLP output by columns
          listElem = sent[i - 1].split('|')
          ## ===============  Anterior ner  ====================##
          ## Saving CoreNLP annotations according column position
          ner0 = listElem[3]
          ## Adding to features dictionary
          features['-1:ner'] = ner

        ## is not the last word
        if i < len(sent) - 1:
          ## Split CoreNLP output by columns
          listElem = sent[i + 1].split('|')
          ## =============  Posterior ner  ====================##
          ## Saving CoreNLP annotations according column position
          ner2 = listElem[3]
          ## Adding to features dictionary
          features['+1:ner'] = ner2

        if i > 1:
          ## Split CoreNLP output by columns
          listElem = sent[i - 2].split('|')
          ## Saving CoreNLP annotations
          ## ===============  2 Anterior ner  =================##
          ner01 = listElem[3]
          features['-2:ner']=  ner01

        ## is not the penultimate word
        if i < len(sent) - 2:
          ## Split CoreNLP output by columns
          listElem = sent[i + 2].split('|')
          ## Saving CoreNLP annotations
          ner02 = listElem[3]
          ## =============  2 Posterior ner  =================##
          features['+2:ner']=  ner02

    return features

def sent2features(sent, S1, S2, S3, S4, v):
    ## Itering in sentence for each word and saving its features
    return [word2features(sent, i, S1, S2, S3, S4, v) for i in range(len(sent))]

def sent2labels(sent):
    ## Save tag, last position by word tokens
    return [elem.split('|')[-1] for elem in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

def print_transitions(trans_features, f):
    for (label_from, label_to), weight in trans_features:
        f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))

def print_state_features(state_features, f):
    for (attr, label), weight in state_features:
        f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))


__author__ = 'egaytan'

##################################################################
#                            MAIN PROGRAM                        #
##################################################################

if __name__ == "__main__":
    ## Defining parameters
    parser = OptionParser()
    parser.add_option("--inputPath",        dest="inputPath",       help="Path of training data set",           metavar="PATH")
    parser.add_option("--outputPath",       dest="outputPath",      help="Output path to place output files",   metavar="PATH")
    parser.add_option("--trainingFile",     dest="trainingFile",    help="File with training data set",         metavar="FILE")
    parser.add_option("--testFile",         dest="testFile",        help="File with test data set",             metavar="FILE")
    parser.add_option("--reportName",       dest="reportName",      help="Report number run",                   metavar="FILE")
    parser.add_option("--variant",          dest="variant",         help="Report file",                         metavar="FILE")
    parser.add_option("--S1",               dest="S1",              help="General features",                    action="store_true", default=False)
    parser.add_option("--S2",               dest="S2",              help="Inner/Complete word features",        action="store_true", default=False)
    parser.add_option("--S3",               dest="S3",              help="Extended context features",           action="store_true", default=False)
    parser.add_option("--S4",               dest="S4",              help="Semantic features",                   action="store_true", default=False)
    parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words",                  action="store_true", default=False)
    parser.add_option("--excludeSymbols",   dest="excludeSymbols",  help="Exclude punctuation marks",           action="store_true", default=False)
    parser.add_option("--nrules",           dest="nrules",          help="Number of crf rules on report",       type="int")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path of test and training data sets: " + options.inputPath)
    print("Path of outputs: "  + options.outputPath)
    print("File with training data set: " + str(options.trainingFile))
    print("File with test data set: " + str(options.testFile))
    print("reportName: " + str(options.reportName))
    print("Exclude stop words: " + str(options.excludeStopWords))
    print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
    print("Run variant: " + str(options.variant))
    print("Number of rules on report file: " + str(options.nrules))

    symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
               '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
    print("Exclude symbols: " + str(options.excludeSymbols))

    print('-------------------------------- PROCESSING --------------------------------')
    print('Reading corpus...')
    t0 = time()

    sentencesTrainingData = []
    sentencesTestData = []

    stopwords = [word for word in stopwords.words('english')]

    with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
        for line in iFile.readlines():
            listLine = []
            line = line.strip('\n')
            for token in line.split():
                if options.excludeStopWords:
                    listToken = token.split('|')
                    lemma = listToken[1]
                    if lemma in stopwords:
                        continue
                if options.excludeSymbols:
                    listToken = token.split('|')
                    lemma = listToken[1]
                    if lemma in symbols:
                        continue
                listLine.append(token)
            sentencesTrainingData.append(listLine)
        print("   Sentences training data: " + str(len(sentencesTrainingData)))

    with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
        for line in iFile.readlines():
            listLine = []
            line = line.strip('\n')
            for token in line.split():
                if options.excludeStopWords:
                    listToken = token.split('|')
                    lemma = listToken[1]
                    if lemma in stopwords:
                        continue
                if options.excludeSymbols:
                    listToken = token.split('|')
                    lemma = listToken[1]
                    if lemma in symbols:
                        continue
                listLine.append(token)
            sentencesTestData.append(listLine)
        print("   Sentences test data: " + str(len(sentencesTestData)))

    print("Reading corpus done in: %fs" % (time() - t0))

    print('-------------------------------- FEATURES --------------------------------')

    Dtraning = sent2features(sentencesTrainingData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
    Dtest = sent2features(sentencesTestData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
    print('--------------------------Features Training ---------------------------')
    print(DF(list(Dtraning.items())))
    print('--------------------------- FeaturesTest -----------------------------')
    print(DF(list(Dtest.items())))

    t0 = time()

    X_train = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTrainingData]
    y_train = [sent2labels(s) for s in sentencesTrainingData]

    X_test = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTestData]
    # print X_test
    y_test = [sent2labels(s) for s in sentencesTestData]

    '''
    Fixed parameters
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_pgossible_transitions=True
    )
    '''
    # Hyperparameter Optimization
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        max_iterations=100,
        all_possible_transitions=True
    )
    params_space = {
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }

    # Original: labels = list(crf.classes_)
    # Original: labels.remove('O')
    labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])

    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)

    # search
    rs = RandomizedSearchCV(crf, params_space,
                            cv=5,
                            verbose=3,
                            n_jobs=-1,
                            n_iter=100,
                            scoring=f1_scorer,
                            random_state=42)

    rs.fit(X_train, y_train)

    # Fixed parameters
    # crf.fit(X_train, y_train)

    # Best hiperparameters
    # crf = rs.best_estimator_

    nameReport = str(options.reportName) + '_v'+ str(options.variant) + '.txt'
    with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
        oFile.write("********** TRAINING AND TESTING REPORT **********\n")
        oFile.write("Training file: " + options.trainingFile + '\n')
        oFile.write('\n')
        oFile.write('best params:' + str(rs.best_params_) + '\n')
        oFile.write('best CV score:' + str(rs.best_score_) + '\n')
        oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))

    print("Training done in: %fs" % (time() - t0))
    t0 = time()

    # Update best crf
    crf = rs.best_estimator_

    # Saving model
    print("     Saving training model...")
    t1 = time()
    nameModel = 'model_' + str(options.reportName) + '_v'+ str(options.variant) + '_S1_' + str(options.S1) + '_S2_' + str(options.S2) + '_S3_' + str(options.S3) + '_S4_' + str(options.S4) + '_' + str(options.reportName) + '_v' + str(options.variant) +'.mod'
    joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
    print("        Saving training model done in: %fs" % (time() - t1))

    # Evaluation against test data
    y_pred = crf.predict(X_test)
    print("*********************************")
    print("Prediction done in: %fs" % (time() - t0))

    with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
        oFile.write('\n')
        oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
        oFile.write('\n')
        # labels = list(crf.classes_)
        sorted_labels = sorted(
            labels,
            key=lambda name: (name[1:], name[0])
        )
        oFile.write(metrics.flat_classification_report( y_test, y_pred, labels=sorted_labels, digits=3))
        oFile.write('\n')

        oFile.write("\nTop likely transitions:\n")
        print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
        oFile.write('\n')

        oFile.write("\nTop unlikely transitions:\n")
        print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
        oFile.write('\n')

        oFile.write("\nTop positive:\n")
        print_state_features(Counter(crf.state_features_).most_common(options.nrules), oFile)
        oFile.write('\n')

        oFile.write("\nTop negative:\n")
        print_state_features(Counter(crf.state_features_).most_common()[-options.nrules:], oFile)
        oFile.write('\n')