tagging.py 9.81 KB

Raw Blame History Permalink

# -*- coding: UTF-8 -*-

import os
from pandas import DataFrame as DF
from optparse import OptionParser
from time import time
from collections import Counter

import nltk
import sklearn
import scipy.stats
import sys

import joblib
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

from nltk.corpus import stopwords

import training_validation_v14 as training

#-------------------------------------------------------------------------------
# Objective
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH      Path of transformed files x|y|z
# --modelPath           Path to CRF model
# --modelName           Model name
# --outputPath=PATH     Output path to place output files
# --filteringStopWords  Filtering stop words
# --filterSymbols       Filtering punctuation marks

# Output
# 1) Tagged files in transformed format

# Examples
# python3 tagging.py
# --inputPath           /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --modelName           model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
# --modelPath           /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
# --outputPath          /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --filterSymbols

# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt

__author__ = 'egaytan'

##########################################
#               MAIN PROGRAM             #
##########################################

if __name__ == "__main__":
    # Defining parameters
    parser = OptionParser()
    parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
    parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
    parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
    parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
    parser.add_option("--variant",         dest="variant",         help="Report file",                       metavar="FILE")
    parser.add_option("--S1",              dest="S1",              help="General features",                  action="store_true", default=False)
    parser.add_option("--S2",              dest="S2",              help="Inner/Complete word features",      action="store_true", default=False)
    parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)
    parser.add_option("--S4",              dest="S4",              help="Semantic features",                 action="store_true", default=False)
    parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words",              action="store_true", default=False)
    parser.add_option("--filterSymbols",   dest="filterSymbols",   help="Filtering punctuation marks",       action="store_true", default=False)

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to read input files: " + options.inputPath)
    print("Mode name: " + str(options.modelName))
    print("Model path: " + options.modelPath)
    print("Path to place output files: " + options.outputPath)
    print("Filtering stop words: " + str(options.filterStopWords))
    print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
    print("Run variant: " + str(options.variant))

    symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
               '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']

    print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))

    print('-------------------------------- PROCESSING --------------------------------')

    stopwords = [word for word in stopwords.words('english')]

    # Read CRF model
    t0 = time()
    print('Reading CRF model...')
    crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
    print("Reading CRF model done in: %fs" % (time() - t0))

    # Reading sentences
    print('Processing corpus...')
    t0 = time()
    labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
    # Walk directory to read files
    for path, dirs, files in os.walk(options.inputPath):
        # For each file in dir
        for file in files:
            print("Preprocessing file..." + str(file))
            sentencesInputData = []
            sentencesOutputData = []
            with open(os.path.join(options.inputPath, file), "r") as iFile:
                lines = iFile.readlines()
                for line in lines:
                    listLine = []
                    for token in line.strip('\n').split():
                        if options.filterStopWords:
                            listToken = token.split('|')
                            lemma = listToken[1]
                            if lemma in stopwords:
                                continue
                        if options.filterSymbols:
                            listToken = token.split('|')
                            lemma = listToken[1]
                            if lemma in symbols:
                                if lemma == ',':
                                    print("Coma , identificada")
                                continue
                        listLine.append(token)
                    sentencesInputData.append(listLine)
                X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
                print("Sentences input data: " + str(len(sentencesInputData)))


                # Predicting tags
                t1 = time()
                print("Predicting tags with model")
                y_pred = crf.predict(X_input)
                print("Prediction done in: %fs" % (time() - t1))


                # Tagging with CRF model
                print("Tagging file")
                for line, tagLine in zip(lines, y_pred):
                    Ltags = set(labels).intersection(set(tagLine))
                    outputLine = ''
                    line = line.strip('\n')
                    #print("\nLine: " + str(line))
                    #print ("CRF tagged line: " + str(tagLine))
                    tb = 'O'
                    i = 0
                    if len(tagLine)==1:
                        if tagLine[0] in labels:
                            start = '<' + tagLine[0] + '> '
                            end   = '<' + tagLine[0] + '/>'
                            word  = line.split('|')[0] + ' '
                            outputLine = start + word + end
                        else:
                            outputLine = line.split(' ')[0]
                        #print(outputLine + '\t' + ', '.join(Ltags))
                        sentencesOutputData.append([outputLine, ', '.join(Ltags)])
                        continue

                    for word,tag in zip(line.split(' '), tagLine):
                        # start tagging
                        if tag in labels and tb == 'O':
                            # start tagging
                            outputLine += '<' + tag + '> '
                            tb = tag
                            outputLine += word.split('|')[0] + ' '
                            i += 1
                            continue
                        # end tagging
                        elif tb in labels:
                            if i+1==len(tagLine):
                                # end tagging
                                outputLine += word.split('|')[0] + ' '
                                outputLine += '<' + tag + '/> '
                                tb = 'O'
                                i += 1
                                continue
                            elif tagLine[i+1]=='O':
                                # end tagging
                                outputLine += word.split('|')[0] + ' '
                                outputLine += '<' + tag + '/> '
                                tb = 'O'
                                i += 1
                                continue
                        # word tagged
                        outputLine += word.split('|')[0] + ' '
                        i += 1
                    #print(outputLine + '\t' + ', '.join(Ltags))
                    sentencesOutputData.append([outputLine, ', '.join(Ltags)])

            print( DF(sentencesOutputData) )

            # Save tags
            '''
            with open(os.path.join(options.outputPath, file), "w") as oFile:
                for line in sentencesOutputData:
                    oFile.write(line + '\n')

    print("Processing corpus done in: %fs" % (time() - t0))
'''