tagging.py 14.3 KB
# -*- coding: UTF-8 -*-

import os
import re
from pandas import DataFrame as DF
from optparse import OptionParser
from time import time
from collections import Counter

import nltk
import sklearn
import scipy.stats
import sys

import joblib
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

from nltk.corpus import stopwords

import training_validation_v14 as training

#-------------------------------------------------------------------------------
# Objective
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH          Path of transformed files x|y|z
# --outputPath              Output path to place output files
# --outputFileI             Output tagged file I
# --outputFileII            Output tagged file II
# --modelPath               Path to CRF model
# --modelName               Model name
# --infoPath                Path of GSE-GSM index file	
# --infoFile	            GSE-GSM index file",      
# --variant	                Part of S2 variant
# --S1                      Inner word features set
# --S2                      Complete word features
# --S3                      Extended context features
# --S4                      Semantic features
# --filteringStopWords      Filtering stop words    
# --filterSymbols           Filtering punctuation marks

# Output
# 1) Tagged files in transformed format

# Examples
# --inputPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --outputPath	    /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --outputFileI     annot-input_bg_outputI.txt
# --outputFileII    annot-input_bg_outputII.txt
# --modelPath		/home/egaytan/automatic-extraction-growth-conditions/CRF/models
# --modelName		model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
# --infoPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
# --infoFile		bg_sentences_midx.txt
# --variant		    13 

#python3 tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI.txt  --outputFileII  annot-input_bg_outputII.txt  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx.txt  --variant 13   --S4   --S1 > ../../reports/output_tagging_report.txt
#python3 predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v4.txt  --outputFileII  annot-input_bg_outputII_v4  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 13   --S4   --S1 > predict-annot/reports/output_tagging_report_v4.txt

__author__ = 'egaytan'

##########################################
#               MAIN PROGRAM             #
##########################################

if __name__ == "__main__":
    # Defining parameters
    parser = OptionParser()
    parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
    parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
    parser.add_option("--outputFileI",     dest="outFileI",        help="Output tagged file I",              metavar="FILE")
    parser.add_option("--outputFileII",    dest="outFileII",       help="Output tagged file II",             metavar="FILE")
    parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
    parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
    parser.add_option("--infoPath",        dest="infoPath",        help="Path of GSE-GSM index file",        metavar="PATH")
    parser.add_option("--infoFile",        dest="idx",             help="GSE-GSM index file",                metavar="FILE")
    parser.add_option("--variant",         dest="variant",         help="Run variant",                       metavar="FILE")
    parser.add_option("--S1",              dest="S1",              help="General features",                  action="store_true", default=False)
    parser.add_option("--S2",              dest="S2",              help="Inner/Complete word features",      action="store_true", default=False)
    parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)
    parser.add_option("--S4",              dest="S4",              help="Semantic features",                 action="store_true", default=False)   
    parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words",              action="store_true", default=False)
    parser.add_option("--filterSymbols",   dest="filterSymbols",   help="Filtering punctuation marks",       action="store_true", default=False)

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    
    print('-------------------------------- PARAMETERS --------------------------------')
    
    print("--inputPath          Path of training data set        : " + str(options.inputPath        ))
    print("--outputPath         Output path to place output files: " + str(options.outputPath       ))
    print("--outputFileI        Output tagged file I             : " + str(options.outFileI         ))
    print("--outputFileII       Output tagged file II            : " + str(options.outFileII        ))
    print("--modelPath          Path to read CRF model           : " + str(options.modelPath        ))
    print("--modelName          Model name                       : " + str(options.modelName        ))
    print("--infoPath           Path of GSE-GSM index file       : " + str(options.infoPath         ))
    print("--infoFile	        GSE-GSM index file               : " + str(options.idx              ))
    print("--variant	        Run variant                      : " + str(options.variant          ))
    print("--S1                 General features                 : " + str(options.S1               ))
    print("--S2                 Inner/Complete word features     : " + str(options.S2               ))
    print("--S3                 Extended context features        : " + str(options.S3               ))
    print("--S4                 Semantic features                : " + str(options.S4               ))
    print("--filteringStopWords Filtering stop words             : " + str(options.filterStopWords  ))
    print("--filterSymbols      Filtering punctuation marks      : " + str(options.filterSymbols    ))
           
    
    symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
               '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']

    print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))

    print('-------------------------------- PROCESSING --------------------------------')

    stopwords = [word for word in stopwords.words('english')]
    # Read index
    idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
    
    # Read CRF model
    t0 = time()
    print('Reading CRF model...')    
    crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
    print("Reading CRF model done in: %fs" % (time() - t0))

    # Reading sentences
    print('Processing corpus...')
    t0 = time()
    labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
    # Walk directory to read files
    for path, dirs, files in os.walk(options.inputPath):
        # For each file in dir
        for file in files:
            print("Preprocessing file..." + str(file))
            sentencesInputData  = []
            sentencesOutputDataI = []
            sentencesOutputDataII = []
            with open(os.path.join(options.inputPath, file), "r") as iFile:
                lines = iFile.readlines()
                for line in lines:
                    listLine = []
                    for token in line.strip('\n').split():
                        if options.filterStopWords:
                            listToken = token.split('|')
                            lemma = listToken[1]                            
                            if lemma in stopwords:
                                continue
                        if options.filterSymbols:
                            listToken = token.split('|')
                            lemma = listToken[1]
                            if lemma in symbols:
                                if lemma == ',':
                                    print("Coma , identificada")
                                continue
                        listLine.append(token)
                    sentencesInputData.append(listLine)
                X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
                print("Sentences input data: " + str(len(sentencesInputData)))
                
                
                # Predicting tags
                t1 = time()               
                print("Predicting tags with model")
                y_pred = crf.predict(X_input)                
                print("Prediction done in: %fs" % (time() - t1))
                
                
                # Tagging with CRF model
                print("Tagging file")
                lidx = 0
                for line, tagLine in zip(lines, y_pred):
                    Ltags = set(labels).intersection(set(tagLine))
                    outputLine = ''                    
                    line = line.strip('\n')
                    
                    #print("\nLine: " + str(line))
                    #print ("CRF tagged line: " + str(tagLine))
                    tb = 'O'
                    i = 0
                    if len(tagLine)==1:
                        if tagLine[0] in labels: 
                            start = '<' + tagLine[0] + '> '
                            end   = '</' + tagLine[0] + '/>'
                            word  = line.split('|')[0] + ' '
                            outputLine = start + word + end
                        else:                             
                            outputLine = line.split(' ')[0]
                        #print(outputLine + '\t' + ', '.join(Ltags))
                        sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + ', '.join(Ltags))
                        sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
                        continue
                        
                    sentence = ''
                    sb = False
                    for word,tag in zip(line.split(' '), tagLine):
                        # start tagging
                        if tag in labels and tb != tag:
                            # start tagging
                            outputLine += '<' + tag + '> '
                            sb = True
                            sentence = word.split('|')[0] + ' '
                            tb = tag
                            outputLine += word.split('|')[0] + ' '
                            i += 1
                            continue
                        # end tagging
                        elif tb in labels:
                            if i+1==len(tagLine):
                                # end sentence
                                outputLine += word.split('|')[0] + ' '
                                outputLine += '</' + tag + '/> '
                                sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
                                sb = False
                                tb = 'O'
                                i += 1
                                continue
                            elif tag!=tagLine[i+1]:
                                # start new tag
                                outputLine += word.split('|')[0] + ' '
                                outputLine += '</' + tag + '/> '
                                sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
                                sb = False
                                tb = 'O'
                                i += 1
                                continue
                        # word tagged
                        outputLine += word.split('|')[0] + ' '
                        i += 1
                        if sb:
                            sentence+= word.split('|')[0] + ' '
                    #print(outputLine + '\t' + ', '.join(Ltags))                  
                    sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ ', '.join(Ltags))
                    lidx += 1
                    
            #print( DF(sentencesOutputDataI) )
            #print( '\n'.join(sentencesOutputDataII) )        
            # Save tags
            with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFile:
                for line in sentencesOutputDataII:
                    #print(line)
                    oFile.write(line + '\n')
            with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
                for line in sentencesOutputDataI:
                    if re.findall('</', line):
                        print(line)
                    #oFileI.write(line + '\n')
                    
    print("Processing corpus done in: %fs" % (time() - t0))