preparing-training-validation-test.py 13.3 KB
# -*- coding: UTF-8 -*-

from optparse import OptionParser
import os
import sys
from time import time
import json
from nltk.corpus import stopwords

__author__ = 'CMendezC'

# Objective: Take transformed file with format word|lemma|tag,
#   for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP
#   and create file with an additional tagging for CRF training. For example:
#   the|the|dt N-terminal|N-terminal|NN| domain|domain|NN -->
#   the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O
#   Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP).
#   We expect that these words are going to have one aTag in some context and different one in others.
#   The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here.
#   In output file we only maintain the lemma and the tag or the word and the tag.
#   This additional tagging is going to give us clues for aspect classification.

# Parameters:
#   1) --inputPath      Path to read files.
#   2) --trainingFile   File name with training data.
#   3) --testFile       File name with test data.
#   4) --outputPath     Path to write files. File names are concatenated with feature name.
#   5) ELIMINATED --feature        Type of feature to extract and create file: lemma
#   6) --termPath       Path to read term files
#   7) --termFiles      JSON file with terms files and tags
#   8) --termPath      Path to read JSON file with information about frequent words files
#   9) --inputFileFreq   JSON file with information about frequent words
#   10  --skip=N     Skip N words to form skip mentions
#   11) --stopWords   Filtering stop words
#   12) --filterPunctMarks      Filtering punctuation marks

# Ouput:
#   1) Files created. Name of feature is concatenated

# Execution:
# ASPECTS
# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json

# SENTENCES
# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json

# none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json
# stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords
# stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks
# filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks
# ¿? --SKIP

def getSkipMentions(aList, aSkip):
    hashTemp = {}
    for j in range(0, aSkip):
        listTemp = []
        for i in range(0, len(aList), aSkip+1):
            listTemp.append(aList[i + j])
        hashTemp[j] = listTemp
    return hashTemp

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Path to read files", metavar="PATH")
    parser.add_option("--trainingFile", dest="trainingFile",
                      help="File with training examples", metavar="FILE")
    parser.add_option("--testFile", dest="testFile",
                      help="File with test examples", metavar="FILE")
    parser.add_option("--trainingClassesFile", dest="trainingClassesFile",
                      help="File with training classes", metavar="FILE")
    parser.add_option("--testClassesFile", dest="testClassesFile",
                      help="File with test classes", metavar="FILE")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
    parser.add_option("--termPath", dest="termPath",
                      help="Path to read term files", metavar="PATH")
    parser.add_option("--termFiles", dest="termFiles",
                  help="JSON file with terms files and tags", metavar="PATH")
    parser.add_option("--inputFileFreq", dest="inputFileFreq",
                  help="JSON file with information about frequent words", metavar="PATH")
    parser.add_option("--skip", type="int",
                      dest="skip", default=0,
                      help="Skip mentions", metavar="N")
    parser.add_option("--filterStopWords", default=False,
                  action="store_true", dest="filterStopWords",
                  help="Filtering stop words")
    parser.add_option("--filterPunctMarks", default=False,
                      action="store_true", dest="filterPunctMarks",
                      help="Filtering punctuation marks")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("None parameters indicated.")
        sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to read files: " + str(options.inputPath))
    print("File with training examples", str(options.trainingFile))
    print("File with test examples", str(options.testFile))
    print("File with training classes", str(options.trainingClassesFile))
    print("File with test classes", str(options.testClassesFile))
    print("File with training classes", str(options.trainingClassesFile))
    print("File with test classes", str(options.testClassesFile))
    print("Path to write output files: " + str(options.outputPath))
    print("JSON file with information about frequent words: " + str(options.inputFileFreq))
    print("Skip mentions: " + str(options.skip))
    print("Filtering stop words: " + str(options.stopWords))
    punctMarks = ['.', ',', ':', ';', '?', '!', '\'', '"']
    print("Filtering puntuation marks " + str(punctMarks) + ': '+ str(options.filterPunctMarks))

    filesRead = 0
    t0 = time()

    print('Loading biological term files...')
    with open(os.path.join(options.termPath, options.termFiles)) as data_file:
        hashes = json.load(data_file)
    print('   Loading biological term files... done')

    hashTagAspect = hashes["hashTagAspect"]

    print('Loading frequent words...')
    with open(os.path.join(options.termPath, options.inputFileFreq)) as data_file:
        hashAspectFreqWords = json.load(data_file)
    print('   Loading frequent words... done')

    listFiles = [options.trainingFile, options.testFile]
    listClassesFiles = [options.trainingClassesFile, options.testClassesFile]

    for iFile, cFile in zip(listFiles, listClassesFiles):
        with open(os.path.join(options.inputPath, iFile), "r", encoding="utf-8", errors="replace") as tFile:
            print("Reading file..." + iFile)
            lines = [l.strip('\n') for l in  tFile.readlines()]
            filesRead += 1
        with open(os.path.join(options.inputPath, cFile), "r", encoding="utf-8", errors="replace") as clFile:
            print("Reading file..." + cFile)
            classes = [c.strip('\n') for c in clFile.readlines()]
        listLines = []
        print("Processing files... ")
        for line, c in zip(lines, classes):
            # print("class: ", c)
            listTokenLine = []
            # listLemmaLine = []
            for tok in line.split():
                tokList = tok.split("|")
                word = tokList[0]
                lemma = tokList[1]
                tag = tokList[2]
                # Filtering stopwords
                if options.stopWords:
                    if lemma in stopwords.words('english'):
                        continue
                if options.filterPunctMarks:
                    if lemma in punctMarks:
                        continue
                # if tag in hashTagAspect:
                # We change tag for aspect tag only in the case of aspect tag coincide with class.
                # We want that CRF learn when to change term tag to aspect tag in correct context
                if tag in hashTagAspect:
                    if hashTagAspect[tag] == c:
                        aTag = hashTagAspect[tag]
                    else:
                        aTag = 'O'
                else:
                    if c in hashAspectFreqWords:
                        # print("class: ", c)
                        hashFreqWords = hashAspectFreqWords[c]
                        # We verify if word or lemma is in frequent words.
                        # These frequent words are word-forms (tokens)
                        if word.lower() in hashFreqWords or lemma in hashFreqWords:
                            aTag = c
                        else:
                            aTag = 'O'
                    else:
                        aTag = 'O'
                listTokenLine.append(word + "|" + lemma + "|" + tag + "|" + aTag)
            # if feature == "word":
            listLines.append(listTokenLine)
            # if feature == "lemma":
                # listLines = listLemmaLine.strip() + '\n'
            if options.skip > 0:
                t0 = time()
                skipTemp = options.skip
                for i in range(1, options.skip):
                    hashTemp = getSkipMentions(listLines, skipTemp)
                    # skipTemp -= 1
                    for key in hashTemp:
                        listLines = hashTemp[key]
                        with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
                                options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(skipTemp) + '.txt')), "w",
                                  encoding="utf-8") as oFile:
                            for line in listLines:
                                oFile.write(line)
                print("Skip mention done in: %fs" % (time() - t0))
            else:
                with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
                        options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(options.skip) + '.txt')), "w",
                          encoding="utf-8") as oFile:
                    for line in listLines:
                        for token in line:
                            oFile.write(token + ' ')
                        oFile.write('\n')

    print("Files processed: " + str(filesRead))