preparing-training-validation-test.py
13.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
import json
from nltk.corpus import stopwords
__author__ = 'CMendezC'
# Objective: Take transformed file with format word|lemma|tag,
#   for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP
#   and create file with an additional tagging for CRF training. For example:
#   the|the|dt N-terminal|N-terminal|NN| domain|domain|NN -->
#   the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O
#   Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP).
#   We expect that these words are going to have one aTag in some context and different one in others.
#   The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here.
#   In output file we only maintain the lemma and the tag or the word and the tag.
#   This additional tagging is going to give us clues for aspect classification.
# Parameters:
#   1) --inputPath      Path to read files.
#   2) --trainingFile   File name with training data.
#   3) --testFile       File name with test data.
#   4) --outputPath     Path to write files. File names are concatenated with feature name.
#   5) ELIMINATED --feature        Type of feature to extract and create file: lemma
#   6) --termPath       Path to read term files
#   7) --termFiles      JSON file with terms files and tags
#   8) --termPath      Path to read JSON file with information about frequent words files
#   9) --inputFileFreq   JSON file with information about frequent words
#   10  --skip=N     Skip N words to form skip mentions
#   11) --stopWords   Filtering stop words
#   12) --filterPunctMarks      Filtering punctuation marks
# Ouput:
#   1) Files created. Name of feature is concatenated
# Execution:
# ASPECTS
# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# SENTENCES
# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json
# stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords
# stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks
# filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks
# ¿? --SKIP
def getSkipMentions(aList, aSkip):
    hashTemp = {}
    for j in range(0, aSkip):
        listTemp = []
        for i in range(0, len(aList), aSkip+1):
            listTemp.append(aList[i + j])
        hashTemp[j] = listTemp
    return hashTemp
###########################################################
#                       MAIN PROGRAM                      #
###########################################################
if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Path to read files", metavar="PATH")
    parser.add_option("--trainingFile", dest="trainingFile",
                      help="File with training examples", metavar="FILE")
    parser.add_option("--testFile", dest="testFile",
                      help="File with test examples", metavar="FILE")
    parser.add_option("--trainingClassesFile", dest="trainingClassesFile",
                      help="File with training classes", metavar="FILE")
    parser.add_option("--testClassesFile", dest="testClassesFile",
                      help="File with test classes", metavar="FILE")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
    parser.add_option("--termPath", dest="termPath",
                      help="Path to read term files", metavar="PATH")
    parser.add_option("--termFiles", dest="termFiles",
                  help="JSON file with terms files and tags", metavar="PATH")
    parser.add_option("--inputFileFreq", dest="inputFileFreq",
                  help="JSON file with information about frequent words", metavar="PATH")
    parser.add_option("--skip", type="int",
                      dest="skip", default=0,
                      help="Skip mentions", metavar="N")
    parser.add_option("--filterStopWords", default=False,
                  action="store_true", dest="filterStopWords",
                  help="Filtering stop words")
    parser.add_option("--filterPunctMarks", default=False,
                      action="store_true", dest="filterPunctMarks",
                      help="Filtering punctuation marks")
    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("None parameters indicated.")
        sys.exit(1)
    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to read files: " + str(options.inputPath))
    print("File with training examples", str(options.trainingFile))
    print("File with test examples", str(options.testFile))
    print("File with training classes", str(options.trainingClassesFile))
    print("File with test classes", str(options.testClassesFile))
    print("File with training classes", str(options.trainingClassesFile))
    print("File with test classes", str(options.testClassesFile))
    print("Path to write output files: " + str(options.outputPath))
    print("JSON file with information about frequent words: " + str(options.inputFileFreq))
    print("Skip mentions: " + str(options.skip))
    print("Filtering stop words: " + str(options.stopWords))
    punctMarks = ['.', ',', ':', ';', '?', '!', '\'', '"']
    print("Filtering puntuation marks " + str(punctMarks) + ': '+ str(options.filterPunctMarks))
    filesRead = 0
    t0 = time()
    print('Loading biological term files...')
    with open(os.path.join(options.termPath, options.termFiles)) as data_file:
        hashes = json.load(data_file)
    print('   Loading biological term files... done')
    hashTagAspect = hashes["hashTagAspect"]
    print('Loading frequent words...')
    with open(os.path.join(options.termPath, options.inputFileFreq)) as data_file:
        hashAspectFreqWords = json.load(data_file)
    print('   Loading frequent words... done')
    listFiles = [options.trainingFile, options.testFile]
    listClassesFiles = [options.trainingClassesFile, options.testClassesFile]
    for iFile, cFile in zip(listFiles, listClassesFiles):
        with open(os.path.join(options.inputPath, iFile), "r", encoding="utf-8", errors="replace") as tFile:
            print("Reading file..." + iFile)
            lines = [l.strip('\n') for l in  tFile.readlines()]
            filesRead += 1
        with open(os.path.join(options.inputPath, cFile), "r", encoding="utf-8", errors="replace") as clFile:
            print("Reading file..." + cFile)
            classes = [c.strip('\n') for c in clFile.readlines()]
        listLines = []
        print("Processing files... ")
        for line, c in zip(lines, classes):
            # print("class: ", c)
            listTokenLine = []
            # listLemmaLine = []
            for tok in line.split():
                tokList = tok.split("|")
                word = tokList[0]
                lemma = tokList[1]
                tag = tokList[2]
                # Filtering stopwords
                if options.stopWords:
                    if lemma in stopwords.words('english'):
                        continue
                if options.filterPunctMarks:
                    if lemma in punctMarks:
                        continue
                # if tag in hashTagAspect:
                # We change tag for aspect tag only in the case of aspect tag coincide with class.
                # We want that CRF learn when to change term tag to aspect tag in correct context
                if tag in hashTagAspect:
                    if hashTagAspect[tag] == c:
                        aTag = hashTagAspect[tag]
                    else:
                        aTag = 'O'
                else:
                    if c in hashAspectFreqWords:
                        # print("class: ", c)
                        hashFreqWords = hashAspectFreqWords[c]
                        # We verify if word or lemma is in frequent words.
                        # These frequent words are word-forms (tokens)
                        if word.lower() in hashFreqWords or lemma in hashFreqWords:
                            aTag = c
                        else:
                            aTag = 'O'
                    else:
                        aTag = 'O'
                listTokenLine.append(word + "|" + lemma + "|" + tag + "|" + aTag)
            # if feature == "word":
            listLines.append(listTokenLine)
            # if feature == "lemma":
                # listLines = listLemmaLine.strip() + '\n'
            if options.skip > 0:
                t0 = time()
                skipTemp = options.skip
                for i in range(1, options.skip):
                    hashTemp = getSkipMentions(listLines, skipTemp)
                    # skipTemp -= 1
                    for key in hashTemp:
                        listLines = hashTemp[key]
                        with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
                                options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(skipTemp) + '.txt')), "w",
                                  encoding="utf-8") as oFile:
                            for line in listLines:
                                oFile.write(line)
                print("Skip mention done in: %fs" % (time() - t0))
            else:
                with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
                        options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(options.skip) + '.txt')), "w",
                          encoding="utf-8") as oFile:
                    for line in listLines:
                        for token in line:
                            oFile.write(token + ' ')
                        oFile.write('\n')
    print("Files processed: " + str(filesRead))