Obtaining training and test data sets

Carlos-Francisco Méndez-Cruz
Commit 1393a83fd8c57a347fb5014a9f49209fbb573d71 1393a83f 1 parent 4791fb14
Showing 1 changed file with 30 additions and 174 deletions
training-validation-v1.py
--- a/training-validation-v1.py
View file @1393a83
+++ b/training-validation-v1.py
View file @1393a83
@@ -32,7 +32,7 @@ from nltk.corpus import stopwords
 # --testFile        File with test data set
 # --outputPath=PATH    Output path to place output files
 # --filteringStopWords   Filtering stop words
- # --filterSymbols      Filtering punctuation marks
+ # --excludeSymbols      Filtering punctuation marks
 
 # Output
 # 1) Best model
@@ -42,116 +42,44 @@ from nltk.corpus import stopwords
 # --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
 # --trainingFile training-data-set-70.txt
 # --testFile test-data-set-30.txt
- # --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/reports
- # python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/reports
+ # --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields
+ # python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields
 
 #################################
 #           FUNCTIONS           #
 #################################
 
- def wordSize(text):
-     lWord = len(text)
-     if lWord == 1:
-         return '1'
-     elif lWord == 2:
-         return '2'
-     elif lWord == 3:
-         return '3'
-     elif lWord == 4:
-         return '4'
-     elif lWord == 5:
-         return '5'
-     elif 6 <= lWord <= 10:
-         return '6-10'
-     elif 11 <= lWord <= 15:
-         return '11-15'
-     elif 16 <= lWord <= 20:
-         return '16-20'
-     elif 21 <= lWord <= 30:
-         return '21-30'
-     else:
-         return '>30'
- 
- def hasUpperLower(text):
-     has = False
-     if len(text) < 3:
-         return False
-     regexUp = nltk.re.compile('[A-Z]')
-     regexLo = nltk.re.compile('[a-z]')
-     if (regexUp.search(text) != None) and (regexLo.search(text) != None):
-         has = True
-     return has
- 
- def hasDigit(text):
-     has = False
-     if len(text) < 3:
-         return False
-     myRegex = nltk.re.compile('[0-9]')
-     if myRegex.search(text) != None:
-         has = True
-     return has
- 
- 
- def hasNonAlphaNum(text):
-     has = False
-     if len(text) < 3:
-         return False
-     myRegex = nltk.re.compile('\W')
-     if myRegex.search(text) != None:
-         has = True
-     return has
- 
 def word2features(sent, i):
-     # print "i: " + str(i)
-     # print "sent[i]" + sent[i]
     listElem = sent[i].split('|')
     word = listElem[0]
     lemma = listElem[1]
     postag = listElem[2]
 
     features = {
-         # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
         # Suffixes
         'word[-3:]': word[-3:],
         'word[-2:]': word[-2:],
         'word[-1:]': word[-1:],
-         'word.isupper()': word.isupper(),
-         'word.istitle()': word.istitle(),
-         'word.hasDigit()': hasDigit(word),
-         'word.hasNonAlphaNum': hasNonAlphaNum(word),
-         # 'word.hasUpperLower': hasUpperLower(word),
-         #'wordSize': wordSize(word),
-         # 'word.isdigit()': word.isdigit(),
+         #'word.isupper()': word.isupper(),
         'word': word,
         'lemma': lemma,
         'lemma[-3:]': lemma[-3:],
         'lemma[-2:]': lemma[-2:],
         'lemma[-1:]': lemma[-1:],
-         'postag': postag,
-         # Prefixes
-         'postag[:2]': postag[:2],
-         'postag[:1]': postag[:1],
+         'word[:3]': word[:3],
+         'word[:2]': word[:2],
+         'word[:1]': word[:1],
     }
+     '''    
     if i > 0:
         listElem = sent[i - 1].split('|')
         word1 = listElem[0]
         lemma1 = listElem[1]
         postag1 = listElem[2]
         features.update({
-             '-1:word.lower()': word1.lower(),
-             '-1:word.istitle()': word1.istitle(),
-             '-1:word.isupper()': word1.isupper(),
-             '-1:word.hasDigit()': hasDigit(word1),
-             '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
-             # '-1:word.hasUpperLower': hasUpperLower(word1),
             '-1:word': word1,
             '-1:lemma': lemma1,
-             '-1:postag': postag1,
-             '-1:postag[:2]': postag1[:2],
-             '-1:postag[:1]': postag1[:1],
         })
-     # else:
-     #    features['BOS'] = True
 
     if i < len(sent) - 1:
         listElem = sent[i + 1].split('|')
@@ -159,37 +87,18 @@ def word2features(sent, i):
         lemma1 = listElem[1]
         postag1 = listElem[2]
         features.update({
-             '+1:word.lower()': word1.lower(),
-             '+1:word.istitle()': word1.istitle(),
-             '+1:word.isupper()': word1.isupper(),
-             '+1:word.hasDigit()': hasDigit(word1),
-             '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
-             # '+1:word.hasUpperLower': hasUpperLower(word1),
             '+1:word': word1,
             '+1:lemma': lemma1,
-             '+1:postag': postag1,
-             '+1:postag[:2]': postag1[:2],
-             '+1:postag[:1]': postag1[:1],
         })
-     # else:
-     #    features['EOS'] = True
+         
     if i > 1:
         listElem = sent[i - 2].split('|')
         word2 = listElem[0]
         lemma2 = listElem[1]
         postag2 = listElem[2]
         features.update({
-             '-2:word.lower()': word2.lower(),
-             '-2:word.istitle()': word2.istitle(),
-             '-2:word.isupper()': word2.isupper(),
-             '-2:word.hasDigit()': hasDigit(word2),
-             '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
-             # '-2:word.hasUpperLower': hasUpperLower(word2),
             '-2:word': word2,
             '-2:lemma': lemma2,
-             '-2:postag': postag2,
-             '-2:postag[:2]': postag2[:2],
-             '-2:postag[:1]': postag2[:1],
         })
 
     if i < len(sent) - 2:
@@ -198,17 +107,8 @@ def word2features(sent, i):
         lemma2 = listElem[1]
         postag2 = listElem[2]
         features.update({
-             '+2:word.lower()': word2.lower(),
-             '+2:word.istitle()': word2.istitle(),
-             '+2:word.isupper()': word2.isupper(),
-             '+2:word.hasDigit()': hasDigit(word2),
-             '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
-             # '+2:word.hasUpperLower': hasUpperLower(word2),
             '+2:word': word2,
             '+2:lemma': lemma2,
-             '+2:postag': postag2,
-             '+2:postag[:2]': postag2[:2],
-             '+2:postag[:1]': postag2[:1],
         })
 
     trigrams = False
@@ -219,17 +119,8 @@ def word2features(sent, i):
             lemma3 = listElem[1]
             postag3 = listElem[2]
             features.update({
-                 '-3:word.lower()': word3.lower(),
-                 '-3:word.istitle()': word3.istitle(),
-                 '-3:word.isupper()': word3.isupper(),
-                 '-3:word.hasDigit()': hasDigit(word3),
-                 '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
-                 # '-3:word.hasUpperLower': hasUpperLower(word3),
                 '-3:word': word3,
                 '-3:lemma': lemma3,
-                 '-3:postag': postag3,
-                 '-3:postag[:2]': postag3[:2],
-                 '-3:postag[:1]': postag3[:1],
             })
 
         if i < len(sent) - 3:
@@ -238,19 +129,10 @@ def word2features(sent, i):
             lemma3 = listElem[1]
             postag3 = listElem[2]
             features.update({
-                 '+3:word.lower()': word3.lower(),
-                 '+3:word.istitle()': word3.istitle(),
-                 '+3:word.isupper()': word3.isupper(),
-                 '+3:word.hasDigit()': hasDigit(word3),
-                 '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
-                 # '+3:word.hasUpperLower': hasUpperLower(word3),
                 '+3:word': word3,
                 '+3:lemma': lemma3,
-                 '+3:postag': postag3,
-                 '+3:postag[:2]': postag3[:2],
-                 '+3:postag[:1]': postag3[:1],
             })
- 
+     '''
     return features
 
 
@@ -260,7 +142,6 @@ def sent2features(sent):
 
 def sent2labels(sent):
     return [elem.split('|')[3] for elem in sent]
-     # return [label for token, postag, label in sent]
 
 
 def sent2tokens(sent):
@@ -269,19 +150,11 @@ def sent2tokens(sent):
 
 def print_transitions(trans_features, f):
     for (label_from, label_to), weight in trans_features:
-         # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
-         # f.write("label_from :" + label_from)
-         # f.write("label_to :" + label_to)
-         # f.write("label_weight :" + weight)
-         # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
         f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
 
 
 def print_state_features(state_features, f):
     for (attr, label), weight in state_features:
-         # f.write("%0.6f %-8s %s\n" % (weight, label, attr))
-         # f.write(attr.encode("utf-8"))
-         # '{:06.2f}'.format(3.141592653589793)
         f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
 
 
@@ -303,12 +176,12 @@ if __name__ == "__main__":
                       help="File with training data set", metavar="FILE")
     parser.add_option("--testFile", dest="testFile",
                       help="File with test data set", metavar="FILE")
-     parser.add_option("--filterStopWords", default=False,
-                       action="store_true", dest="filterStopWords",
-                       help="Filtering stop words")
-     parser.add_option("--filterSymbols", default=False,
-                       action="store_true", dest="filterSymbols",
-                       help="Filtering punctuation marks")
+     parser.add_option("--excludeStopWords", default=False,
+                       action="store_true", dest="excludeStopWords",
+                       help="Exclude stop words")
+     parser.add_option("--excludeSymbols", default=False,
+                       action="store_true", dest="excludeSymbols",
+                       help="Exclude punctuation marks")
 
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -320,10 +193,10 @@ if __name__ == "__main__":
     print("File with training data set: " + str(options.trainingFile))
     print("Path of test data set: " + options.inputPath)
     print("File with test data set: " + str(options.testFile))
-     print("Filtering stop words: " + str(options.filterStopWords))
+     print("Exclude stop words: " + str(options.excludeStopWords))
     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
-     print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+     print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols))
 
     print('-------------------------------- PROCESSING --------------------------------')
     print('Reading corpus...')
@@ -332,67 +205,50 @@ if __name__ == "__main__":
     sentencesTrainingData = []
     sentencesTestData = []
 
-     # Original: stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
     stopwords = [word for word in stopwords.words('english')]
 
     with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
-         # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
         for line in iFile.readlines():
             listLine = []
             line = line.strip('\n')
             for token in line.split():
-                 if options.filterStopWords:
+                 if options.excludeStopWords:
                     listToken = token.split('|')
                     lemma = listToken[1]
-                     # Original: if lemma in stopwords.words('english'):
-                     # trainingTesting_Sklearn_crfsuite.py:269:
-                     # UnicodeWarning: Unicode equal comparison failed to
-                     # convert both arguments to Unicode -
-                     # interpreting them as being unequal
                     if lemma in stopwords:
                         continue
-                 if options.filterSymbols:
+                 if options.excludeSymbols:
                     listToken = token.split('|')
                     lemma = listToken[1]
                     if lemma in symbols:
-                         # if lemma == ',':
-                         #     print "Coma , identificada"
                         continue
                 listLine.append(token)
             sentencesTrainingData.append(listLine)
         print("   Sentences training data: " + str(len(sentencesTrainingData)))
-         # print sentencesTrainingData[0]
 
     with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
-         # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
         for line in iFile.readlines():
             listLine = []
             line = line.strip('\n')
             for token in line.split():
-                 if options.filterStopWords:
+                 if options.excludeStopWords:
                     listToken = token.split('|')
                     lemma = listToken[1]
-                     # Original if lemma in stopwords.words('english'):
                     if lemma in stopwords:
                         continue
-                 if options.filterSymbols:
+                 if options.excludeSymbols:
                     listToken = token.split('|')
                     lemma = listToken[1]
                     if lemma in symbols:
-                         # if lemma == ',':
-                          #    print "Coma , identificada"
                         continue
                 listLine.append(token)
             sentencesTestData.append(listLine)
         print("   Sentences test data: " + str(len(sentencesTestData)))
-         # print sentencesTestData[0]
 
     print("Reading corpus done in: %fs" % (time() - t0))
 
     print(sent2features(sentencesTrainingData[0])[0])
     print(sent2features(sentencesTestData[0])[0])
-     # print(sent2labels(sentencesTrainingData[0]))
-     # print(sent2labels(sentencesTestData[0]))
     t0 = time()
 
     X_train = [sent2features(s) for s in sentencesTrainingData]
@@ -445,8 +301,8 @@ if __name__ == "__main__":
 
     # Best hiperparameters
     # crf = rs.best_estimator_
-     nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
-         options.filterSymbols) + '.txt')
+     nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.txt')
     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
         oFile.write("********** TRAINING AND TESTING REPORT **********\n")
         oFile.write("Training file: " + options.trainingFile + '\n')
@@ -464,23 +320,23 @@ if __name__ == "__main__":
     # Saving model
     print("     Saving training model...")
     t1 = time()
-     nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
-         options.filterSymbols) + '.mod')
+     nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.mod')
     joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
     print("        Saving training model done in: %fs" % (time() - t1))
 
     # Evaluation against test data
     y_pred = crf.predict(X_test)
     print("*********************************")
-     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
-         options.filterSymbols) + '.txt')
+     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.txt')
     with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
         for y in y_pred:
             oFile.write(str(y) + '\n')
 
     print("*********************************")
-     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
-         options.filterSymbols) + '.txt')
+     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.txt')
     with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
         for y in y_test:
             oFile.write(str(y) + '\n')