Carlos-Francisco Méndez-Cruz

Obtaining training and test data sets

......@@ -32,7 +32,7 @@ from nltk.corpus import stopwords
# --testFile File with test data set
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# --excludeSymbols Filtering punctuation marks
# Output
# 1) Best model
......@@ -42,116 +42,44 @@ from nltk.corpus import stopwords
# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
# --trainingFile training-data-set-70.txt
# --testFile test-data-set-30.txt
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/reports
# python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/reports
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields
# python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields
#################################
# FUNCTIONS #
#################################
def wordSize(text):
lWord = len(text)
if lWord == 1:
return '1'
elif lWord == 2:
return '2'
elif lWord == 3:
return '3'
elif lWord == 4:
return '4'
elif lWord == 5:
return '5'
elif 6 <= lWord <= 10:
return '6-10'
elif 11 <= lWord <= 15:
return '11-15'
elif 16 <= lWord <= 20:
return '16-20'
elif 21 <= lWord <= 30:
return '21-30'
else:
return '>30'
def hasUpperLower(text):
has = False
if len(text) < 3:
return False
regexUp = nltk.re.compile('[A-Z]')
regexLo = nltk.re.compile('[a-z]')
if (regexUp.search(text) != None) and (regexLo.search(text) != None):
has = True
return has
def hasDigit(text):
has = False
if len(text) < 3:
return False
myRegex = nltk.re.compile('[0-9]')
if myRegex.search(text) != None:
has = True
return has
def hasNonAlphaNum(text):
has = False
if len(text) < 3:
return False
myRegex = nltk.re.compile('\W')
if myRegex.search(text) != None:
has = True
return has
def word2features(sent, i):
# print "i: " + str(i)
# print "sent[i]" + sent[i]
listElem = sent[i].split('|')
word = listElem[0]
lemma = listElem[1]
postag = listElem[2]
features = {
# Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
# Suffixes
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word[-1:]': word[-1:],
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.hasDigit()': hasDigit(word),
'word.hasNonAlphaNum': hasNonAlphaNum(word),
# 'word.hasUpperLower': hasUpperLower(word),
#'wordSize': wordSize(word),
# 'word.isdigit()': word.isdigit(),
#'word.isupper()': word.isupper(),
'word': word,
'lemma': lemma,
'lemma[-3:]': lemma[-3:],
'lemma[-2:]': lemma[-2:],
'lemma[-1:]': lemma[-1:],
'postag': postag,
# Prefixes
'postag[:2]': postag[:2],
'postag[:1]': postag[:1],
'word[:3]': word[:3],
'word[:2]': word[:2],
'word[:1]': word[:1],
}
'''
if i > 0:
listElem = sent[i - 1].split('|')
word1 = listElem[0]
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
'-1:word.isupper()': word1.isupper(),
'-1:word.hasDigit()': hasDigit(word1),
'-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
# '-1:word.hasUpperLower': hasUpperLower(word1),
'-1:word': word1,
'-1:lemma': lemma1,
'-1:postag': postag1,
'-1:postag[:2]': postag1[:2],
'-1:postag[:1]': postag1[:1],
})
# else:
# features['BOS'] = True
if i < len(sent) - 1:
listElem = sent[i + 1].split('|')
......@@ -159,37 +87,18 @@ def word2features(sent, i):
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
'+1:word.isupper()': word1.isupper(),
'+1:word.hasDigit()': hasDigit(word1),
'+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
# '+1:word.hasUpperLower': hasUpperLower(word1),
'+1:word': word1,
'+1:lemma': lemma1,
'+1:postag': postag1,
'+1:postag[:2]': postag1[:2],
'+1:postag[:1]': postag1[:1],
})
# else:
# features['EOS'] = True
if i > 1:
listElem = sent[i - 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'-2:word.lower()': word2.lower(),
'-2:word.istitle()': word2.istitle(),
'-2:word.isupper()': word2.isupper(),
'-2:word.hasDigit()': hasDigit(word2),
'-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
# '-2:word.hasUpperLower': hasUpperLower(word2),
'-2:word': word2,
'-2:lemma': lemma2,
'-2:postag': postag2,
'-2:postag[:2]': postag2[:2],
'-2:postag[:1]': postag2[:1],
})
if i < len(sent) - 2:
......@@ -198,17 +107,8 @@ def word2features(sent, i):
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'+2:word.lower()': word2.lower(),
'+2:word.istitle()': word2.istitle(),
'+2:word.isupper()': word2.isupper(),
'+2:word.hasDigit()': hasDigit(word2),
'+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
# '+2:word.hasUpperLower': hasUpperLower(word2),
'+2:word': word2,
'+2:lemma': lemma2,
'+2:postag': postag2,
'+2:postag[:2]': postag2[:2],
'+2:postag[:1]': postag2[:1],
})
trigrams = False
......@@ -219,17 +119,8 @@ def word2features(sent, i):
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'-3:word.lower()': word3.lower(),
'-3:word.istitle()': word3.istitle(),
'-3:word.isupper()': word3.isupper(),
'-3:word.hasDigit()': hasDigit(word3),
'-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
# '-3:word.hasUpperLower': hasUpperLower(word3),
'-3:word': word3,
'-3:lemma': lemma3,
'-3:postag': postag3,
'-3:postag[:2]': postag3[:2],
'-3:postag[:1]': postag3[:1],
})
if i < len(sent) - 3:
......@@ -238,19 +129,10 @@ def word2features(sent, i):
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'+3:word.lower()': word3.lower(),
'+3:word.istitle()': word3.istitle(),
'+3:word.isupper()': word3.isupper(),
'+3:word.hasDigit()': hasDigit(word3),
'+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
# '+3:word.hasUpperLower': hasUpperLower(word3),
'+3:word': word3,
'+3:lemma': lemma3,
'+3:postag': postag3,
'+3:postag[:2]': postag3[:2],
'+3:postag[:1]': postag3[:1],
})
'''
return features
......@@ -260,7 +142,6 @@ def sent2features(sent):
def sent2labels(sent):
return [elem.split('|')[3] for elem in sent]
# return [label for token, postag, label in sent]
def sent2tokens(sent):
......@@ -269,19 +150,11 @@ def sent2tokens(sent):
def print_transitions(trans_features, f):
for (label_from, label_to), weight in trans_features:
# f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
# f.write("label_from :" + label_from)
# f.write("label_to :" + label_to)
# f.write("label_weight :" + weight)
# f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
def print_state_features(state_features, f):
for (attr, label), weight in state_features:
# f.write("%0.6f %-8s %s\n" % (weight, label, attr))
# f.write(attr.encode("utf-8"))
# '{:06.2f}'.format(3.141592653589793)
f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
......@@ -303,12 +176,12 @@ if __name__ == "__main__":
help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile",
help="File with test data set", metavar="FILE")
parser.add_option("--filterStopWords", default=False,
action="store_true", dest="filterStopWords",
help="Filtering stop words")
parser.add_option("--filterSymbols", default=False,
action="store_true", dest="filterSymbols",
help="Filtering punctuation marks")
parser.add_option("--excludeStopWords", default=False,
action="store_true", dest="excludeStopWords",
help="Exclude stop words")
parser.add_option("--excludeSymbols", default=False,
action="store_true", dest="excludeSymbols",
help="Exclude punctuation marks")
(options, args) = parser.parse_args()
if len(args) > 0:
......@@ -320,10 +193,10 @@ if __name__ == "__main__":
print("File with training data set: " + str(options.trainingFile))
print("Path of test data set: " + options.inputPath)
print("File with test data set: " + str(options.testFile))
print("Filtering stop words: " + str(options.filterStopWords))
print("Exclude stop words: " + str(options.excludeStopWords))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols))
print('-------------------------------- PROCESSING --------------------------------')
print('Reading corpus...')
......@@ -332,67 +205,50 @@ if __name__ == "__main__":
sentencesTrainingData = []
sentencesTestData = []
# Original: stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
stopwords = [word for word in stopwords.words('english')]
with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
# with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
for line in iFile.readlines():
listLine = []
line = line.strip('\n')
for token in line.split():
if options.filterStopWords:
if options.excludeStopWords:
listToken = token.split('|')
lemma = listToken[1]
# Original: if lemma in stopwords.words('english'):
# trainingTesting_Sklearn_crfsuite.py:269:
# UnicodeWarning: Unicode equal comparison failed to
# convert both arguments to Unicode -
# interpreting them as being unequal
if lemma in stopwords:
continue
if options.filterSymbols:
if options.excludeSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
# if lemma == ',':
# print "Coma , identificada"
continue
listLine.append(token)
sentencesTrainingData.append(listLine)
print(" Sentences training data: " + str(len(sentencesTrainingData)))
# print sentencesTrainingData[0]
with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
# with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
for line in iFile.readlines():
listLine = []
line = line.strip('\n')
for token in line.split():
if options.filterStopWords:
if options.excludeStopWords:
listToken = token.split('|')
lemma = listToken[1]
# Original if lemma in stopwords.words('english'):
if lemma in stopwords:
continue
if options.filterSymbols:
if options.excludeSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
# if lemma == ',':
# print "Coma , identificada"
continue
listLine.append(token)
sentencesTestData.append(listLine)
print(" Sentences test data: " + str(len(sentencesTestData)))
# print sentencesTestData[0]
print("Reading corpus done in: %fs" % (time() - t0))
print(sent2features(sentencesTrainingData[0])[0])
print(sent2features(sentencesTestData[0])[0])
# print(sent2labels(sentencesTrainingData[0]))
# print(sent2labels(sentencesTestData[0]))
t0 = time()
X_train = [sent2features(s) for s in sentencesTrainingData]
......@@ -445,8 +301,8 @@ if __name__ == "__main__":
# Best hiperparameters
# crf = rs.best_estimator_
nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
options.filterSymbols) + '.txt')
nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
options.excludeSymbols) + '.txt')
with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
oFile.write("********** TRAINING AND TESTING REPORT **********\n")
oFile.write("Training file: " + options.trainingFile + '\n')
......@@ -464,23 +320,23 @@ if __name__ == "__main__":
# Saving model
print(" Saving training model...")
t1 = time()
nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
options.filterSymbols) + '.mod')
nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
options.excludeSymbols) + '.mod')
joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
print(" Saving training model done in: %fs" % (time() - t1))
# Evaluation against test data
y_pred = crf.predict(X_test)
print("*********************************")
name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
options.filterSymbols) + '.txt')
name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
options.excludeSymbols) + '.txt')
with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
for y in y_pred:
oFile.write(str(y) + '\n')
print("*********************************")
name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
options.filterSymbols) + '.txt')
name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
options.excludeSymbols) + '.txt')
with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
for y in y_test:
oFile.write(str(y) + '\n')
......