Carlos-Francisco Méndez-Cruz

Training validation script

......@@ -5,6 +5,7 @@ from itertools import chain
from optparse import OptionParser
from time import time
from collections import Counter
import re
import nltk
import sklearn
......@@ -48,6 +49,12 @@ from nltk.corpus import stopwords
#################################
# FUNCTIONS #
#################################
def endsConLow(word):
miregex = re.compile(r'[^aeiouA-Z0-9]$')
if miregex.search(word):
return True
else:
return False
def word2features(sent, i):
listElem = sent[i].split('|')
......@@ -69,6 +76,7 @@ def word2features(sent, i):
'word[:3]': word[:3],
'word[:2]': word[:2],
'word[:1]': word[:1],
'endsConLow()': endsConLow(word),
}
'''
if i > 0:
......@@ -196,7 +204,8 @@ if __name__ == "__main__":
print("Exclude stop words: " + str(options.excludeStopWords))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols))
#print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols))
print("Exclude symbols: " + str(options.excludeSymbols))
print('-------------------------------- PROCESSING --------------------------------')
print('Reading corpus...')
......