Carlos-Francisco Méndez-Cruz

Training validation script

...@@ -5,6 +5,7 @@ from itertools import chain ...@@ -5,6 +5,7 @@ from itertools import chain
5 from optparse import OptionParser 5 from optparse import OptionParser
6 from time import time 6 from time import time
7 from collections import Counter 7 from collections import Counter
8 +import re
8 9
9 import nltk 10 import nltk
10 import sklearn 11 import sklearn
...@@ -48,6 +49,12 @@ from nltk.corpus import stopwords ...@@ -48,6 +49,12 @@ from nltk.corpus import stopwords
48 ################################# 49 #################################
49 # FUNCTIONS # 50 # FUNCTIONS #
50 ################################# 51 #################################
52 +def endsConLow(word):
53 + miregex = re.compile(r'[^aeiouA-Z0-9]$')
54 + if miregex.search(word):
55 + return True
56 + else:
57 + return False
51 58
52 def word2features(sent, i): 59 def word2features(sent, i):
53 listElem = sent[i].split('|') 60 listElem = sent[i].split('|')
...@@ -69,6 +76,7 @@ def word2features(sent, i): ...@@ -69,6 +76,7 @@ def word2features(sent, i):
69 'word[:3]': word[:3], 76 'word[:3]': word[:3],
70 'word[:2]': word[:2], 77 'word[:2]': word[:2],
71 'word[:1]': word[:1], 78 'word[:1]': word[:1],
79 + 'endsConLow()': endsConLow(word),
72 } 80 }
73 ''' 81 '''
74 if i > 0: 82 if i > 0:
...@@ -196,7 +204,8 @@ if __name__ == "__main__": ...@@ -196,7 +204,8 @@ if __name__ == "__main__":
196 print("Exclude stop words: " + str(options.excludeStopWords)) 204 print("Exclude stop words: " + str(options.excludeStopWords))
197 symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', 205 symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
198 '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] 206 '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
199 - print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols)) 207 + #print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols))
208 + print("Exclude symbols: " + str(options.excludeSymbols))
200 209
201 print('-------------------------------- PROCESSING --------------------------------') 210 print('-------------------------------- PROCESSING --------------------------------')
202 print('Reading corpus...') 211 print('Reading corpus...')
......