Showing
1 changed file
with
10 additions
and
1 deletions
... | @@ -5,6 +5,7 @@ from itertools import chain | ... | @@ -5,6 +5,7 @@ from itertools import chain |
5 | from optparse import OptionParser | 5 | from optparse import OptionParser |
6 | from time import time | 6 | from time import time |
7 | from collections import Counter | 7 | from collections import Counter |
8 | +import re | ||
8 | 9 | ||
9 | import nltk | 10 | import nltk |
10 | import sklearn | 11 | import sklearn |
... | @@ -48,6 +49,12 @@ from nltk.corpus import stopwords | ... | @@ -48,6 +49,12 @@ from nltk.corpus import stopwords |
48 | ################################# | 49 | ################################# |
49 | # FUNCTIONS # | 50 | # FUNCTIONS # |
50 | ################################# | 51 | ################################# |
52 | +def endsConLow(word): | ||
53 | + miregex = re.compile(r'[^aeiouA-Z0-9]$') | ||
54 | + if miregex.search(word): | ||
55 | + return True | ||
56 | + else: | ||
57 | + return False | ||
51 | 58 | ||
52 | def word2features(sent, i): | 59 | def word2features(sent, i): |
53 | listElem = sent[i].split('|') | 60 | listElem = sent[i].split('|') |
... | @@ -69,6 +76,7 @@ def word2features(sent, i): | ... | @@ -69,6 +76,7 @@ def word2features(sent, i): |
69 | 'word[:3]': word[:3], | 76 | 'word[:3]': word[:3], |
70 | 'word[:2]': word[:2], | 77 | 'word[:2]': word[:2], |
71 | 'word[:1]': word[:1], | 78 | 'word[:1]': word[:1], |
79 | + 'endsConLow()': endsConLow(word), | ||
72 | } | 80 | } |
73 | ''' | 81 | ''' |
74 | if i > 0: | 82 | if i > 0: |
... | @@ -196,7 +204,8 @@ if __name__ == "__main__": | ... | @@ -196,7 +204,8 @@ if __name__ == "__main__": |
196 | print("Exclude stop words: " + str(options.excludeStopWords)) | 204 | print("Exclude stop words: " + str(options.excludeStopWords)) |
197 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | 205 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', |
198 | '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | 206 | '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] |
199 | - print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols)) | 207 | + #print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols)) |
208 | + print("Exclude symbols: " + str(options.excludeSymbols)) | ||
200 | 209 | ||
201 | print('-------------------------------- PROCESSING --------------------------------') | 210 | print('-------------------------------- PROCESSING --------------------------------') |
202 | print('Reading corpus...') | 211 | print('Reading corpus...') | ... | ... |
-
Please register or login to post a comment