Showing
1 changed file
with
10 additions
and
1 deletions
| ... | @@ -5,6 +5,7 @@ from itertools import chain | ... | @@ -5,6 +5,7 @@ from itertools import chain |
| 5 | from optparse import OptionParser | 5 | from optparse import OptionParser |
| 6 | from time import time | 6 | from time import time |
| 7 | from collections import Counter | 7 | from collections import Counter |
| 8 | +import re | ||
| 8 | 9 | ||
| 9 | import nltk | 10 | import nltk |
| 10 | import sklearn | 11 | import sklearn |
| ... | @@ -48,6 +49,12 @@ from nltk.corpus import stopwords | ... | @@ -48,6 +49,12 @@ from nltk.corpus import stopwords |
| 48 | ################################# | 49 | ################################# |
| 49 | # FUNCTIONS # | 50 | # FUNCTIONS # |
| 50 | ################################# | 51 | ################################# |
| 52 | +def endsConLow(word): | ||
| 53 | + miregex = re.compile(r'[^aeiouA-Z0-9]$') | ||
| 54 | + if miregex.search(word): | ||
| 55 | + return True | ||
| 56 | + else: | ||
| 57 | + return False | ||
| 51 | 58 | ||
| 52 | def word2features(sent, i): | 59 | def word2features(sent, i): |
| 53 | listElem = sent[i].split('|') | 60 | listElem = sent[i].split('|') |
| ... | @@ -69,6 +76,7 @@ def word2features(sent, i): | ... | @@ -69,6 +76,7 @@ def word2features(sent, i): |
| 69 | 'word[:3]': word[:3], | 76 | 'word[:3]': word[:3], |
| 70 | 'word[:2]': word[:2], | 77 | 'word[:2]': word[:2], |
| 71 | 'word[:1]': word[:1], | 78 | 'word[:1]': word[:1], |
| 79 | + 'endsConLow()': endsConLow(word), | ||
| 72 | } | 80 | } |
| 73 | ''' | 81 | ''' |
| 74 | if i > 0: | 82 | if i > 0: |
| ... | @@ -196,7 +204,8 @@ if __name__ == "__main__": | ... | @@ -196,7 +204,8 @@ if __name__ == "__main__": |
| 196 | print("Exclude stop words: " + str(options.excludeStopWords)) | 204 | print("Exclude stop words: " + str(options.excludeStopWords)) |
| 197 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | 205 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', |
| 198 | '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | 206 | '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] |
| 199 | - print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols)) | 207 | + #print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols)) |
| 208 | + print("Exclude symbols: " + str(options.excludeSymbols)) | ||
| 200 | 209 | ||
| 201 | print('-------------------------------- PROCESSING --------------------------------') | 210 | print('-------------------------------- PROCESSING --------------------------------') |
| 202 | print('Reading corpus...') | 211 | print('Reading corpus...') | ... | ... |
-
Please register or login to post a comment