Carlos-Francisco Méndez-Cruz

Conditional Random Fields

This diff is collapsed. Click to expand it.
...@@ -49,42 +49,21 @@ from nltk.corpus import stopwords ...@@ -49,42 +49,21 @@ from nltk.corpus import stopwords
49 ################################# 49 #################################
50 # FUNCTIONS # 50 # FUNCTIONS #
51 ################################# 51 #################################
52 -def endsConLow(word):
53 - miregex = re.compile(r'[^aeiouA-Z0-9]$')
54 - if miregex.search(word):
55 - return True
56 - else:
57 - return False
58 -
59 def word2features(sent, i): 52 def word2features(sent, i):
60 listElem = sent[i].split('|') 53 listElem = sent[i].split('|')
61 word = listElem[0] 54 word = listElem[0]
55 + #print("word: {}".format(word))
62 lemma = listElem[1] 56 lemma = listElem[1]
63 postag = listElem[2] 57 postag = listElem[2]
64 58
65 features = { 59 features = {
66 - # Suffixes
67 - #'word[-3:]': word[-3:],
68 - #'word[-2:]': word[-2:],
69 - #'word[-1:]': word[-1:],
70 - #'word.isupper()': word.isupper(),
71 #'word': word, 60 #'word': word,
72 - #'lemma': lemma, 61 + 'lemma': lemma,
73 - #'postag': postag, 62 + 'postag': postag,
74 - 'lemma[-3:]': lemma[-3:],
75 - 'lemma[-2:]': lemma[-2:],
76 - 'lemma[-1:]': lemma[-1:],
77 - 'lemma[+3:]': lemma[:3],
78 - 'lemma[+2:]': lemma[:2],
79 - 'lemma[+1:]': lemma[:1],
80 - #'word[:3]': word[:3],
81 - #'word[:2]': word[:2],
82 - #'word[:1]': word[:1],
83 - #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
84 } 63 }
85 if i > 0: 64 if i > 0:
86 listElem = sent[i - 1].split('|') 65 listElem = sent[i - 1].split('|')
87 - word1 = listElem[0] 66 + #word1 = listElem[0]
88 lemma1 = listElem[1] 67 lemma1 = listElem[1]
89 postag1 = listElem[2] 68 postag1 = listElem[2]
90 features.update({ 69 features.update({
...@@ -95,7 +74,7 @@ def word2features(sent, i): ...@@ -95,7 +74,7 @@ def word2features(sent, i):
95 74
96 if i < len(sent) - 1: 75 if i < len(sent) - 1:
97 listElem = sent[i + 1].split('|') 76 listElem = sent[i + 1].split('|')
98 - word1 = listElem[0] 77 + #word1 = listElem[0]
99 lemma1 = listElem[1] 78 lemma1 = listElem[1]
100 postag1 = listElem[2] 79 postag1 = listElem[2]
101 features.update({ 80 features.update({
...@@ -103,53 +82,8 @@ def word2features(sent, i): ...@@ -103,53 +82,8 @@ def word2features(sent, i):
103 '+1:lemma': lemma1, 82 '+1:lemma': lemma1,
104 '+1:postag': postag1, 83 '+1:postag': postag1,
105 }) 84 })
106 -
107 - '''
108 - if i > 1:
109 - listElem = sent[i - 2].split('|')
110 - word2 = listElem[0]
111 - lemma2 = listElem[1]
112 - postag2 = listElem[2]
113 - features.update({
114 - '-2:word': word2,
115 - '-2:lemma': lemma2,
116 - })
117 -
118 - if i < len(sent) - 2:
119 - listElem = sent[i + 2].split('|')
120 - word2 = listElem[0]
121 - lemma2 = listElem[1]
122 - postag2 = listElem[2]
123 - features.update({
124 - '+2:word': word2,
125 - '+2:lemma': lemma2,
126 - })
127 -
128 - trigrams = False
129 - if trigrams:
130 - if i > 2:
131 - listElem = sent[i - 3].split('|')
132 - word3 = listElem[0]
133 - lemma3 = listElem[1]
134 - postag3 = listElem[2]
135 - features.update({
136 - '-3:word': word3,
137 - '-3:lemma': lemma3,
138 - })
139 -
140 - if i < len(sent) - 3:
141 - listElem = sent[i + 3].split('|')
142 - word3 = listElem[0]
143 - lemma3 = listElem[1]
144 - postag3 = listElem[2]
145 - features.update({
146 - '+3:word': word3,
147 - '+3:lemma': lemma3,
148 - })
149 - '''
150 return features 85 return features
151 86
152 -
153 def sent2features(sent): 87 def sent2features(sent):
154 return [word2features(sent, i) for i in range(len(sent))] 88 return [word2features(sent, i) for i in range(len(sent))]
155 89
......
This diff is collapsed. Click to expand it.