Carlos-Francisco Méndez-Cruz

Conditional Random Fields

This diff is collapsed. Click to expand it.
......@@ -49,42 +49,21 @@ from nltk.corpus import stopwords
#################################
# FUNCTIONS #
#################################
def endsConLow(word):
miregex = re.compile(r'[^aeiouA-Z0-9]$')
if miregex.search(word):
return True
else:
return False
def word2features(sent, i):
listElem = sent[i].split('|')
word = listElem[0]
#print("word: {}".format(word))
lemma = listElem[1]
postag = listElem[2]
features = {
# Suffixes
#'word[-3:]': word[-3:],
#'word[-2:]': word[-2:],
#'word[-1:]': word[-1:],
#'word.isupper()': word.isupper(),
#'word': word,
#'lemma': lemma,
#'postag': postag,
'lemma[-3:]': lemma[-3:],
'lemma[-2:]': lemma[-2:],
'lemma[-1:]': lemma[-1:],
'lemma[+3:]': lemma[:3],
'lemma[+2:]': lemma[:2],
'lemma[+1:]': lemma[:1],
#'word[:3]': word[:3],
#'word[:2]': word[:2],
#'word[:1]': word[:1],
#'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
'lemma': lemma,
'postag': postag,
}
if i > 0:
listElem = sent[i - 1].split('|')
word1 = listElem[0]
#word1 = listElem[0]
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
......@@ -95,7 +74,7 @@ def word2features(sent, i):
if i < len(sent) - 1:
listElem = sent[i + 1].split('|')
word1 = listElem[0]
#word1 = listElem[0]
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
......@@ -103,53 +82,8 @@ def word2features(sent, i):
'+1:lemma': lemma1,
'+1:postag': postag1,
})
'''
if i > 1:
listElem = sent[i - 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'-2:word': word2,
'-2:lemma': lemma2,
})
if i < len(sent) - 2:
listElem = sent[i + 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'+2:word': word2,
'+2:lemma': lemma2,
})
trigrams = False
if trigrams:
if i > 2:
listElem = sent[i - 3].split('|')
word3 = listElem[0]
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'-3:word': word3,
'-3:lemma': lemma3,
})
if i < len(sent) - 3:
listElem = sent[i + 3].split('|')
word3 = listElem[0]
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'+3:word': word3,
'+3:lemma': lemma3,
})
'''
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
......
This diff is collapsed. Click to expand it.