Showing
3 changed files
with
5 additions
and
71 deletions
training_validation_v1-1.py
0 → 100644
This diff is collapsed. Click to expand it.
| ... | @@ -49,42 +49,21 @@ from nltk.corpus import stopwords | ... | @@ -49,42 +49,21 @@ from nltk.corpus import stopwords |
| 49 | ################################# | 49 | ################################# |
| 50 | # FUNCTIONS # | 50 | # FUNCTIONS # |
| 51 | ################################# | 51 | ################################# |
| 52 | -def endsConLow(word): | ||
| 53 | - miregex = re.compile(r'[^aeiouA-Z0-9]$') | ||
| 54 | - if miregex.search(word): | ||
| 55 | - return True | ||
| 56 | - else: | ||
| 57 | - return False | ||
| 58 | - | ||
| 59 | def word2features(sent, i): | 52 | def word2features(sent, i): |
| 60 | listElem = sent[i].split('|') | 53 | listElem = sent[i].split('|') |
| 61 | word = listElem[0] | 54 | word = listElem[0] |
| 55 | + #print("word: {}".format(word)) | ||
| 62 | lemma = listElem[1] | 56 | lemma = listElem[1] |
| 63 | postag = listElem[2] | 57 | postag = listElem[2] |
| 64 | 58 | ||
| 65 | features = { | 59 | features = { |
| 66 | - # Suffixes | ||
| 67 | - #'word[-3:]': word[-3:], | ||
| 68 | - #'word[-2:]': word[-2:], | ||
| 69 | - #'word[-1:]': word[-1:], | ||
| 70 | - #'word.isupper()': word.isupper(), | ||
| 71 | #'word': word, | 60 | #'word': word, |
| 72 | - #'lemma': lemma, | 61 | + 'lemma': lemma, |
| 73 | - #'postag': postag, | 62 | + 'postag': postag, |
| 74 | - 'lemma[-3:]': lemma[-3:], | ||
| 75 | - 'lemma[-2:]': lemma[-2:], | ||
| 76 | - 'lemma[-1:]': lemma[-1:], | ||
| 77 | - 'lemma[+3:]': lemma[:3], | ||
| 78 | - 'lemma[+2:]': lemma[:2], | ||
| 79 | - 'lemma[+1:]': lemma[:1], | ||
| 80 | - #'word[:3]': word[:3], | ||
| 81 | - #'word[:2]': word[:2], | ||
| 82 | - #'word[:1]': word[:1], | ||
| 83 | - #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word), | ||
| 84 | } | 63 | } |
| 85 | if i > 0: | 64 | if i > 0: |
| 86 | listElem = sent[i - 1].split('|') | 65 | listElem = sent[i - 1].split('|') |
| 87 | - word1 = listElem[0] | 66 | + #word1 = listElem[0] |
| 88 | lemma1 = listElem[1] | 67 | lemma1 = listElem[1] |
| 89 | postag1 = listElem[2] | 68 | postag1 = listElem[2] |
| 90 | features.update({ | 69 | features.update({ |
| ... | @@ -95,7 +74,7 @@ def word2features(sent, i): | ... | @@ -95,7 +74,7 @@ def word2features(sent, i): |
| 95 | 74 | ||
| 96 | if i < len(sent) - 1: | 75 | if i < len(sent) - 1: |
| 97 | listElem = sent[i + 1].split('|') | 76 | listElem = sent[i + 1].split('|') |
| 98 | - word1 = listElem[0] | 77 | + #word1 = listElem[0] |
| 99 | lemma1 = listElem[1] | 78 | lemma1 = listElem[1] |
| 100 | postag1 = listElem[2] | 79 | postag1 = listElem[2] |
| 101 | features.update({ | 80 | features.update({ |
| ... | @@ -103,53 +82,8 @@ def word2features(sent, i): | ... | @@ -103,53 +82,8 @@ def word2features(sent, i): |
| 103 | '+1:lemma': lemma1, | 82 | '+1:lemma': lemma1, |
| 104 | '+1:postag': postag1, | 83 | '+1:postag': postag1, |
| 105 | }) | 84 | }) |
| 106 | - | ||
| 107 | - ''' | ||
| 108 | - if i > 1: | ||
| 109 | - listElem = sent[i - 2].split('|') | ||
| 110 | - word2 = listElem[0] | ||
| 111 | - lemma2 = listElem[1] | ||
| 112 | - postag2 = listElem[2] | ||
| 113 | - features.update({ | ||
| 114 | - '-2:word': word2, | ||
| 115 | - '-2:lemma': lemma2, | ||
| 116 | - }) | ||
| 117 | - | ||
| 118 | - if i < len(sent) - 2: | ||
| 119 | - listElem = sent[i + 2].split('|') | ||
| 120 | - word2 = listElem[0] | ||
| 121 | - lemma2 = listElem[1] | ||
| 122 | - postag2 = listElem[2] | ||
| 123 | - features.update({ | ||
| 124 | - '+2:word': word2, | ||
| 125 | - '+2:lemma': lemma2, | ||
| 126 | - }) | ||
| 127 | - | ||
| 128 | - trigrams = False | ||
| 129 | - if trigrams: | ||
| 130 | - if i > 2: | ||
| 131 | - listElem = sent[i - 3].split('|') | ||
| 132 | - word3 = listElem[0] | ||
| 133 | - lemma3 = listElem[1] | ||
| 134 | - postag3 = listElem[2] | ||
| 135 | - features.update({ | ||
| 136 | - '-3:word': word3, | ||
| 137 | - '-3:lemma': lemma3, | ||
| 138 | - }) | ||
| 139 | - | ||
| 140 | - if i < len(sent) - 3: | ||
| 141 | - listElem = sent[i + 3].split('|') | ||
| 142 | - word3 = listElem[0] | ||
| 143 | - lemma3 = listElem[1] | ||
| 144 | - postag3 = listElem[2] | ||
| 145 | - features.update({ | ||
| 146 | - '+3:word': word3, | ||
| 147 | - '+3:lemma': lemma3, | ||
| 148 | - }) | ||
| 149 | - ''' | ||
| 150 | return features | 85 | return features |
| 151 | 86 | ||
| 152 | - | ||
| 153 | def sent2features(sent): | 87 | def sent2features(sent): |
| 154 | return [word2features(sent, i) for i in range(len(sent))] | 88 | return [word2features(sent, i) for i in range(len(sent))] |
| 155 | 89 | ... | ... |
training_validation_v3.py
0 → 100644
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment