Showing
3 changed files
with
5 additions
and
71 deletions
training_validation_v1-1.py
0 → 100644
This diff is collapsed. Click to expand it.
... | @@ -49,42 +49,21 @@ from nltk.corpus import stopwords | ... | @@ -49,42 +49,21 @@ from nltk.corpus import stopwords |
49 | ################################# | 49 | ################################# |
50 | # FUNCTIONS # | 50 | # FUNCTIONS # |
51 | ################################# | 51 | ################################# |
52 | -def endsConLow(word): | ||
53 | - miregex = re.compile(r'[^aeiouA-Z0-9]$') | ||
54 | - if miregex.search(word): | ||
55 | - return True | ||
56 | - else: | ||
57 | - return False | ||
58 | - | ||
59 | def word2features(sent, i): | 52 | def word2features(sent, i): |
60 | listElem = sent[i].split('|') | 53 | listElem = sent[i].split('|') |
61 | word = listElem[0] | 54 | word = listElem[0] |
55 | + #print("word: {}".format(word)) | ||
62 | lemma = listElem[1] | 56 | lemma = listElem[1] |
63 | postag = listElem[2] | 57 | postag = listElem[2] |
64 | 58 | ||
65 | features = { | 59 | features = { |
66 | - # Suffixes | ||
67 | - #'word[-3:]': word[-3:], | ||
68 | - #'word[-2:]': word[-2:], | ||
69 | - #'word[-1:]': word[-1:], | ||
70 | - #'word.isupper()': word.isupper(), | ||
71 | #'word': word, | 60 | #'word': word, |
72 | - #'lemma': lemma, | 61 | + 'lemma': lemma, |
73 | - #'postag': postag, | 62 | + 'postag': postag, |
74 | - 'lemma[-3:]': lemma[-3:], | ||
75 | - 'lemma[-2:]': lemma[-2:], | ||
76 | - 'lemma[-1:]': lemma[-1:], | ||
77 | - 'lemma[+3:]': lemma[:3], | ||
78 | - 'lemma[+2:]': lemma[:2], | ||
79 | - 'lemma[+1:]': lemma[:1], | ||
80 | - #'word[:3]': word[:3], | ||
81 | - #'word[:2]': word[:2], | ||
82 | - #'word[:1]': word[:1], | ||
83 | - #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word), | ||
84 | } | 63 | } |
85 | if i > 0: | 64 | if i > 0: |
86 | listElem = sent[i - 1].split('|') | 65 | listElem = sent[i - 1].split('|') |
87 | - word1 = listElem[0] | 66 | + #word1 = listElem[0] |
88 | lemma1 = listElem[1] | 67 | lemma1 = listElem[1] |
89 | postag1 = listElem[2] | 68 | postag1 = listElem[2] |
90 | features.update({ | 69 | features.update({ |
... | @@ -95,7 +74,7 @@ def word2features(sent, i): | ... | @@ -95,7 +74,7 @@ def word2features(sent, i): |
95 | 74 | ||
96 | if i < len(sent) - 1: | 75 | if i < len(sent) - 1: |
97 | listElem = sent[i + 1].split('|') | 76 | listElem = sent[i + 1].split('|') |
98 | - word1 = listElem[0] | 77 | + #word1 = listElem[0] |
99 | lemma1 = listElem[1] | 78 | lemma1 = listElem[1] |
100 | postag1 = listElem[2] | 79 | postag1 = listElem[2] |
101 | features.update({ | 80 | features.update({ |
... | @@ -103,53 +82,8 @@ def word2features(sent, i): | ... | @@ -103,53 +82,8 @@ def word2features(sent, i): |
103 | '+1:lemma': lemma1, | 82 | '+1:lemma': lemma1, |
104 | '+1:postag': postag1, | 83 | '+1:postag': postag1, |
105 | }) | 84 | }) |
106 | - | ||
107 | - ''' | ||
108 | - if i > 1: | ||
109 | - listElem = sent[i - 2].split('|') | ||
110 | - word2 = listElem[0] | ||
111 | - lemma2 = listElem[1] | ||
112 | - postag2 = listElem[2] | ||
113 | - features.update({ | ||
114 | - '-2:word': word2, | ||
115 | - '-2:lemma': lemma2, | ||
116 | - }) | ||
117 | - | ||
118 | - if i < len(sent) - 2: | ||
119 | - listElem = sent[i + 2].split('|') | ||
120 | - word2 = listElem[0] | ||
121 | - lemma2 = listElem[1] | ||
122 | - postag2 = listElem[2] | ||
123 | - features.update({ | ||
124 | - '+2:word': word2, | ||
125 | - '+2:lemma': lemma2, | ||
126 | - }) | ||
127 | - | ||
128 | - trigrams = False | ||
129 | - if trigrams: | ||
130 | - if i > 2: | ||
131 | - listElem = sent[i - 3].split('|') | ||
132 | - word3 = listElem[0] | ||
133 | - lemma3 = listElem[1] | ||
134 | - postag3 = listElem[2] | ||
135 | - features.update({ | ||
136 | - '-3:word': word3, | ||
137 | - '-3:lemma': lemma3, | ||
138 | - }) | ||
139 | - | ||
140 | - if i < len(sent) - 3: | ||
141 | - listElem = sent[i + 3].split('|') | ||
142 | - word3 = listElem[0] | ||
143 | - lemma3 = listElem[1] | ||
144 | - postag3 = listElem[2] | ||
145 | - features.update({ | ||
146 | - '+3:word': word3, | ||
147 | - '+3:lemma': lemma3, | ||
148 | - }) | ||
149 | - ''' | ||
150 | return features | 85 | return features |
151 | 86 | ||
152 | - | ||
153 | def sent2features(sent): | 87 | def sent2features(sent): |
154 | return [word2features(sent, i) for i in range(len(sent))] | 88 | return [word2features(sent, i) for i in range(len(sent))] |
155 | 89 | ... | ... |
training_validation_v3.py
0 → 100644
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment