Showing
33 changed files
with
138 additions
and
837 deletions
1 | -#!/bin/python3 | ||
2 | -import os | ||
3 | -from itertools import chain | ||
4 | -from optparse import OptionParser | ||
5 | -from time import time | ||
6 | -from collections import Counter | ||
7 | -import re | ||
8 | - | ||
9 | -import nltk | ||
10 | -import sklearn | ||
11 | -import scipy.stats | ||
12 | -import sys | ||
13 | - | ||
14 | -from sklearn.externals import joblib | ||
15 | -from sklearn.metrics import make_scorer | ||
16 | -from sklearn.cross_validation import cross_val_score | ||
17 | -from sklearn.grid_search import RandomizedSearchCV | ||
18 | - | ||
19 | -import sklearn_crfsuite | ||
20 | -from sklearn_crfsuite import scorers | ||
21 | -from sklearn_crfsuite import metrics | ||
22 | - | ||
23 | -from nltk.corpus import stopwords | ||
24 | -import random | ||
25 | - | ||
26 | - | ||
27 | -# Objective | ||
28 | -# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging | ||
29 | -# | ||
30 | -# Input parameters | ||
31 | -# --inputPath=PATH Path of inputfile | ||
32 | -# --outputPath=PATH Path to place output files | ||
33 | -# --trainingFile=testFile Output training data set | ||
34 | -# --testFile=testFile Output test data set | ||
35 | -# | ||
36 | -# Output | ||
37 | -# training and test data set | ||
38 | -# | ||
39 | -# Examples | ||
40 | -# python label-split_training_test_v1.py | ||
41 | -# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ | ||
42 | -# --inputFile sentences.tsv_pakal_.conll | ||
43 | -# --trainingFile training-data-set-70.txt | ||
44 | -# --testFile test-data-set-30.txt | ||
45 | -# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
46 | -# | ||
47 | -# | ||
48 | -# python label-split_training_test_v1.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile sentences.tsv_pakal_.conll --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
49 | - | ||
50 | - | ||
51 | -########################################## | ||
52 | -# MAIN PROGRAM # | ||
53 | -########################################## | ||
54 | - | ||
55 | -if __name__ == "__main__": | ||
56 | - # Defining parameters | ||
57 | - parser = OptionParser() | ||
58 | - parser.add_option("--inputPath", dest="inputPath", | ||
59 | - help="Path of output from CoreNLP", metavar="PATH") | ||
60 | - parser.add_option("--outputPath", dest="outputPath", | ||
61 | - help="Output path to place output files", | ||
62 | - metavar="PATH") | ||
63 | - parser.add_option("--inputFile", dest="inputFile", | ||
64 | - help="File with CoreNLP-tagging sentences", metavar="FILE") | ||
65 | - parser.add_option("--trainingFile", dest="trainingFile", | ||
66 | - help="File with training data set", metavar="FILE") | ||
67 | - parser.add_option("--testFile", dest="testFile", | ||
68 | - help="File with test data set", metavar="FILE") | ||
69 | - | ||
70 | - (options, args) = parser.parse_args() | ||
71 | - if len(args) > 0: | ||
72 | - parser.error("Any parameter given.") | ||
73 | - sys.exit(1) | ||
74 | - | ||
75 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
76 | - print("Path of CoreNLP output: " + options.inputPath) | ||
77 | - print("File with CoreNLP-tagging sentences: " + str(options.inputFile)) | ||
78 | - print("Path of training data set: " + str(options.outputPath)) | ||
79 | - print("File with training data set: " + str(options.trainingFile)) | ||
80 | - print("Path of test data set: " + str(options.outputPath)) | ||
81 | - print("File with test data set: " + str(options.testFile)) | ||
82 | - print('-------------------------------- PROCESSING --------------------------------') | ||
83 | - ## begin of tagging | ||
84 | - in_labels = { | ||
85 | - '<Gtype>': 'Gtype', | ||
86 | - '<Gversion>': 'Gversion', | ||
87 | - '<Med>': 'Med', | ||
88 | - '<Phase>': 'Phase', | ||
89 | - '<Sample>': 'Sample', | ||
90 | - '<Serie>': 'Serie', | ||
91 | - '<Substrain>': 'Substrain', | ||
92 | - '<Supp>': 'Supp', | ||
93 | - '<Technique>': 'Technique', | ||
94 | - '<Temp>': 'Temp', | ||
95 | - '<OD>': 'OD', | ||
96 | - '<Anti>': 'Anti', | ||
97 | - '<Agit>': 'Agit', | ||
98 | - '<Vess>': 'Vess' | ||
99 | - } | ||
100 | - ## End of tagging | ||
101 | - out_labels = { | ||
102 | - '</Air>': 'O', | ||
103 | - '</Gtype>': 'O', | ||
104 | - '</Gversion>': 'O', | ||
105 | - '</Med>': 'O', | ||
106 | - '</Phase>': 'O', | ||
107 | - '</Sample>': 'O', | ||
108 | - '</Serie>': 'O', | ||
109 | - '</Strain>': 'O', | ||
110 | - '<Strain>': 'O', | ||
111 | - '</Substrain>': 'O', | ||
112 | - '</Supp>': 'O', | ||
113 | - '</Technique>': 'O', | ||
114 | - '</Temp>': 'O', | ||
115 | - '</OD>': 'O', | ||
116 | - '</Anti>': 'O', | ||
117 | - '</Agit>': 'O', | ||
118 | - '<Name>': 'O', | ||
119 | - '</Name>': 'O', | ||
120 | - '<Orgn>': 'O', | ||
121 | - '</Orgn>': 'O', | ||
122 | - '</Vess>': 'O'} | ||
123 | - | ||
124 | - # Other label | ||
125 | - flag = 'O' | ||
126 | - # sentences counter | ||
127 | - n=0 | ||
128 | - lista = [] | ||
129 | - #First sentence | ||
130 | - sentence = '' | ||
131 | - with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: | ||
132 | - for line in input_file: | ||
133 | - if len(line.split('\t')) > 1: | ||
134 | - w = line.split('\t')[1] | ||
135 | - if w in in_labels or w in out_labels: | ||
136 | - #Tagging | ||
137 | - if w in in_labels.keys(): flag = in_labels[w] | ||
138 | - if w in out_labels: flag = out_labels[w] | ||
139 | - else: | ||
140 | - if w == "PGCGROWTHCONDITIONS": | ||
141 | - #End of sentence | ||
142 | - lista.append(sentence) | ||
143 | - #New setence | ||
144 | - sentence = '' | ||
145 | - n=n+1 | ||
146 | - else: | ||
147 | - #Building and save tagging sentence | ||
148 | - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') | ||
149 | - | ||
150 | - print("Number of sentences: " + str(n)) | ||
151 | - | ||
152 | - # Split 70 30 training and test sentences | ||
153 | - trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70)) | ||
154 | - testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex] | ||
155 | - print(len(trainingIndex)) | ||
156 | - print(len(testIndex)) | ||
157 | - | ||
158 | - with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile: | ||
159 | - Data = [lista[i] for i in trainingIndex] | ||
160 | - oFile.write('\n'.join(Data)) | ||
161 | - | ||
162 | - with open(os.path.join(options.outputPath, options.testFile), "w") as oFile: | ||
163 | - Data = [lista[i] for i in testIndex] | ||
164 | - oFile.write('\n'.join(Data)) | ||
165 | - | ||
166 | - print("==================================END===================================") |
1 | -#!/bin/python3 | ||
2 | -from optparse import OptionParser | ||
3 | -import re | ||
4 | -import os | ||
5 | -import random | ||
6 | - | ||
7 | - | ||
8 | -# Objective | ||
9 | -# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging | ||
10 | -# make data sets using only sentences with at least one true-tag | ||
11 | -# | ||
12 | -# Input parameters | ||
13 | -# --inputPath=PATH Path of inputfile | ||
14 | -# --outputPath=PATH Path to place output files | ||
15 | -# --trainingFile=testFile Output training data set | ||
16 | -# --testFile=testFile Output test data set | ||
17 | -# | ||
18 | -# Output | ||
19 | -# training and test data set | ||
20 | -# | ||
21 | -# Examples | ||
22 | -# python label-split_training_test_v2.py | ||
23 | -# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ | ||
24 | -# --inputFile sentences.tsv_pakal_.conll | ||
25 | -# --trainingFile training-data-set-70.txt | ||
26 | -# --testFile test-data-set-30.txt | ||
27 | -# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
28 | -# | ||
29 | -# | ||
30 | -# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
31 | - | ||
32 | - | ||
33 | -########################################## | ||
34 | -# MAIN PROGRAM # | ||
35 | -########################################## | ||
36 | - | ||
37 | -if __name__ == "__main__": | ||
38 | - # Defining parameters | ||
39 | - parser = OptionParser() | ||
40 | - parser.add_option("--inputPath", dest="inputPath", | ||
41 | - help="Path of output from CoreNLP", metavar="PATH") | ||
42 | - parser.add_option("--outputPath", dest="outputPath", | ||
43 | - help="Output path to place output files", | ||
44 | - metavar="PATH") | ||
45 | - parser.add_option("--inputFile", dest="inputFile", | ||
46 | - help="File with CoreNLP-tagging sentences", metavar="FILE") | ||
47 | - parser.add_option("--trainingFile", dest="trainingFile", | ||
48 | - help="File with training data set", metavar="FILE") | ||
49 | - parser.add_option("--testFile", dest="testFile", | ||
50 | - help="File with test data set", metavar="FILE") | ||
51 | - | ||
52 | - (options, args) = parser.parse_args() | ||
53 | - if len(args) > 0: | ||
54 | - parser.error("Any parameter given.") | ||
55 | - sys.exit(1) | ||
56 | - | ||
57 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
58 | - print("Path of CoreNLP output: " + options.inputPath) | ||
59 | - print("File with CoreNLP-tagging sentences: " + str(options.inputFile)) | ||
60 | - print("Path of training data set: " + options.outputPath) | ||
61 | - print("File with training data set: " + str(options.trainingFile)) | ||
62 | - print("Path of test data set: " + options.outputPath) | ||
63 | - print("File with test data set: " + str(options.testFile)) | ||
64 | - print('-------------------------------- PROCESSING --------------------------------') | ||
65 | - ## begin of tagging | ||
66 | - in_labels = { | ||
67 | - '<Gtype>': 'Gtype', | ||
68 | - '<Gversion>': 'Gversion', | ||
69 | - '<Med>': 'Med', | ||
70 | - '<Phase>': 'Phase', | ||
71 | - '<Supp>': 'Supp', | ||
72 | - '<Technique>': 'Technique', | ||
73 | - '<Temp>': 'Temp', | ||
74 | - '<OD>': 'OD', | ||
75 | - '<Anti>': 'Anti' | ||
76 | - } | ||
77 | - ## End of tagging | ||
78 | - out_labels = { | ||
79 | - '<Air>': 'O', | ||
80 | - '</Air>': 'O', | ||
81 | - '</Gtype>': 'O', | ||
82 | - '</Gversion>': 'O', | ||
83 | - '</Med>': 'O', | ||
84 | - '</Phase>': 'O', | ||
85 | - '<Sample>': 'O', | ||
86 | - '</Sample>': 'O', | ||
87 | - '<Serie>': 'O', | ||
88 | - '</Serie>': 'O', | ||
89 | - '<Strain>': 'O', | ||
90 | - '</Strain>': 'O', | ||
91 | - '<Substrain>': 'O', | ||
92 | - '</Substrain>': 'O', | ||
93 | - '</Supp>': 'O', | ||
94 | - '</Technique>': 'O', | ||
95 | - '</Temp>': 'O', | ||
96 | - '</OD>': 'O', | ||
97 | - '<Agit>': 'O', | ||
98 | - '</Agit>': 'O', | ||
99 | - '<Name>': 'O', | ||
100 | - '</Name>': 'O', | ||
101 | - '<Orgn>': 'O', | ||
102 | - '</Orgn>': 'O', | ||
103 | - '</Anti>': 'O', | ||
104 | - '<Vess>': 'O', | ||
105 | - '</Vess>': 'O'} | ||
106 | - | ||
107 | - # Other label | ||
108 | - flag = 'O' | ||
109 | - # sentences counter | ||
110 | - lista = [] | ||
111 | - #First sentence | ||
112 | - sentence = '' | ||
113 | - with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: | ||
114 | - for line in input_file: | ||
115 | - if len(line.split('\t')) > 1: | ||
116 | - w = line.split('\t')[1] | ||
117 | - if w in in_labels or w in out_labels: | ||
118 | - #Tagging | ||
119 | - if w in in_labels.keys(): flag = in_labels[w] | ||
120 | - if w in out_labels: flag = out_labels[w] | ||
121 | - else: | ||
122 | - if w == "PGCGROWTHCONDITIONS": | ||
123 | - words = sentence.split(' ') | ||
124 | - #End of sentence | ||
125 | - tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ] | ||
126 | - #At least one true-tag on sentence | ||
127 | - if len(tags)> 0: | ||
128 | - lista.append(sentence) | ||
129 | - #New setence | ||
130 | - sentence = '' | ||
131 | - else: | ||
132 | - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') | ||
133 | - | ||
134 | - print("Number of sentences: " + str( len(lista) ) ) | ||
135 | - | ||
136 | - | ||
137 | - # Split 70 30 training and test sentences | ||
138 | - trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70)) | ||
139 | - testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex] | ||
140 | - | ||
141 | - with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile: | ||
142 | - Data = [lista[i] for i in trainingIndex] | ||
143 | - oFile.write('\n'.join(Data)) | ||
144 | - | ||
145 | - with open(os.path.join(options.outputPath, options.testFile), "w") as oFile: | ||
146 | - Data = [lista[i] for i in testIndex] | ||
147 | - oFile.write('\n'.join(Data)) | ||
148 | - | ||
149 | - print("==================================END===================================") |
1 | -#!/bin/python3 | ||
2 | from optparse import OptionParser | 1 | from optparse import OptionParser |
3 | import re | 2 | import re |
4 | import os | 3 | import os |
... | @@ -7,7 +6,6 @@ import random | ... | @@ -7,7 +6,6 @@ import random |
7 | 6 | ||
8 | # Objective | 7 | # Objective |
9 | # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging | 8 | # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging |
10 | -# make data sets using only sentences with at least one true-tag | ||
11 | # | 9 | # |
12 | # Input parameters | 10 | # Input parameters |
13 | # --inputPath=PATH Path of inputfile | 11 | # --inputPath=PATH Path of inputfile |
... | @@ -19,15 +17,15 @@ import random | ... | @@ -19,15 +17,15 @@ import random |
19 | # training and test data set | 17 | # training and test data set |
20 | # | 18 | # |
21 | # Examples | 19 | # Examples |
22 | -# python label-split_training_test_v2.py | 20 | +# python label-split_training_test_v1.py |
23 | -# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ | 21 | +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ |
24 | -# --inputFile sentences.tsv_pakal_.conll | 22 | +# --inputFile raw-metadata-senteneces_v2.txt.conll |
25 | -# --trainingFile training-data-set-70.txt | 23 | +# --trainingFile training-data-set-70_v4.txt |
26 | -# --testFile test-data-set-30.txt | 24 | +# --testFile test-data-set-30_v4.txt |
27 | -# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | 25 | +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets |
28 | # | 26 | # |
29 | # | 27 | # |
30 | -# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | 28 | +# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets |
31 | 29 | ||
32 | 30 | ||
33 | ########################################## | 31 | ########################################## |
... | @@ -67,78 +65,79 @@ if __name__ == "__main__": | ... | @@ -67,78 +65,79 @@ if __name__ == "__main__": |
67 | '<Gtype>': 'Gtype', | 65 | '<Gtype>': 'Gtype', |
68 | '<Gversion>': 'Gversion', | 66 | '<Gversion>': 'Gversion', |
69 | '<Med>': 'Med', | 67 | '<Med>': 'Med', |
70 | - '<Phase>': 'Phase', | 68 | + '<Phase>': 'Phase', |
69 | + '<Substrain>': 'Substrain', | ||
71 | '<Supp>': 'Supp', | 70 | '<Supp>': 'Supp', |
71 | + '<Strain>': 'Strain', | ||
72 | '<Technique>': 'Technique', | 72 | '<Technique>': 'Technique', |
73 | '<Temp>': 'Temp', | 73 | '<Temp>': 'Temp', |
74 | '<OD>': 'OD', | 74 | '<OD>': 'OD', |
75 | '<Anti>': 'Anti', | 75 | '<Anti>': 'Anti', |
76 | '<Agit>': 'Agit', | 76 | '<Agit>': 'Agit', |
77 | - '<Vess>': 'Vess' | 77 | + '<Air>': 'Air', |
78 | + '<Vess>': 'Vess', | ||
79 | + '<pH>': 'pH' | ||
78 | } | 80 | } |
79 | ## End of tagging | 81 | ## End of tagging |
80 | out_labels = { | 82 | out_labels = { |
81 | - '<Air>': 'O', | ||
82 | - '</Air>': 'O', | ||
83 | '</Gtype>': 'O', | 83 | '</Gtype>': 'O', |
84 | '</Gversion>': 'O', | 84 | '</Gversion>': 'O', |
85 | '</Med>': 'O', | 85 | '</Med>': 'O', |
86 | '</Phase>': 'O', | 86 | '</Phase>': 'O', |
87 | - '<Sample>': 'O', | ||
88 | - '</Sample>': 'O', | ||
89 | - '<Serie>': 'O', | ||
90 | - '</Serie>': 'O', | ||
91 | - '<Strain>': 'O', | ||
92 | - '</Strain>': 'O', | ||
93 | - '<Substrain>': 'O', | ||
94 | '</Substrain>': 'O', | 87 | '</Substrain>': 'O', |
95 | '</Supp>': 'O', | 88 | '</Supp>': 'O', |
89 | + '</Strain>': 'O', | ||
96 | '</Technique>': 'O', | 90 | '</Technique>': 'O', |
97 | '</Temp>': 'O', | 91 | '</Temp>': 'O', |
98 | '</OD>': 'O', | 92 | '</OD>': 'O', |
99 | '</Anti>': 'O', | 93 | '</Anti>': 'O', |
100 | '</Agit>': 'O', | 94 | '</Agit>': 'O', |
101 | - '<Name>': 'O', | 95 | + '</Air>': 'O', |
102 | - '</Name>': 'O', | 96 | + '</Vess>': 'O', |
97 | + '</pH>': 'O'} | ||
98 | + old_labels = { | ||
103 | '<Orgn>': 'O', | 99 | '<Orgn>': 'O', |
104 | - '</Orgn>': 'O', | 100 | + '</Orgn>': 'O' |
105 | - '</Vess>': 'O'} | 101 | + } |
106 | 102 | ||
107 | # Other label | 103 | # Other label |
108 | - flag = 'O' | 104 | + flag = 'O' |
109 | - # sentences counter | ||
110 | - n=0 | ||
111 | lista = [] | 105 | lista = [] |
112 | #First sentence | 106 | #First sentence |
113 | sentence = '' | 107 | sentence = '' |
108 | + n = 0 | ||
114 | with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: | 109 | with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: |
115 | for line in input_file: | 110 | for line in input_file: |
116 | if len(line.split('\t')) > 1: | 111 | if len(line.split('\t')) > 1: |
117 | - w = line.split('\t')[1] | 112 | + w = line.split('\t')[1] |
118 | if w in in_labels or w in out_labels: | 113 | if w in in_labels or w in out_labels: |
119 | - #Tagging | 114 | + #Tagging |
120 | - if w in in_labels.keys(): flag = in_labels[w] | 115 | + if w in in_labels.keys(): flag = in_labels[w] |
121 | - if w in out_labels: flag = out_labels[w] | 116 | + if w in out_labels: flag = out_labels[w] |
122 | - else: | 117 | + else: |
123 | if w == "PGCGROWTHCONDITIONS": | 118 | if w == "PGCGROWTHCONDITIONS": |
124 | - words = sentence.split(' ') | 119 | + n=n+1 |
125 | - tags = [tag for tag in words if word.split('|')[-1] in in_labels.values() ] | 120 | + words = sentence.split(' ') |
126 | - #At least one true-tag on sentence | 121 | + #End of sentence |
127 | - if len(tags)> 0: | 122 | + tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ] |
128 | - lista.append(sentence) | 123 | + #At least one true-tag on sentence |
129 | - #New setence | 124 | + if len(tags)> 0: |
130 | - sentence = '' | 125 | + lista.append(sentence) |
131 | - n=n+1 | 126 | + #New setence |
132 | - else: | 127 | + sentence = '' |
133 | - #Building and save tagging sentence | 128 | + elif w not in old_labels.keys(): |
129 | + #Building and save tagging sentence | ||
134 | sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') | 130 | sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') |
135 | 131 | ||
136 | - print("Number of sentences: " + str(n) + str(len(lista)+1)) | 132 | + print("Number of sentences with at least one tag: " + str(len(lista))) |
133 | + print("Number of sentences from CoreNLP: " + str(n)) | ||
137 | 134 | ||
138 | 135 | ||
139 | # Split 70 30 training and test sentences | 136 | # Split 70 30 training and test sentences |
140 | - trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70)) | 137 | + trainingIndex = random.sample(range(len(lista)), int(len(lista)*.70)) |
141 | - testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex] | 138 | + testIndex = [n for n in range(len(lista)) if n not in trainingIndex] |
139 | + print("Number of sentences for training: " + str(len(trainingIndex))) | ||
140 | + print("Number of sentences for test: " + str(len(testIndex))) | ||
142 | 141 | ||
143 | with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile: | 142 | with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile: |
144 | Data = [lista[i] for i in trainingIndex] | 143 | Data = [lista[i] for i in trainingIndex] | ... | ... |
CRF/bin/params.py
deleted
100644 → 0
1 | -#!/bin/python3 | ||
2 | -import os | ||
3 | -from itertools import chain | ||
4 | -from optparse import OptionParser | ||
5 | -from time import time | ||
6 | -from collections import Counter | ||
7 | -import re | ||
8 | - | ||
9 | -import nltk | ||
10 | -import sklearn | ||
11 | -import scipy.stats | ||
12 | -import sys | ||
13 | - | ||
14 | -from sklearn.externals import joblib | ||
15 | -from sklearn.metrics import make_scorer | ||
16 | -from sklearn.cross_validation import cross_val_score | ||
17 | -from sklearn.grid_search import RandomizedSearchCV | ||
18 | - | ||
19 | -import sklearn_crfsuite | ||
20 | -from sklearn_crfsuite import scorers | ||
21 | -from sklearn_crfsuite import metrics | ||
22 | - | ||
23 | -from nltk.corpus import stopwords | ||
24 | - | ||
25 | -import random | ||
26 | - | ||
27 | - | ||
28 | -# Objective | ||
29 | -# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging | ||
30 | -# | ||
31 | -# Input parameters | ||
32 | -# --inputPath=PATH Path of inputfile | ||
33 | -# --outputPath=PATH Path to place output files | ||
34 | -# --trainingFile=testFile Output training data set | ||
35 | -# --testFile=testFile Output test data set | ||
36 | -# | ||
37 | -# Output | ||
38 | -# training and test data set | ||
39 | -# | ||
40 | -# Examples | ||
41 | -# python label-split_training_test_v1.py | ||
42 | -# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ | ||
43 | -# --inputFile sentences.tsv_pakal_.conll | ||
44 | -# --trainingFile training-data-set-70.txt | ||
45 | -# --testFile test-data-set-30.txt | ||
46 | -# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
47 | -# | ||
48 | -# | ||
49 | -# python label-split_training_test_v1.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile sentences.tsv_pakal_.conll --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets | ||
50 | - | ||
51 | - | ||
52 | -########################################## | ||
53 | -# MAIN PROGRAM # | ||
54 | -########################################## | ||
55 | - | ||
56 | -if __name__ == "__main__": | ||
57 | - # Defining parameters | ||
58 | - parser = OptionParser() | ||
59 | - parser.add_option("--inputPath", dest="inputPath", | ||
60 | - help="Path of output from CoreNLP", metavar="PATH") | ||
61 | - parser.add_option("--outputPath", dest="outputPath", | ||
62 | - help="Output path to place output files", | ||
63 | - metavar="PATH") | ||
64 | - parser.add_option("--inputFile", dest="inputFile", | ||
65 | - help="File with CoreNLP-tagging sentences", metavar="FILE") | ||
66 | - parser.add_option("--trainingFile", dest="trainingFile", | ||
67 | - help="File with training data set", metavar="FILE") | ||
68 | - parser.add_option("--testFile", dest="testFile", | ||
69 | - help="File with test data set", metavar="FILE") | ||
70 | - | ||
71 | - (options, args) = parser.parse_args() | ||
72 | - if len(args) > 0: | ||
73 | - parser.error("Any parameter given.") | ||
74 | - sys.exit(1) | ||
75 | - | ||
76 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
77 | - print("Path of CoreNLP output: " + str(options.inputPath)) | ||
78 | - print("File with CoreNLP-tagging sentences: " + str(options.inputFile)) | ||
79 | - print("Path of training data set: " + str(options.outputPath)) | ||
80 | - print("File with training data set: " + str(options.trainingFile)) | ||
81 | - print("Path of test data set: " + str(options.outputPath)) | ||
82 | - print("File with test data set: " + str(options.testFile)) | ||
83 | - print('-------------------------------- PROCESSING --------------------------------') | ||
84 | - ## begin of tagging | ||
85 | - in_labels = { | ||
86 | - '<Gtype>': 'Gtype', | ||
87 | - '<Gversion>': 'Gversion', | ||
88 | - '<Med>': 'Med', | ||
89 | - '<Phase>': 'Phase', | ||
90 | - '<Sample>': 'Sample', | ||
91 | - '<Serie>': 'Serie', | ||
92 | - '<Substrain>': 'Substrain', | ||
93 | - '<Supp>': 'Supp', | ||
94 | - '<Technique>': 'Technique', | ||
95 | - '<Temp>': 'Temp', | ||
96 | - '<OD>': 'OD', | ||
97 | - '<Anti>': 'Anti', | ||
98 | - '<Agit>': 'Agit', | ||
99 | - '<Vess>': 'Vess' | ||
100 | - } | ||
101 | - ## End of tagging | ||
102 | - out_labels = { | ||
103 | - '</Air>': 'O', | ||
104 | - '</Gtype>': 'O', | ||
105 | - '</Gversion>': 'O', | ||
106 | - '</Med>': 'O', | ||
107 | - '</Phase>': 'O', | ||
108 | - '</Sample>': 'O', | ||
109 | - '</Serie>': 'O', | ||
110 | - '</Strain>': 'O', | ||
111 | - '<Strain>': 'O', | ||
112 | - '</Substrain>': 'O', | ||
113 | - '</Supp>': 'O', | ||
114 | - '</Technique>': 'O', | ||
115 | - '</Temp>': 'O', | ||
116 | - '</OD>': 'O', | ||
117 | - '</Anti>': 'O', | ||
118 | - '</Agit>': 'O', | ||
119 | - '<Name>': 'O', | ||
120 | - '</Name>': 'O', | ||
121 | - '<Orgn>': 'O', | ||
122 | - '</Orgn>': 'O', | ||
123 | - '</Vess>': 'O'} | ||
124 | - | ||
125 | - # Other label | ||
126 | - flag = 'O' | ||
127 | - # sentences counter | ||
128 | - n=0 | ||
129 | - lista = [] | ||
130 | - #First sentence | ||
131 | - sentence = '' | ||
132 | - with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: | ||
133 | - for line in input_file: | ||
134 | - if len(line.split('\t')) > 1: | ||
135 | - w = line.split('\t')[1] | ||
136 | - if w in in_labels or w in out_labels: | ||
137 | - #Tagging | ||
138 | - if w in in_labels.keys(): flag = in_labels[w] | ||
139 | - if w in out_labels: flag = out_labels[w] | ||
140 | - else: | ||
141 | - if w == "PGCGROWTHCONDITIONS": | ||
142 | - #End of sentence | ||
143 | - lista.append(sentence) | ||
144 | - #New setence | ||
145 | - sentence = '' | ||
146 | - n=n+1 | ||
147 | - else: | ||
148 | - #Building and save tagging sentence | ||
149 | - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') | ||
150 | - | ||
151 | - print("Number of sentences: " + str(n)) | ||
152 | - print('\n'.join(lista)) | ||
153 | - # Split 70 30 training and test sentences | ||
154 | -# trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70)) | ||
155 | -# testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex] | ||
156 | - | ||
157 | -# with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile: | ||
158 | -# Data = [lista[i] for i in trainingIndex] | ||
159 | -# oFile.write('\n'.join(Data)) | ||
160 | - | ||
161 | -# with open(os.path.join(options.outputPath, options.testFile), "w") as oFile: | ||
162 | -# Data = [lista[i] for i in testIndex] | ||
163 | -# oFile.write('\n'.join(Data)) | ||
164 | - | ||
165 | -# print("==================================END===================================") | ||
166 | - |
CRF/bin/training_validation_v3.py
deleted
100644 → 0
This diff is collapsed. Click to expand it.
... | @@ -32,7 +32,7 @@ from nltk.corpus import stopwords | ... | @@ -32,7 +32,7 @@ from nltk.corpus import stopwords |
32 | # --trainingFile File with training data set | 32 | # --trainingFile File with training data set |
33 | # --testFile File with test data set | 33 | # --testFile File with test data set |
34 | # --outputPath=PATH Output path to place output files | 34 | # --outputPath=PATH Output path to place output files |
35 | -# --reportFile Report Fileneme | 35 | +# --version Version Report |
36 | 36 | ||
37 | # Output | 37 | # Output |
38 | # 1) Best model | 38 | # 1) Best model |
... | @@ -43,31 +43,54 @@ from nltk.corpus import stopwords | ... | @@ -43,31 +43,54 @@ from nltk.corpus import stopwords |
43 | # --trainingFile training-data-set-70.txt | 43 | # --trainingFile training-data-set-70.txt |
44 | # --testFile test-data-set-30.txt | 44 | # --testFile test-data-set-30.txt |
45 | # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/ | 45 | # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/ |
46 | -# --reportFile report_1 | 46 | +# --version _v2 |
47 | -# python3.4 training-validation_v5.py --inputPatTH /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/ | 47 | +# python3 training_validation_v7.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70_v4.txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --version _v1 |
48 | 48 | ||
49 | ################################# | 49 | ################################# |
50 | # FUNCTIONS # | 50 | # FUNCTIONS # |
51 | ################################# | 51 | ################################# |
52 | 52 | ||
53 | def isGreek(word): | 53 | def isGreek(word): |
54 | + #al greek letters | ||
54 | alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω', | 55 | alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω', |
55 | 'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω'] | 56 | 'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω'] |
56 | if word in alphabet: | 57 | if word in alphabet: |
57 | return True | 58 | return True |
58 | else: | 59 | else: |
59 | return False | 60 | return False |
61 | + | ||
62 | +def hNumber(word): | ||
63 | + for l in word: | ||
64 | + if l.isdigit(): | ||
65 | + return True | ||
66 | + return False | ||
67 | + | ||
68 | +def symb(word): | ||
69 | + n=0 | ||
70 | + #at least a not alphanumeric character | ||
71 | + for l in word: | ||
72 | + if l.isdigit(): n = n+1 | ||
73 | + if l.isalpha(): n = n+1 | ||
74 | + #Exclude Greek letters | ||
75 | + if isGreek(l): n = n+1 | ||
76 | + | ||
77 | + if n<len(word): return True | ||
78 | + else: return False | ||
79 | + | ||
60 | def hUpper(word): | 80 | def hUpper(word): |
81 | + #at least an uppers | ||
61 | for l in word: | 82 | for l in word: |
62 | if l.isupper(): return True | 83 | if l.isupper(): return True |
63 | return False | 84 | return False |
64 | 85 | ||
65 | def hLower(word): | 86 | def hLower(word): |
87 | + #at least a lower | ||
66 | for l in word: | 88 | for l in word: |
67 | if l.islower(): return True | 89 | if l.islower(): return True |
68 | return False | 90 | return False |
69 | 91 | ||
70 | def hGreek(word): | 92 | def hGreek(word): |
93 | + #at least an greek letter | ||
71 | for l in word: | 94 | for l in word: |
72 | if isGreek(l): return True | 95 | if isGreek(l): return True |
73 | return False | 96 | return False |
... | @@ -80,54 +103,69 @@ def word2features(sent, i, S1, S2): | ... | @@ -80,54 +103,69 @@ def word2features(sent, i, S1, S2): |
80 | postag = listElem[2] | 103 | postag = listElem[2] |
81 | ner = listElem[3] | 104 | ner = listElem[3] |
82 | 105 | ||
106 | + #====================== G1 ======================# | ||
107 | + | ||
83 | features = { | 108 | features = { |
84 | #General | 109 | #General |
85 | 'lemma': lemma, | 110 | 'lemma': lemma, |
86 | 'postag': postag | 111 | 'postag': postag |
87 | } | 112 | } |
88 | 113 | ||
89 | - if S1: | ||
90 | - #S1 | ||
91 | - features['word']: word | ||
92 | - features['hUpper']: hUpper(word) | ||
93 | - features['hLower']: hUpper(word) | ||
94 | - features['hGreek']: hGreek(word) | ||
95 | - #features['hAlfNum']: hAlfNum(word) | ||
96 | - | ||
97 | - if S2: | ||
98 | - #S2 | ||
99 | - features['isUpper']: word.isupper() | ||
100 | - features['isLower']: word.isLower() | ||
101 | - features['isGreek']: isGreek(word) | ||
102 | - features['isNumber']: word.isdigit() | ||
103 | - | ||
104 | if i > 0: | 114 | if i > 0: |
105 | - listElem = sent[i - 1].split('|') | 115 | + listElem = sent[i - 1].split('|') |
106 | - word1 = listElem[0] | ||
107 | lemma1 = listElem[1] | 116 | lemma1 = listElem[1] |
108 | postag1 = listElem[2] | 117 | postag1 = listElem[2] |
109 | - features.update({ | 118 | + |
110 | - #Word anterioir | 119 | + features.update({ |
111 | - '-1:word': word1, | ||
112 | #LemaG posterior | 120 | #LemaG posterior |
113 | '-1:lemma': lemma1, | 121 | '-1:lemma': lemma1, |
114 | #PostG posterior | 122 | #PostG posterior |
115 | '-1:postag': postag1, | 123 | '-1:postag': postag1, |
116 | }) | 124 | }) |
117 | 125 | ||
118 | - if i < len(sent) - 1: | 126 | + if i < len(sent) - 1: |
119 | - listElem = sent[i + 1].split('|') | 127 | + listElem = sent[i + 1].split('|') |
120 | - word1 = listElem[0] | ||
121 | lemma1 = listElem[1] | 128 | lemma1 = listElem[1] |
122 | postag1 = listElem[2] | 129 | postag1 = listElem[2] |
123 | - features.update({ | 130 | + |
124 | - #Word anterioir | 131 | + features.update({ |
125 | - '+1:word': word1, | ||
126 | #LemaG posterior | 132 | #LemaG posterior |
127 | '+1:lemma': lemma1, | 133 | '+1:lemma': lemma1, |
128 | #PostG posterior | 134 | #PostG posterior |
129 | '+1:postag': postag1, | 135 | '+1:postag': postag1, |
130 | }) | 136 | }) |
137 | + | ||
138 | + #====================== S1 ======================# | ||
139 | + if S1: | ||
140 | + listElem = sent[i - 1].split('|') | ||
141 | + lemma1 = listElem[1] | ||
142 | + postag1 = listElem[2] | ||
143 | + | ||
144 | + features['hUpper']: hUpper(word) | ||
145 | + features['hLower']: hUpper(word) | ||
146 | + features['hGreek']: hGreek(word) | ||
147 | + features['symb']: symb(word) | ||
148 | + #firstChar | ||
149 | + features['lemma1[:1]']: lemma1[:1] | ||
150 | + #secondChar | ||
151 | + features['postag[:1]']: lemma1[:1] | ||
152 | + features['postag[:2]']: lemma1[:2] | ||
153 | + features['lemma[:2]']: lemma1[:2] | ||
154 | + | ||
155 | + #====================== S2 ======================# | ||
156 | + if S2: | ||
157 | + #S2 | ||
158 | + features['isUpper']: word.isupper() | ||
159 | + features['isLower']: word.isLower() | ||
160 | + features['isGreek']: isGreek(word) | ||
161 | + features['isNumber']: word.isdigit() | ||
162 | + | ||
163 | + | ||
164 | + ''' | ||
165 | + #====================== S3 ======================# | ||
166 | + if S3: | ||
167 | + features['word']: word | ||
168 | + ''' | ||
131 | return features | 169 | return features |
132 | 170 | ||
133 | 171 | ||
... | @@ -153,7 +191,7 @@ def print_state_features(state_features, f): | ... | @@ -153,7 +191,7 @@ def print_state_features(state_features, f): |
153 | f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | 191 | f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) |
154 | 192 | ||
155 | 193 | ||
156 | -__author__ = 'CMendezC' | 194 | +__author__ = 'egaytan' |
157 | 195 | ||
158 | ########################################## | 196 | ########################################## |
159 | # MAIN PROGRAM # | 197 | # MAIN PROGRAM # |
... | @@ -177,7 +215,7 @@ if __name__ == "__main__": | ... | @@ -177,7 +215,7 @@ if __name__ == "__main__": |
177 | parser.add_option("--excludeSymbols", default=False, | 215 | parser.add_option("--excludeSymbols", default=False, |
178 | action="store_true", dest="excludeSymbols", | 216 | action="store_true", dest="excludeSymbols", |
179 | help="Exclude punctuation marks") | 217 | help="Exclude punctuation marks") |
180 | - parser.add_option("--reportFile", dest="reportFile", | 218 | + parser.add_option("--version", dest="version", |
181 | help="Report file", metavar="FILE") | 219 | help="Report file", metavar="FILE") |
182 | parser.add_option("--S1", default=False, | 220 | parser.add_option("--S1", default=False, |
183 | action="store_true", dest="S1", | 221 | action="store_true", dest="S1", |
... | @@ -198,7 +236,7 @@ if __name__ == "__main__": | ... | @@ -198,7 +236,7 @@ if __name__ == "__main__": |
198 | print("File with test data set: " + str(options.testFile)) | 236 | print("File with test data set: " + str(options.testFile)) |
199 | print("Exclude stop words: " + str(options.excludeStopWords)) | 237 | print("Exclude stop words: " + str(options.excludeStopWords)) |
200 | print("Levels: " + str(options.S1) + " " + str(options.S2)) | 238 | print("Levels: " + str(options.S1) + " " + str(options.S2)) |
201 | - print("Report file: " + str(options.reportFile)) | 239 | + print("Report file: " + str(options.version)) |
202 | 240 | ||
203 | 241 | ||
204 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | 242 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', |
... | @@ -254,19 +292,14 @@ if __name__ == "__main__": | ... | @@ -254,19 +292,14 @@ if __name__ == "__main__": |
254 | 292 | ||
255 | print("Reading corpus done in: %fs" % (time() - t0)) | 293 | print("Reading corpus done in: %fs" % (time() - t0)) |
256 | 294 | ||
257 | - if options.S1: S1 = 0 | 295 | + print(sent2features(sentencesTrainingData[0], options.S1, options.S2)[0]) |
258 | - else: S1 = 1 | 296 | + print(sent2features(sentencesTestData[0], options.S1, options.S2)[0]) |
259 | - if options.S2: S2 = 0 | ||
260 | - else: S2 = 1 | ||
261 | - | ||
262 | - print(sent2features(sentencesTrainingData[0], S1, S2)[0]) | ||
263 | - print(sent2features(sentencesTestData[0], S1, S2)[0]) | ||
264 | t0 = time() | 297 | t0 = time() |
265 | 298 | ||
266 | - X_train = [sent2features(s, S1, S2) for s in sentencesTrainingData] | 299 | + X_train = [sent2features(s, options.S1, options.S2) for s in sentencesTrainingData] |
267 | y_train = [sent2labels(s) for s in sentencesTrainingData] | 300 | y_train = [sent2labels(s) for s in sentencesTrainingData] |
268 | 301 | ||
269 | - X_test = [sent2features(s, S1, S2) for s in sentencesTestData] | 302 | + X_test = [sent2features(s, options.S1, options.S2) for s in sentencesTestData] |
270 | # print X_test | 303 | # print X_test |
271 | y_test = [sent2labels(s) for s in sentencesTestData] | 304 | y_test = [sent2labels(s) for s in sentencesTestData] |
272 | 305 | ||
... | @@ -292,7 +325,7 @@ if __name__ == "__main__": | ... | @@ -292,7 +325,7 @@ if __name__ == "__main__": |
292 | 325 | ||
293 | # Original: labels = list(crf.classes_) | 326 | # Original: labels = list(crf.classes_) |
294 | # Original: labels.remove('O') | 327 | # Original: labels.remove('O') |
295 | - labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti']) | 328 | + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH']) |
296 | 329 | ||
297 | # use the same metric for evaluation | 330 | # use the same metric for evaluation |
298 | f1_scorer = make_scorer(metrics.flat_f1_score, | 331 | f1_scorer = make_scorer(metrics.flat_f1_score, |
... | @@ -312,8 +345,10 @@ if __name__ == "__main__": | ... | @@ -312,8 +345,10 @@ if __name__ == "__main__": |
312 | # crf.fit(X_train, y_train) | 345 | # crf.fit(X_train, y_train) |
313 | 346 | ||
314 | # Best hiperparameters | 347 | # Best hiperparameters |
315 | - # crf = rs.best_estimator_ | 348 | + # crf = rs.best_estimator_ |
316 | - nameReport = options.trainingFile.replace('.txt', str(options.reportFile) + '.txt') | 349 | + |
350 | + | ||
351 | + nameReport = str(options.S1) + '_S2_' + str(options.S2) + str(options.version) + '.txt' | ||
317 | with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | 352 | with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: |
318 | oFile.write("********** TRAINING AND TESTING REPORT **********\n") | 353 | oFile.write("********** TRAINING AND TESTING REPORT **********\n") |
319 | oFile.write("Training file: " + options.trainingFile + '\n') | 354 | oFile.write("Training file: " + options.trainingFile + '\n') |
... | @@ -331,27 +366,13 @@ if __name__ == "__main__": | ... | @@ -331,27 +366,13 @@ if __name__ == "__main__": |
331 | # Saving model | 366 | # Saving model |
332 | print(" Saving training model...") | 367 | print(" Saving training model...") |
333 | t1 = time() | 368 | t1 = time() |
334 | - nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | 369 | + nameModel = 'model_S1_' + str(options.S1) + '_S2_' + str(options.S2) + str(options.version) + '.mod' |
335 | - options.excludeSymbols) + '.mod') | ||
336 | joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | 370 | joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) |
337 | print(" Saving training model done in: %fs" % (time() - t1)) | 371 | print(" Saving training model done in: %fs" % (time() - t1)) |
338 | 372 | ||
339 | # Evaluation against test data | 373 | # Evaluation against test data |
340 | y_pred = crf.predict(X_test) | 374 | y_pred = crf.predict(X_test) |
341 | print("*********************************") | 375 | print("*********************************") |
342 | - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | ||
343 | - options.excludeSymbols) + '.txt') | ||
344 | - with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: | ||
345 | - for y in y_pred: | ||
346 | - oFile.write(str(y) + '\n') | ||
347 | - | ||
348 | - print("*********************************") | ||
349 | - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | ||
350 | - options.excludeSymbols) + '.txt') | ||
351 | - with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: | ||
352 | - for y in y_test: | ||
353 | - oFile.write(str(y) + '\n') | ||
354 | - | ||
355 | print("Prediction done in: %fs" % (time() - t0)) | 376 | print("Prediction done in: %fs" % (time() - t0)) |
356 | 377 | ||
357 | # labels = list(crf.classes_) | 378 | # labels = list(crf.classes_) |
... | @@ -387,4 +408,3 @@ if __name__ == "__main__": | ... | @@ -387,4 +408,3 @@ if __name__ == "__main__": |
387 | print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile) | 408 | print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile) |
388 | oFile.write('\n') | 409 | oFile.write('\n') |
389 | 410 | ||
390 | - | ... | ... |
This diff is collapsed. Click to expand it.
... | @@ -3,8 +3,6 @@ Gtype | ... | @@ -3,8 +3,6 @@ Gtype |
3 | Gversion | 3 | Gversion |
4 | Med | 4 | Med |
5 | Phase | 5 | Phase |
6 | -Sample | ||
7 | -Serie | ||
8 | Strain | 6 | Strain |
9 | Supp | 7 | Supp |
10 | Technique | 8 | Technique |
... | @@ -13,4 +11,5 @@ OD | ... | @@ -13,4 +11,5 @@ OD |
13 | Anti | 11 | Anti |
14 | Agit | 12 | Agit |
15 | Vess | 13 | Vess |
16 | - | 14 | +Substrain |
15 | +pH | ... | ... |
CRF/data-sets/test-data-set-30.txt
deleted
100644 → 0
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
CRF/data-sets/test-data-set-30_v4.txt
0 → 100644
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
CRF/data-sets/training-data-set-70_v4.txt
0 → 100644
This diff is collapsed. Click to expand it.
CRF/models/model_S1_False_S2_False_v1.mod
0 → 100644
No preview for this file type
CRF/models/model_S1_False_S2_True_v1.mod
0 → 100644
No preview for this file type
No preview for this file type
CRF/models/model_S1_True_S2_True_v1.mod
0 → 100644
No preview for this file type
CRF/reports/report_False_S2_False_v1.txt
0 → 100644
This diff is collapsed. Click to expand it.
CRF/reports/report_False_S2_True_v1.txt
0 → 100644
This diff is collapsed. Click to expand it.
CRF/reports/report_True_S2_False_v1.txt
0 → 100644
This diff is collapsed. Click to expand it.
CRF/reports/report_True_S2_True_v1.txt
0 → 100644
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
1 | -['O', 'O', 'O', 'O', 'O'] | ||
2 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
3 | -['O', 'O', 'O'] | ||
4 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
5 | -['O', 'O', 'O', 'Gtype'] | ||
6 | -['O', 'O', 'O', 'O'] | ||
7 | -['Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
8 | -['O', 'O', 'O', 'O', 'O'] | ||
9 | -['O', 'O', 'O', 'O', 'O'] | ||
10 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
11 | -['O', 'O', 'O', 'Med', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp'] | ||
12 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
13 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
14 | -['O', 'O', 'O', 'O'] | ||
15 | -['O', 'O', 'O', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp'] | ||
16 | -['O', 'O', 'O', 'O', 'O'] | ||
17 | -['O', 'O', 'Gtype'] | ||
18 | -['O', 'O', 'O'] | ||
19 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
20 | -['O', 'O', 'O', 'O', 'O'] | ||
21 | -['O', 'O', 'O', 'O', 'O', 'O', 'Gversion', 'Gversion', 'Gversion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
22 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
23 | -['O', 'O', 'O', 'O', 'O'] | ||
24 | -['O', 'O', 'O'] | ||
25 | -['O', 'O', 'O', 'O', 'O'] | ||
26 | -['O', 'O', 'Anti', 'Anti', 'Anti'] | ||
27 | -['O', 'O', 'O'] | ||
28 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
29 | -['O', 'O', 'O', 'O', 'O'] | ||
30 | -['O', 'O', 'O'] | ||
31 | -['O', 'O', 'O'] | ||
32 | -['O', 'O', 'O'] | ||
33 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
34 | -['O', 'O', 'O', 'O'] | ||
35 | -['O', 'O', 'O', 'Supp'] | ||
36 | -['O', 'O', 'O', 'Gtype', 'O', 'O', 'O', 'O', 'O'] | ||
37 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
38 | -['O', 'O', 'O', 'O'] | ||
39 | -['O', 'O', 'O'] | ||
40 | -['O', 'O', 'O', 'O'] | ||
41 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
42 | -['O', 'O', 'O'] | ||
43 | -['O', 'O', 'O', 'O'] | ||
44 | -['O', 'O', 'O', 'O', 'O'] | ||
45 | -['O', 'O', 'Gtype'] | ||
46 | -['O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype'] | ||
47 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gtype', 'Gtype'] | ||
48 | -['O', 'O', 'O', 'O', 'O', 'O'] | ||
49 | -['O', 'O', 'O', 'O', 'O'] | ||
50 | -['O', 'O', 'Gtype'] | ||
51 | -['O', 'O', 'O'] | ||
52 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O', 'Med', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp', 'O'] | ||
53 | -['O', 'O', 'O'] | ||
54 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
55 | -['O', 'O', 'O'] | ||
56 | -['O', 'O', 'O'] | ||
57 | -['O', 'O', 'O', 'Gtype'] | ||
58 | -['O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
59 | -['O', 'O', 'O'] | ||
60 | -['O', 'O', 'O'] | ||
61 | -['O', 'O', 'Gtype'] | ||
62 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O'] | ||
63 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'O'] | ||
64 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
65 | -['O', 'O', 'O'] | ||
66 | -['O', 'O', 'Gtype'] | ||
67 | -['O', 'O', 'O', 'Anti', 'Anti', 'Anti', 'O'] | ||
68 | -['O', 'O', 'O', 'O'] | ||
69 | -['O', 'O', 'O', 'O'] | ||
70 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
71 | -['O', 'O', 'O', 'Anti', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
72 | -['O', 'O', 'O'] | ||
73 | -['O', 'O', 'O'] | ||
74 | -['O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype'] | ||
75 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
76 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
77 | -['O', 'O', 'Gtype'] | ||
78 | -['O', 'O', 'O', 'O', 'O', 'Anti', 'Anti'] | ||
79 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
80 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD'] | ||
81 | -['O', 'O', 'O', 'O', 'O', 'O'] | ||
82 | -['O', 'O', 'O', 'O', 'O'] | ||
83 | -['O', 'O', 'O', 'Anti'] | ||
84 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
85 | -['O', 'O', 'Gtype'] | ||
86 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'Substrain'] | ||
87 | -['O', 'O', 'Gtype'] | ||
88 | -['O', 'O', 'Gtype'] | ||
89 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
90 | -['O', 'O', 'O'] | ||
91 | -['O', 'O', 'O'] | ||
92 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gtype', 'Gtype', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
93 | -['O', 'O', 'O', 'O', 'O', 'O'] | ||
94 | -['O', 'O', 'Med'] | ||
95 | -['O', 'O', 'O', 'O', 'O', 'O'] | ||
96 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
97 | -['O', 'O', 'Gtype'] | ||
98 | -['Gversion', 'Gversion'] | ||
99 | -['O', 'O', 'O', 'O', 'O'] | ||
100 | -['O', 'O', 'O'] | ||
101 | -['O', 'O', 'O'] | ||
102 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
103 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
104 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
105 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gversion', 'Gversion', 'Gversion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
106 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'O'] | ||
107 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
108 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
109 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
110 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
111 | -['O', 'O', 'Gtype'] | ||
112 | -['O', 'O', 'O', 'O'] | ||
113 | -['O', 'O', 'O', 'O', 'O'] | ||
114 | -['O', 'O', 'Anti', 'Anti', 'Anti', 'Anti'] | ||
115 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
116 | -['O', 'O', 'O', 'O'] | ||
117 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
118 | -['O', 'O', 'O'] | ||
119 | -['O', 'O', 'O', 'O'] | ||
120 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
121 | -['Med', 'Med', 'Med', 'Med', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] |
1 | -['O', 'O', 'O', 'O', 'O'] | ||
2 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
3 | -['O', 'O', 'O'] | ||
4 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
5 | -['O', 'O', 'O', 'Technique'] | ||
6 | -['O', 'O', 'O', 'O'] | ||
7 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
8 | -['O', 'O', 'O', 'O', 'O'] | ||
9 | -['O', 'O', 'O', 'O', 'O'] | ||
10 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'Substrain'] | ||
11 | -['O', 'O', 'O', 'Med', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp'] | ||
12 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'OD', 'OD', 'OD', 'O', 'O'] | ||
13 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
14 | -['O', 'O', 'O', 'O'] | ||
15 | -['O', 'O', 'O', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp'] | ||
16 | -['O', 'O', 'O', 'O', 'O'] | ||
17 | -['O', 'O', 'Gtype'] | ||
18 | -['O', 'O', 'O'] | ||
19 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
20 | -['O', 'O', 'O', 'O', 'O'] | ||
21 | -['O', 'O', 'O', 'O', 'O', 'O', 'Gversion', 'Gversion', 'Gversion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
22 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
23 | -['O', 'O', 'Gtype', 'Gtype', 'Gtype'] | ||
24 | -['O', 'O', 'O'] | ||
25 | -['O', 'O', 'O', 'O', 'O'] | ||
26 | -['O', 'O', 'Anti', 'Anti', 'Anti'] | ||
27 | -['O', 'O', 'O'] | ||
28 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
29 | -['O', 'O', 'O', 'O', 'O'] | ||
30 | -['O', 'O', 'O'] | ||
31 | -['O', 'O', 'O'] | ||
32 | -['O', 'O', 'O'] | ||
33 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
34 | -['O', 'O', 'O', 'O'] | ||
35 | -['O', 'O', 'O', 'Supp'] | ||
36 | -['O', 'O', 'O', 'Anti', 'O', 'O', 'O', 'O', 'O'] | ||
37 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
38 | -['O', 'O', 'O', 'O'] | ||
39 | -['O', 'O', 'O'] | ||
40 | -['O', 'O', 'O', 'O'] | ||
41 | -['O', 'O', 'O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype'] | ||
42 | -['O', 'O', 'O'] | ||
43 | -['O', 'O', 'O', 'O'] | ||
44 | -['O', 'O', 'O', 'O', 'O'] | ||
45 | -['O', 'O', 'Gtype'] | ||
46 | -['O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype'] | ||
47 | -['Substrain', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype'] | ||
48 | -['O', 'O', 'O', 'O', 'O', 'O'] | ||
49 | -['O', 'O', 'Gtype', 'O', 'O'] | ||
50 | -['O', 'O', 'Gtype'] | ||
51 | -['O', 'O', 'O'] | ||
52 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O', 'Med', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp', 'O'] | ||
53 | -['O', 'O', 'O'] | ||
54 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
55 | -['O', 'O', 'O'] | ||
56 | -['O', 'O', 'O'] | ||
57 | -['O', 'O', 'O', 'Anti'] | ||
58 | -['O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
59 | -['O', 'O', 'O'] | ||
60 | -['O', 'O', 'O'] | ||
61 | -['O', 'O', 'Gtype'] | ||
62 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O'] | ||
63 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
64 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
65 | -['O', 'O', 'O'] | ||
66 | -['O', 'O', 'Gtype'] | ||
67 | -['O', 'O', 'O', 'Anti', 'Anti', 'Anti', 'O'] | ||
68 | -['O', 'O', 'O', 'O'] | ||
69 | -['O', 'O', 'O', 'O'] | ||
70 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
71 | -['O', 'O', 'O', 'Anti', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
72 | -['O', 'O', 'O'] | ||
73 | -['O', 'O', 'O'] | ||
74 | -['O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype'] | ||
75 | -['Substrain', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype'] | ||
76 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
77 | -['O', 'O', 'Gtype'] | ||
78 | -['O', 'O', 'O', 'Anti', 'Anti', 'Anti', 'Anti'] | ||
79 | -['OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
80 | -['OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD'] | ||
81 | -['O', 'O', 'O', 'O', 'O', 'O'] | ||
82 | -['O', 'O', 'O', 'O', 'O'] | ||
83 | -['O', 'O', 'O', 'Anti'] | ||
84 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
85 | -['O', 'O', 'Gtype'] | ||
86 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
87 | -['O', 'O', 'Gtype'] | ||
88 | -['O', 'O', 'Supp'] | ||
89 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
90 | -['O', 'O', 'O'] | ||
91 | -['O', 'O', 'O'] | ||
92 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gtype', 'Gtype', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
93 | -['O', 'O', 'O', 'O', 'O', 'O'] | ||
94 | -['O', 'O', 'Med'] | ||
95 | -['O', 'O', 'O', 'O', 'O', 'O'] | ||
96 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
97 | -['O', 'O', 'Supp'] | ||
98 | -['O', 'O'] | ||
99 | -['O', 'O', 'O', 'O', 'O'] | ||
100 | -['O', 'O', 'O'] | ||
101 | -['O', 'O', 'O'] | ||
102 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
103 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
104 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
105 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gversion', 'Gversion', 'Gversion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
106 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
107 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
108 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
109 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
110 | -['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'O'] | ||
111 | -['O', 'O', 'Gtype'] | ||
112 | -['O', 'O', 'O', 'O'] | ||
113 | -['O', 'O', 'O', 'O', 'O'] | ||
114 | -['O', 'O', 'Anti', 'Anti', 'Anti', 'Anti'] | ||
115 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
116 | -['O', 'O', 'O', 'O'] | ||
117 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
118 | -['O', 'O', 'O'] | ||
119 | -['O', 'O', 'O', 'O'] | ||
120 | -['O', 'O', 'O', 'O', 'O', 'O', 'O'] | ||
121 | -['Med', 'Med', 'Med', 'Med', 'O', 'O', 'O', 'O', 'O', 'O', 'Temp', 'Temp', 'Temp', 'O', 'O', 'Agit', 'Agit', 'Agit', 'Agit', 'Agit', 'Agit', 'Agit'] |
1 | -cd /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data | 1 | + |
2 | + | ||
3 | +# Orgiginal files | ||
4 | +#cd /home/egaytan/automatic-extraction-growth-conditions/data-sets/report-manually-tagged-gcs/ | ||
5 | + | ||
6 | +# Re-tagged | ||
7 | +cd /home/egaytan/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ | ||
2 | echo | 8 | echo |
3 | echo | 9 | echo |
4 | echo | 10 | echo |
... | @@ -18,9 +24,9 @@ echo | ... | @@ -18,9 +24,9 @@ echo |
18 | echo | 24 | echo |
19 | echo "Filter all paragraphs with tags..." | 25 | echo "Filter all paragraphs with tags..." |
20 | echo "Add sentence-end-tag PGCGROWTHCONDITIONS..." | 26 | echo "Add sentence-end-tag PGCGROWTHCONDITIONS..." |
21 | -grep -E "<[^<]*>" * | grep -E '!'| cut -f2 -d'='|sort|uniq|awk '{ print $_" PGCGROWTHCONDITIONS"; }' > /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt | 27 | +grep -E "<[^<]*>" * | grep -E '!'| cut -f2 -d'='|sort|uniq|awk '{ print $_" PGCGROWTHCONDITIONS"; }' > /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces_v2.txt |
22 | echo | 28 | echo |
23 | -echo "Number of total tag sentences: "$(wc /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt -l); | 29 | +echo "Number of total tag sentences: "$(wc /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces_v2.txt -l); |
24 | echo | 30 | echo |
25 | echo | 31 | echo |
26 | -echo "Saving file: /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt"; | 32 | +echo "Saving file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces_v2.txt"; | ... | ... |
... | @@ -4,8 +4,8 @@ echo "==============================Run CoreNLP================================= | ... | @@ -4,8 +4,8 @@ echo "==============================Run CoreNLP================================= |
4 | echo | 4 | echo |
5 | echo | 5 | echo |
6 | 6 | ||
7 | -input="/home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt"; | 7 | +input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces_v2.txt"; |
8 | -output="/home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/"; | 8 | +output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/"; |
9 | echo "input file: "$input; | 9 | echo "input file: "$input; |
10 | echo | 10 | echo |
11 | echo "output directory: "$output; | 11 | echo "output directory: "$output; | ... | ... |
CoreNLP/input/raw-metadata-senteneces_v2.txt
0 → 100644
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
-
Please register or login to post a comment