Showing
6 changed files
with
355 additions
and
81 deletions
1 | +import stanza | ||
2 | +import argparse | ||
3 | +import re | ||
4 | +import os | ||
5 | +import pandas as pd | ||
6 | + | ||
7 | +# Objective | ||
8 | +# Check if MCO terms appear in raw sentences from extracted sentences from softfiles | ||
9 | +# | ||
10 | +# Input parameters | ||
11 | +# --inputPath=PATH Path to geo_sentences_to_check_fixed.csv | ||
12 | + # /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
13 | +# --inputFile=PATH File geo_sentences_to_check_fixed.csv | ||
14 | +# --inputPathMco Path to MCO term file | ||
15 | + # /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb | ||
16 | +# --inputFileMco File with MCO terms GC_Terms.txt (tsv) | ||
17 | +# --outputPath=PATH Path to place MCO terms that appeared in input file | ||
18 | +# | ||
19 | +# Output | ||
20 | +# Files with MCO terms that appeared in input file | ||
21 | +# | ||
22 | +# _v1 | ||
23 | +# python check_mco_terms_in_sentences_v1.py | ||
24 | +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
25 | +# --inputFile geo_sentences_to_check_fixed.csv | ||
26 | +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
27 | +# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb | ||
28 | +# --inputFileMco GC_Terms.txt | ||
29 | +# python check_mco_terms_in_sentences_v1.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputFile geo_sentences_to_check_fixed.csv --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb --inputFileMco GC_Terms.txt | ||
30 | + | ||
31 | +''' | ||
32 | +RESULTADO DE LA REVISIÓN: | ||
33 | +Sólo se encontraron los siguientes términos en las oraciones extraídas de la curación de los softfiles: | ||
34 | +TERM_NAME | ||
35 | +L broth (también fue anotado por el curador) | ||
36 | +MOPS (anotado por el curador como <Med> MOPS minimal glucose media </Med>) | ||
37 | +glucose (también fue anotado por el curador, pero no como palabra aislada) | ||
38 | +nitrate (también fue anotado por el curador, aislado como <Supp> nitrate </Supp> y también como parte de varios suplementos anotados por el curador) | ||
39 | +M9 minimal medium (también fue anotado por el curador) | ||
40 | +OD600 of 0.3 (también fue anotado por el curador) | ||
41 | +Escherichia coli (no estamos considerando organismos) | ||
42 | +LB medium (no anotado por el curador) | ||
43 | +''' | ||
44 | + | ||
45 | +########################################## | ||
46 | +# MAIN PROGRAM # | ||
47 | +########################################## | ||
48 | + | ||
49 | +if __name__ == "__main__": | ||
50 | + # Defining parameters | ||
51 | + parser = argparse.ArgumentParser( | ||
52 | + prog='check_mco_terms_in_sentences_v1-py', | ||
53 | + description='Check if MCO terms appear in raw sentences from extracted sentences from softfiles.', | ||
54 | + epilog='') | ||
55 | + parser.add_argument("--inputPath", dest="inputPath", | ||
56 | + help="Path to extracted sentences from softfiles", metavar="PATH") | ||
57 | + parser.add_argument("--inputFile", dest="inputFile", | ||
58 | + help="Input extracted sentences from softfiles", metavar="FILE") | ||
59 | + parser.add_argument("--outputPath", dest="outputPath", | ||
60 | + help="Path to place MCO terms that appeared in input file", metavar="PATH") | ||
61 | + parser.add_argument("--inputPathMco", dest="inputPathMco", | ||
62 | + help="Path to MCO file", metavar="PATH") | ||
63 | + parser.add_argument("--inputFileMco", dest="inputFileMco", | ||
64 | + help="MCO file", metavar="FILE") | ||
65 | + args = parser.parse_args() | ||
66 | + | ||
67 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
68 | + print("Path to extracted sentences from softfiles: " + args.inputPath) | ||
69 | + print("Input extracted sentences from softfiles: " + args.inputFile) | ||
70 | + print("Path to place MCO terms that appeared in input file: " + args.outputPath) | ||
71 | + print("Path to MCO file: " + args.inputPathMco) | ||
72 | + print("MCO file: " + args.inputFileMco) | ||
73 | + print('-------------------------------- PROCESSING --------------------------------') | ||
74 | + | ||
75 | + df_sentences_to_check = pd.read_csv(os.path.join(args.inputPath, args.inputFile)) | ||
76 | + print(df_sentences_to_check.head(3)) | ||
77 | + print(df_sentences_to_check.shape) | ||
78 | + | ||
79 | + df_mco_terms = pd.read_csv(os.path.join(args.inputPathMco, args.inputFileMco), sep="\t") | ||
80 | + print(df_mco_terms.head(3)) | ||
81 | + print(df_mco_terms.shape) | ||
82 | + | ||
83 | + df_mco_terms_found = pd.DataFrame(columns=['TERM_TYPE', 'TERM_NAME', 'SENTENCE']) | ||
84 | + | ||
85 | + text_sentences = [] | ||
86 | + for ind in df_sentences_to_check.index: | ||
87 | + line_trans = df_sentences_to_check['transformed_sentence'][ind] | ||
88 | + list_line = line_trans.split() | ||
89 | + list_sentence = [tokens.split("|")[0] for tokens in list_line] | ||
90 | + text_sentence = " ".join(list_sentence) | ||
91 | + # print(text_sentence) | ||
92 | + if text_sentence not in text_sentences: | ||
93 | + text_sentences.append(text_sentence) | ||
94 | + | ||
95 | + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt') | ||
96 | + | ||
97 | + with open(os.path.join(args.outputPath, "MCO_terms_found.tsv"), mode='w') as ofile: | ||
98 | + for ind in df_mco_terms.index: | ||
99 | + term_type = df_mco_terms['TERM_TYPE'][ind] | ||
100 | + term_name = df_mco_terms['TERM_NAME'][ind] | ||
101 | + doc = nlp(term_name) | ||
102 | + word_list = [w.text for w in doc.sentences[0].words] | ||
103 | + term_name_new = " ".join(word_list) | ||
104 | + #print(term_name_new) | ||
105 | + sentences_found = [sent for sent in text_sentences if term_name_new in sent] | ||
106 | + for s in sentences_found: | ||
107 | + print("TERM_TYPE {} TERM_NAME {} SENT {}".format(term_type, term_name, s)) | ||
108 | + new_row = {'TERM_TYPE': term_type, | ||
109 | + 'TERM_NAME': term_name, | ||
110 | + 'SENTENCE': s} | ||
111 | + df_mco_terms_found = df_mco_terms_found.append(new_row, ignore_index=True) | ||
112 | + df_mco_terms_found.to_csv(os.path.join(args.outputPath, 'MCO_terms_found_in_softfiles.tsv'), sep="\t") |
... | @@ -15,11 +15,11 @@ import pandas as pd | ... | @@ -15,11 +15,11 @@ import pandas as pd |
15 | # Files with sentences obtained from XML Soft files | 15 | # Files with sentences obtained from XML Soft files |
16 | # | 16 | # |
17 | # Examples | 17 | # Examples |
18 | -# python extract-sentences-from-softfiles.py | 18 | +# python extract-sentences-from-softfiles_v2.py |
19 | # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data | 19 | # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data |
20 | # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | 20 | # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences |
21 | # | 21 | # |
22 | -# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | 22 | +# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences |
23 | 23 | ||
24 | ########################################## | 24 | ########################################## |
25 | # MAIN PROGRAM # | 25 | # MAIN PROGRAM # | ... | ... |
... | @@ -15,11 +15,11 @@ import pandas as pd | ... | @@ -15,11 +15,11 @@ import pandas as pd |
15 | # Files with sentences obtained from XML Soft files | 15 | # Files with sentences obtained from XML Soft files |
16 | # | 16 | # |
17 | # Examples | 17 | # Examples |
18 | -# python extract-sentences-from-softfiles.py | 18 | +# python extract-sentences-from-softfiles_v2.py |
19 | # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data | 19 | # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data |
20 | # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | 20 | # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences |
21 | # | 21 | # |
22 | -# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | 22 | +# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences |
23 | 23 | ||
24 | ########################################## | 24 | ########################################## |
25 | # MAIN PROGRAM # | 25 | # MAIN PROGRAM # |
... | @@ -70,7 +70,7 @@ if __name__ == "__main__": | ... | @@ -70,7 +70,7 @@ if __name__ == "__main__": |
70 | tags = ['Gtype', 'Med', 'Phase', 'Supp', | 70 | tags = ['Gtype', 'Med', 'Phase', 'Supp', |
71 | 'Temp', 'OD', 'Anti', 'Agit', | 71 | 'Temp', 'OD', 'Anti', 'Agit', |
72 | 'Air', 'Vess', 'pH'] | 72 | 'Air', 'Vess', 'pH'] |
73 | - deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique'] | 73 | + deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique', 'Orgn'] |
74 | all_tags = tags + deleted_tags | 74 | all_tags = tags + deleted_tags |
75 | # Regex to check if line has a tag | 75 | # Regex to check if line has a tag |
76 | regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>') | 76 | regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>') |
... | @@ -89,9 +89,11 @@ if __name__ == "__main__": | ... | @@ -89,9 +89,11 @@ if __name__ == "__main__": |
89 | testing_file = "GSE54899_family_retagged-05242019_validated.xml" | 89 | testing_file = "GSE54899_family_retagged-05242019_validated.xml" |
90 | 90 | ||
91 | # Define stanza pipeline for sentence segmentation | 91 | # Define stanza pipeline for sentence segmentation |
92 | - nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize') | 92 | + # nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize') |
93 | # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation | 93 | # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation |
94 | - nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True) | 94 | + # nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True) |
95 | + # Define stanza pipeline for lemmatization and pos tagging with sentence segmentation | ||
96 | + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma') | ||
95 | 97 | ||
96 | # Store field_name (bangline) and field_text | 98 | # Store field_name (bangline) and field_text |
97 | field_name = "" | 99 | field_name = "" |
... | @@ -117,7 +119,7 @@ if __name__ == "__main__": | ... | @@ -117,7 +119,7 @@ if __name__ == "__main__": |
117 | for path, dirs, files in os.walk(args.inputPath): | 119 | for path, dirs, files in os.walk(args.inputPath): |
118 | # For each file in dir | 120 | # For each file in dir |
119 | for file in files: | 121 | for file in files: |
120 | - if file == testing_file: | 122 | + # if file == testing_file: |
121 | print(" Reading file..." + str(file)) | 123 | print(" Reading file..." + str(file)) |
122 | with open(os.path.join(args.inputPath, file)) as iFile: | 124 | with open(os.path.join(args.inputPath, file)) as iFile: |
123 | for line in iFile: | 125 | for line in iFile: |
... | @@ -140,7 +142,7 @@ if __name__ == "__main__": | ... | @@ -140,7 +142,7 @@ if __name__ == "__main__": |
140 | hash_field_name[field_name] += 1 | 142 | hash_field_name[field_name] += 1 |
141 | else: | 143 | else: |
142 | hash_field_name[field_name] = 1 | 144 | hash_field_name[field_name] = 1 |
143 | - original_sentence = field_text | 145 | + # original_sentence = field_text |
144 | # delete GC tags | 146 | # delete GC tags |
145 | modified_sentence = regex_delete_tag.sub("", field_text) | 147 | modified_sentence = regex_delete_tag.sub("", field_text) |
146 | modified_sentence = regex_delete_tag.sub("", modified_sentence) | 148 | modified_sentence = regex_delete_tag.sub("", modified_sentence) |
... | @@ -173,6 +175,8 @@ if __name__ == "__main__": | ... | @@ -173,6 +175,8 @@ if __name__ == "__main__": |
173 | gc_tag = "O" | 175 | gc_tag = "O" |
174 | list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag)) | 176 | list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag)) |
175 | transformed_sentence = " ".join(list_transformed_sentence) | 177 | transformed_sentence = " ".join(list_transformed_sentence) |
178 | + original_sentence = regex_gc_ini_tag.sub(r'<\g<tag>>', sentence.text) | ||
179 | + original_sentence = regex_gc_end_tag.sub(r'</\g<tag>>', original_sentence) | ||
176 | new_row = {'serie': serie, | 180 | new_row = {'serie': serie, |
177 | 'serie_pubmed_id': serie_pubmed_id, | 181 | 'serie_pubmed_id': serie_pubmed_id, |
178 | 'sample': sample, | 182 | 'sample': sample, |
... | @@ -182,75 +186,3 @@ if __name__ == "__main__": | ... | @@ -182,75 +186,3 @@ if __name__ == "__main__": |
182 | 'transformed_sentence': transformed_sentence} | 186 | 'transformed_sentence': transformed_sentence} |
183 | df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True) | 187 | df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True) |
184 | df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv')) | 188 | df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv')) |
185 | - #print(token) | ||
186 | - quit() | ||
187 | - | ||
188 | - ## End of tagging | ||
189 | - out_labels = { | ||
190 | - '</Gtype>': 'O', | ||
191 | - '</Gversion>': 'O', | ||
192 | - '</Med>': 'O', | ||
193 | - '</Phase>': 'O', | ||
194 | - '</Substrain>': 'O', | ||
195 | - '</Supp>': 'O', | ||
196 | - '</Strain>': 'O', | ||
197 | - '</Technique>': 'O', | ||
198 | - '</Temp>': 'O', | ||
199 | - '</OD>': 'O', | ||
200 | - '</Anti>': 'O', | ||
201 | - '</Agit>': 'O', | ||
202 | - '</Air>': 'O', | ||
203 | - '</Vess>': 'O', | ||
204 | - '</pH>': 'O'} | ||
205 | - old_labels = { | ||
206 | - '<Orgn>': 'O', | ||
207 | - '</Orgn>': 'O' | ||
208 | - } | ||
209 | - | ||
210 | - # Other label | ||
211 | - flag = 'O' | ||
212 | - lista = [] | ||
213 | - # First sentence | ||
214 | - sentence = '' | ||
215 | - n = 0 | ||
216 | - with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file: | ||
217 | - for line in input_file: | ||
218 | - if len(line.split('\t')) > 1: | ||
219 | - w = line.split('\t')[1] | ||
220 | - if w in in_labels or w in out_labels: | ||
221 | - # Tagging | ||
222 | - if w in in_labels.keys(): flag = in_labels[w] | ||
223 | - if w in out_labels: flag = out_labels[w] | ||
224 | - else: | ||
225 | - if w == "PGCGROWTHCONDITIONS": | ||
226 | - n = n + 1 | ||
227 | - words = sentence.split(' ') | ||
228 | - # End of sentence | ||
229 | - tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()] | ||
230 | - # At least one true-tag on sentence | ||
231 | - if len(tags) > 0: | ||
232 | - lista.append(sentence) | ||
233 | - # New setence | ||
234 | - sentence = '' | ||
235 | - elif w not in old_labels.keys(): | ||
236 | - # Building and save tagging sentence | ||
237 | - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ') | ||
238 | - | ||
239 | - print("Number of sentences with at least one tag: " + str(len(lista))) | ||
240 | - print("Number of sentences from CoreNLP: " + str(n)) | ||
241 | - | ||
242 | - # Split 70 30 training and test sentences | ||
243 | - trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70)) | ||
244 | - testIndex = [n for n in range(len(lista)) if n not in trainingIndex] | ||
245 | - print("Number of sentences for training: " + str(len(trainingIndex))) | ||
246 | - print("Number of sentences for test: " + str(len(testIndex))) | ||
247 | - | ||
248 | - with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile: | ||
249 | - Data = [lista[i] for i in trainingIndex] | ||
250 | - oFile.write('\n'.join(Data)) | ||
251 | - | ||
252 | - with open(os.path.join(args.outputPath, args.testFile), "w") as oFile: | ||
253 | - Data = [lista[i] for i in testIndex] | ||
254 | - oFile.write('\n'.join(Data)) | ||
255 | - | ||
256 | - print("==================================END===================================") | ... | ... |
1 | +import stanza | ||
2 | +import argparse | ||
3 | +import re | ||
4 | +import os | ||
5 | +import pandas as pd | ||
6 | + | ||
7 | +# Objective | ||
8 | +# Sentences extraction from XML Soft files. _v3 includes dictionary-based NER of MCO conditions | ||
9 | +# | ||
10 | +# Input parameters | ||
11 | +# --inputPath=PATH Path to XML Soft files | ||
12 | +# --outputPath=PATH Path to place output files | ||
13 | +# | ||
14 | +# Output | ||
15 | +# Files with sentences obtained from XML Soft files | ||
16 | +# | ||
17 | +# Examples | ||
18 | +# python extract-sentences-from-softfiles_v2.py | ||
19 | +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data | ||
20 | +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
21 | +# | ||
22 | +# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
23 | + | ||
24 | +# _v3 | ||
25 | +# python extract-sentences-from-softfiles_v3.py | ||
26 | +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data | ||
27 | +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences | ||
28 | +# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb | ||
29 | +# --inputFileMco | ||
30 | + | ||
31 | +########################################## | ||
32 | +# MAIN PROGRAM # | ||
33 | +########################################## | ||
34 | + | ||
35 | +if __name__ == "__main__": | ||
36 | + # Defining parameters | ||
37 | + parser = argparse.ArgumentParser( | ||
38 | + prog='extract-sentences-from-softfiles', | ||
39 | + description='Sentences extraction from XML Soft files.', | ||
40 | + epilog='') | ||
41 | + parser.add_argument("--inputPath", dest="inputPath", | ||
42 | + help="Path to XML Soft files", metavar="PATH") | ||
43 | + parser.add_argument("--outputPath", dest="outputPath", | ||
44 | + help="Path for output files", metavar="PATH") | ||
45 | + parser.add_argument("--inputPathMco", dest="inputPathMco", | ||
46 | + help="Path to MCO file", metavar="PATH") | ||
47 | + parser.add_argument("--inputFileMco", dest="inputFileMco", | ||
48 | + help="MCO file", metavar="FILE") | ||
49 | + args = parser.parse_args() | ||
50 | + | ||
51 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
52 | + print("Path to XML Soft files: " + args.inputPath) | ||
53 | + print("Path to output files: " + args.outputPath) | ||
54 | + print("Path to MCO file: " + args.inputPathMco) | ||
55 | + print("MCO file: " + args.inputFileMco) | ||
56 | + print('-------------------------------- PROCESSING --------------------------------') | ||
57 | + | ||
58 | + ## Tags of GCs into consideration | ||
59 | + # culture medium, medium supplements, aeration, temperature, | ||
60 | + # pH, agitation, growth phase, optical density, genetic background | ||
61 | + tags = { | ||
62 | + '<Gtype>': 'Gtype', | ||
63 | + # '<Gversion>': 'Gversion', | ||
64 | + '<Med>': 'Med', | ||
65 | + '<Phase>': 'Phase', | ||
66 | + # '<Substrain>': 'Substrain', | ||
67 | + '<Supp>': 'Supp', | ||
68 | + # '<Strain>': 'Strain', | ||
69 | + # '<Technique>': 'Technique', | ||
70 | + '<Temp>': 'Temp', | ||
71 | + '<OD>': 'OD', | ||
72 | + '<Anti>': 'Anti', | ||
73 | + '<Agit>': 'Agit', | ||
74 | + '<Air>': 'Air', | ||
75 | + '<Vess>': 'Vess', | ||
76 | + '<pH>': 'pH' | ||
77 | + } | ||
78 | + #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>', | ||
79 | + # '<Temp>', '<OD>', '<Anti>', '<Agit>', | ||
80 | + # '<Air>', '<Vess>', '<pH>'] | ||
81 | + #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>'] | ||
82 | + tags = ['Gtype', 'Med', 'Phase', 'Supp', | ||
83 | + 'Temp', 'OD', 'Anti', 'Agit', | ||
84 | + 'Air', 'Vess', 'pH'] | ||
85 | + deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique', 'Orgn'] | ||
86 | + all_tags = tags + deleted_tags | ||
87 | + # Regex to check if line has a tag | ||
88 | + regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>') | ||
89 | + # Regex to delete tags | ||
90 | + regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>') | ||
91 | + # Regex to substitute tags | ||
92 | + regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>') | ||
93 | + regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>') | ||
94 | + #p = re.compile(r'blue (?P<animal>dog|cat)') | ||
95 | + #p.sub(r'gray \g<animal>', s) | ||
96 | + # Regex to tag GCs | ||
97 | + regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))') | ||
98 | + regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))') | ||
99 | + | ||
100 | + # Testing file: GSE54899_family_retagged-05242019_validated.xml | ||
101 | + testing_file = "GSE54899_family_retagged-05242019_validated.xml" | ||
102 | + | ||
103 | + # Define stanza pipeline for sentence segmentation | ||
104 | + # nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize') | ||
105 | + # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation | ||
106 | + # nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True) | ||
107 | + # Define stanza pipeline for lemmatization and pos tagging with sentence segmentation | ||
108 | + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma') | ||
109 | + | ||
110 | + # Store field_name (bangline) and field_text | ||
111 | + field_name = "" | ||
112 | + field_text = "" | ||
113 | + | ||
114 | + # Store list of unique field_name | ||
115 | + hash_field_name = {} | ||
116 | + | ||
117 | + # Store sentences from fields that contained at least one GC tag. | ||
118 | + # We want to use this list for someone to check it | ||
119 | + df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence']) | ||
120 | + | ||
121 | + # Store serie number | ||
122 | + # ^SERIES = GSE54899 | ||
123 | + serie = "" | ||
124 | + # Store series pubmed id | ||
125 | + # !Series_pubmed_id = 25222563 | ||
126 | + serie_pubmed_id = "" | ||
127 | + # Store sample | ||
128 | + # ^SAMPLE = GSM1326335 | ||
129 | + sample = "" | ||
130 | + | ||
131 | + for path, dirs, files in os.walk(args.inputPath): | ||
132 | + # For each file in dir | ||
133 | + for file in files: | ||
134 | + # if file == testing_file: | ||
135 | + print(" Reading file..." + str(file)) | ||
136 | + with open(os.path.join(args.inputPath, file)) as iFile: | ||
137 | + for line in iFile: | ||
138 | + line = line.rstrip('\n') | ||
139 | + if line.find(" = ") == -1: | ||
140 | + continue | ||
141 | + list_line = line.split(" = ") | ||
142 | + field_name = list_line[0] | ||
143 | + #print("field_name: {}".format(field_name)) | ||
144 | + field_text = list_line[1] | ||
145 | + #print("field_text: {}".format(field_text)) | ||
146 | + if field_name == "^SERIES": | ||
147 | + serie = field_text | ||
148 | + elif field_name == "!Series_pubmed_id": | ||
149 | + serie_pubmed_id = field_text | ||
150 | + elif field_name == "^SAMPLE": | ||
151 | + sample = field_text | ||
152 | + elif regex_has_tag.search(line): # Contains GC tag | ||
153 | + if field_name in hash_field_name: | ||
154 | + hash_field_name[field_name] += 1 | ||
155 | + else: | ||
156 | + hash_field_name[field_name] = 1 | ||
157 | + # original_sentence = field_text | ||
158 | + # delete GC tags | ||
159 | + modified_sentence = regex_delete_tag.sub("", field_text) | ||
160 | + modified_sentence = regex_delete_tag.sub("", modified_sentence) | ||
161 | + # substitute tags | ||
162 | + # p = re.compile(r'blue (?P<animal>dog|cat)') | ||
163 | + # p.sub(r'gray \g<animal>', s) | ||
164 | + modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence) | ||
165 | + modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence) | ||
166 | + doc = nlp(modified_sentence) | ||
167 | + for i, sentence in enumerate(doc.sentences): | ||
168 | + # print(sentence.text) | ||
169 | + list_transformed_sentence = [] | ||
170 | + # For GC tag | ||
171 | + gc_tag = "O" | ||
172 | + in_tag = False | ||
173 | + for word in sentence.words: | ||
174 | + result = regex_gc_ini_tag.match(word.text) | ||
175 | + if result: | ||
176 | + gc_tag = result.group("tag") | ||
177 | + in_tag = True | ||
178 | + continue | ||
179 | + else: | ||
180 | + result = regex_gc_end_tag.match(word.text) | ||
181 | + if result: | ||
182 | + gc_tag = "O" | ||
183 | + in_tag = False | ||
184 | + continue | ||
185 | + else: | ||
186 | + if not in_tag: | ||
187 | + gc_tag = "O" | ||
188 | + list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag)) | ||
189 | + transformed_sentence = " ".join(list_transformed_sentence) | ||
190 | + original_sentence = regex_gc_ini_tag.sub(r'<\g<tag>>', sentence.text) | ||
191 | + original_sentence = regex_gc_end_tag.sub(r'</\g<tag>>', original_sentence) | ||
192 | + new_row = {'serie': serie, | ||
193 | + 'serie_pubmed_id': serie_pubmed_id, | ||
194 | + 'sample': sample, | ||
195 | + 'field_name': field_name, | ||
196 | + 'original_sentence': original_sentence, | ||
197 | + 'modified_sentence': sentence.text, | ||
198 | + 'transformed_sentence': transformed_sentence} | ||
199 | + df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True) | ||
200 | + df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv')) |
data-sets/bin/mco_terms.csv
0 → 100644
This diff could not be displayed because it is too large.
1 | +import pandas as pd | ||
2 | +import os | ||
3 | + | ||
4 | +def transform_sentence_to_check_to_XML(inputPath, outputPath, inputFile, outputFile): | ||
5 | + df_sentences_to_check = pd.read_csv(os.path.join(inputPath, inputFile)) | ||
6 | + df_sentences_to_check.rename(columns={'Unnamed: 0': 'row'}, inplace=True) | ||
7 | + df_sentences_to_check = df_sentences_to_check.sort_values(by=['original_sentence']) | ||
8 | + print(df_sentences_to_check.head(5)) | ||
9 | + with open(os.path.join(outputPath, outputFile), mode='w') as ofile: | ||
10 | + ofile.write('<?xml version="1.0" encoding="UTF-8"?>\n') | ||
11 | + ofile.write('<gcs_to_check xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="esquema-gcs-to-check.xsd">\n') | ||
12 | + for ind in df_sentences_to_check.index: | ||
13 | + # | ||
14 | + line = '<row id="{}">\n'.format(df_sentences_to_check['row'][ind]) | ||
15 | + line = line + "\t<serie>{}</serie>\n".format(df_sentences_to_check['serie'][ind]) | ||
16 | + line = line + "\t<serie_pubmed_id>{}</serie_pubmed_id>\n".format(df_sentences_to_check['serie_pubmed_id'][ind]) | ||
17 | + line = line + "\t<sample>{}</sample>\n".format(df_sentences_to_check['sample'][ind]) | ||
18 | + line = line + "\t<field_name>{}</field_name>\n".format(df_sentences_to_check['field_name'][ind]) | ||
19 | + line = line + "\t<original_sentence>{}</original_sentence>\n".format(df_sentences_to_check['original_sentence'][ind]) | ||
20 | + line = line + "\t<corrected_sentence>{}</corrected_sentence>\n".format(df_sentences_to_check['original_sentence'][ind]) | ||
21 | + line = line + "</row>\n" | ||
22 | + ofile.write(line) | ||
23 | + ofile.write('</gcs_to_check>\n') | ||
24 | + | ||
25 | +transform_sentence_to_check_to_XML(inputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences', | ||
26 | + outputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences', | ||
27 | + inputFile='geo_sentences_to_check_fixed.csv', | ||
28 | + #inputFile='geo_sentences_to_check.csv', | ||
29 | + outputFile='geo_sentences_to_check_fixed.xml' | ||
30 | + ) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment