cmendezc

Nuevo procesamiento para reentrenamiento

1 +import stanza
2 +import argparse
3 +import re
4 +import os
5 +import pandas as pd
6 +
7 +# Objective
8 +# Check if MCO terms appear in raw sentences from extracted sentences from softfiles
9 +#
10 +# Input parameters
11 +# --inputPath=PATH Path to geo_sentences_to_check_fixed.csv
12 + # /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
13 +# --inputFile=PATH File geo_sentences_to_check_fixed.csv
14 +# --inputPathMco Path to MCO term file
15 + # /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
16 +# --inputFileMco File with MCO terms GC_Terms.txt (tsv)
17 +# --outputPath=PATH Path to place MCO terms that appeared in input file
18 +#
19 +# Output
20 +# Files with MCO terms that appeared in input file
21 +#
22 +# _v1
23 +# python check_mco_terms_in_sentences_v1.py
24 +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
25 +# --inputFile geo_sentences_to_check_fixed.csv
26 +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
27 +# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
28 +# --inputFileMco GC_Terms.txt
29 +# python check_mco_terms_in_sentences_v1.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputFile geo_sentences_to_check_fixed.csv --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb --inputFileMco GC_Terms.txt
30 +
31 +'''
32 +RESULTADO DE LA REVISIÓN:
33 +Sólo se encontraron los siguientes términos en las oraciones extraídas de la curación de los softfiles:
34 +TERM_NAME
35 +L broth (también fue anotado por el curador)
36 +MOPS (anotado por el curador como <Med> MOPS minimal glucose media </Med>)
37 +glucose (también fue anotado por el curador, pero no como palabra aislada)
38 +nitrate (también fue anotado por el curador, aislado como <Supp> nitrate </Supp> y también como parte de varios suplementos anotados por el curador)
39 +M9 minimal medium (también fue anotado por el curador)
40 +OD600 of 0.3 (también fue anotado por el curador)
41 +Escherichia coli (no estamos considerando organismos)
42 +LB medium (no anotado por el curador)
43 +'''
44 +
45 +##########################################
46 +# MAIN PROGRAM #
47 +##########################################
48 +
49 +if __name__ == "__main__":
50 + # Defining parameters
51 + parser = argparse.ArgumentParser(
52 + prog='check_mco_terms_in_sentences_v1-py',
53 + description='Check if MCO terms appear in raw sentences from extracted sentences from softfiles.',
54 + epilog='')
55 + parser.add_argument("--inputPath", dest="inputPath",
56 + help="Path to extracted sentences from softfiles", metavar="PATH")
57 + parser.add_argument("--inputFile", dest="inputFile",
58 + help="Input extracted sentences from softfiles", metavar="FILE")
59 + parser.add_argument("--outputPath", dest="outputPath",
60 + help="Path to place MCO terms that appeared in input file", metavar="PATH")
61 + parser.add_argument("--inputPathMco", dest="inputPathMco",
62 + help="Path to MCO file", metavar="PATH")
63 + parser.add_argument("--inputFileMco", dest="inputFileMco",
64 + help="MCO file", metavar="FILE")
65 + args = parser.parse_args()
66 +
67 + print('-------------------------------- PARAMETERS --------------------------------')
68 + print("Path to extracted sentences from softfiles: " + args.inputPath)
69 + print("Input extracted sentences from softfiles: " + args.inputFile)
70 + print("Path to place MCO terms that appeared in input file: " + args.outputPath)
71 + print("Path to MCO file: " + args.inputPathMco)
72 + print("MCO file: " + args.inputFileMco)
73 + print('-------------------------------- PROCESSING --------------------------------')
74 +
75 + df_sentences_to_check = pd.read_csv(os.path.join(args.inputPath, args.inputFile))
76 + print(df_sentences_to_check.head(3))
77 + print(df_sentences_to_check.shape)
78 +
79 + df_mco_terms = pd.read_csv(os.path.join(args.inputPathMco, args.inputFileMco), sep="\t")
80 + print(df_mco_terms.head(3))
81 + print(df_mco_terms.shape)
82 +
83 + df_mco_terms_found = pd.DataFrame(columns=['TERM_TYPE', 'TERM_NAME', 'SENTENCE'])
84 +
85 + text_sentences = []
86 + for ind in df_sentences_to_check.index:
87 + line_trans = df_sentences_to_check['transformed_sentence'][ind]
88 + list_line = line_trans.split()
89 + list_sentence = [tokens.split("|")[0] for tokens in list_line]
90 + text_sentence = " ".join(list_sentence)
91 + # print(text_sentence)
92 + if text_sentence not in text_sentences:
93 + text_sentences.append(text_sentence)
94 +
95 + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt')
96 +
97 + with open(os.path.join(args.outputPath, "MCO_terms_found.tsv"), mode='w') as ofile:
98 + for ind in df_mco_terms.index:
99 + term_type = df_mco_terms['TERM_TYPE'][ind]
100 + term_name = df_mco_terms['TERM_NAME'][ind]
101 + doc = nlp(term_name)
102 + word_list = [w.text for w in doc.sentences[0].words]
103 + term_name_new = " ".join(word_list)
104 + #print(term_name_new)
105 + sentences_found = [sent for sent in text_sentences if term_name_new in sent]
106 + for s in sentences_found:
107 + print("TERM_TYPE {} TERM_NAME {} SENT {}".format(term_type, term_name, s))
108 + new_row = {'TERM_TYPE': term_type,
109 + 'TERM_NAME': term_name,
110 + 'SENTENCE': s}
111 + df_mco_terms_found = df_mco_terms_found.append(new_row, ignore_index=True)
112 + df_mco_terms_found.to_csv(os.path.join(args.outputPath, 'MCO_terms_found_in_softfiles.tsv'), sep="\t")
...@@ -15,11 +15,11 @@ import pandas as pd ...@@ -15,11 +15,11 @@ import pandas as pd
15 # Files with sentences obtained from XML Soft files 15 # Files with sentences obtained from XML Soft files
16 # 16 #
17 # Examples 17 # Examples
18 -# python extract-sentences-from-softfiles.py 18 +# python extract-sentences-from-softfiles_v2.py
19 # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data 19 # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
20 # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences 20 # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
21 # 21 #
22 -# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences 22 +# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
23 23
24 ########################################## 24 ##########################################
25 # MAIN PROGRAM # 25 # MAIN PROGRAM #
......
...@@ -15,11 +15,11 @@ import pandas as pd ...@@ -15,11 +15,11 @@ import pandas as pd
15 # Files with sentences obtained from XML Soft files 15 # Files with sentences obtained from XML Soft files
16 # 16 #
17 # Examples 17 # Examples
18 -# python extract-sentences-from-softfiles.py 18 +# python extract-sentences-from-softfiles_v2.py
19 # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data 19 # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
20 # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences 20 # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
21 # 21 #
22 -# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences 22 +# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
23 23
24 ########################################## 24 ##########################################
25 # MAIN PROGRAM # 25 # MAIN PROGRAM #
...@@ -70,7 +70,7 @@ if __name__ == "__main__": ...@@ -70,7 +70,7 @@ if __name__ == "__main__":
70 tags = ['Gtype', 'Med', 'Phase', 'Supp', 70 tags = ['Gtype', 'Med', 'Phase', 'Supp',
71 'Temp', 'OD', 'Anti', 'Agit', 71 'Temp', 'OD', 'Anti', 'Agit',
72 'Air', 'Vess', 'pH'] 72 'Air', 'Vess', 'pH']
73 - deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique'] 73 + deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique', 'Orgn']
74 all_tags = tags + deleted_tags 74 all_tags = tags + deleted_tags
75 # Regex to check if line has a tag 75 # Regex to check if line has a tag
76 regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>') 76 regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
...@@ -89,9 +89,11 @@ if __name__ == "__main__": ...@@ -89,9 +89,11 @@ if __name__ == "__main__":
89 testing_file = "GSE54899_family_retagged-05242019_validated.xml" 89 testing_file = "GSE54899_family_retagged-05242019_validated.xml"
90 90
91 # Define stanza pipeline for sentence segmentation 91 # Define stanza pipeline for sentence segmentation
92 - nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize') 92 + # nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
93 # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation 93 # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
94 - nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True) 94 + # nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
95 + # Define stanza pipeline for lemmatization and pos tagging with sentence segmentation
96 + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
95 97
96 # Store field_name (bangline) and field_text 98 # Store field_name (bangline) and field_text
97 field_name = "" 99 field_name = ""
...@@ -117,7 +119,7 @@ if __name__ == "__main__": ...@@ -117,7 +119,7 @@ if __name__ == "__main__":
117 for path, dirs, files in os.walk(args.inputPath): 119 for path, dirs, files in os.walk(args.inputPath):
118 # For each file in dir 120 # For each file in dir
119 for file in files: 121 for file in files:
120 - if file == testing_file: 122 + # if file == testing_file:
121 print(" Reading file..." + str(file)) 123 print(" Reading file..." + str(file))
122 with open(os.path.join(args.inputPath, file)) as iFile: 124 with open(os.path.join(args.inputPath, file)) as iFile:
123 for line in iFile: 125 for line in iFile:
...@@ -140,7 +142,7 @@ if __name__ == "__main__": ...@@ -140,7 +142,7 @@ if __name__ == "__main__":
140 hash_field_name[field_name] += 1 142 hash_field_name[field_name] += 1
141 else: 143 else:
142 hash_field_name[field_name] = 1 144 hash_field_name[field_name] = 1
143 - original_sentence = field_text 145 + # original_sentence = field_text
144 # delete GC tags 146 # delete GC tags
145 modified_sentence = regex_delete_tag.sub("", field_text) 147 modified_sentence = regex_delete_tag.sub("", field_text)
146 modified_sentence = regex_delete_tag.sub("", modified_sentence) 148 modified_sentence = regex_delete_tag.sub("", modified_sentence)
...@@ -173,6 +175,8 @@ if __name__ == "__main__": ...@@ -173,6 +175,8 @@ if __name__ == "__main__":
173 gc_tag = "O" 175 gc_tag = "O"
174 list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag)) 176 list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
175 transformed_sentence = " ".join(list_transformed_sentence) 177 transformed_sentence = " ".join(list_transformed_sentence)
178 + original_sentence = regex_gc_ini_tag.sub(r'<\g<tag>>', sentence.text)
179 + original_sentence = regex_gc_end_tag.sub(r'</\g<tag>>', original_sentence)
176 new_row = {'serie': serie, 180 new_row = {'serie': serie,
177 'serie_pubmed_id': serie_pubmed_id, 181 'serie_pubmed_id': serie_pubmed_id,
178 'sample': sample, 182 'sample': sample,
...@@ -182,75 +186,3 @@ if __name__ == "__main__": ...@@ -182,75 +186,3 @@ if __name__ == "__main__":
182 'transformed_sentence': transformed_sentence} 186 'transformed_sentence': transformed_sentence}
183 df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True) 187 df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
184 df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv')) 188 df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
185 - #print(token)
186 - quit()
187 -
188 - ## End of tagging
189 - out_labels = {
190 - '</Gtype>': 'O',
191 - '</Gversion>': 'O',
192 - '</Med>': 'O',
193 - '</Phase>': 'O',
194 - '</Substrain>': 'O',
195 - '</Supp>': 'O',
196 - '</Strain>': 'O',
197 - '</Technique>': 'O',
198 - '</Temp>': 'O',
199 - '</OD>': 'O',
200 - '</Anti>': 'O',
201 - '</Agit>': 'O',
202 - '</Air>': 'O',
203 - '</Vess>': 'O',
204 - '</pH>': 'O'}
205 - old_labels = {
206 - '<Orgn>': 'O',
207 - '</Orgn>': 'O'
208 - }
209 -
210 - # Other label
211 - flag = 'O'
212 - lista = []
213 - # First sentence
214 - sentence = ''
215 - n = 0
216 - with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
217 - for line in input_file:
218 - if len(line.split('\t')) > 1:
219 - w = line.split('\t')[1]
220 - if w in in_labels or w in out_labels:
221 - # Tagging
222 - if w in in_labels.keys(): flag = in_labels[w]
223 - if w in out_labels: flag = out_labels[w]
224 - else:
225 - if w == "PGCGROWTHCONDITIONS":
226 - n = n + 1
227 - words = sentence.split(' ')
228 - # End of sentence
229 - tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
230 - # At least one true-tag on sentence
231 - if len(tags) > 0:
232 - lista.append(sentence)
233 - # New setence
234 - sentence = ''
235 - elif w not in old_labels.keys():
236 - # Building and save tagging sentence
237 - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
238 -
239 - print("Number of sentences with at least one tag: " + str(len(lista)))
240 - print("Number of sentences from CoreNLP: " + str(n))
241 -
242 - # Split 70 30 training and test sentences
243 - trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
244 - testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
245 - print("Number of sentences for training: " + str(len(trainingIndex)))
246 - print("Number of sentences for test: " + str(len(testIndex)))
247 -
248 - with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
249 - Data = [lista[i] for i in trainingIndex]
250 - oFile.write('\n'.join(Data))
251 -
252 - with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
253 - Data = [lista[i] for i in testIndex]
254 - oFile.write('\n'.join(Data))
255 -
256 - print("==================================END===================================")
......
1 +import stanza
2 +import argparse
3 +import re
4 +import os
5 +import pandas as pd
6 +
7 +# Objective
8 +# Sentences extraction from XML Soft files. _v3 includes dictionary-based NER of MCO conditions
9 +#
10 +# Input parameters
11 +# --inputPath=PATH Path to XML Soft files
12 +# --outputPath=PATH Path to place output files
13 +#
14 +# Output
15 +# Files with sentences obtained from XML Soft files
16 +#
17 +# Examples
18 +# python extract-sentences-from-softfiles_v2.py
19 +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
20 +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
21 +#
22 +# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
23 +
24 +# _v3
25 +# python extract-sentences-from-softfiles_v3.py
26 +# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
27 +# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
28 +# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
29 +# --inputFileMco
30 +
31 +##########################################
32 +# MAIN PROGRAM #
33 +##########################################
34 +
35 +if __name__ == "__main__":
36 + # Defining parameters
37 + parser = argparse.ArgumentParser(
38 + prog='extract-sentences-from-softfiles',
39 + description='Sentences extraction from XML Soft files.',
40 + epilog='')
41 + parser.add_argument("--inputPath", dest="inputPath",
42 + help="Path to XML Soft files", metavar="PATH")
43 + parser.add_argument("--outputPath", dest="outputPath",
44 + help="Path for output files", metavar="PATH")
45 + parser.add_argument("--inputPathMco", dest="inputPathMco",
46 + help="Path to MCO file", metavar="PATH")
47 + parser.add_argument("--inputFileMco", dest="inputFileMco",
48 + help="MCO file", metavar="FILE")
49 + args = parser.parse_args()
50 +
51 + print('-------------------------------- PARAMETERS --------------------------------')
52 + print("Path to XML Soft files: " + args.inputPath)
53 + print("Path to output files: " + args.outputPath)
54 + print("Path to MCO file: " + args.inputPathMco)
55 + print("MCO file: " + args.inputFileMco)
56 + print('-------------------------------- PROCESSING --------------------------------')
57 +
58 + ## Tags of GCs into consideration
59 + # culture medium, medium supplements, aeration, temperature,
60 + # pH, agitation, growth phase, optical density, genetic background
61 + tags = {
62 + '<Gtype>': 'Gtype',
63 + # '<Gversion>': 'Gversion',
64 + '<Med>': 'Med',
65 + '<Phase>': 'Phase',
66 + # '<Substrain>': 'Substrain',
67 + '<Supp>': 'Supp',
68 + # '<Strain>': 'Strain',
69 + # '<Technique>': 'Technique',
70 + '<Temp>': 'Temp',
71 + '<OD>': 'OD',
72 + '<Anti>': 'Anti',
73 + '<Agit>': 'Agit',
74 + '<Air>': 'Air',
75 + '<Vess>': 'Vess',
76 + '<pH>': 'pH'
77 + }
78 + #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
79 + # '<Temp>', '<OD>', '<Anti>', '<Agit>',
80 + # '<Air>', '<Vess>', '<pH>']
81 + #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
82 + tags = ['Gtype', 'Med', 'Phase', 'Supp',
83 + 'Temp', 'OD', 'Anti', 'Agit',
84 + 'Air', 'Vess', 'pH']
85 + deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique', 'Orgn']
86 + all_tags = tags + deleted_tags
87 + # Regex to check if line has a tag
88 + regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
89 + # Regex to delete tags
90 + regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
91 + # Regex to substitute tags
92 + regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
93 + regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
94 + #p = re.compile(r'blue (?P<animal>dog|cat)')
95 + #p.sub(r'gray \g<animal>', s)
96 + # Regex to tag GCs
97 + regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
98 + regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
99 +
100 + # Testing file: GSE54899_family_retagged-05242019_validated.xml
101 + testing_file = "GSE54899_family_retagged-05242019_validated.xml"
102 +
103 + # Define stanza pipeline for sentence segmentation
104 + # nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
105 + # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
106 + # nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
107 + # Define stanza pipeline for lemmatization and pos tagging with sentence segmentation
108 + nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
109 +
110 + # Store field_name (bangline) and field_text
111 + field_name = ""
112 + field_text = ""
113 +
114 + # Store list of unique field_name
115 + hash_field_name = {}
116 +
117 + # Store sentences from fields that contained at least one GC tag.
118 + # We want to use this list for someone to check it
119 + df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
120 +
121 + # Store serie number
122 + # ^SERIES = GSE54899
123 + serie = ""
124 + # Store series pubmed id
125 + # !Series_pubmed_id = 25222563
126 + serie_pubmed_id = ""
127 + # Store sample
128 + # ^SAMPLE = GSM1326335
129 + sample = ""
130 +
131 + for path, dirs, files in os.walk(args.inputPath):
132 + # For each file in dir
133 + for file in files:
134 + # if file == testing_file:
135 + print(" Reading file..." + str(file))
136 + with open(os.path.join(args.inputPath, file)) as iFile:
137 + for line in iFile:
138 + line = line.rstrip('\n')
139 + if line.find(" = ") == -1:
140 + continue
141 + list_line = line.split(" = ")
142 + field_name = list_line[0]
143 + #print("field_name: {}".format(field_name))
144 + field_text = list_line[1]
145 + #print("field_text: {}".format(field_text))
146 + if field_name == "^SERIES":
147 + serie = field_text
148 + elif field_name == "!Series_pubmed_id":
149 + serie_pubmed_id = field_text
150 + elif field_name == "^SAMPLE":
151 + sample = field_text
152 + elif regex_has_tag.search(line): # Contains GC tag
153 + if field_name in hash_field_name:
154 + hash_field_name[field_name] += 1
155 + else:
156 + hash_field_name[field_name] = 1
157 + # original_sentence = field_text
158 + # delete GC tags
159 + modified_sentence = regex_delete_tag.sub("", field_text)
160 + modified_sentence = regex_delete_tag.sub("", modified_sentence)
161 + # substitute tags
162 + # p = re.compile(r'blue (?P<animal>dog|cat)')
163 + # p.sub(r'gray \g<animal>', s)
164 + modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
165 + modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
166 + doc = nlp(modified_sentence)
167 + for i, sentence in enumerate(doc.sentences):
168 + # print(sentence.text)
169 + list_transformed_sentence = []
170 + # For GC tag
171 + gc_tag = "O"
172 + in_tag = False
173 + for word in sentence.words:
174 + result = regex_gc_ini_tag.match(word.text)
175 + if result:
176 + gc_tag = result.group("tag")
177 + in_tag = True
178 + continue
179 + else:
180 + result = regex_gc_end_tag.match(word.text)
181 + if result:
182 + gc_tag = "O"
183 + in_tag = False
184 + continue
185 + else:
186 + if not in_tag:
187 + gc_tag = "O"
188 + list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
189 + transformed_sentence = " ".join(list_transformed_sentence)
190 + original_sentence = regex_gc_ini_tag.sub(r'<\g<tag>>', sentence.text)
191 + original_sentence = regex_gc_end_tag.sub(r'</\g<tag>>', original_sentence)
192 + new_row = {'serie': serie,
193 + 'serie_pubmed_id': serie_pubmed_id,
194 + 'sample': sample,
195 + 'field_name': field_name,
196 + 'original_sentence': original_sentence,
197 + 'modified_sentence': sentence.text,
198 + 'transformed_sentence': transformed_sentence}
199 + df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
200 + df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
This diff could not be displayed because it is too large.
1 +import pandas as pd
2 +import os
3 +
4 +def transform_sentence_to_check_to_XML(inputPath, outputPath, inputFile, outputFile):
5 + df_sentences_to_check = pd.read_csv(os.path.join(inputPath, inputFile))
6 + df_sentences_to_check.rename(columns={'Unnamed: 0': 'row'}, inplace=True)
7 + df_sentences_to_check = df_sentences_to_check.sort_values(by=['original_sentence'])
8 + print(df_sentences_to_check.head(5))
9 + with open(os.path.join(outputPath, outputFile), mode='w') as ofile:
10 + ofile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
11 + ofile.write('<gcs_to_check xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="esquema-gcs-to-check.xsd">\n')
12 + for ind in df_sentences_to_check.index:
13 + #
14 + line = '<row id="{}">\n'.format(df_sentences_to_check['row'][ind])
15 + line = line + "\t<serie>{}</serie>\n".format(df_sentences_to_check['serie'][ind])
16 + line = line + "\t<serie_pubmed_id>{}</serie_pubmed_id>\n".format(df_sentences_to_check['serie_pubmed_id'][ind])
17 + line = line + "\t<sample>{}</sample>\n".format(df_sentences_to_check['sample'][ind])
18 + line = line + "\t<field_name>{}</field_name>\n".format(df_sentences_to_check['field_name'][ind])
19 + line = line + "\t<original_sentence>{}</original_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
20 + line = line + "\t<corrected_sentence>{}</corrected_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
21 + line = line + "</row>\n"
22 + ofile.write(line)
23 + ofile.write('</gcs_to_check>\n')
24 +
25 +transform_sentence_to_check_to_XML(inputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
26 + outputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
27 + inputFile='geo_sentences_to_check_fixed.csv',
28 + #inputFile='geo_sentences_to_check.csv',
29 + outputFile='geo_sentences_to_check_fixed.xml'
30 + )
...\ No newline at end of file ...\ No newline at end of file