extract-sentences-from-softfiles_v2.py
11.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import stanza
import argparse
import re
import os
import pandas as pd
# Objective
# Sentences extraction from XML Soft files.
#
# Input parameters
# --inputPath=PATH Path to XML Soft files
# --outputPath=PATH Path to place output files
#
# Output
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = argparse.ArgumentParser(
prog='extract-sentences-from-softfiles',
description='Sentences extraction from XML Soft files.',
epilog='')
parser.add_argument("--inputPath", dest="inputPath",
help="Path to XML Soft files", metavar="PATH")
parser.add_argument("--outputPath", dest="outputPath",
help="Path for output files", metavar="PATH")
args = parser.parse_args()
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to XML Soft files: " + args.inputPath)
print("Path to output files: " + args.outputPath)
print('-------------------------------- PROCESSING --------------------------------')
## Tags of GCs into consideration
# culture medium, medium supplements, aeration, temperature,
# pH, agitation, growth phase, optical density, genetic background
tags = {
'<Gtype>': 'Gtype',
# '<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Phase>': 'Phase',
# '<Substrain>': 'Substrain',
'<Supp>': 'Supp',
# '<Strain>': 'Strain',
# '<Technique>': 'Technique',
'<Temp>': 'Temp',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Air>': 'Air',
'<Vess>': 'Vess',
'<pH>': 'pH'
}
#tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
# '<Temp>', '<OD>', '<Anti>', '<Agit>',
# '<Air>', '<Vess>', '<pH>']
#deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
tags = ['Gtype', 'Med', 'Phase', 'Supp',
'Temp', 'OD', 'Anti', 'Agit',
'Air', 'Vess', 'pH']
deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
all_tags = tags + deleted_tags
# Regex to check if line has a tag
regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
# Regex to delete tags
regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
# Regex to substitute tags
regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
#p = re.compile(r'blue (?P<animal>dog|cat)')
#p.sub(r'gray \g<animal>', s)
# Regex to tag GCs
regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
# Testing file: GSE54899_family_retagged-05242019_validated.xml
testing_file = "GSE54899_family_retagged-05242019_validated.xml"
# Define stanza pipeline for sentence segmentation
nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
# Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
# Store field_name (bangline) and field_text
field_name = ""
field_text = ""
# Store list of unique field_name
hash_field_name = {}
# Store sentences from fields that contained at least one GC tag.
# We want to use this list for someone to check it
df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
# Store serie number
# ^SERIES = GSE54899
serie = ""
# Store series pubmed id
# !Series_pubmed_id = 25222563
serie_pubmed_id = ""
# Store sample
# ^SAMPLE = GSM1326335
sample = ""
for path, dirs, files in os.walk(args.inputPath):
# For each file in dir
for file in files:
if file == testing_file:
print(" Reading file..." + str(file))
with open(os.path.join(args.inputPath, file)) as iFile:
for line in iFile:
line = line.rstrip('\n')
if line.find(" = ") == -1:
continue
list_line = line.split(" = ")
field_name = list_line[0]
#print("field_name: {}".format(field_name))
field_text = list_line[1]
#print("field_text: {}".format(field_text))
if field_name == "^SERIES":
serie = field_text
elif field_name == "!Series_pubmed_id":
serie_pubmed_id = field_text
elif field_name == "^SAMPLE":
sample = field_text
elif regex_has_tag.search(line): # Contains GC tag
if field_name in hash_field_name:
hash_field_name[field_name] += 1
else:
hash_field_name[field_name] = 1
original_sentence = field_text
# delete GC tags
modified_sentence = regex_delete_tag.sub("", field_text)
modified_sentence = regex_delete_tag.sub("", modified_sentence)
# substitute tags
# p = re.compile(r'blue (?P<animal>dog|cat)')
# p.sub(r'gray \g<animal>', s)
modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
doc = nlp(modified_sentence)
for i, sentence in enumerate(doc.sentences):
# print(sentence.text)
list_transformed_sentence = []
# For GC tag
gc_tag = "O"
in_tag = False
for word in sentence.words:
result = regex_gc_ini_tag.match(word.text)
if result:
gc_tag = result.group("tag")
in_tag = True
continue
else:
result = regex_gc_end_tag.match(word.text)
if result:
gc_tag = "O"
in_tag = False
continue
else:
if not in_tag:
gc_tag = "O"
list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
transformed_sentence = " ".join(list_transformed_sentence)
new_row = {'serie': serie,
'serie_pubmed_id': serie_pubmed_id,
'sample': sample,
'field_name': field_name,
'original_sentence': original_sentence,
'modified_sentence': sentence.text,
'transformed_sentence': transformed_sentence}
df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
#print(token)
quit()
## End of tagging
out_labels = {
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Phase>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Strain>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</OD>': 'O',
'</Anti>': 'O',
'</Agit>': 'O',
'</Air>': 'O',
'</Vess>': 'O',
'</pH>': 'O'}
old_labels = {
'<Orgn>': 'O',
'</Orgn>': 'O'
}
# Other label
flag = 'O'
lista = []
# First sentence
sentence = ''
n = 0
with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
# Tagging
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS":
n = n + 1
words = sentence.split(' ')
# End of sentence
tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
# At least one true-tag on sentence
if len(tags) > 0:
lista.append(sentence)
# New setence
sentence = ''
elif w not in old_labels.keys():
# Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')
print("Number of sentences with at least one tag: " + str(len(lista)))
print("Number of sentences from CoreNLP: " + str(n))
# Split 70 30 training and test sentences
trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
print("Number of sentences for training: " + str(len(trainingIndex)))
print("Number of sentences for test: " + str(len(testIndex)))
with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
Data = [lista[i] for i in trainingIndex]
oFile.write('\n'.join(Data))
with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
Data = [lista[i] for i in testIndex]
oFile.write('\n'.join(Data))
print("==================================END===================================")