extract-sentences-from-softfiles_v3.py
9.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import stanza
import argparse
import re
import os
import pandas as pd
# Objective
# Sentences extraction from XML Soft files. _v3 includes dictionary-based NER of MCO conditions
#
# Input parameters
# --inputPath=PATH Path to XML Soft files
# --outputPath=PATH Path to place output files
#
# Output
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles_v2.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# _v3
# python extract-sentences-from-softfiles_v3.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = argparse.ArgumentParser(
prog='extract-sentences-from-softfiles',
description='Sentences extraction from XML Soft files.',
epilog='')
parser.add_argument("--inputPath", dest="inputPath",
help="Path to XML Soft files", metavar="PATH")
parser.add_argument("--outputPath", dest="outputPath",
help="Path for output files", metavar="PATH")
parser.add_argument("--inputPathMco", dest="inputPathMco",
help="Path to MCO file", metavar="PATH")
parser.add_argument("--inputFileMco", dest="inputFileMco",
help="MCO file", metavar="FILE")
args = parser.parse_args()
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to XML Soft files: " + args.inputPath)
print("Path to output files: " + args.outputPath)
print("Path to MCO file: " + args.inputPathMco)
print("MCO file: " + args.inputFileMco)
print('-------------------------------- PROCESSING --------------------------------')
## Tags of GCs into consideration
# culture medium, medium supplements, aeration, temperature,
# pH, agitation, growth phase, optical density, genetic background
tags = {
'<Gtype>': 'Gtype',
# '<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Phase>': 'Phase',
# '<Substrain>': 'Substrain',
'<Supp>': 'Supp',
# '<Strain>': 'Strain',
# '<Technique>': 'Technique',
'<Temp>': 'Temp',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Air>': 'Air',
'<Vess>': 'Vess',
'<pH>': 'pH'
}
#tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
# '<Temp>', '<OD>', '<Anti>', '<Agit>',
# '<Air>', '<Vess>', '<pH>']
#deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
tags = ['Gtype', 'Med', 'Phase', 'Supp',
'Temp', 'OD', 'Anti', 'Agit',
'Air', 'Vess', 'pH']
deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique', 'Orgn']
all_tags = tags + deleted_tags
# Regex to check if line has a tag
regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
# Regex to delete tags
regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
# Regex to substitute tags
regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
#p = re.compile(r'blue (?P<animal>dog|cat)')
#p.sub(r'gray \g<animal>', s)
# Regex to tag GCs
regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
# Testing file: GSE54899_family_retagged-05242019_validated.xml
testing_file = "GSE54899_family_retagged-05242019_validated.xml"
# Define stanza pipeline for sentence segmentation
# nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
# Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
# nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
# Define stanza pipeline for lemmatization and pos tagging with sentence segmentation
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
# Store field_name (bangline) and field_text
field_name = ""
field_text = ""
# Store list of unique field_name
hash_field_name = {}
# Store sentences from fields that contained at least one GC tag.
# We want to use this list for someone to check it
df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
# Store serie number
# ^SERIES = GSE54899
serie = ""
# Store series pubmed id
# !Series_pubmed_id = 25222563
serie_pubmed_id = ""
# Store sample
# ^SAMPLE = GSM1326335
sample = ""
for path, dirs, files in os.walk(args.inputPath):
# For each file in dir
for file in files:
# if file == testing_file:
print(" Reading file..." + str(file))
with open(os.path.join(args.inputPath, file)) as iFile:
for line in iFile:
line = line.rstrip('\n')
if line.find(" = ") == -1:
continue
list_line = line.split(" = ")
field_name = list_line[0]
#print("field_name: {}".format(field_name))
field_text = list_line[1]
#print("field_text: {}".format(field_text))
if field_name == "^SERIES":
serie = field_text
elif field_name == "!Series_pubmed_id":
serie_pubmed_id = field_text
elif field_name == "^SAMPLE":
sample = field_text
elif regex_has_tag.search(line): # Contains GC tag
if field_name in hash_field_name:
hash_field_name[field_name] += 1
else:
hash_field_name[field_name] = 1
# original_sentence = field_text
# delete GC tags
modified_sentence = regex_delete_tag.sub("", field_text)
modified_sentence = regex_delete_tag.sub("", modified_sentence)
# substitute tags
# p = re.compile(r'blue (?P<animal>dog|cat)')
# p.sub(r'gray \g<animal>', s)
modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
doc = nlp(modified_sentence)
for i, sentence in enumerate(doc.sentences):
# print(sentence.text)
list_transformed_sentence = []
# For GC tag
gc_tag = "O"
in_tag = False
for word in sentence.words:
result = regex_gc_ini_tag.match(word.text)
if result:
gc_tag = result.group("tag")
in_tag = True
continue
else:
result = regex_gc_end_tag.match(word.text)
if result:
gc_tag = "O"
in_tag = False
continue
else:
if not in_tag:
gc_tag = "O"
list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
transformed_sentence = " ".join(list_transformed_sentence)
original_sentence = regex_gc_ini_tag.sub(r'<\g<tag>>', sentence.text)
original_sentence = regex_gc_end_tag.sub(r'</\g<tag>>', original_sentence)
new_row = {'serie': serie,
'serie_pubmed_id': serie_pubmed_id,
'sample': sample,
'field_name': field_name,
'original_sentence': original_sentence,
'modified_sentence': sentence.text,
'transformed_sentence': transformed_sentence}
df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))