sentence-filter_v02.py
11.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
import json
import re
import pandas as pd
__author__ = 'CMendezC'
# Objective: Filter sentences with specific entities.
# Also extract attributive sentences: effect-TF
# And autoregulation: regulates its own gene
# CFMC 2022-03-08: We added updating tsv file with idsentence, sentence and section (.pre.tsv)
# to indicate filtered sentences.
# Parameters:
# 1) --inputFileWord Path and filename to read feature word file.
# 2) --inputFileTrans Path and filename to read transformed file.
# 3) --outputPath Path to place output file.
# 4) --outputFile Output file.
# 5) --filter FILT1: (GENE OR TU) AND TF
# FILT2: (GENE OR TU) AND EFFECT AND TF
# 6) --attrPath Path for attributive cases: ArgP-regulated genes
# 8) --dicPath Path for dictionary
# 9) --dicFile Path for dictionary file normalized_Effects.json
# 10) --autoPath Path for autoregulation cases: regulates its own gene
# /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
# Output:
# 1) Filtered sentences.
# 2) Attributive sentences
# 3) Autoregulation sentences
###########################################################
# MAIN PROGRAM #
###########################################################
def getEntities(tline, filt):
# FILT1: (GENE OR TU) AND TF
# FILT2: (GENE OR TU) AND EFFECT AND TF
entities = {}
tline = tline.rstrip('\n\r ')
for token in tline.split(" "):
# print("Token: {}".format(token))
listElem = token.split("|")
w = listElem[0]
l = listElem[1]
t = listElem[2]
if filt == "FILT1" or filt == "FILT2":
if t in ["GENE", "TU", "TF", "EFFECT"]:
if w not in entities:
entities[w] = t
# if filt == "FILT2":
# if t in ["GENE", "TU", "TF", "EFFECT"]:
# if w not in entities:
# entities[w] = t
return entities
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputFileWord", dest="inputFileWord",
help="Path and filename to read feature word file", metavar="PATH")
parser.add_option("--inputFileTrans", dest="inputFileTrans",
help="Path and filename to read transformed file", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--outputFile", dest="outputFile",
help="Output file", metavar="FILE")
parser.add_option("--filter", dest="filter", choices=('FILT1', 'FILT2'), default=None,
help="FILT1: (GENE OR TU) AND TF; FILT2: (GENE OR TU) AND EFFECT AND TF", metavar="TEXT")
parser.add_option("--attrPath", dest="attrPath",
help="Output path attributive sentences", metavar="PATH")
parser.add_option("--dicPath", dest="dicPath",
help="Output path dictionary", metavar="PATH")
parser.add_option("--dicFile", dest="dicFile",
help="Output file dictionary normalized_Effects.json", metavar="FILE")
parser.add_option("--autoPath", dest="autoPath",
help="Output path autoregulation sentences", metavar="PATH")
parser.add_option("--tsvPath", dest="tsvPath",
help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path and filename to read feature word file: " + str(options.inputFileWord))
print("Path and filename to read transformed file: " + str(options.inputFileTrans))
print("Output path: " + str(options.outputPath))
print("Output file: " + str(options.outputFile))
print("Filter: " + str(options.filter))
print("Output path attributive sentences: " + str(options.attrPath))
print("Output path autoregulation sentences: " + str(options.autoPath))
print("Output path dictionary: " + str(options.dicPath))
print("Output file dictionary normalized_Effects.json: " + str(options.dicFile))
print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))
# Loading normalized effects
# print('Loading normalized effects...')
hashNormalizedEffects = {}
with open(os.path.join(options.dicPath, options.dicFile)) as diccFile:
hashNormalizedEffects = json.load(diccFile)
listEffects = []
for eff in hashNormalizedEffects.keys():
if eff.endswith('d'):
listEffects.append(eff)
listEffects.append("dependent")
effects = "|".join(listEffects)
print("Effects: {}".format(effects))
t0 = time()
count = 0
hashEntities = {}
hashAttrSent = {}
hashAutoSent = {}
# Original CMC 2018-11-07: reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
# We decided to extract all sentences containing effect-TF because we observed some patterns where
# "gene" does not appear, then, to recover these examples we employ a more general rule to separate
# attributive sentences.
reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF')
# We decided to extract all sentences containing autoregulation
# The FimZ transcription factor activates this promoter directly ,
# and it also positively regulates the transcription of its own gene
# FimZ is known to regulate the expression of its own gene positively
# FimZ also positively regulates its own transcription
# ArgP protein represses its own synthesis
# ArgP both represses its own transcription
# ArgP protein represses its own synthesis
# OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
# of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
reAutoSent = re.compile(r'(?<=\|TF).+\|EFFECT.+its\|its\|PRP\$ own\|own\|JJ')
aFilter = options.filter
print(" Processing file...{}".format(options.inputFileTrans))
with open(os.path.join(options.outputPath, options.outputFile), "w", encoding="utf-8", errors="replace") as oFile:
with open(os.path.join(options.inputFileTrans), mode="r", encoding="utf-8", errors="replace") as tFile, open(os.path.join(options.inputFileWord), mode="r", encoding="utf-8", errors="replace") as wFile:
# CFMC 2022-03-09: Load tsv file with section, id sentence, sentence (Extracted from jsonpdf)
file = options.inputFileTrans[options.inputFileTrans.rfind("/")+1:]
file_tsv = file.replace(".tra.txt", ".pre.tsv")
tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
print("tsv_file.shape: {}".format(tsv_file.shape))
tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
# print(tsv_file_filtered.head(10))
tsv_file_new = tsv_file_filtered.reset_index(drop=True)
# print(tsv_file_new.shape)
# print(tsv_file_new.head(10))
i = 0
for tLine, wLine in zip(tFile, wFile):
# FILT1: (GENE OR TU) AND TF
# FILT2: (GENE OR TU) AND EFFECT AND TF
if aFilter is not None:
reGENETU = re.compile(r'(\|GENE|\|TU)')
reEFFECT = re.compile(r'\|EFFECT')
reTF = re.compile(r'\|TF')
tCount = str(count)
if aFilter == "FILT1":
if not (reGENETU.search(tLine) and reTF.search(tLine)):
#print("NOT FOUND")
# CFMC 2022-03-08
tsv_file_new.at[i, 'status'] = 0
i += 1
continue
else:
#print("FOUND")
oFile.write(wLine)
if tCount not in hashEntities:
hashEntities[tCount] = getEntities(tLine, aFilter)
if reAttrSent.search(tLine):
#print("ATTRIBUTIVE SENTENCE: {}".format(tLine))
if tCount not in hashAttrSent:
hashAttrSent[tCount] = tLine
# Autoregulation sentences
if reAutoSent.search(tLine):
# print("AUOREGULATION SENTENCE: {}".format(tLine))
if tCount not in hashAutoSent:
hashAutoSent[tCount] = tLine
#print(tLine)
elif aFilter == "FILT2":
if not (reGENETU.search(tLine) and reEFFECT.search(tLine) and reTF.search(tLine)):
continue
# CFMC 2022-03-08
tsv_file_new.at[i, 'status'] = 0
i += 1
else:
oFile.write(wLine)
if tCount not in hashEntities:
hashEntities[tCount] = getEntities(tLine, aFilter)
if reAttrSent.search(tLine):
if tCount not in hashAttrSent:
hashAttrSent[tCount] = tLine
if reAutoSent.search(tLine):
if tCount not in hashAutoSent:
hashAutoSent[tCount] = tLine
count += 1
i += 1
merged = tsv_file.merge(tsv_file_new, on=['idsentence'], how='left')
# print(merged.shape)
# print(merged.head(10))
tsv_file.status = merged.status_y.where(~merged.status_y.isnull(), tsv_file.status).astype(int)
tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
print("Last tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
# print(tsv_file_filtered.head(10))
tsv_file.to_csv(os.path.join(options.tsvPath, file_tsv.replace('.tsv', '.fil.tsv')), sep='\t')
with open(os.path.join(options.outputPath, options.outputFile.replace(".txt", ".ents.json")), "w", encoding="utf-8",
errors="replace") as eFile:
json.dump(hashEntities, eFile)
for f, sent in hashAttrSent.items():
listPath = options.inputFileTrans.split('/')
fileName = listPath[-1]
fileName = fileName.replace('.tra.', '.att.' + f + '.')
print("Save file {}".format(fileName))
with open(os.path.join(options.attrPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
aFile.write(sent)
for f, sent in hashAutoSent.items():
listPath = options.inputFileTrans.split('/')
fileName = listPath[-1]
fileName = fileName.replace('.tra.', '.auto.' + f + '.')
print("Save file {}".format(fileName))
with open(os.path.join(options.autoPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
aFile.write(sent)
print("Files split in: %fs" % (time() - t0))