get-TRN-Organism-v1.py
12.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# -*- coding: UTF-8 -*-
import operator
from optparse import OptionParser
import os
import sys
import json
import re
import pandas as pd
__author__ = 'CMendezC'
# Objective: add organism annotation (http://pakal.ccg.unam.mx/cmendezc/bacteria-annotation) to TRN tabla
# Parameters:
# 1) --trnPath Path to TRN detail table
# 2) --trnFile File of TRN detail table
# 3) --outputPath Output path
# 4) --organismPath Path to Organism annotation table
# 5) --organismFile File of Organism annotation table
# Ouput:
# 1) Tsv file detail with:
# TF TypeRegulated Regulated Effect PMID IdSentence TypeSentence Sentence
# Original_idsentence Original_sentence SectionNum SectionName OrganismMentions OrganismScore ConfirmationLevel
# OrganismScore = {
# If only salmonella or only non identified organism = 1,
# If (startswith salmonella or non identified organism) and other organisms = 0.5
# If only other organisms = 0
# }
# Execution:
# python3.4 get-TRN-Organism-v1.py
# Local
# python get-TRN-Organism-v1.py
# --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
# --trnFile STMTRN_all.detail.tsv
# --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
# --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results
# --organismFile annotations_STMTRN_all.sentences.csv
# python3 get-TRN-Organism-v1.py --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --trnFile STMTRN_all.detail.tsv --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results --organismFile annotations_STMTRN_all.sentences.csv
###########################################################
# MAIN PROGRAM #
###########################################################
def only_salmonella_or_non_identified_organism(list_temp):
non_identified_organisms = [
'unidentified plasmid',
'unidentified',
'bacterium',
'bacterium IFAM-3211',
'bacterium IFAM-2074',
'bacterium IFAM-1493',
'bacterium IFAM-3215',
'bacterium IFAM-3359',
'hybrid',
'Vector pMC1403',
'Transposon Tn10',
'unidentified cloning vector',
'Plasmid F',
'Cloning vector pUC19'
]
matches = 0
for o in list_temp:
if o.lower().startswith("salmonella") or o in non_identified_organisms:
matches += 1
if matches == len(list_temp):
return True
else:
return False
def salmonella_or_non_identified_and_other_organisms(list_temp):
non_identified_organisms = [
'unidentified plasmid',
'unidentified',
'bacterium',
'bacterium IFAM-3211',
'bacterium IFAM-2074',
'bacterium IFAM-1493',
'bacterium IFAM-3215',
'bacterium IFAM-3359',
'hybrid',
'Vector pMC1403',
'Transposon Tn10',
'unidentified cloning vector',
'Plasmid F',
'Cloning vector pUC19'
]
matches = 0
for o in list_temp:
if o.lower().startswith("salmonella") or o in non_identified_organisms:
matches += 1
if matches < len(list_temp) and matches > 0:
return True
else:
return False
def only_other_organims(list_temp):
non_identified_organisms = [
'unidentified plasmid',
'unidentified',
'bacterium',
'bacterium IFAM-3211',
'bacterium IFAM-2074',
'bacterium IFAM-1493',
'bacterium IFAM-3215',
'bacterium IFAM-3359',
'hybrid',
'Vector pMC1403',
'Transposon Tn10',
'unidentified cloning vector',
'Plasmid F',
'Cloning vector pUC19'
]
matches = 0
for o in list_temp:
if o.lower().startswith("salmonella") or o in non_identified_organisms:
matches += 1
if matches == 0:
return True
else:
return False
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--trnPath", dest="trnPath",
help="Path to TRN detail table", metavar="PATH")
parser.add_option("--trnFile", dest="trnFile",
help="File of TRN detail table", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--organismPath", dest="organismPath",
help="Path to organism annotation table", metavar="PATH")
parser.add_option("--organismFile", dest="organismFile",
help="File of organism annotation table", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to TRN detail table: " + str(options.trnPath))
print("File of TRN detail table: " + str(options.trnFile))
print("Output path: " + str(options.outputPath))
print("Path to organism annotation table: " + str(options.organismPath))
print("File of organism annotation table: " + str(options.organismFile))
# Load organism annotation table
print("Loading organism annotation table")
df_organisms = pd.read_csv(os.path.join(options.organismPath, options.organismFile), sep=',')
print("Total de frases anotadas con organism: {}".format(df_organisms.shape[0]))
# Load TRN detail table
print("Loading TRN detail table")
df_detail = pd.read_csv(os.path.join(options.trnPath, options.trnFile), sep='\t')
print("Total de frases en TRN: {}".format(df_detail.shape[0]))
# Fix column for organism. We changed this issue in get-TRN-v2.py
df_detail = df_detail.rename(columns={"Organism": "Organisms"})
df_detail['OrganismScore'] = 1.00
print(df_detail.columns)
#print(df_detail['Sentence'].head(15))
for idx in df_organisms.index:
organisms = df_organisms['Organisms'][idx]
SentenceNumberInFile = df_organisms['SentenceNumberInFile'][idx]
SentenceNumberInFile = SentenceNumberInFile - 2
# print("Organisms before: {}".format(df_detail.Organisms[SentenceNumberInFile]))
df_detail.Organisms[SentenceNumberInFile] = organisms
# print("Organisms assigned: {}".format(df_detail.Organisms[SentenceNumberInFile]))
# OrganismScore = {
# If only salmonella or only non identified organism = 1,
# If (startswith salmonella or non identified organism) and other organisms = 0.5
# If only other organisms = 0
# }
list_organisms = organisms.split(';')
# print(" OrganismScore before: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
if only_salmonella_or_non_identified_organism(list_organisms):
df_detail.OrganismScore[SentenceNumberInFile] = 1.00
elif salmonella_or_non_identified_and_other_organisms(list_organisms):
df_detail.OrganismScore[SentenceNumberInFile] = 0.50
elif only_other_organims(list_organisms):
df_detail.OrganismScore[SentenceNumberInFile] = 0.00
# print(" OrganismScore assigned: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
hashPredictedRIs = {}
hashPredictedRIsCount = {}
hashPredictedRIsCountVer = {}
hashPredictedRIsCountDev = {}
hashPredictedRIsCountAtt = {}
hashPredictedRIsCountAuto = {}
hashPredictedRIsScore = {}
hashPredictedRIsRI = {}
for idx in df_detail.index:
tf = df_detail['TF'][idx]
TypeRegulated = df_detail['TypeRegulated'][idx]
Regulated = df_detail['Regulated'][idx]
Effect = df_detail['Effect'][idx]
pmid = df_detail['PMID'][idx]
numsent = df_detail['NumSentence'][idx]
type_sent = df_detail['TypeSentence'][idx]
sentence = df_detail['Sentence'][idx]
original_idsentence = df_detail['OriginalIdSentence'][idx]
original_sentence = df_detail['OriginalSentence'][idx]
section_num = df_detail['SectionNum'][idx]
section_name = df_detail['SectionName'][idx]
organisms = df_detail['Organisms'][idx]
organism_score = df_detail['OrganismScore'][idx]
llave = "{}\t{}\t{}\t{}".format(tf, TypeRegulated, Regulated, Effect)
if organism_score == 0:
continue
if llave in hashPredictedRIs:
hashPredictedRIs[llave].append(
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
original_sentence, section_num, section_name, organisms,
organism_score, "", "", "", "", "", ""))
hashPredictedRIsCount[llave] += 1
if type_sent == "ver/dev":
hashPredictedRIsCountVer[llave] += 1
elif type_sent == "dev":
hashPredictedRIsCountDev[llave] += 1
elif type_sent == "att":
hashPredictedRIsCountAtt[llave] += 1
elif type_sent == "auto":
hashPredictedRIsCountAuto[llave] += 1
# if organism_score == 0.5:
# We penalize RI
# hashPredictedRIsScore[llave] -= 0.05
else:
hashPredictedRIs[llave] = [
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
original_sentence, section_num, section_name, organisms,
organism_score, "", "", "", "", "", "")]
hashPredictedRIsCount[llave] = 1
hashPredictedRIsCountVer[llave] = 0
hashPredictedRIsCountDev[llave] = 0
hashPredictedRIsCountAtt[llave] = 0
hashPredictedRIsCountAuto[llave] = 0
hashPredictedRIsScore[llave] = 1
if type_sent == "ver/dev":
hashPredictedRIsCountVer[llave] = 1
elif type_sent == "dev":
hashPredictedRIsCountDev[llave] = 1
elif type_sent == "att":
hashPredictedRIsCountAtt[llave] = 1
elif type_sent == "auto":
hashPredictedRIsCountAuto[llave] = 1
# if organism_score == 0.5:
# We penalize RI
# hashPredictedRIsScore[llave] -= 0.05
print("Total RIs en TRN con organismo: {}".format(len(hashPredictedRIs)))
with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "summary_org")), mode="w") as oFile:
# oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
for k,v in hashPredictedRIs.items():
RI_value = "True"
# if hashPredictedRIsScore[k] < 1:
# RI_value = "Possible"
oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k],
hashPredictedRIsScore[k], RI_value))
with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "detail_org")), mode="w") as oFile:
# oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tKT\tCL\tSource\tSpeculation\tNegation\tConfirmationLevel\n")
i = 0
for k,v in hashPredictedRIs.items():
for s in v:
oFile.write("{}\t{}\n".format(k, s))
i += 1
print("Total de frases en TRN organismo: {}".format(i))