cmendezc

Bacterial regulatory interaction extraction system

1 +# Bacterial regulatory interaction extraction system
2 +
3 +## Prerequisites
4 +1. Input file must be tokenized and sentence split
5 +
6 +
7 +
8 +
9 +## Run
10 +### Several files
11 +Set filenames and paths in run-several-files.sh
12 +
13 +## Acknowledgments
14 +This work was supported by UNAM-PAPIIT IA203420.
...\ No newline at end of file ...\ No newline at end of file
1 +#!/bin/bash
2 +# Main script for automatic extraction of regulatory interactions
3 +
4 +#Parameters
5 +#1: Path y nombre de archivo con las frases preprocesadas en formato de tokens (palabras)
6 +#2: Path y nombre de archivo con las frases preprocesadas en formato trasformado (palabra|lemma|pos)
7 +#3: Path y nombre de archivo para procesamiento con OpenIE
8 +#4: Path de salida de archivos a1 y a2 con RIS y GCs
9 +#5: Simplificar Y/N?
10 +#6: Separar verbales y deverbales Y/N?
11 +#7: Filtro de frases que contengan entidades. FILT1 = (GENE OR TU) AND TF
12 +#8: Path con archivos a1 y a2 de referencia (RIs y GCs verdaderas)
13 +#9: Archivo de referencia (RIs y GCs verdaderas)
14 +#10: Path para guardar archivo de evaluación
15 +#11: Archivo para guardar resultados de la evaluación contra referencia
16 +#12: Archivo de sinónimos de TFs
17 +
18 +# RUN EXTRACTION FOR L&C STM
19 +# ./automatic-extraction-ris-gcs.sh
20 +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt
21 +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt
22 +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt
23 +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs
24 +# Y Y FILT1
25 +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference
26 +# unused.txt
27 +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports
28 +# unused.txt
29 +# diccionario-STM-LT2-v7.0.SYNONYMS.json
30 +# 1>uno-STM-LC.txt
31 +# 2>dos-STM-LC.txt
32 +# ./automatic-extraction-ris-gcs.sh /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs Y Y FILT1 /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference unused.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports unused.txt diccionario-STM-LT2-v7.0.SYNONYMS.json 1>uno-STM-LC.txt 2>dos-STM-LC.txt
33 +
34 +# Some help
35 +# Filename without path: filename=$(basename "$fullfile")
36 +# Filename extension: extension="${filename##*.}"
37 +# Filename without extension: filename="${filename%.*}"
38 +# Por error de muchos archivos: find . -print0 | xargs -0 grep AcrR
39 +
40 +
41 +
42 +PATH_TO_CORENLP=/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09
43 +DICC_PATH=/home/cmendezc/terminologicalResources
44 +ISIMP_PATH=/home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/sentence-simplification/isimp_v2
45 +
46 +SCRIPT_PATH=$(cd `dirname $0` && pwd)
47 +INPUT_PATH=$1
48 +INPUT_PATH_TRANS=$2
49 +OUTPUT_FILE=$3
50 +OUTPUT_PATH=$4
51 +INPUT_NAME_EXT=$(basename "$INPUT_PATH")
52 +INPUT_NAME="${INPUT_NAME_EXT%.*}"
53 +# Simplify sentences?
54 +SIMPLIFY=$5
55 +# Separate sentences with deverbal effect?
56 +DEVERBAL_SEPARATOR=$6
57 +FILT=$7
58 +TRUE_PATH=$8
59 +TRUE_FILE=$9
60 +PATH_EVAL=${10}
61 +FILE_EVAL=${11}
62 +DICC_SYNON=${12}
63 +# CFMC 2022-03-09: tsv file with section, id sentence, sentence (Extracted from jsonpdf)
64 +TSV_PATH=${13}
65 +
66 +#Validate arguments
67 +if [[ ! ("$#" == 13 ) ]]; then
68 + echo 'Usage: ./automatic-extraction-ris-gcs.sh <inputPath_wordFile>
69 + <inputPath_taggedFile> <outputPath_file> <simplify?> <deverbal_detector?>
70 + <filter> <true_path> <true_file> <path_evaluation_report> <file_evaluation_report>
71 + <dictionary_TFs_synonyms> <path_tsv_file>'
72 + exit 1
73 +fi
74 +
75 +echo "********** SELECTED PARAMETERS **********"
76 +echo "INPUT PATH: $INPUT_PATH"
77 +echo "INPUT PATH TRANSFORMED FILE $INPUT_PATH_TRANS"
78 +echo "OUTPUT FILE: $OUTPUT_FILE"
79 +echo "OUTPUT PATH: $OUTPUT_PATH"
80 +echo "SIMPLIFY SENTENCES? $SIMPLIFY"
81 +echo "SEPARATE DEVERBAL SENTENCES? $DEVERBAL_SEPARATOR"
82 +echo "FILTER SENTENCES WITH ENTITIES? $FILT"
83 +echo "REFERENCE (TRUE) PATH: $TRUE_PATH"
84 +echo "REFERENCE (TRUE) FILE: $TRUE_FILE"
85 +echo "PATH EVALUATION REPORT: $PATH_EVAL"
86 +echo "FILE EVALUATION REPORT: $FILE_EVAL"
87 +echo "DICTIONARY OF SYNONYMS OF TFS: $DICC_SYNON"
88 +
89 +echo "********** SELECTED PROCESSES **********"
90 +CLEAN_OUTPUT=FALSE
91 +echo " Clean output paths: $CLEAN_OUTPUT"
92 +
93 +FILTER=TRUE
94 +echo " Filter sentences: $FILTER"
95 +
96 +CLEAN=TRUE
97 +echo " Clean sentences for iSimp: $CLEAN"
98 +
99 +SEPARATE=TRUE
100 +echo " Separate sentences to iSimp: $SEPARATE"
101 +
102 +SIMPLI=TRUE
103 +echo " Simplify sentences: $SIMPLI"
104 +
105 +DEVERBAL=TRUE
106 +echo " Separate deverbal and verbal sentences: $DEVERBAL"
107 +
108 +DEVTAG=TRUE # Needs DEVERBAL=TRUE
109 +echo " Tag sentences to separate deverbal and verbal sentences: $DEVTAG"
110 +
111 +DEVSEPAR=TRUE # Needs DEVERBAL=TRUE
112 +echo " Do separate deverbal and verbal sentences: $DEVSEPAR"
113 +
114 +EXTDEVERBAL=TRUE
115 +echo " Extract RI deverbal: $EXTDEVERBAL"
116 +
117 +OPENIE=TRUE
118 +echo " OpenIE triplet extraction: $OPENIE"
119 +
120 +EXTOPENIE=TRUE
121 +echo " Extract RI verbal: $EXTOPENIE"
122 +
123 +EXTATTRIB=TRUE
124 +echo " Extract RI attributive: $EXTATTRIB"
125 +
126 +EXTAUTOREG=TRUE
127 +echo " Extract RI autoregulation: $EXTAUTOREG"
128 +
129 +EXTGC=FALSE
130 +echo " Extract growth conditions: $EXTGC"
131 +
132 +EVAL=FALSE
133 +echo " Evaluate extraction: $EVAL"
134 +
135 +EVALGC=FALSE
136 +echo " Evaluate growth condition extraction: $EVALGC"
137 +
138 +#########################
139 +# Cleaning output paths #
140 +#########################
141 +if [ "$CLEAN_OUTPUT" = "TRUE" ]; then
142 + if [ -z "$(ls -A $OUTPUT_PATH/complete-ris/)" ]; then :
143 + else
144 + #echo "Not Empty"
145 + # Original: rm $OUTPUT_PATH/complete-ris/*
146 + find $OUTPUT_PATH/complete-ris -maxdepth 1 -name '*.*' -delete
147 + fi
148 + if [ -z "$(ls -A $OUTPUT_PATH/incomplete-ris/)" ]; then :
149 + else
150 + #echo "Not Empty"
151 + # Original: rm $OUTPUT_PATH/incomplete-ris/*
152 + find $OUTPUT_PATH/incomplete-ris -maxdepth 1 -name '*.*' -delete
153 + fi
154 +fi # if [ "$CLEAN_OUTPUT" = "TRUE" ]; then
155 +################
156 +# preliminares #
157 +################
158 +#Clone and update simplification pipeline
159 +#if [ ! -d "./sentence-simplification" ]
160 +# then
161 +# echo Downloading sentence simplificator...
162 +# git clone https://github.com/ezojg/sentence-simplification
163 +# else
164 +# cd ./sentence-simplification
165 +# git pull origin master
166 +# cd ..
167 +#fi
168 +#Check for iSimp
169 +#if [ ! -d "./sentence-simplification/isimp_v2" ]
170 +# then
171 +# echo ERROR: ./sentence-simplification/isimp_v2 not found. Please manually copy iSimp to said path.
172 +# exit 1
173 +#fi
174 +
175 +if [ "$FILTER" = "TRUE" ]; then
176 +echo "********** FILTER SENTENCES **********"
177 +###################################################
178 +# filter sentences with entities of interest #
179 +# and collect attributive examples ArgP-regulated #
180 +###################################################
181 +# INPUT:
182 +# 1) --inputFileWord $INPUT_PATH input file transformed
183 +# 2) --inputFileTrans $INPUT_PATH_TRANS input file of feature 'word'
184 +# 3) --outputPath $SCRIPT_PATH/filtered-sentences
185 +# 4) --outputFile filtered-sentences.txt output File
186 +# 5) --filter filter $FILT
187 +# FILT1: (GENE OR TU) AND TF
188 +# FILT2: (GENE OR TU) AND EFFECT AND TF
189 +# 6) --attrPath $SCRIPT_PATH/attributive-sentences Path for attributive cases: ArgP-regulated genes
190 +# 7) --attrFile attributive-sentences.txt File for attributive cases: ArgP-regulated genes
191 +# $DICC_PATH/normalized_Effects.json
192 +
193 +cd $SCRIPT_PATH
194 +if [ -z "$(ls -A ./filtered-sentences/)" ]; then :
195 +else
196 + #echo "Not Empty"
197 + rm ./filtered-sentences/*
198 +fi
199 +if [ -z "$(ls -A ./attributive-sentences/)" ]; then :
200 +else
201 + #echo "Not Empty"
202 + rm ./attributive-sentences/*
203 +fi
204 +if [ -z "$(ls -A ./autoregulation-sentences/)" ]; then :
205 +else
206 + #echo "Not Empty"
207 + rm ./autoregulation-sentences/*
208 +fi
209 +# CFMC 2022-03-09: To update tsv file with filtered sentences
210 +# python3.4 $SCRIPT_PATH/sentence-filter.py --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json
211 +python3.4 $SCRIPT_PATH/sentence-filter_v02.py --tsvPath $TSV_PATH --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json
212 +fi # if [ "$PRE" = "TRUE" ]; then
213 +
214 +if [ "$CLEAN" = "TRUE" ]; then
215 +echo "********** CLEAN SENTENCES **********"
216 +#################################
217 +# Clean sentences for iSimpm #
218 +#################################
219 +# INPUT - PREVIOUS OUTPUT: filtered sentences $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt
220 +# output path and file $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
221 +if [ -z "$(ls -A ./format/sanitized_sentences/)" ]; then :
222 +else
223 + #echo "Not Empty"
224 + rm ./format/sanitized_sentences/*
225 +fi
226 +#Original Daniel: python2 $SCRIPT_PATH/format/regex-before.py $INPUT_PATH $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
227 +python2 $SCRIPT_PATH/format/regex.py $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
228 +fi # if [ "$CLEAN" = "TRUE" ]; then
229 +
230 +if [ "$SEPARATE" = "TRUE" ]; then
231 +echo "********** SEPARATE SENTENCES **********"
232 +################################
233 +# Separate sentences for iSimp #
234 +################################
235 +# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/sanitized_sentences/$l
236 +# output path and file $SCRIPT_PATH/format/split_sentences/$BARE_NAME
237 +cd $SCRIPT_PATH
238 +if [ -z "$(ls -A ./format/split_sentences/)" ]; then :
239 + else
240 + rm ./format/split_sentences/*
241 +fi
242 +cd ./format/sanitized_sentences
243 +for l in $(\ls $INPUT_NAME*)
244 +do
245 + # echo $l
246 + BARE_NAME=$(echo $l | cut -f 1 -d '.')
247 + BARE_NAME+="_"
248 + LENGTH="$(wc -l < $l)"
249 + LENGTH="$(echo "${#LENGTH}")"
250 + split -a $LENGTH -d -l 1 --additional-suffix=.spt $SCRIPT_PATH/format/sanitized_sentences/$l $SCRIPT_PATH/format/split_sentences/$BARE_NAME
251 +done
252 +fi # if [ "$SEPARATE" = "TRUE" ]; then
253 +
254 +if [ "$SIMPLI" = "TRUE" ]; then
255 +echo "********** SIMPLIFY SENTENCES **********"
256 +######################
257 +# Simplify sentences #
258 +######################
259 +# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/split_sentences
260 +# output file $OUTPUT_FILE
261 +# path to iSimp $ISIMP_PATH
262 +# CALL: ./sentence-simplification/sentence-simplification-main.sh
263 +# CALL: $ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
264 +# CALL: $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
265 +# $OUTPUT_INDEX_FILE_PATH = $OUTPUT_FILE
266 +# OUTPUT: simplified sentences in path ./algorithm_sentences
267 +
268 +# while true; do
269 + # read -p "Do you wish to simplificate sentences? [Y/N]: " yn
270 + # case $yn in
271 + # [Yy]* ) SIMP=1; break;;
272 + # [Nn]* ) SIMP=0; break;;
273 + # * ) echo "Please answer yes [Y] or no [N].";;
274 + # esac
275 +# done
276 +case $SIMPLIFY in
277 + [Yy]* )
278 + SIMP=1
279 + ;;
280 + [Nn]* )
281 + SIMP=0
282 + ;;
283 + * )
284 + SIMP=1
285 + ;;
286 +esac
287 +cd $SCRIPT_PATH
288 +if [ $SIMP == 1 ]
289 + then #USING SIMPLIFICATION
290 + echo "********** YES SIMPLIFY SENTENCES **********"
291 + #Copy file to sentence-simplification
292 + #FILE_NAME=$(basename "$INPUT_PATH")
293 + #Call simplification pipeline AND create a file with the paths for the simplificated sentences
294 + ./sentence-simplification/sentence-simplification-main.sh $SCRIPT_PATH/format/split_sentences $OUTPUT_FILE $ISIMP_PATH
295 + #echo "entrada: $SCRIPT_PATH/format/split_sentences --salida: $OUTPUT_FILE"
296 + #echo "Sentences simplificated. Paths to simplificated sentences saved in $OUTPUT_FILE"
297 + else #WITHOUT SIMPLIFICACION
298 + echo "********** NO SIMPLIFY SENTENCES **********"
299 + if [ -z "$(ls -A ./sentence-simplification/algorithm_sentences/)" ]; then :
300 + else
301 + #echo "Not Empty"
302 + rm ./sentence-simplification/algorithm_sentences/*
303 + fi
304 + ls $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE
305 + cp $SCRIPT_PATH/format/split_sentences/* $SCRIPT_PATH/sentence-simplification/algorithm_sentences
306 + #echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE"
307 +fi
308 +fi # if [ "$SIMPLI" = "TRUE" ]; then
309 +
310 +if [ "$DEVERBAL" = "TRUE" ]; then
311 +echo "********** SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
312 +######################
313 +# Deverbal separator #
314 +######################
315 +# $PATH_TO_CORENLP
316 +# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/sentence-simplification/algorithm_sentences
317 +# output path $SCRIPT_PATH/deverbal-separator/separated_sentences
318 +# $DICC_PATH
319 +# $DEVTAG POS taggging sentences
320 +# $DEVSEPAR Do separate sentences
321 +# CALL: java -cp "$PATH_TO_CORENLP/*"
322 +# $SCRIPT_PATH/filter.py
323 +# OUTPUT: sentences separated in two paths according to verbal/deverbal effect
324 +
325 +case $DEVERBAL_SEPARATOR in
326 + [Yy]* )
327 + DEVSEP=1
328 + ;;
329 + [Nn]* )
330 + DEVSEP=0
331 + ;;
332 + * )
333 + DEVSEP=1
334 + ;;
335 +esac
336 +if [ $DEVSEP == 1 ]
337 + then #USING DEVERBAL SEPARATOR
338 +
339 + #if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/)" ]; then :
340 + #else
341 + #echo "Not Empty"
342 + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/*
343 + # find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -maxdepth 1 -name '*.vrb' -delete
344 + #fi
345 + #if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/)" ]; then :
346 + #else
347 + #echo "Not Empty"
348 + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/*
349 + # find $SCRIPT_PATH/deverbal-separator/separated_sentences/dev -maxdepth 1 -name '*.dev' -delete
350 + #fi
351 +
352 + echo "********** YES SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
353 + # Original Daniel 2018-12-06: ./deverbal-separator/separator.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR
354 + ./deverbal-separator/separator-v02.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR
355 + else #WITHOUT DEVERBAL SEPARATOR
356 + echo "********** NO SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
357 + ls $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE
358 + #echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE"
359 +fi # [ $DEVSEP == 1 ]
360 +fi # if [ "$DEVERBAL" = "TRUE" ]; then
361 +
362 +if [ "$EXTDEVERBAL" = "TRUE" ]; then
363 +echo "********** EXTRACT RI DEVERBAL **********"
364 +#######################
365 +# Extract RI deverbal #
366 +#######################
367 +# INPUT: deverbal files $(dirname ${file}) $(basename ${file})
368 +# output path $OUTPUT_PATH $(basename ${file%.*})
369 +# $DICC_PATH/names_EFFECT_ONTOGENE.txt $
370 +# DICC_PATH/names_GENE.txt
371 +# $DICC_PATH/names_GENE_ONTOGENE.txt
372 +# $DICC_PATH/names_GENE_SYN.txt
373 +# $DICC_PATH/names_TU.txt
374 +# $DICC_PATH/names_TU_ONTOGENE.txt
375 +# $DICC_PATH/names_TF_1grams.txt
376 +# $DICC_PATH/names_TF_2grams.txt
377 +# $DICC_PATH/names_TF_3grams.txt
378 +# $DICC_PATH/names_TF_4grams.txt
379 +# $DICC_PATH/names_TF_5Moregrams.txt
380 +# $DICC_PATH/names_TF_ONTOGENE.txt
381 +# $DICC_PATH/normalized_Effects.json
382 +# OUTPUT: standoff files with RIs
383 +# PATH ALREADY TAGGED ENTITIES: $SCRIPT_PATH/filtered-sentences
384 +# FILE ALREADY TAGGED ENTITIES: filtered-sentences.ents.json
385 + for file in $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/*.*
386 + do
387 + #python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-OriginalDaniel.py $file $OUTPUT_PATH/$(basename ${file%.*}) $DICC_PATH/names_EFFECT_ONTOGENE.txt $DICC_PATH/names_GENE.txt $DICC_PATH/names_GENE_ONTOGENE.txt $DICC_PATH/names_GENE_SYN.txt $DICC_PATH/names_TU.txt $DICC_PATH/names_TU_ONTOGENE.txt $DICC_PATH/names_TF_1grams.txt $DICC_PATH/names_TF_2grams.txt $DICC_PATH/names_TF_3grams.txt $DICC_PATH/names_TF_4grams.txt $DICC_PATH/names_TF_5Moregrams.txt $DICC_PATH/names_TF_ONTOGENE.txt
388 + #echo "Dir file: $(dirname ${file})"
389 + #echo "File $(basename ${file})"
390 + #echo "OUTOUT_PATH $OUTPUT_PATH"
391 + #echo "File $(basename ${file%.*})"
392 + echo "Dir and files: $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
393 + #python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v02.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json
394 + python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v03.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json
395 + done
396 +fi # if [ "$EXTDEVERBAL" = "TRUE" ]; then
397 +
398 +if [ "$OPENIE" = "TRUE" ]; then
399 +echo "********** OPENIE TRIPLET EXTRACTION **********"
400 + #########################
401 + # OpenIE RI extraction #
402 + #########################
403 + # Juntamos frases verbales en archivo para OpenIE extraction
404 + # Error: /bin/ls: Argument list too long: ls $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE
405 + echo " Join verbal sentences into file for OpenIE extraction"
406 + find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -type f -name '*' > $OUTPUT_FILE
407 + #echo "Deberval sentences separated. Paths to verbal sentences saved in $OUTPUT_FILE"
408 +
409 + echo " CoreNLP OpenIE..."
410 + java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.naturalli.OpenIE -filelist $OUTPUT_FILE -triple.strict false -triple.all_nominals true -format reverb > $OUTPUT_FILE.reverb
411 +fi # if [ "$OPENIE" = "TRUE" ]; then
412 +
413 +if [ "$EXTOPENIE" = "TRUE" ]; then
414 + echo "********** OPENIE RI EXTRACTION **********"
415 + #########################
416 + # OpenIE RI extraction #
417 + #########################
418 + # Sustituyo oie_compress de Nacho por un programa hecho por CMC para analizar las tripletas
419 + # y obtener aquellas que sugieran a los participantes y el efecto
420 + #Paste input and output for fancy printing
421 + # Original Nacho: echo " Fancy printing..."
422 + # Original Nacho: > $OUTPUT_FILE.fuzzy
423 + # Original Nacho: python3 oie_compress.py --oies $OUTPUT_FILE.reverb --op fuzzy --ris $DICC_PATH/normalized_Effects.json --out $OUTPUT_FILE.fuzzy
424 + #
425 + # --inputFile $OUTPUT_FILE.reverb file obtained with CoreNLPL
426 + # --outputPath $OUTPUT_PATH
427 + # --diccPath $SCRIPT_PATH/filtered-sentences Before: $DICC_PATH
428 + # --diccFile Before: termFilesTag_RIE_GCE_SYSTEM_ECCO.json
429 + # --diccEffect normalized_Effects.json
430 + # --format standoff
431 + # --diccEPAth $DICC_PATH
432 + # OUTPUT: standoff files with RIs
433 +
434 + # python3.4 $SCRIPT_PATH/ri-openie-extraction.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccFile termFilesTag_RIE_GCE_SYSTEM_ECCO.json --diccEffect normalized_Effects.json --format standoff
435 + python3.4 $SCRIPT_PATH/ri-openie-extraction-v02.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $SCRIPT_PATH/filtered-sentences --diccFile filtered-sentences.ents.json --diccEffect normalized_Effects.json --diccEPAth $DICC_PATH --format standoff
436 +
437 + #Join into single file
438 + #Sort fuzzy
439 + # Original Nacho: echo " Sort fuzzy..."
440 + # Obtiene tipo de efecto
441 + # Original Nacho: sort $OUTPUT_FILE.fuzzy -o $OUTPUT_FILE.fuzzy
442 + #Concatenate
443 + # CMC eliminated following lines because simplification was
444 + #discriminated before
445 + #if [ $SIMP == 1 ]
446 + #then #USING SIMPLIFICATION
447 + #ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
448 + #awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
449 + #cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
450 + #paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
451 + #else #WITHOUT SIMPLIFICACION
452 + #ls -l $SCRIPT_PATH/format/split_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
453 + #awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
454 + #cat $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE.als
455 + #paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
456 + #fi
457 + # Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
458 + # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
459 + # Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
460 + # Original Nacho: echo " Creating ils, fls and als files..."
461 + # Original Nacho: if [ $DEVSEP == 1 ]
462 + # Original Nacho: then #USING DEVERBAL SEPARATOR
463 + # Original Nacho: ls -l $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
464 + # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
465 + # Original Nacho: cat $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE.als
466 + # Original Nacho: else #WITHOUT DEVERBAL SEPARATOR
467 + # Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
468 + # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
469 + # Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
470 + # Original Nacho: fi
471 + # Original Nacho: echo " Paste merger..."
472 + # Original Nacho: paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
473 + # Original Nacho: echo " Create dsp file..."
474 + # Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp
475 + # Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp
476 + # rm $(dirname "$OUTPUT_FILE")/*.fls
477 + # rm $(dirname "$OUTPUT_FILE")/*.ils
478 + # rm $(dirname "$OUTPUT_FILE")/*.als
479 + #rm $SCRIPT_PATH/*.merger
480 + #rm $SCRIPT_PATH/*.reverb
481 + #rm $SCRIPT_PATH/*.fuzzy
482 +fi # if [ "$EXTOPENIE" = "TRUE" ]; then
483 +
484 +if [ "$EXTATTRIB" = "TRUE" ]; then
485 + echo "********** ATTRIBUTIVE RI EXTRACTION **********"
486 + #########################
487 + # Attributive RI extraction #
488 + #########################
489 + # Attributive RI extraction, such as AraP-regulated genes aragP, aragT
490 + #
491 + # --inputPath $SCRIPT_PATH/attributive-sentences
492 + # --outputPath $OUTPUT_PATH
493 + # --diccPath $SCRIPT_PATH/filtered-sentences Before: $DICC_PATH
494 + # --diccEffect normalized_Effects.json
495 + # OUTPUT: standoff files with RIs
496 +
497 + for file in $SCRIPT_PATH/attributive-sentences/*.*
498 + do
499 + echo "Dir file: $(dirname ${file})"
500 + echo "File: $(basename ${file})"
501 + # echo "OUTOUT_PATH $OUTPUT_PATH"
502 + # echo "File $(basename ${file%.*})"
503 + # echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
504 + if [ "$(basename ${file})" = "*.*" ]; then
505 + echo "None attributive sentence found"
506 + else
507 + python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json
508 + fi
509 + done
510 +
511 +fi # if [ "$EXTATTRIB" = "TRUE" ]; then
512 +
513 +if [ "$EXTAUTOREG" = "TRUE" ]; then
514 + echo "********** AUTOREGULATION RI EXTRACTION **********"
515 + #########################
516 + # Autoregulation RI extraction #
517 + #########################
518 + # Autoregulation RI extraction, such as ArgP protein represses its own synthesis
519 + #
520 + # --inputPath $SCRIPT_PATH/autoregulation-sentences
521 + # --outputPath $OUTPUT_PATH
522 + # --diccPath $DICC_PATH
523 + # --diccEffect normalized_Effects.json
524 + # OUTPUT: standoff files with RIs
525 +
526 + for file in $SCRIPT_PATH/autoregulation-sentences/*.*
527 + do
528 + echo "Dir file: $(dirname ${file})"
529 + echo "File: $(basename ${file})"
530 + # echo "OUTOUT_PATH $OUTPUT_PATH"
531 + # echo "File $(basename ${file%.*})"
532 + # echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
533 + if [ "$(basename ${file})" = "*.*" ]; then
534 + echo "None autoregulation sentence found"
535 + else
536 + python3 $SCRIPT_PATH/ri-autoregulation-extraction-v01.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json
537 + fi
538 + done
539 +
540 +fi # if [ "$EXTAUTOREG" = "TRUE" ]; then
541 +
542 +if [ "$EXTGC" = "TRUE" ]; then
543 + echo "********** EXTRACT GROWTH CONDITIONS **********"
544 + #############################
545 + # Extract growth conditions #
546 + #############################
547 + python3.4 $SCRIPT_PATH/extract-gcs/extract-gcs-regex.py --inputPath $OUTPUT_PATH/complete-ris --outputPath $OUTPUT_PATH/complete-ris --termPath /home/cmendezc/terminologicalResources
548 + #python3 ./GCs-regex-before.py ./ejemplo_11.spt
549 + #/home/elwe/Documents/prueba3/RIE_reordenado/RI-searcher/GC/ejemplo_11.spt ./ejemplo_11.a2
550 + #./names_GC_ECCO_1grams.txt ./names_GC_ECCO_2grams.txt ./names_GC_ECCO_3grams.txt
551 + #./names_GC_ECCO_4grams.txt ./names_GC_ECCO_5Moregrams.txt
552 +fi # if [ "$EXTGC" = "TRUE" ]; then
553 +
554 +if [ "$EVAL" = "TRUE" ]; then
555 + echo "********** EVALUATE EXTRACTION **********"
556 + if [ "$EVALGC" = "TRUE" ]; then
557 + echo "********** EVALUATE GROWTH CONDITION EXTRACTION **********"
558 + python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON --evaluateGCs
559 + else
560 + echo "********** EVALUATE WITHOUT GROWTH CONDITION EXTRACTION **********"
561 + python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON
562 +
563 + fi # if [ "$EVALGC" = "TRUE" ]; then
564 +fi # if [ "$EVAL" = "TRUE" ]; then
565 +
1 +# import fileinput
2 +# import regex as re
3 +# from regex import finditer
4 +import sys
5 +import json
6 +
7 +if ( len( sys.argv ) != 3 ):
8 + # Original Daniel: sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <EFFs_dictionary> \n" )
9 + sys.stderr.write("E: usage: " + sys.argv[0] + " <input_file> <normalized_Effects> \n")
10 + sys.stderr.flush();
11 +
12 +# exit( 2 );
13 +
14 +#LEER ARCHIVO INPUT
15 +text_file = open( sys.argv[1], "r" )
16 +dato = text_file.read()
17 +text_file.close()
18 +
19 +#LEE DICCIONARIO
20 +
21 +# Loading normalized effects
22 +# print('Loading normalized effects...')
23 +with open(sys.argv[2]) as diccFile:
24 + hashNormalizedEffects = json.load(diccFile)
25 +DICC = list(hashNormalizedEffects.keys())
26 +
27 +# Original Daniel: text_file = open( sys.argv[2], "r" )
28 +# Original Daniel: DICC = text_file.read().splitlines()
29 +# Original Daniel: text_file.close()
30 +
31 +
32 +#declara variables
33 +is_dev = False
34 +is_vrb = False
35 +
36 +
37 +# DICC
38 +# 2018-11-30 CMC: We separated noun and only past participle for deverbal processing
39 +# and all verb forms as verbal
40 +# VRB: VB verb, base form think
41 +# VRB: VBZ verb, 3rd person singular present she thinks
42 +# VRB: VBP verb, non-3rd person singular present I think
43 +# VRB: VBD verb, past tense they thought
44 +# DEV: VBN verb, past participle a sunken ship
45 +# VRB: VBG verb, gerund or present participle thinking is fun
46 +# extend/VBP
47 +for i in range(len(DICC)):
48 + # print(DICC[i])
49 + for token in dato.split():
50 + word = token[:token.find("/")]
51 + tag = token[token.find("/")+1:]
52 + # print("word: {}".format(word))
53 + # print("tag: {}".format(tag))
54 + if (DICC[i] in word) and (("NN" in tag)
55 + or ("VBN" == tag)
56 + ):
57 + is_dev = True
58 + # print("deverbal: " + word)
59 + if (DICC[i] in word) and ("VB" in tag):
60 + is_vrb = True
61 + # print("verbal: " + word)
62 +
63 +if is_dev and is_vrb:
64 + sys.exit(11)
65 +elif is_dev:
66 + sys.exit(12)
67 +elif is_vrb:
68 + sys.exit(13)
69 +else:
70 + sys.exit(10)
71 +
1 +#!/bin/bash
2 +# Separates sentences by deverbal (.dev) and verbal (.vrb)
3 +
4 +# Original Daniel: PATH_TO_CORENLP=/home/elwe/Documents/temporal/CoreNLP
5 +
6 +#Validate arguments
7 +if [[ ! ("$#" == 6 ) ]]; then
8 + echo 'Usage: ./separator.sh <path_to_corenlp> <input_path> <output_path> <dicc_path> <if_tag> <if_separate>'
9 + exit 1
10 +fi
11 +
12 +SCRIPT_PATH=$(cd `dirname $0` && pwd)
13 +# Original Daniel: INPUT_PATH=$1 #carpeta que contiene archivos a separar
14 +# Original Daniel: OUTPUT_PATH=$2
15 +PATH_TO_CORENLP=$1
16 +INPUT_PATH=$2 #carpeta que contiene archivos a separar
17 +OUTPUT_PATH=$3
18 +DICC_PATH=$4
19 +# Tag sentences to separate deverbal and verbal sentences: $DEVTAG
20 +TAG=$5
21 +# Do separate deverbal and verbal sentences: $DEVSEPAR
22 +SEP=$6
23 +
24 +if [ $TAG == "TRUE" ]
25 + then #ANALIZAR EN STANFORD PARSER
26 +
27 + if [ -z "$(ls -A $SCRIPT_PATH/tagged/)" ]; then :
28 + else
29 + #echo "Not Empty"
30 + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged/*
31 + find $SCRIPT_PATH/tagged -maxdepth 1 -name '*.conll' -delete
32 + fi
33 +
34 + # Added by CMC
35 + if [ -z "$(ls -A $SCRIPT_PATH/tagged-line/)" ]; then :
36 + else
37 + #echo "Not Empty"
38 + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged-line/*
39 + find $SCRIPT_PATH/tagged-line -maxdepth 1 -name '*.spt' -delete
40 + fi
41 +
42 + for j in $INPUT_PATH/*
43 + do
44 + #echo $j
45 + #Original Daniel: java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.parser.lexparser.LexicalizedParser -writeOutputFiles -retainTMPSubcategories -outputFormat "wordsAndTags" $SCRIPT_PATH/englishPCFG.ser.gz $j
46 + # Command line: java -cp "/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file datos_0.spt -outputDirectory tagged
47 + # java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
48 + # With parse: java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
49 + java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
50 + done
51 +
52 + # Original Daniel: mv $INPUT_PATH/*.stp $SCRIPT_PATH/tagged/
53 + for j in $SCRIPT_PATH/tagged/*
54 + do
55 + # Original Daniel: awk 'NF {print $2 "/" $4}' tagged/$j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${j%.spt}"
56 + filename=$(basename "$j")
57 + #filename="${filename%.*}"
58 + awk 'NF {print $2 "/" $4}' $j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${filename%.*}.spt"
59 + # Original Daniel: mv "$j" "${j%.stp}"
60 + done
61 +fi # if [ $TAG == "TRUE" ]
62 +
63 +if [ $SEP == "TRUE" ]
64 + then #SEPARAR ARCHIVOS
65 +
66 + # Original Daniel: if [ -z "$(ls -A $OUTPUT_PATH)" ]; then :
67 + # Modified by Carlos Méndez
68 + if [ -z "$(ls -A $OUTPUT_PATH/dev)" ]; then :
69 + else
70 + #echo "Not Empty"
71 + # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/dev/*
72 + find $OUTPUT_PATH/dev -maxdepth 1 -name '*.dev' -delete
73 + fi
74 +
75 + if [ -z "$(ls -A $OUTPUT_PATH/vrb)" ]; then :
76 + else
77 + #echo "Not Empty"
78 + # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/vrb/*
79 + find $OUTPUT_PATH/vrb -maxdepth 1 -name '*.vrb' -delete
80 + fi
81 +
82 + for j in $SCRIPT_PATH/tagged-line/*
83 + do
84 + # Original Daniel: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/names_EFFECT_ONTOGENE.txt
85 + # CMC 2018-12-04: Without separating verbal forms: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/normalized_Effects.json
86 + # CMC 2018-12-11: With separating verbal forms: python3 $SCRIPT_PATH/filter-v02.py $j $DICC_PATH/normalized_Effects.json
87 + # CMC 2018-12-11: Considering only passive verbal form as deverbal: VBN verb, past participle
88 + python3 $SCRIPT_PATH/filter-v03.py $j $DICC_PATH/normalized_Effects.json
89 + VAR=$?
90 + # filename=${j##*/}
91 + # inputfile=${filename%.spt}
92 + # exit
93 +
94 + if [ $VAR == 11 ]; then :
95 + #contiene dev y vrb $SCRIPT_PATH/tagged-line/
96 + # o
97 + #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
98 + #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
99 + #echo "Deverbal and verbal"
100 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
101 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
102 + elif [ $VAR == 12 ]; then :
103 + #contiene dev
104 + #echo "Deverbal"
105 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
106 + # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
107 + elif [ $VAR == 13 ]; then :
108 + #contiene vrb
109 + #echo "Verbal"
110 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
111 + # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
112 + elif [ $VAR == 10 ]; then :
113 + #parece no contener dev ni vrb
114 + echo "Non deverbal and verbal"
115 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
116 + # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
117 + fi
118 + done
119 +fi # if [ $SEP == "TRUE" ]
1 +# -*- coding: UTF-8 -*-
2 +import operator
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +import json
7 +import re
8 +
9 +__author__ = 'CMendezC'
10 +
11 +
12 +# Objective: evaluate predicted interactions in standoff format
13 +# versus true interactions in tab format
14 +# v04: add synonyms of TFs
15 +
16 +# Parameters:
17 +# 1) --truePath Path for true interactions
18 +# 2) --trueFile File for true interactions
19 +# 3) --predictedPath Path for predicted interactions
20 +# 4) --outputPath Output path
21 +# 5) --outputFile File for saving results
22 +# 6) --evaluateGCs Evaluate with GCs
23 +# 7) --diccPath Dictionary path
24 +# 8) --diccSynon File with synonyms of TFs
25 +
26 +# Ouput:
27 +# 1) File with TP, FP, FN and scores Precision, Recall , F1
28 +
29 +# Execution:
30 +# python3.4 evaluate-ris-gcs-standoff.py
31 +# --truePath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/analysis-validation-data-sets
32 +# --trueFile ris-analysis-reference.txt
33 +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris-gcs
34 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/evaluation-reports
35 +# --outputFile evaluation-riegce-system-ris-analysis.txt
36 +# --diccPath /home/cmendezc/terminologicalResources
37 +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
38 +# --evaluateGCs
39 +
40 +###########################################################
41 +# MAIN PROGRAM #
42 +###########################################################
43 +
44 +def updateHashPredicted(pr, hashP, pm, sF, ef):
45 + if pr not in hashP:
46 + hashTemp = {"pmids": {pm: [sF]}, "orieff": ef}
47 + hashP[pr] = hashTemp
48 + else:
49 + hashTemp = hashP[pr]
50 + if pm in hashTemp["pmids"]:
51 + hashP[pr]["pmids"][pm].append(sF)
52 + else:
53 + hashP[pr]["pmids"][pm] = [sF]
54 +
55 +
56 +def getSummary(r, hashTemp):
57 + pmids = 0
58 + sentences = 0
59 + orieff = ""
60 + if r in hashTemp:
61 + # print("r: {}".format(r))
62 + orieff = hashTemp[r]["orieff"]
63 + for pmid in hashTemp[r]["pmids"]:
64 + pmids += 1
65 + # print("PMID with sentences: {}".format(pmid))
66 + for sent in hashTemp[r]["pmids"][pmid]:
67 + sentences += 1
68 + else:
69 + return "WARNING: no data available!"
70 + return "Artículos: {}\tFrases: {}\tOriginal effect: {}".format(pmids, sentences, orieff)
71 +
72 +
73 +def getDetail(r, hashTemp):
74 + return_text = ""
75 + sentences = 0
76 + aHash = {}
77 + if r in hashTemp:
78 + for pmid in hashTemp[r]["pmids"]:
79 + for sent in hashTemp[r]["pmids"][pmid]:
80 + sentences += 1
81 + if pmid not in aHash:
82 + aHash[pmid] = sentences
83 + else:
84 + return "WARNING: PMID duplicated!"
85 + else:
86 + return "WARNING: no data available!"
87 + for p, s in sorted(aHash.items(), key=operator.itemgetter(1), reverse=True):
88 + return_text += "\tPMID {}: {} frases\n".format(p, s)
89 +
90 + return return_text
91 +
92 +
93 +def get_standard_name(regSynon):
94 + reg = ""
95 + if regSynon in hashSynon:
96 + reg = hashSynon[regSynon]
97 + else:
98 + for syn, std in hashSynon.items():
99 + if regSynon.startswith(syn):
100 + reg = regSynon.replace(syn, std, 1)
101 + break
102 + return reg
103 +
104 +
105 +def isCorrect(ripr, listT, rtype):
106 + # The predicted regulator starts with entity
107 + # Effect and regulated coincide
108 + # Regulator coincides with activator or repressor
109 + # We return a flag to indicate type of matching: full
110 + list_ripr = ripr.split('\t')
111 + regulator = list_ripr[0]
112 + regulatorStdName = ""
113 + if use_synonyms:
114 + regulatorStdName = get_standard_name(regulator)
115 + for rit in listT:
116 + # print("RI TRUE: {}".format(rit))
117 + listRT = rit.split('\t')
118 + regulatorT = listRT[0]
119 + regexRegulatorStarts = re.compile(r'(' + regulatorT + r').+')
120 + if rtype == "ri":
121 + regulated = list_ripr[1]
122 + regulatedT = listRT[1]
123 + if (regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT:
124 + return (rit, 'Full')
125 + # For cases where regulator is part of the word, such as ArgP-regulated
126 + result = regexRegulatorStarts.match(regulator)
127 + if result:
128 + # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
129 + regulator = result.group(1)
130 + if regulator == regulatorT and regulated == regulatedT:
131 + return (rit, 'Start')
132 + else:
133 + if use_synonyms:
134 + result = regexRegulatorStarts.match(regulatorStdName)
135 + if result:
136 + # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
137 + regulator = result.group(1)
138 + if regulator == regulatorT and regulated == regulatedT:
139 + return (rit, 'Start')
140 + elif rtype == "rief":
141 + effect = list_ripr[2]
142 + regulated = list_ripr[1]
143 + effectT = listRT[2]
144 + regulatedT = listRT[1]
145 + # if ripr == "ArgP\ttargets\tregulator":
146 + # print("RI-PREDICT: ArgP\ttargets\tregulator")
147 + # print(" PREDICT: regulator {} effect {} regulated {}".format(regulator, effect, regulated))
148 + # print(" TRUE: regulator {} effect {} regulated {}".format(regulatorT, effectT, regulatedT))
149 + if (
150 + regulator == regulatorT or regulatorStdName == regulatorT) and effect == effectT and regulated == regulatedT:
151 + return (rit, 'Full')
152 + elif (
153 + regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT and effect == "regulator" and (
154 + effectT == "activator" or effectT == "repressor"):
155 + # if ripr == "ArgP\ttargets\tregulator":
156 + # print(" Correct RI with regulator: {}".format(ripr))
157 + # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
158 + return (ripr, 'Regulator')
159 + else:
160 + # For cases where regulator is part of the word, such as ArgP-regulated
161 + result = regexRegulatorStarts.match(regulator)
162 + if result:
163 + # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
164 + regulator = result.group(1)
165 + if regulator == regulatorT and effect == effectT and regulated == regulatedT:
166 + return (rit, 'Start')
167 + elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and (
168 + effectT == "activator" or effectT == "repressor"):
169 + # if ripr == "ArgP\ttargets\tregulator":
170 + # print(" Correct RI with regulator: {}".format(ripr))
171 + # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
172 + # solo que en este caso uso solo el regulador
173 + # return rit
174 + return (regulator + '\t' + regulated + '\t' + effect, 'Regulator')
175 + else:
176 + if use_synonyms:
177 + result = regexRegulatorStarts.match(regulatorStdName)
178 + if result:
179 + if regulator == regulatorT and effect == effectT and regulated == regulatedT:
180 + return (rit, 'Start')
181 + elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and (
182 + effectT == "activator" or effectT == "repressor"):
183 + # if ripr == "ArgP\ttargets\tregulator":
184 + # print(" Correct RI with regulator: {}".format(ripr))
185 + # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
186 + # solo que en este caso uso solo el regulador
187 + # return rit
188 + return (regulator + '\t' + regulated + '\t' + effect, 'Regulator')
189 +
190 + # CMC 2018-10-14: Revisar riefgc porque no se ha actualizado
191 + # elif rtype == "riefgc":
192 + # effect = list_ripr[2]
193 + # regulated = list_ripr[1]
194 + # gc = list_ripr[3]
195 + # effectT = listRT[2]
196 + # regulatedT = listRT[1]
197 + # gcT = listRT[3]
198 + # if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT:
199 + # return rit
200 + # elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT:
201 + # return rit
202 + # else:
203 + # # For cases where regulator is part of the word, such as ArgP-regulated
204 + # result = regexRegulatorStarts.match(regulator)
205 + # if result:
206 + # #print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
207 + # regulator = result.group(1)
208 + # if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT:
209 + # return rit
210 + # elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT:
211 + # return rit
212 + return ('', '')
213 +
214 +
215 +def get_scores_rules(listTrue, listPredicted, hashTemp, title, ri_type):
216 + print("Evaluation")
217 + # print(listPredicted)
218 + # Precision = Extraídos correctos / Predichos
219 + # Recall = Extraídos correctos / Referencia
220 + # F - 1 = 2 * ((Precision * Recall) / (Precision + Recall))
221 + correct = 0
222 + incorrect = 0
223 + # For registering correct and incorrect RIs
224 + hashPredicted = {}
225 + # To print output RIs
226 + hashOutputRIs = {}
227 + # For registering unrecovered RIs
228 + hashUnrecovered = {}
229 +
230 + predicted = len(listPredicted)
231 + print("len(listPredicted): {}".format(predicted))
232 + reference = len(listTrue)
233 + # print("Reference: {}".format(reference))
234 +
235 + listRecovered = []
236 + for ri_pred in listPredicted:
237 + print("ri_pred: {}".format(ri_pred))
238 + # if ri_pred in hashPredicted:
239 + # print("WARNING: RI predicted {} duplicated {}".format(ri_pred, hashPredicted[ri_pred]))
240 + # else:
241 + # First all predicted RIs are incorrect
242 + # hashPredicted[ri_pred] = "incorrect"
243 + # if ri_pred in listTrue:
244 + # hashPredicted[ri_pred] = "correct"
245 + # listRecovered.append(ri_pred)
246 + # correct += 1
247 + # continue
248 + riTrue = ''
249 + result = isCorrect(ri_pred, listTrue, ri_type)
250 + riResult = result[0]
251 + matchType = result[1]
252 + if riResult != '':
253 + if riResult not in hashOutputRIs:
254 + hashOutputRIs[riResult] = "Correct"
255 + if ri_pred not in hashPredicted:
256 + hashPredicted[ri_pred] = "correct"
257 + print("ri_pred {} correct".format(ri_pred))
258 + correct += 1
259 + # Complete matching or the predicted regulator starts with entity
260 + if matchType == 'Full' or matchType == 'Start':
261 + # ri_pred matches with ri_true
262 + if riResult in listRecovered:
263 + print("WARNING: riResult {} already in listRecovered".format(riResult))
264 + else:
265 + listRecovered.append(riResult)
266 + else:
267 + incorrect += 1
268 + if riResult not in hashOutputRIs:
269 + hashOutputRIs[riResult] = "Incorrect"
270 + if ri_pred not in hashPredicted:
271 + hashPredicted[ri_pred] = "incorrect"
272 + print("ri_pred {} incorrect".format(ri_pred))
273 +
274 + if len(hashPredicted) != predicted:
275 + print("ERROR: number of predicted RIs mismatch")
276 + # return
277 + print("Predicted: {}".format(predicted))
278 + print("len(hashPredicted): {}".format(len(hashPredicted)))
279 +
280 + cor = 0
281 + inc = 0
282 + for r, v in hashPredicted.items():
283 + if v == "correct":
284 + cor += 1
285 + elif v == "incorrect":
286 + inc += 1
287 + if cor != correct:
288 + print("ERROR: number of correct RIs mismatch")
289 + # return
290 + if inc != incorrect:
291 + print("ERROR: number of incorrect RIs mismatch")
292 + # return
293 + print("Correct: {}".format(correct))
294 + print("Incorrect: {}".format(incorrect))
295 +
296 + unrecovered = 0
297 + recovered = 0 # Only when coincide with reference
298 + # without considering Regulator correct when Activator or Repressor appears in reference
299 + listRecovered2 = []
300 + listUnrecovered = []
301 + for ri in listTrue:
302 + if ri not in listRecovered:
303 + if ri in listUnrecovered:
304 + print("WARNING: ri {} already in listUnrecovered".format(ri))
305 + else:
306 + listUnrecovered.append(ri)
307 + unrecovered += 1
308 + else:
309 + if ri in listRecovered2:
310 + print("WARNING: ri {} already in listRecovered2".format(ri))
311 + else:
312 + listRecovered2.append(ri)
313 + recovered += 1
314 +
315 + print("Len listRecovered: {}".format(len(listRecovered)))
316 + print("Len listRecovered2: {}".format(len(listRecovered2)))
317 + print("Len listUnrecovered: {}".format(len(listUnrecovered)))
318 + # if (unrecovered + correct) != reference:
319 + # print("ERROR: number of unrecovered {} + correct {} and reference {} RIs mismatch".format(unrecovered, correct, reference))
320 + # return
321 +
322 + print("{}".format(title))
323 + print("Predicted: {}".format(predicted))
324 + print("Reference: {}".format(reference))
325 + print("Unrecovered: {}".format(unrecovered))
326 + print("Recovered: {}".format(recovered))
327 +
328 + precision = correct / predicted
329 + print("Precision = correct / predicted: {}".format(precision))
330 + # recall = correct / reference
331 + # We calculate recall as recovery rate, because correct instances are calculates
332 + # considering Regulator correct when Activator and Repressor appears in reference
333 + recall = recovered / reference
334 + print("Recall = recovered / reference: {}".format(recall))
335 + f1 = 2 * ((precision * recall) / (precision + recall))
336 + print("F1: {}".format(f1))
337 +
338 + with open(os.path.join(options.outputPath, options.outputFile), mode="a", errors="replace") as oFile:
339 + oFile.write("{}\n".format(title))
340 + oFile.write("Predicted: {}\n".format(predicted))
341 + oFile.write("Reference: {}\n".format(reference))
342 + oFile.write("Correct: {}\n".format(correct))
343 + oFile.write("Incorrect: {}\n".format(incorrect))
344 + oFile.write("Unrecovered: {}\n".format(unrecovered))
345 + oFile.write("Recovered: {}\n".format(recovered))
346 + oFile.write("Precision = correct / predicted: {}\n".format(precision))
347 + oFile.write("Recall = recovered / reference: {}\n".format(recall))
348 + oFile.write("F1: {}\n".format(f1))
349 + oFile.write("Unrecovered instances:\n")
350 + for r in sorted(listUnrecovered):
351 + oFile.write("\tUnrecovered: {}\n".format(r))
352 + oFile.write("Recovered instances:\n")
353 + for r in sorted(listRecovered):
354 + oFile.write("\tRecovered: {}\n".format(r))
355 + oFile.write("Incorrect instances:\n")
356 + for r, v in sorted(hashPredicted.items()):
357 + if v == "incorrect":
358 + oFile.write("\tIncorrect: {}\n".format(r))
359 + oFile.write("Correct instances:\n")
360 + for r, v in sorted(hashPredicted.items()):
361 + if v == "correct":
362 + oFile.write("\tCorrect: {}\n".format(r))
363 + # oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp)))
364 + # oFile.write("\t{}\n".format(getDetail(r, hashTemp)))
365 +
366 +
367 +def get_scores(listTrue, listPredicted, hashTemp, title):
368 + # Precision = Extraídos correctos / Extraídos
369 + # Recall = Extraídos correctos / Referencia
370 + # F - 1 = 2 * ((Precision * Recall) / (Precision + Recall))
371 + print("{}".format(title))
372 + # print("listTrue: {}".format(listTrue))
373 + # print("listPredicted: {}".format(listPredicted))
374 + print("Predicted: {}".format(len(listPredicted)))
375 + print("Reference: {}".format(len(listTrue)))
376 + correct = set(listTrue) & set(listPredicted)
377 + print("Correct: {} ({})".format(len(correct), len(correct) / len(listPredicted)))
378 + incorrect = set(listPredicted) - set(listTrue)
379 + print("Incorrect: {} ({})".format(len(incorrect), len(incorrect) / len(listPredicted)))
380 + unrecovered = set(listTrue) - set(listPredicted)
381 + print("Unrecovered: {} ()".format(len(unrecovered), len(unrecovered) / len(listTrue)))
382 + precision = len(correct) / len(listPredicted)
383 + print("Precision: {}".format(precision))
384 + recall = len(correct) / len(listTrue)
385 + print("Recall: {}".format(recall))
386 + f1 = 2 * ((precision * recall) / (precision + recall))
387 + print("F1: {}".format(f1))
388 +
389 + with open(os.path.join(options.outputPath, options.outputFile), mode="a") as oFile:
390 + oFile.write("{}\n".format(title))
391 + oFile.write("Predicted: {}\n".format(len(listPredicted)))
392 + oFile.write("Reference: {}\n".format(len(listTrue)))
393 + oFile.write("Correct: {}\n".format(len(correct)))
394 + oFile.write("Incorrect: {}\n".format(len(incorrect)))
395 + oFile.write("Unrecovered: {}\n".format(len(unrecovered)))
396 + oFile.write("Precision: {}\n".format(precision))
397 + oFile.write("Recall: {}\n".format(recall))
398 + oFile.write("F1: {}\n".format(f1))
399 + oFile.write("Correct instances:\n")
400 + for r in sorted(correct):
401 + oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp)))
402 + oFile.write("\t{}\n".format(getDetail(r, hashTemp)))
403 + oFile.write("Incorrect instances:\n")
404 + for r in sorted(incorrect):
405 + oFile.write("\t{}\n".format(r))
406 + oFile.write("Unrecovered instances:\n")
407 + for r in sorted(unrecovered):
408 + oFile.write("\t{}\n".format(r))
409 +
410 +
411 +if __name__ == "__main__":
412 + # Parameter definition
413 + parser = OptionParser()
414 + parser.add_option("--truePath", dest="truePath",
415 + help="Path true ris gcs", metavar="PATH")
416 + parser.add_option("--trueFile", dest="trueFile",
417 + help="File true ris gcs", metavar="FILE")
418 + parser.add_option("--predictedPath", dest="predictedPath",
419 + help="Path predicted ris gcs", metavar="PATH")
420 + parser.add_option("--outputPath", dest="outputPath",
421 + help="Output path", metavar="PATH")
422 + parser.add_option("--outputFile", dest="outputFile",
423 + help="File for saving results", metavar="FILE")
424 + parser.add_option("--evaluateGCs", default=False,
425 + action="store_true", dest="evaluateGCs",
426 + help="Evaluate GCs?")
427 + parser.add_option("--diccPath", dest="diccPath",
428 + help="Path to dictionary", metavar="PATH")
429 + parser.add_option("--diccSynon", dest="diccSynon",
430 + help="File with synonyms", metavar="FILE")
431 +
432 + (options, args) = parser.parse_args()
433 + if len(args) > 0:
434 + parser.error("None parameter entered.")
435 + sys.exit(1)
436 +
437 + # Printing parameter values
438 + print('-------------------------------- PARAMETERS --------------------------------')
439 + print("Path true ris gcs: " + str(options.truePath))
440 + print("File true ris gcs: " + str(options.trueFile))
441 + print("Path predicted ris gcs: " + str(options.predictedPath))
442 + print("Output path: " + str(options.outputPath))
443 + print("File for saving results: " + str(options.outputFile))
444 + print("Evaluate GCs: " + str(options.evaluateGCs))
445 + print("Path to dictionary: " + str(options.diccPath))
446 + print("File with synonyms: " + str(options.diccSynon))
447 +
448 + use_synonyms = False
449 + hashSynon = {}
450 + if options.diccPath != None and options.diccSynon != "no-synonyms":
451 + print("***** Using synonyms *****")
452 + use_synonyms = True
453 + print('Loading dictionary of synonyms...')
454 + with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon:
455 + hashSynon = json.load(diccSynon)
456 + print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon)))
457 +
458 + listTrueRI = [] # Without effect nor gc
459 + listTrueRIEF = [] # With effect nor gc
460 + if options.evaluateGCs:
461 + listTrueRIEFGC = [] # With effect and gc
462 + # Read and process Reference
463 + with open(os.path.join(options.truePath, options.trueFile), mode="r", encoding="utf-8") as iFile:
464 + for line in iFile:
465 + line = line.strip('\n')
466 + if line.startswith("#"):
467 + continue
468 + listElem = line.split('\t')
469 + if len(listElem) > 4:
470 + regulator = listElem[2]
471 + regulated = listElem[3]
472 + effect = listElem[4]
473 + if options.evaluateGCs:
474 + gc = listElem[5]
475 + else:
476 + regulator = listElem[0]
477 + regulated = listElem[1]
478 + effect = listElem[2]
479 + if options.evaluateGCs:
480 + gc = listElem[3]
481 + if effect == "binding":
482 + effect = "regulator"
483 + ri = "{}\t{}".format(regulator, regulated)
484 + if ri not in listTrueRI:
485 + listTrueRI.append(ri)
486 + rief = "{}\t{}\t{}".format(regulator, regulated, effect)
487 + if rief not in listTrueRIEF:
488 + listTrueRIEF.append(rief)
489 + if options.evaluateGCs:
490 + riefgc = "{}\t{}\t{}\t{}".format(regulator, regulated, effect, gc)
491 + if riefgc not in listTrueRIEFGC:
492 + listTrueRIEFGC.append(riefgc)
493 + print(" RIs en referencia antes regulators: {}".format(len(listTrueRI)))
494 + print(" RIEFs en referencia antes regulators: {}".format(len(listTrueRIEF)))
495 + if options.evaluateGCs:
496 + print(" RIEFGCs en referencia antes regulators: {}".format(len(listTrueRIEFGC)))
497 +
498 + # Eliminate those RIs with regulator which also have RIs with activator or repressor
499 + listRITemp = []
500 + for ri in listTrueRIEF:
501 + listRI = ri.split('\t')
502 + regulator = listRI[0]
503 + regulated = listRI[1]
504 + effect = listRI[2]
505 + if effect == "regulator":
506 + tempRIA = "{}\t{}\t{}".format(regulator, regulated, "activator")
507 + tempRIR = "{}\t{}\t{}".format(regulator, regulated, "repressor")
508 + if tempRIA in listTrueRIEF or tempRIR in listTrueRIEF:
509 + pass
510 + # print("RI regulator matchs RI activator/repressor: {}".format(ri))
511 + # listTrueRIEF.remove(ri)
512 + else:
513 + # print("Len before: {}".format(len(listRITemp)))
514 + listRITemp.append(ri)
515 + # print("Len after: {}".format(len(listRITemp)))
516 + else:
517 + listRITemp.append(ri)
518 + listTrueRIEF = listRITemp
519 +
520 + print(" RIEFs en referencia después regulators: {}".format(len(listTrueRIEF)))
521 + if options.evaluateGCs:
522 + for ri in listTrueRIEFGC:
523 + listRI = ri.split('\t')
524 + regulator = listRI[0]
525 + regulated = listRI[1]
526 + effect = listRI[2]
527 + gc = listRI[3]
528 + if effect == "regulator":
529 + tempRIGCA = "{}\t{}\t{}\t{}".format(regulator, regulated, "activator", gc)
530 + tempRIGCR = "{}\t{}\t{}\t{}".format(regulator, regulated, "repressor", gc)
531 + if tempRIGCA in listTrueRIEFGC or tempRIGCR in listTrueRIEFGC:
532 + listTrueRIEFGC.remove(ri)
533 + print(" RIEFGCs en referencia después regulators: {}".format(len(listTrueRIEFGC)))
534 +
535 + listPredictedRI = []
536 + hashPredictedRI = {}
537 + listPredictedRIEF = []
538 + hashPredictedRIEF = {}
539 + if options.evaluateGCs:
540 + listPredictedRIEFGC = []
541 + hashPredictedRIEFGC = {}
542 + hashFiles = {}
543 + for path, dirs, files in os.walk(options.predictedPath):
544 + for file in files:
545 + if file.endswith(".a1"):
546 + filename = file[:-3]
547 + if filename not in hashFiles:
548 + hashFiles[filename] = 1
549 + else:
550 + hashFiles[filename] += 1
551 + print("Files: {}".format(len(hashFiles)))
552 +
553 + hashEntities = {}
554 + processedFiles = 0
555 + for file in sorted(hashFiles.keys()):
556 + print("File: {}".format(file))
557 + pmid = file[:file.find("_")]
558 + # print("pmid {}".format(pmid))
559 + sentenceFile = file[:file.find("-", file.find("_"))] + ".txt"
560 + hashEntities = {}
561 + hashOriginalEffect = {}
562 + with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File:
563 + for line in a1File:
564 + line = line.strip('\n')
565 + listLine1 = line.split('\t')
566 + listLine2 = listLine1[1].split(' ')
567 + entity = listLine2[0]
568 + idEntity = listLine1[0]
569 + originalEffect = listLine1[2]
570 + if entity.startswith("EFFECT"):
571 + entity = entity[entity.find(".") + 1:]
572 + print("Entity: {}".format(entity))
573 + entity = entity.replace("_dev", "")
574 + print("Entity without _dev: {}".format(entity))
575 + if idEntity not in hashOriginalEffect:
576 + hashOriginalEffect[idEntity] = originalEffect
577 + else:
578 + entity = listLine1[2]
579 + if idEntity not in hashEntities:
580 + hashEntities[idEntity] = entity
581 + print("hashEntities: {}".format(hashEntities))
582 +
583 + with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File:
584 + for line in a2File:
585 + # print("Line a2: {}".format(line))
586 + # R1 Interaction.T3 Target:T2 Agent:T1 Condition: T4
587 + line = line.strip('\n')
588 + listLine1 = line.split('\t')
589 + listLine2 = listLine1[1].split(' ')
590 + regulator = listLine2[2]
591 + regulator = regulator[regulator.find(":") + 1:]
592 + regulated = listLine2[1]
593 + regulated = regulated[regulated.find(":") + 1:]
594 + effect = listLine2[0]
595 + effect = effect[effect.find(".") + 1:]
596 + # print("effect: {}".format(hashEntities[effect]))
597 + # if hashEntities[effect] == "binding":
598 + # continue
599 + if options.evaluateGCs:
600 + gc = listLine2[3]
601 + gc = gc[gc.find(":") + 1:]
602 +
603 + pri = "{}\t{}".format(hashEntities[regulator], hashEntities[regulated])
604 + if pri not in listPredictedRI:
605 + listPredictedRI.append(pri)
606 + updateHashPredicted(pri, hashPredictedRI, pmid, sentenceFile, None)
607 +
608 + prief = "{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated], hashEntities[effect])
609 + print("prief: {}".format(prief))
610 + if prief not in listPredictedRIEF:
611 + listPredictedRIEF.append(prief)
612 + updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect])
613 +
614 + if options.evaluateGCs:
615 + priefgc = "{}\t{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated],
616 + hashEntities[effect], hashEntities[gc])
617 + if priefgc not in listPredictedRIEFGC:
618 + listPredictedRIEFGC.append(priefgc)
619 + updateHashPredicted(priefgc, hashPredictedRIEFGC, pmid, sentenceFile, hashOriginalEffect[effect])
620 + processedFiles += 1
621 +
622 + print("Processed files: {}".format(processedFiles))
623 + with open(os.path.join(options.outputPath, options.outputFile), mode="w") as oFile:
624 + pass
625 + get_scores_rules(listTrueRIEF, listPredictedRIEF, hashPredictedRIEF,
626 + "Scores regulator-regulated-effect (without gc)", "rief")
627 + get_scores_rules(listTrueRI, listPredictedRI, hashPredictedRI, "Scores regulator-regulated (without effect nor gc)",
628 + "ri")
629 + if options.evaluateGCs:
630 + get_scores_rules(listTrueRIEFGC, listPredictedRIEFGC, hashPredictedRIEFGC,
631 + "Scores regulator-regulated-effect-gc", "riefgc")
1 +import fileinput
2 +#import regex as re
3 +#from regex import finditer
4 +# We use Python3 and we had to eliminate option overlapped from finditer method of re
5 +# As Daniel created this Python script in Python 2.7, he used overlapped, but in
6 +# Python 3 this option was eliminated.
7 +import re
8 +from re import finditer
9 +import sys
10 +import os
11 +import json
12 +
13 +if (len(sys.argv) != 8):
14 + sys.stderr.write("E: usage: " + sys.argv[
15 + 0] + " <input_path> <input_file> <output_path> <output_file> <normalized_Effects> <entity_path> <entity_file>\n")
16 + sys.stderr.flush();
17 + exit(2);
18 +
19 +# LEER ARCHIVO INPUT
20 +# Original Daniel: text_file = open( sys.argv[1], "r" )
21 +# Original Daniel: dato = text_file.read()
22 +# Original Daniel: text_file.close()
23 +filename = sys.argv[2]
24 +input_file = open(os.path.join(sys.argv[1], filename), "r")
25 +#print("Input file: {}".format(os.path.join(sys.argv[1], sys.argv[2])))
26 +dato = input_file.read()
27 +input_file.close()
28 +
29 +# Loading normalized effects
30 +# print('Loading normalized effects...')
31 +with open(os.path.join(sys.argv[5])) as diccFile:
32 + hashNormalizedEffects = json.load(diccFile)
33 +
34 +# USING ALREADY TAGGED ENTITIES OF THE FILE (in filter sentence step)
35 +#<entity_path> <entity_file>
36 +# READ DICTIONARY WITH ALREADY TAGGED ENTITIES
37 +entity_path = sys.argv[6]
38 +entity_file = sys.argv[7]
39 +print('Loading dictionaries with already tagged entities...')
40 +with open(os.path.join(entity_path, entity_file)) as entFile:
41 + hashDicc = json.load(entFile)
42 +print(' Loading dictionaries with already tagged entities... Done!')
43 +# CREATE LISTS WITH ALREADY TAGGED ENTITIES OF THE FILE
44 +regexNumFile = re.compile(r'_([0-9]+)[.-]')
45 +result = regexNumFile.search(filename)
46 +numFile = ""
47 +inumFile = 0
48 +if result:
49 + inumFile = int(result.group(1))
50 + numFile = str(inumFile)
51 + print("Numfile: {}".format(numFile))
52 +else:
53 + print("WARNING: numfile not found in filename")
54 +
55 +ATEREG1 = []
56 +PTEREG1GENE = []
57 +PTEREG1TU = []
58 +listEffects = []
59 +
60 +if numFile in hashDicc:
61 + hashTemp = hashDicc[numFile]
62 + # print("hashDicc[numFile]: {}".format(hashTemp))
63 + for k, v in hashTemp.items():
64 + if v == "TF":
65 + # print("Verifiying TF")
66 + if k not in ATEREG1:
67 + # print(" TF {}".format(k))
68 + ATEREG1.append(k)
69 + elif v == "GENE":
70 + if k not in PTEREG1GENE:
71 + PTEREG1GENE.append(k)
72 + elif v == "TU":
73 + if k not in PTEREG1TU:
74 + PTEREG1TU.append(k)
75 + elif v == "EFFECT":
76 + if k not in listEffects:
77 + listEffects.append(k)
78 + else:
79 + print("WARNING: entity not found in dictionaries")
80 +else:
81 + print("WARNING: numfile not found in dictionaries")
82 +
83 +# QUITA EXTENSION DE NOMBRE DE ARCHIVO
84 +# Original Daniel: split_line = sys.argv[2]
85 +output_path = sys.argv[3]
86 +# Original Daniel: split_line = split_line[:-4]
87 +# Original Daniel: file_name = split_line + ".a2"
88 +input_file_name = sys.argv[2]
89 +# Original Daniel: open( file_name , 'w').close()
90 +file_name_entities_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a1")
91 +file_name_interactions_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a2")
92 +file_name_entities_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a1")
93 +file_name_interactions_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a2")
94 +
95 +file_name_text_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".txt")
96 +file_name_text_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".txt")
97 +
98 +open(file_name_entities_complete, 'w').close()
99 +open(file_name_interactions_complete, 'w').close()
100 +# Original Daniel: open( file_name , 'w').close()
101 +open(file_name_entities_incomplete, 'w').close()
102 +open(file_name_interactions_incomplete, 'w').close()
103 +
104 +# declara variables
105 +# Original Daniel: impresion = []
106 +impresionEntities = []
107 +impresionInteractionsComplete = []
108 +impresionInteractionsIncomplete = []
109 +salida_a2 = []
110 +salida_a2_trimmed = []
111 +salida_a2_str = []
112 +q2line = ()
113 +listadeRIs = []
114 +posiblesminimos = [[], []]
115 +posiblesmaximos = [[], []]
116 +listasecundaria = []
117 +listasecundaria_trimmed = []
118 +impresionEntities = []
119 +impresionInteractionsComplete = []
120 +impresionInteractionsIncomplete = []
121 +
122 +# Effects
123 +for i in range(len(listEffects)):
124 + if listEffects[i] in dato:
125 + for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at|for)\b)', dato): # "of","for" o "at" a la derecha de EFF
126 + # Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at)\b)', dato,
127 + # Original Daniel: overlapped=True): # "of" o "at" a la derecha de EFF
128 + spantup = match.span(1)
129 + # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1))
130 + if match.group(1).lower() in hashNormalizedEffects:
131 + effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
132 + else:
133 + effect = "EFFECT." + "deverbal_effect"
134 + # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1))
135 + a2line = (effect, spantup[0], spantup[1] - 1, match.group(1))
136 + #print("Append effect a2line: {}".format(a2line))
137 + salida_a2.append(a2line)
138 + for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato): # "by" a la derecha de EFF
139 + # Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato,
140 + # Original Daniel: overlapped=True): # "by" a la derecha de EFF
141 + spantup = match.span(1)
142 + # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1))
143 + if match.group(1).lower() in hashNormalizedEffects:
144 + effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
145 + else:
146 + effect = "EFFECT." + "deverbal_effect"
147 + # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1))
148 + a2line = (effect, spantup[0], spantup[1] - 1, match.group(1))
149 + salida_a2.append(a2line)
150 + #print("Append effect a2line: {}".format(a2line))
151 + for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato): # "is the" 0-1 palabras a la izquierda de EFF
152 + # Original Daniel: for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato,
153 + # Original Daniel: overlapped=True): # "is the" 0-1 palabras a la izquierda de EFF
154 + spantup = match.span(3)
155 + # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(3))
156 + if match.group(1).lower() in hashNormalizedEffects:
157 + effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
158 + else:
159 + effect = "EFFECT." + "deverbal_effect"
160 + # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(3))
161 + a2line = (effect, spantup[0], spantup[1] - 1, match.group(3))
162 + salida_a2.append(a2line)
163 + #print("Append effect a2line: {}".format(a2line))
164 +#print("Efectos salida_a2: {}".format(salida_a2))
165 +
166 +# PTEREG1GENE regulados pacientes GENE
167 +for i in range(len(PTEREG1GENE)):
168 + if PTEREG1GENE[i] in dato:
169 + # print(PTEREG1GENE[i])
170 + for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato): # "of", "for" o "at" 0-1 palabras a la izq de regulado
171 + # Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato,
172 + # Original Daniel: overlapped=True): # "of" o "at" 0-1 palabras a la izq de regulado
173 + spantup = match.span(3)
174 + # print("match {} spantup {}".format(match.group(3), match.span(3)))
175 + # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3))
176 + a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(3))
177 + salida_a2.append(a2line)
178 + # print("Append regulados a2line: {}".format(a2line))
179 + for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato): # regulados sin patron
180 + # Original Daniel: for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato, overlapped=True): # regulados sin patron
181 + spantup = match.span(1)
182 + # print("match {} spantup {}".format(match.group(1), match.span(1)))
183 + # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1))
184 + a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(1))
185 + listasecundaria.append(a2line)
186 +#print("Efectos regulados gene listasecundaria: {}".format(listasecundaria))
187 +
188 +# CMC: ADDED TO SEPARTE REGULATED GENE AND TU
189 +# PTEREG1TU regulados pacientes TU
190 +for i in range(len(PTEREG1TU)):
191 + if PTEREG1TU[i] in dato:
192 + # print(PTEREG1TU[i])
193 + for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato): # "of","for" o "at" 0-1 palabras a la izq de regulado
194 + # Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato,
195 + # Original Daniel: overlapped=True): # "of" o "at" 0-1 palabras a la izq de regulado
196 + spantup = match.span(3)
197 + # print("match: " + match.group(3))
198 + # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3))
199 + a2line = ('TU', spantup[0], spantup[1] - 1, match.group(3))
200 + salida_a2.append(a2line)
201 + # print("Append regulados a2line: {}".format(a2line))
202 + for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato): # regulados sin patron
203 + # for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato, overlapped=True): # regulados sin patron
204 + spantup = match.span(1)
205 + # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1))
206 + a2line = ('TU', spantup[0], spantup[1] - 1, match.group(1))
207 + listasecundaria.append(a2line)
208 +#print("Efectos regulados tu listasecundaria: {}".format(listasecundaria))
209 +
210 +# ATEREG1 reguladores agentes
211 +for i in range(len(ATEREG1)):
212 + if ATEREG1[i] in dato:
213 + # print(ATEREG1[i])
214 + for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato): # "by" 0-1 palabras a la izq de regulado
215 + # Original Daniel: for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato,
216 + # Original Daniel: overlapped=True): # "by" 0-1 palabras a la izq de regulado
217 + spantup = match.span(2)
218 + # print("match: " + match.group(2))
219 + # print("match {} spantup {}".format(match.group(2), match.span(2)))
220 + # Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(2))
221 + a2line = ('TF', spantup[0], spantup[1] - 1, match.group(2))
222 + salida_a2.append(a2line)
223 + #print("Append regulator a2line: {}".format(a2line))
224 + for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato): # reguladores sin patron
225 + # for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato, overlapped=True): # reguladores sin patron
226 + spantup = match.span(1)
227 + # print("match {} spantup {}".format(match.group(1), match.span(1)))
228 + # Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(1))
229 + a2line = ('TF', spantup[0], spantup[1] - 1, match.group(1))
230 + listasecundaria.append(a2line)
231 + #print("Append regulator a2line: {}".format(a2line))
232 +#print("Reguladores agentes salida_a2: {}".format(salida_a2))
233 +#print("Reguladores agentes listasecundaria: {}".format(listasecundaria))
234 +
235 +# Elimina etiquetados repetidos o que estan incluidos en otros
236 +if salida_a2:
237 + salida_a2.sort(key=lambda tup: tup[1])
238 + salida_a2_trimmed.append(salida_a2[0])
239 + for i in range(len(salida_a2)):
240 + copiar = True
241 + for j in range(len(salida_a2_trimmed)):
242 + if ((salida_a2[i][1] >= salida_a2_trimmed[j][1]) and (salida_a2[i][2] <= salida_a2_trimmed[j][2])):
243 + copiar = False
244 + if copiar:
245 + salida_a2_trimmed.append(salida_a2[i])
246 +if listasecundaria:
247 + listasecundaria.sort(key=lambda tup: tup[1])
248 + listasecundaria_trimmed.append(listasecundaria[0])
249 + for i in range(len(listasecundaria)):
250 + copiar = True
251 + for j in range(len(listasecundaria_trimmed)):
252 + if ((listasecundaria[i][1] >= listasecundaria_trimmed[j][1]) and (
253 + listasecundaria[i][2] <= listasecundaria_trimmed[j][2])):
254 + copiar = False
255 + if copiar:
256 + listasecundaria_trimmed.append(listasecundaria[i])
257 +# print("Sin repeticiones salida_a2_trimmed: {}".format(salida_a2_trimmed))
258 +#print("Sin repeticiones listasecundaria_trimmed: {}".format(listasecundaria_trimmed))
259 +
260 +# Asigna identificadores (TX) a entidades (eff, regulador, regulado)
261 +lastID = 0
262 +for i in range(len(salida_a2_trimmed)):
263 + # if sys.argv[2].find('355') > -1:
264 + # print("i : {}".format(i))
265 + salida_a2_trimmed[i] = list(salida_a2_trimmed[i])
266 + ID = "T" + str(i + 1)
267 + salida_a2_trimmed[i].insert(0, ID)
268 + lastID = i + 1
269 + # if sys.argv[2].find('355') > -1:
270 + # print("lastID : {}".format(lastID))
271 +
272 +for i in range(len(listasecundaria_trimmed)):
273 + # if sys.argv[2].find('355') > -1:
274 + # print("i : {}".format(i))
275 + # print("lastID : {}".format(lastID))
276 + listasecundaria_trimmed[i] = list(listasecundaria_trimmed[i])
277 + ID = "T" + str(i + 1 + lastID)
278 + listasecundaria_trimmed[i].insert(0, ID)
279 +
280 +# print("Con identificadores salida_a2_trimmed: {}".format(salida_a2_trimmed))
281 +#print("Con identificadores listasecundaria_trimmed: {}".format(listasecundaria_trimmed))
282 +
283 +#print("salida_a2_trimmed") #########################
284 +#print(salida_a2_trimmed) #########################
285 +#print("listasecundaria_trimmed")
286 +#print(listasecundaria_trimmed)
287 +
288 +# Arma Interacciones Regulatorias
289 +i = 0
290 +while i < int(len(salida_a2_trimmed)):
291 + if "EFFECT" in salida_a2_trimmed[i][1]:
292 + # BUSCA REGULADO A LA DERECHA
293 + nuevaRI = [salida_a2_trimmed[i][0], "", ""] # efecto, tema, causa
294 + ref = ""
295 + posiblesminimos = [[], []]
296 + j = 0
297 + while j < int(len(salida_a2_trimmed)):
298 + # Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
299 + if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
300 + posiblesminimos[0].append(salida_a2_trimmed[j][2])
301 + posiblesminimos[1].append(salida_a2_trimmed[j][0])
302 + j = j + 1
303 + if posiblesminimos[0]:
304 + refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
305 + ref = posiblesminimos[1][refpointer]
306 + # si no encuentra, BUSCA REGULADO A LA IZQUIERDA
307 + if not ref:
308 + posiblesmaximos = [[], []]
309 + j = 0
310 + while j < int(len(salida_a2_trimmed)):
311 + # Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
312 + if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
313 + posiblesmaximos[0].append(salida_a2_trimmed[j][3])
314 + posiblesmaximos[1].append(salida_a2_trimmed[j][0])
315 + j = j + 1
316 + if posiblesmaximos[0]:
317 + refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0]))
318 + ref = posiblesmaximos[1][refpointer]
319 + nuevaRI[1] = ref
320 + # BUSCA REGULADOR A LA DERECHA
321 + ref = ""
322 + posiblesminimos = [[], []]
323 + j = 0
324 + while j < int(len(salida_a2_trimmed)):
325 + # Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
326 + if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
327 + posiblesminimos[0].append(salida_a2_trimmed[j][2])
328 + posiblesminimos[1].append(salida_a2_trimmed[j][0])
329 + j = j + 1
330 + if posiblesminimos[0]:
331 + refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
332 + ref = posiblesminimos[1][refpointer]
333 + # si no encuentra, BUSCA REGULADOR A LA IZQUIERDA
334 + if not ref:
335 + posiblesmaximos = [[], []]
336 + j = 0
337 + while j < int(len(salida_a2_trimmed)):
338 + # Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
339 + if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
340 + posiblesmaximos[0].append(salida_a2_trimmed[j][3])
341 + posiblesmaximos[1].append(salida_a2_trimmed[j][0])
342 + j = j + 1
343 + if posiblesmaximos[0]:
344 + refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0]))
345 + ref = posiblesmaximos[1][refpointer]
346 + nuevaRI[2] = ref
347 + listadeRIs.append(nuevaRI)
348 + i = i + 1
349 +
350 +# SEGUNDA FASE DE BUSQUEDA DE REGULADORES Y REGULADOS
351 +i = 0
352 +while i < int(len(listadeRIs)):
353 + if not listadeRIs[i][1]: # no regulado
354 + ref = ""
355 + posiblesminimos = [[], []]
356 + # BUSCA REGULADO A LA DERECHA
357 + j = 0
358 + while j < int(len(listasecundaria_trimmed)):
359 + for k in range(len(salida_a2_trimmed)):
360 + if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
361 + ind = k
362 + # Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
363 + if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
364 + posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3]))
365 + posiblesminimos[1].append(listasecundaria_trimmed[j][0])
366 + j = j + 1
367 + # BUSCA REGULADO A LA IZQUIERDA
368 + j = 0
369 + while j < int(len(listasecundaria_trimmed)):
370 + for k in range(len(salida_a2_trimmed)):
371 + if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
372 + ind = k
373 + # Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
374 + if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
375 + posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3]))
376 + posiblesminimos[1].append(listasecundaria_trimmed[j][0])
377 + j = j + 1
378 + # ELIGE EL REGULADO MAS CERCANO
379 + if posiblesminimos[0]:
380 + refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
381 + ref = posiblesminimos[1][refpointer]
382 + # print(ref)
383 + listadeRIs[i][1] = ref
384 + if not listadeRIs[i][2]: # no regulador
385 + ref = ""
386 + posiblesminimos = [[], []]
387 + # BUSCA REGULADO A LA DERECHA
388 + j = 0
389 + while j < int(len(listasecundaria_trimmed)):
390 + for k in range(len(salida_a2_trimmed)):
391 + if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
392 + ind = k
393 + # Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
394 + if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
395 + posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3]))
396 + posiblesminimos[1].append(listasecundaria_trimmed[j][0])
397 + j = j + 1
398 + # BUSCA REGULADO A LA IZQUIERDA
399 + j = 0
400 + while j < int(len(listasecundaria_trimmed)):
401 + for k in range(len(salida_a2_trimmed)):
402 + if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
403 + ind = k
404 + # Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
405 + if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
406 + posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3]))
407 + posiblesminimos[1].append(listasecundaria_trimmed[j][0])
408 + j = j + 1
409 + # ELIGE EL REGULADO MAS CERCANO
410 + if posiblesminimos[0]:
411 + refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
412 + ref = posiblesminimos[1][refpointer]
413 + # print(ref)
414 + listadeRIs[i][2] = ref
415 + i = i + 1
416 +#print("ListadeRIs: {}".format(listadeRIs))
417 +
418 +# Elige reguladores y regulados de la listasecundaria para ser impresos
419 +setmem = []
420 +k = 0
421 +while k < int(len(listadeRIs)):
422 + j = 0
423 + copysec = False
424 + #while j < int(len(listasecundaria_trimmed)):
425 + while j < len(listasecundaria_trimmed):
426 + # print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs))
427 + # Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
428 + if listasecundaria_trimmed[j][0] == listadeRIs[k][2]:
429 + # print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][2] {}".format(listasecundaria_trimmed[j][0],
430 + # listadeRIs[k][2]))
431 + copysec = True
432 + # print("j: {}".format(j))
433 + indj = j
434 + j = j + 1
435 + if copysec:
436 + setmem.append(listasecundaria_trimmed[indj])
437 + # print("setmen: {}".format(setmem))
438 +
439 + #### CMC: AGREGO ESTE CODIGO PARA BUSCAR REGULADOS YA QUE EL CODIGO ANTERIOR BUSCA REGULADORES
440 + j = 0
441 + copysec = False
442 + #while j < int(len(listasecundaria_trimmed)):
443 + while j < len(listasecundaria_trimmed):
444 + # print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs))
445 + # Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
446 + if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
447 + # print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][1] {}".format(listasecundaria_trimmed[j][0],
448 + # listadeRIs[k][1]))
449 + copysec = True
450 + # print("j: {}".format(j))
451 + indj = j
452 + j = j + 1
453 + if copysec:
454 + setmem.append(listasecundaria_trimmed[indj])
455 + # print("setmen: {}".format(setmem))
456 +
457 + k = k + 1
458 +setmem = sorted(setmem)
459 +# print("setmen: {}".format(setmem))
460 +dedup = [setmem[i] for i in range(len(setmem)) if i == 0 or setmem[i] != setmem[i - 1]]
461 +# print("dedup: {}".format(dedup))
462 +salida_a2_trimmed.extend(dedup)
463 +#print("salida_a2_trimmed after listasecundaria_trimmed: {}".format(salida_a2_trimmed))
464 +
465 +# Asigna identificadores (EX) a eventos (RIs)
466 +for i in range(len(listadeRIs)):
467 + # Original Daniel: ID = "E" + str(i+1)
468 + ID = "R" + str(i + 1)
469 + listadeRIs[i].insert(0, ID)
470 +#print("Con identificadores ListadeRIs: {}".format(listadeRIs))
471 +
472 +# CREA LISTADO DE EVENTOS (EX) Y ENTIDADES (TX) EN FORMATO DE IMPESIÓN
473 +for i in range(len(salida_a2_trimmed)):
474 + linea = str(salida_a2_trimmed[i][0]) + ' ' + str(salida_a2_trimmed[i][1]) + ' ' + str(
475 + salida_a2_trimmed[i][2]) + ' ' + str(salida_a2_trimmed[i][3]) + ' ' + str(salida_a2_trimmed[i][4])
476 + # Original Daniel: impresion.append(linea)
477 + impresionEntities.append(linea)
478 +
479 +for i in range(len(listadeRIs)):
480 + if listadeRIs[i][2] and listadeRIs[i][3]:
481 + # Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2]) + ' ' + 'Cause:' + str(listadeRIs[i][3])
482 + linea = str(listadeRIs[i][0]) + ' ' + "Interaction." + str(listadeRIs[i][1]) + ' ' + 'Target:' + str(
483 + listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3])
484 + # Original Daniel: elif listadeRIs[i][2]:
485 + # Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2])
486 + # Original Daniel: elif listadeRIs[i][3]:
487 + # Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Cause:' + str(listadeRIs[i][3])
488 + # Original Daniel: else:
489 + # Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1])
490 + # Original Daniel: impresion.append(linea)
491 + impresionInteractionsComplete.append(linea)
492 + #print("Interaction complete: {}".format(linea))
493 + linea = str(listadeRIs[i][0]) + ' ' + "Interaction.regulator" + ' ' + 'Target:' + str(
494 + listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3])
495 + impresionInteractionsIncomplete.append(linea)
496 +
497 +#print("Entities: {}".format(impresionEntities))
498 +
499 +# Escribir entidades interacciones completas en a1
500 +for line in impresionEntities:
501 + # Original Daniel: save_file = open( file_name, "a" )
502 + save_file = open(file_name_entities_complete, "a")
503 + save_file.write(line)
504 + save_file.write("\n")
505 + save_file.close()
506 +
507 +# Escribir entidades interacciones incompletas en a1
508 +for line in impresionEntities:
509 + # Original Daniel: save_file = open( file_name, "a" )
510 + save_file = open(file_name_entities_incomplete, "a")
511 + save_file.write(line)
512 + save_file.write("\n")
513 + save_file.close()
514 +
515 +# Escribir interacciones completas (regulator, effect, regulated)
516 +# print("InteractionsComplete: {}".format(impresionInteractionsComplete))
517 +for line in impresionInteractionsComplete:
518 + # Original Daniel: save_file = open( file_name, "a" )
519 + save_file = open(file_name_interactions_complete, "a")
520 + save_file.write(line)
521 + save_file.write("\n")
522 + save_file.close()
523 +
524 +# Escribir interacciones incompletas (regulator, "regulator", regulated)
525 +# print("InteractionsIncomplete: {}".format(impresionInteractionsIncomplete))
526 +for line in impresionInteractionsIncomplete:
527 + # Original Daniel: save_file = open( file_name, "a" )
528 + save_file = open(file_name_interactions_incomplete, "a")
529 + save_file.write(line)
530 + save_file.write("\n")
531 + save_file.close()
532 +
533 +with open(file_name_text_complete, mode="w") as txtFile:
534 + txtFile.write(dato)
535 +with open(file_name_text_incomplete, mode="w") as txtFile:
536 + txtFile.write(dato)
1 +import fileinput
2 +import re
3 +import sys
4 +
5 +if ( len( sys.argv ) < 3 ):
6 + sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <output_file> \n" )
7 + sys.stderr.flush();
8 +
9 + exit( 2 );
10 +else:
11 + print("Ok.")
12 +
13 +#LEER ARCHIVO INPUT
14 +text_file = open( sys.argv[1], "r" )
15 +dato = text_file.read().splitlines()
16 +text_file.close()
17 +
18 +
19 +#QUITA EXTENSION DE NOMBRE DE ARCHIVO
20 +split_line = sys.argv[2]
21 +split_line = split_line[:-4]
22 +file_name=""
23 +file_name = split_line + ".san"
24 +open( file_name , 'w').close()
25 +
26 +#ESCRIBIR REGEX EN ARGV 2
27 +for line in dato:
28 + line = re.sub('[\(][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NNNNa_)
29 + line = re.sub('[\[][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NNNNa_]
30 + line = re.sub('[\(][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NN,NN,NN_)
31 + line = re.sub('[\[][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NN,NN,NN_]
32 + line = re.sub('[\(][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num_)
33 + line = re.sub('[\(][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num.num_)
34 + line = re.sub('[\(][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num-num_)
35 + line = re.sub('[\[][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num_]
36 + line = re.sub('[\[][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num.num_]
37 + line = re.sub('[\[][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num-num_]
38 + line = re.sub('[\(]\s[a-zA-Z]{1}\s[\)]', '', line.rstrip()) #elimina (_alpha_)
39 + line = re.sub('[\[]\s[a-zA-Z]{1}\s[\]]', '', line.rstrip()) #elimina [_alpha_]
40 + line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman_)
41 + line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman-Roman_)
42 + line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman_)
43 + line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman-roman_)
44 + line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman_]
45 + line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman-Roman_]
46 + line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman_]
47 + line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman-roman_]
48 + line = re.sub('[\(][^\(|^\)]*\s(fig\s\.|figure|see|i\s\.\se\s\.|e\s\.\sg\s\.|tab\s\.table)\s[^\(|^\)]*[\)]', '', line.rstrip(), flags=re.I) #
49 + line = re.sub(' ', ' ', line.rstrip()) #elimina (_NNNNa_)
50 + #print(line)
51 +
52 +
53 + save_file = open( file_name, "a" )
54 + save_file.write(line)
55 + save_file.write("\n")
56 + save_file.close()
1 +# -*- coding: UTF-8 -*-
2 +import operator
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +import json
7 +import re
8 +import pandas as pd
9 +
10 +__author__ = 'CMendezC'
11 +
12 +
13 +# Objective: add organism annotation (http://pakal.ccg.unam.mx/cmendezc/bacteria-annotation) to TRN tabla
14 +
15 +# Parameters:
16 +# 1) --trnPath Path to TRN detail table
17 +# 2) --trnFile File of TRN detail table
18 +# 3) --outputPath Output path
19 +# 4) --organismPath Path to Organism annotation table
20 +# 5) --organismFile File of Organism annotation table
21 +
22 +# Ouput:
23 +# 1) Tsv file detail with:
24 +# TF TypeRegulated Regulated Effect PMID IdSentence TypeSentence Sentence
25 +# Original_idsentence Original_sentence SectionNum SectionName OrganismMentions OrganismScore ConfirmationLevel
26 +# OrganismScore = {
27 +# If only salmonella or only non identified organism = 1,
28 +# If (startswith salmonella or non identified organism) and other organisms = 0.5
29 +# If only other organisms = 0
30 +# }
31 +
32 +# Execution:
33 +# python3.4 get-TRN-Organism-v1.py
34 +
35 +# Local
36 +# python get-TRN-Organism-v1.py
37 +# --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
38 +# --trnFile STMTRN_all.detail.tsv
39 +# --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
40 +# --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results
41 +# --organismFile annotations_STMTRN_all.sentences.csv
42 +# python3 get-TRN-Organism-v1.py --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --trnFile STMTRN_all.detail.tsv --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results --organismFile annotations_STMTRN_all.sentences.csv
43 +
44 +###########################################################
45 +# MAIN PROGRAM #
46 +###########################################################
47 +
48 +def only_salmonella_or_non_identified_organism(list_temp):
49 + non_identified_organisms = [
50 + 'unidentified plasmid',
51 + 'unidentified',
52 + 'bacterium',
53 + 'bacterium IFAM-3211',
54 + 'bacterium IFAM-2074',
55 + 'bacterium IFAM-1493',
56 + 'bacterium IFAM-3215',
57 + 'bacterium IFAM-3359',
58 + 'hybrid',
59 + 'Vector pMC1403',
60 + 'Transposon Tn10',
61 + 'unidentified cloning vector',
62 + 'Plasmid F',
63 + 'Cloning vector pUC19'
64 + ]
65 + matches = 0
66 + for o in list_temp:
67 + if o.lower().startswith("salmonella") or o in non_identified_organisms:
68 + matches += 1
69 + if matches == len(list_temp):
70 + return True
71 + else:
72 + return False
73 +
74 +def salmonella_or_non_identified_and_other_organisms(list_temp):
75 + non_identified_organisms = [
76 + 'unidentified plasmid',
77 + 'unidentified',
78 + 'bacterium',
79 + 'bacterium IFAM-3211',
80 + 'bacterium IFAM-2074',
81 + 'bacterium IFAM-1493',
82 + 'bacterium IFAM-3215',
83 + 'bacterium IFAM-3359',
84 + 'hybrid',
85 + 'Vector pMC1403',
86 + 'Transposon Tn10',
87 + 'unidentified cloning vector',
88 + 'Plasmid F',
89 + 'Cloning vector pUC19'
90 + ]
91 + matches = 0
92 + for o in list_temp:
93 + if o.lower().startswith("salmonella") or o in non_identified_organisms:
94 + matches += 1
95 + if matches < len(list_temp) and matches > 0:
96 + return True
97 + else:
98 + return False
99 +
100 +def only_other_organims(list_temp):
101 + non_identified_organisms = [
102 + 'unidentified plasmid',
103 + 'unidentified',
104 + 'bacterium',
105 + 'bacterium IFAM-3211',
106 + 'bacterium IFAM-2074',
107 + 'bacterium IFAM-1493',
108 + 'bacterium IFAM-3215',
109 + 'bacterium IFAM-3359',
110 + 'hybrid',
111 + 'Vector pMC1403',
112 + 'Transposon Tn10',
113 + 'unidentified cloning vector',
114 + 'Plasmid F',
115 + 'Cloning vector pUC19'
116 + ]
117 + matches = 0
118 + for o in list_temp:
119 + if o.lower().startswith("salmonella") or o in non_identified_organisms:
120 + matches += 1
121 + if matches == 0:
122 + return True
123 + else:
124 + return False
125 +
126 +if __name__ == "__main__":
127 + # Parameter definition
128 + parser = OptionParser()
129 + parser.add_option("--trnPath", dest="trnPath",
130 + help="Path to TRN detail table", metavar="PATH")
131 + parser.add_option("--trnFile", dest="trnFile",
132 + help="File of TRN detail table", metavar="FILE")
133 + parser.add_option("--outputPath", dest="outputPath",
134 + help="Output path", metavar="PATH")
135 + parser.add_option("--organismPath", dest="organismPath",
136 + help="Path to organism annotation table", metavar="PATH")
137 + parser.add_option("--organismFile", dest="organismFile",
138 + help="File of organism annotation table", metavar="FILE")
139 +
140 + (options, args) = parser.parse_args()
141 + if len(args) > 0:
142 + parser.error("None parameter entered.")
143 + sys.exit(1)
144 +
145 + # Printing parameter values
146 + print('-------------------------------- PARAMETERS --------------------------------')
147 + print("Path to TRN detail table: " + str(options.trnPath))
148 + print("File of TRN detail table: " + str(options.trnFile))
149 + print("Output path: " + str(options.outputPath))
150 + print("Path to organism annotation table: " + str(options.organismPath))
151 + print("File of organism annotation table: " + str(options.organismFile))
152 +
153 + # Load organism annotation table
154 + print("Loading organism annotation table")
155 + df_organisms = pd.read_csv(os.path.join(options.organismPath, options.organismFile), sep=',')
156 + print("Total de frases anotadas con organism: {}".format(df_organisms.shape[0]))
157 +
158 + # Load TRN detail table
159 + print("Loading TRN detail table")
160 + df_detail = pd.read_csv(os.path.join(options.trnPath, options.trnFile), sep='\t')
161 + print("Total de frases en TRN: {}".format(df_detail.shape[0]))
162 +
163 + # Fix column for organism. We changed this issue in get-TRN-v2.py
164 + df_detail = df_detail.rename(columns={"Organism": "Organisms"})
165 + df_detail['OrganismScore'] = 1.00
166 + print(df_detail.columns)
167 + #print(df_detail['Sentence'].head(15))
168 +
169 + for idx in df_organisms.index:
170 + organisms = df_organisms['Organisms'][idx]
171 + SentenceNumberInFile = df_organisms['SentenceNumberInFile'][idx]
172 + SentenceNumberInFile = SentenceNumberInFile - 2
173 + # print("Organisms before: {}".format(df_detail.Organisms[SentenceNumberInFile]))
174 + df_detail.Organisms[SentenceNumberInFile] = organisms
175 + # print("Organisms assigned: {}".format(df_detail.Organisms[SentenceNumberInFile]))
176 +
177 + # OrganismScore = {
178 + # If only salmonella or only non identified organism = 1,
179 + # If (startswith salmonella or non identified organism) and other organisms = 0.5
180 + # If only other organisms = 0
181 + # }
182 + list_organisms = organisms.split(';')
183 + # print(" OrganismScore before: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
184 + if only_salmonella_or_non_identified_organism(list_organisms):
185 + df_detail.OrganismScore[SentenceNumberInFile] = 1.00
186 + elif salmonella_or_non_identified_and_other_organisms(list_organisms):
187 + df_detail.OrganismScore[SentenceNumberInFile] = 0.50
188 + elif only_other_organims(list_organisms):
189 + df_detail.OrganismScore[SentenceNumberInFile] = 0.00
190 + # print(" OrganismScore assigned: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
191 +
192 + hashPredictedRIs = {}
193 + hashPredictedRIsCount = {}
194 + hashPredictedRIsCountVer = {}
195 + hashPredictedRIsCountDev = {}
196 + hashPredictedRIsCountAtt = {}
197 + hashPredictedRIsCountAuto = {}
198 + hashPredictedRIsScore = {}
199 + hashPredictedRIsRI = {}
200 + for idx in df_detail.index:
201 + tf = df_detail['TF'][idx]
202 + TypeRegulated = df_detail['TypeRegulated'][idx]
203 + Regulated = df_detail['Regulated'][idx]
204 + Effect = df_detail['Effect'][idx]
205 + pmid = df_detail['PMID'][idx]
206 + numsent = df_detail['NumSentence'][idx]
207 + type_sent = df_detail['TypeSentence'][idx]
208 + sentence = df_detail['Sentence'][idx]
209 + original_idsentence = df_detail['OriginalIdSentence'][idx]
210 + original_sentence = df_detail['OriginalSentence'][idx]
211 + section_num = df_detail['SectionNum'][idx]
212 + section_name = df_detail['SectionName'][idx]
213 + organisms = df_detail['Organisms'][idx]
214 + organism_score = df_detail['OrganismScore'][idx]
215 + llave = "{}\t{}\t{}\t{}".format(tf, TypeRegulated, Regulated, Effect)
216 + if organism_score == 0:
217 + continue
218 + if llave in hashPredictedRIs:
219 + hashPredictedRIs[llave].append(
220 + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
221 + original_sentence, section_num, section_name, organisms,
222 + organism_score, "", "", "", "", "", ""))
223 + hashPredictedRIsCount[llave] += 1
224 + if type_sent == "ver/dev":
225 + hashPredictedRIsCountVer[llave] += 1
226 + elif type_sent == "dev":
227 + hashPredictedRIsCountDev[llave] += 1
228 + elif type_sent == "att":
229 + hashPredictedRIsCountAtt[llave] += 1
230 + elif type_sent == "auto":
231 + hashPredictedRIsCountAuto[llave] += 1
232 + # if organism_score == 0.5:
233 + # We penalize RI
234 + # hashPredictedRIsScore[llave] -= 0.05
235 +
236 + else:
237 + hashPredictedRIs[llave] = [
238 + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
239 + original_sentence, section_num, section_name, organisms,
240 + organism_score, "", "", "", "", "", "")]
241 + hashPredictedRIsCount[llave] = 1
242 + hashPredictedRIsCountVer[llave] = 0
243 + hashPredictedRIsCountDev[llave] = 0
244 + hashPredictedRIsCountAtt[llave] = 0
245 + hashPredictedRIsCountAuto[llave] = 0
246 + hashPredictedRIsScore[llave] = 1
247 + if type_sent == "ver/dev":
248 + hashPredictedRIsCountVer[llave] = 1
249 + elif type_sent == "dev":
250 + hashPredictedRIsCountDev[llave] = 1
251 + elif type_sent == "att":
252 + hashPredictedRIsCountAtt[llave] = 1
253 + elif type_sent == "auto":
254 + hashPredictedRIsCountAuto[llave] = 1
255 + # if organism_score == 0.5:
256 + # We penalize RI
257 + # hashPredictedRIsScore[llave] -= 0.05
258 +
259 + print("Total RIs en TRN con organismo: {}".format(len(hashPredictedRIs)))
260 + with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "summary_org")), mode="w") as oFile:
261 + # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
262 + oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
263 + for k,v in hashPredictedRIs.items():
264 + RI_value = "True"
265 + # if hashPredictedRIsScore[k] < 1:
266 + # RI_value = "Possible"
267 + oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
268 + hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k],
269 + hashPredictedRIsScore[k], RI_value))
270 + with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "detail_org")), mode="w") as oFile:
271 + # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
272 + oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tKT\tCL\tSource\tSpeculation\tNegation\tConfirmationLevel\n")
273 + i = 0
274 + for k,v in hashPredictedRIs.items():
275 + for s in v:
276 + oFile.write("{}\t{}\n".format(k, s))
277 + i += 1
278 + print("Total de frases en TRN organismo: {}".format(i))
279 +
1 +# -*- coding: UTF-8 -*-
2 +import operator
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +import json
7 +import re
8 +import pandas as pd
9 +
10 +__author__ = 'CMendezC'
11 +
12 +
13 +# Objective: generate TRN
14 +# CFMC 2022-03-11: Agregamos:
15 +# 1) Sección de oraciones de salida
16 +# 2)
17 +
18 +# Parameters:
19 +# 1) --predictedPath Path for predicted interactions
20 +# 2) --outputPath Output path
21 +# 3) --outputFile Preffix file for saving TRN
22 +# 4) --diccPath Dictionary path
23 +# 5) --diccSynon File with synonyms of TFs
24 +# 6) --tsvPath Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf
25 +# 7) --jsonpdfPath Path to read jsonpdf file to extract section name
26 +
27 +# Ouput:
28 +# 1) Tsv file detail with:
29 +# TF TypeRegulated Regulated Effect PMID IdSentence TypeSentence Sentence
30 +# Original_idsentence Original_sentence SectionNum SectionName OrganismMentions OrganismScore ConfirmationLevel
31 +
32 +# 1) Tsv file summary with:
33 +# TF TypeRegulated Regulated Effect SentCount Ver/Dev Att Auto Score RI (True/False)
34 +
35 +# Execution:
36 +# Version 2 TRN Salmonella
37 +# python3.4 get-TRN-v2.py
38 +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
39 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn
40 +# --outputFile STMTRN_v2
41 +# --diccPath /home/cmendezc/terminologicalResources
42 +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
43 +# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv
44 +# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf
45 +# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_v2 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf
46 +
47 +# articulos_sal_4
48 +# python3.4 get-TRN-v2.py
49 +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
50 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn
51 +# --outputFile STMTRN_articulos_sal_4
52 +# --diccPath /home/cmendezc/terminologicalResources
53 +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
54 +# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv
55 +# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf
56 +# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_4 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf
57 +
58 +# articulos_sal_1
59 +# python3.4 get-TRN-v2.py
60 +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
61 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn
62 +# --outputFile STMTRN_articulos_sal_1
63 +# --diccPath /home/cmendezc/terminologicalResources
64 +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
65 +# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv
66 +# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf
67 +# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_1 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf
68 +
69 +# all = articulos_sal_1 + articulos_sal_2 + articulos_sal_3 + articulos_sal_4
70 +# python3.4 get-TRN-v2.py
71 +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
72 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn
73 +# --outputFile STMTRN_all
74 +# --diccPath /home/cmendezc/terminologicalResources
75 +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
76 +# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv
77 +# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf
78 +# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_all --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf
79 +
80 +####
81 +# python3.4 get-TRN-v1.py
82 +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
83 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn
84 +# --outputFile STMTRN
85 +# --diccPath /home/cmendezc/terminologicalResources
86 +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
87 +# python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
88 +
89 +# Con dataset automatic-extraction-STM-RIs-dataset
90 +# python3.4 get-TRN-v1.py
91 +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
92 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn
93 +# --outputFile STM-RIs-dataset
94 +# --diccPath /home/cmendezc/terminologicalResources
95 +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
96 +# python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STM-RIs-dataset --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
97 +
98 +###########################################################
99 +# MAIN PROGRAM #
100 +###########################################################
101 +
102 +def updateHashPredicted(pr, hashP, pm, sF, ef):
103 + # updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect])
104 + if pr not in hashP:
105 + hashTemp = {"pmids": {pm: [sF]}, "orieff": ef}
106 + hashP[pr] = hashTemp
107 + else:
108 + hashTemp = hashP[pr]
109 + if pm in hashTemp["pmids"]:
110 + hashP[pr]["pmids"][pm].append(sF)
111 + else:
112 + hashP[pr]["pmids"][pm] = [sF]
113 +
114 +def get_standard_name(regSynon):
115 + reg = regSynon
116 + if regSynon in hashSynon:
117 + reg = hashSynon[regSynon]
118 + else:
119 + for syn, std in hashSynon.items():
120 + if regSynon.startswith(syn):
121 + reg = regSynon.replace(syn, std, 1)
122 + break
123 + return reg
124 +
125 +if __name__ == "__main__":
126 + # Parameter definition
127 + parser = OptionParser()
128 + parser.add_option("--predictedPath", dest="predictedPath",
129 + help="Path predicted ris gcs", metavar="PATH")
130 + parser.add_option("--outputPath", dest="outputPath",
131 + help="Output path", metavar="PATH")
132 + parser.add_option("--outputFile", dest="outputFile",
133 + help="Preffix file for saving results", metavar="FILE")
134 + parser.add_option("--diccPath", dest="diccPath",
135 + help="Path to dictionary", metavar="PATH")
136 + parser.add_option("--diccSynon", dest="diccSynon",
137 + help="File with synonyms", metavar="FILE")
138 + parser.add_option("--tsvPath", dest="tsvPath",
139 + help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")
140 + parser.add_option("--jsonpdfPath", dest="jsonpdfPath",
141 + help="Path to read jsonpdf file to extract section name", metavar="PATH")
142 +
143 + (options, args) = parser.parse_args()
144 + if len(args) > 0:
145 + parser.error("None parameter entered.")
146 + sys.exit(1)
147 +
148 + # Printing parameter values
149 + print('-------------------------------- PARAMETERS --------------------------------')
150 + print("Path predicted ris gcs: " + str(options.predictedPath))
151 + print("Output path: " + str(options.outputPath))
152 + print("Preffix file for saving results: " + str(options.outputFile))
153 + print("Path to dictionary: " + str(options.diccPath))
154 + print("File with synonyms: " + str(options.diccSynon))
155 + print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))
156 + print("Path to read jsonpdf file to extract section name: " + str(options.jsonpdfPath))
157 +
158 + use_synonyms = False
159 + hashSynon = {}
160 + if options.diccPath != None and options.diccSynon != "no-synonyms":
161 + print("***** Using synonyms *****")
162 + use_synonyms = True
163 + print('Loading dictionary of synonyms...')
164 + with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon:
165 + hashSynon = json.load(diccSynon)
166 + print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon)))
167 +
168 + hashPredictedRIs = {}
169 + hashPredictedRIsCount = {}
170 + hashPredictedRIsCountVer = {}
171 + hashPredictedRIsCountDev = {}
172 + hashPredictedRIsCountAtt = {}
173 + hashPredictedRIsCountAuto = {}
174 + hashFiles = {}
175 + for path, dirs, files in os.walk(options.predictedPath):
176 + for file in files:
177 + if file.endswith(".a1"):
178 + filename = file[:-3]
179 + if filename not in hashFiles:
180 + hashFiles[filename] = 1
181 + else:
182 + hashFiles[filename] += 1
183 + print("Files: {}".format(len(hashFiles)))
184 +
185 + processedFiles = 0
186 + id_ri = 1
187 + regex_att_auto = re.compile(r"(\.att\.|\.auto\.)[0-9]*$")
188 + for file in sorted(hashFiles.keys()):
189 + print("File: {}".format(file))
190 + type_sent = "ver/dev"
191 + if file.find("dataSet_OnlyRI_sentences") > -1:
192 + pmid = "000000"
193 + if file.find("dataSet_OnlyRI_sentences.") > -1:
194 + if file.find(".att.") > -1:
195 + numsent = file[file.find("att.") + 4:]
196 + type_sent = "att"
197 + if pmid.find(".auto.") > -1:
198 + numsent = file[file.find("auto.") + 5:]
199 + type_sent = "auto"
200 + else:
201 + numsent = file[file.find("_", file.find("_", file.find("_") + 1) + 1) + 1:file.find("-")]
202 + numsent = numsent.replace(".al", "")
203 + print("dataSet_OnlyRI_sentences numsent: {}".format(numsent))
204 + print("dataSet_OnlyRI_sentences pmid: {}".format(pmid))
205 + else:
206 + pmid = file[:file.find("_")]
207 + # print("pmid: {}".format(pmid))
208 + numsent = file[file.find("_")+1:file.find("-")]
209 + numsent = numsent.replace(".al", "")
210 + if pmid.find(".att.") > -1:
211 + # CFMC 2022-03-11: Fix errro in pmid
212 + # CFMC 2022-03-11 Original: pmid = pmid.replace(".att.", "")
213 + pmid = regex_att_auto.sub("", pmid)
214 + numsent = file[file.find("att.")+4:]
215 + type_sent = "att"
216 + if pmid.find(".auto.") > -1:
217 + # CFMC 2022-03-11: Fix errro in pmid
218 + # CFMC 2022-03-11 Original: pmid = pmid.replace(".auto.", "")
219 + pmid = regex_att_auto.sub("", pmid)
220 + numsent = file[file.find("auto.") + 5:]
221 + type_sent = "auto"
222 + # numsent = file[file.find("_"):file.find("-")]
223 + # print("pmid {}".format(pmid))
224 + # print("numsent: {}".format(numsent))
225 +
226 + sentenceFile = file[:file.find("-", file.find("_"))] + ".txt"
227 + hashEntitiesGenes = {}
228 + hashEntitiesTUs = {}
229 + hashEntitiesTFs = {}
230 + hashEntitiesEffects = {}
231 + hashOriginalEffect = {}
232 + regex_fix_regulator = re.compile(r'(Regulated|Binds|Bind|deverbal_effect|Regulate)')
233 + regex_fix_repressor = re.compile(r'(Repressing|Represses)')
234 + with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File:
235 + for line in a1File:
236 + line = line.strip('\n')
237 + listLine1 = line.split('\t')
238 + listLine2 = listLine1[1].split(' ')
239 + entity = listLine2[0]
240 + entity_type = listLine2[0]
241 + idEntity = listLine1[0]
242 + originalEffect = listLine1[2]
243 + if entity.startswith("EFFECT"):
244 + entity = entity[entity.find(".") + 1:]
245 + # print("Entity: {}".format(entity))
246 + if pmid.find("_dev") > -1:
247 + type_sent = "dev"
248 + entity = entity.replace("_dev", "")
249 + # print("Entity without _dev: {}".format(entity))
250 + if idEntity not in hashOriginalEffect:
251 + hashOriginalEffect[idEntity] = originalEffect
252 + if idEntity not in hashEntitiesEffects:
253 + # We fixed some wrong effects in TRN, but we must fix this also in another script where error is produced
254 + if regex_fix_regulator.match(entity):
255 + print("WARNING EFFECT: {}".format(entity))
256 + entity = regex_fix_regulator.sub("regulator", entity)
257 + print("WARNING EFFECT after: {}".format(entity))
258 + if regex_fix_repressor.match(entity):
259 + print("WARNING EFFECT: {}".format(entity))
260 + entity = regex_fix_repressor.sub("repressor", entity)
261 + print("WARNING EFFECT after: {}".format(entity))
262 + hashEntitiesEffects[idEntity] = entity
263 + else:
264 + entity = listLine1[2]
265 + if entity_type == "GENE":
266 + if idEntity not in hashEntitiesGenes:
267 + hashEntitiesGenes[idEntity] = entity
268 + elif entity_type == "TU":
269 + if idEntity not in hashEntitiesTUs:
270 + hashEntitiesTUs[idEntity] = entity
271 + elif entity_type == "TF":
272 + if idEntity not in hashEntitiesTFs:
273 + hashEntitiesTFs[idEntity] = entity
274 +
275 + # print("hashEntities: {}".format(hashEntitiesGenes))
276 + # print("hashEntities: {}".format(hashEntitiesTUs))
277 + # print("hashEntities: {}".format(hashEntitiesTFs))
278 +
279 + with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File:
280 + sentence = ''
281 + with open(os.path.join(options.predictedPath, file + ".txt"), mode="r") as txtFile:
282 + sentence = txtFile.read()
283 + listTokens = [token.split('|')[0] for token in sentence.split()]
284 + sentence = ' '.join(listTokens)
285 +
286 + # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
287 + # Open jsonpdf file
288 + hash_sections = {}
289 + sentences = {}
290 + print('Loading jsonpdf file...')
291 + with open(os.path.join(options.jsonpdfPath, pmid + ".jsonpdf"), "r", encoding="utf-8", errors="replace") as jsonpdfFile:
292 + text_file = jsonpdfFile.read()
293 + if file.startswith("26781240"):
294 + text_file = text_file.replace(" \\ ", " \\\\ ")
295 + elif file.startswith("26249345"):
296 + text_file = text_file.replace('}], ', '}],"sections": {}')
297 + try:
298 + hash_jsonpdf = json.loads(text_file)
299 + print(' Loading jsponpdf file... done!')
300 + except Exception as e:
301 + print(e)
302 + print(" Loading jsonpdf file failed: {}".format(file))
303 + hash_sections = hash_jsonpdf["sections"]
304 + # print("Sections: {}".format(hash_sections))
305 + sentences = hash_jsonpdf["sentences"]
306 + # Open tsv file
307 + print('Loading tsv file...')
308 + file_tsv = pmid + ".pre.fil.tsv"
309 + tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
310 + # print("tsv_file.shape: {}".format(tsv_file.shape))
311 + tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
312 + # print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
313 + tsv_file_new = tsv_file_filtered.reset_index(drop=True)
314 + # print(tsv_file_new.head(10))
315 + print(' Loading tsv file... done!')
316 + numsent_int = int(numsent)
317 + original_sentence = tsv_file_new.at[numsent_int, 'sentence']
318 + section_num = tsv_file_new.at[numsent_int, 'section']
319 + # print("type(section_num): {}".format(type(section_num)))
320 + original_idsentence = tsv_file_new.at[numsent_int, 'idsentence']
321 + section_num_str = str(section_num)
322 + if section_num_str in hash_sections:
323 + section_name = hash_sections[section_num_str]
324 + else:
325 + section_name = "Unknown"
326 +
327 + for line in a2File:
328 + # print("Line a2: {}".format(line))
329 + # R1 Interaction.T3 Target:T2 Agent:T1 Condition: T4
330 + line = line.strip('\n')
331 + listLine1 = line.split('\t')
332 + listLine2 = listLine1[1].split(' ')
333 + regulator = listLine2[2]
334 + regulator = regulator[regulator.find(":") + 1:]
335 + regulated = listLine2[1]
336 + regulated = regulated[regulated.find(":") + 1:]
337 + effect = listLine2[0]
338 + effect = effect[effect.find(".") + 1:]
339 +
340 + tf = hashEntitiesTFs[regulator]
341 + if tf.endswith("ed"):
342 + tf = tf[:tf.find("-")]
343 + #else:
344 + # Clean TF names by expressions seen in TRN outpur file
345 + tf = re.sub(r"(/absence|controlle|activation|‐regulate|‐mediate|mediate|-regulate|regulate|ˉ|-like|-mutant|-type|-independent|-dependent|dependent|-dependant|-binding|-and|-family|-bound|-deficient|-indepen-dent|-inducing|-green|-overproducing|-or|-depletion|-repressible|-dual|-box)", "", tf)
346 + # Clean false TF names - 2329
347 + result = re.match(r"(cyclic|RHONDA|Crawford|Hulett|Rhodobacter|Danino|Huang|Neisseria|Huang|HUGHES1|Robbe-Saule|Danchin|Roberts|Furer|Hunter|Furue|Humphreys|Nacional)", tf)
348 + if result:
349 + break
350 + # H
351 + tf = get_standard_name(tf)
352 +
353 + # print("numsent: {}".format(numsent))
354 + # For L&C do not increment 1
355 + # CFMC 2022-03-11 Original: numsent_int = int(numsent)
356 +
357 + if regulated in hashEntitiesGenes:
358 + type_regulated = "Gene"
359 + llave = "{}\t{}\t{}\t{}".format(tf, "gene", hashEntitiesGenes[regulated],
360 + hashEntitiesEffects[effect])
361 + elif regulated in hashEntitiesTUs:
362 + type_regulated ="TU"
363 + llave = "{}\t{}\t{}\t{}".format(tf, "TU", hashEntitiesTUs[regulated],
364 + hashEntitiesEffects[effect])
365 + else:
366 + print("ERROR: Regulated did not found!")
367 + # Clean false cases
368 + if llave.startswith("Hu"):
369 + break
370 +
371 + if llave in hashPredictedRIs:
372 + # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
373 + hashPredictedRIs[llave].append("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, ""))
374 + hashPredictedRIsCount[llave] += 1
375 + if type_sent == "ver/dev":
376 + # if llave in hashPredictedRIsCountVer:
377 + hashPredictedRIsCountVer[llave] += 1
378 + # else:
379 + # hashPredictedRIsCountVer[llave] = 1
380 + elif type_sent == "dev":
381 + # if llave in hashPredictedRIsCountVer:
382 + hashPredictedRIsCountDev[llave] += 1
383 + # else:
384 + # hashPredictedRIsCountDev[llave] = 1
385 + elif type_sent == "att":
386 + # if llave in hashPredictedRIsCountVer:
387 + hashPredictedRIsCountAtt[llave] += 1
388 + # else:
389 + # hashPredictedRIsCountAtt[llave] = 1
390 + elif type_sent == "auto":
391 + # if llave in hashPredictedRIsCountVer:
392 + hashPredictedRIsCountAuto[llave] += 1
393 + # else:
394 + # hashPredictedRIsCountAuto[llave] = 1
395 + else:
396 + # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
397 + hashPredictedRIs[llave] = ["{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, "")]
398 + hashPredictedRIsCount[llave] = 1
399 + hashPredictedRIsCountVer[llave] = 0
400 + hashPredictedRIsCountDev[llave] = 0
401 + hashPredictedRIsCountAtt[llave] = 0
402 + hashPredictedRIsCountAuto[llave] = 0
403 + if type_sent == "ver/dev":
404 + hashPredictedRIsCountVer[llave] = 1
405 + elif type_sent == "dev":
406 + hashPredictedRIsCountDev[llave] = 1
407 + elif type_sent == "att":
408 + hashPredictedRIsCountAtt[llave] = 1
409 + elif type_sent == "auto":
410 + hashPredictedRIsCountAuto[llave] = 1
411 +
412 + id_ri += 1
413 + processedFiles += 1
414 +
415 + print("Processed files: {}".format(processedFiles))
416 + with open(os.path.join(options.outputPath, options.outputFile + ".summary.tsv"), mode="w") as oFile:
417 + # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
418 + oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
419 + for k,v in hashPredictedRIs.items():
420 + oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
421 + hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], "1", "True"))
422 + #oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k], hashPredictedRIsCountDev[k], hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], v))
423 + with open(os.path.join(options.outputPath, options.outputFile + ".detail.tsv"), mode="w") as oFile:
424 + # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
425 + oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tConfirmationLevel\n")
426 + for k,v in hashPredictedRIs.items():
427 + for s in v:
428 + oFile.write("{}\t{}\n".format(k, s))
429 +
1 +# -*- coding: UTF-8 -*-
2 +from optparse import OptionParser
3 +import sys
4 +import os
5 +import json
6 +import operator
7 +import re
8 +from nltk.corpus import words
9 +
10 +__author__ = 'CMendezC'
11 +
12 +
13 +# Objective: obtain predicted ris from attributive sentences, such as ArgP-regulated gene argP
14 +# Input format: transformed format.
15 +# WARNING: Only one sentence per line
16 +
17 +# Parameters:
18 +# 1) --inputPath Input path
19 +# 2) --inputFile Inpupt file
20 +# 3) --outputPath Output path
21 +# 5) --diccPath Dictionary path
22 +# 7) --diccEffect File with normalized effects
23 +
24 +# 6) --diccFile JSON file with entity dictionaries
25 +# 9) --diccEPAth Dictionary path diccEffect
26 +# 8) --format Output format: standoff, tabs
27 +
28 +# Ouput:
29 +# 1) File with predicted ris combined with existing files.
30 +# Format standoff:
31 +# T1 TF 0 0 ArgP-regulated
32 +# T2 GENE 0 0 argP
33 +# T1 Growth_condition 88 137 mitochondrial electron transport chain inhibitors
34 +# R1 Interaction.activator Target:T3 Agent:T1
35 +
36 +# Execution
37 +# C:\anaconda3\python ri-attributive-extraction.py
38 +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences
39 +# --inputFile ris-sentences-analysis.att.017.txt
40 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs
41 +# --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources
42 +# --diccEffect normalized_Effects.json
43 +# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json
44 +# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.286.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json
45 +
46 +# python3 ri-attributive-extraction.py
47 +# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences
48 +# --inputFile ris-sentences-analysis.att.017.txt
49 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs
50 +# --diccPath /home/cmendezc/terminologicalResources
51 +# --diccEffect normalized_Effects.json
52 +# python3 ri-attributive-extraction.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json
53 +
54 +###########################################################
55 +# MAIN PROGRAM #
56 +###########################################################
57 +
58 +def getPosWord(wordPos, endPos, text, termList):
59 + offsetStart = 0
60 + wordNum = 0
61 + listText = text.split()
62 + for w in listText:
63 + # if filenameBefore.find('000-2') > -1:
64 + # print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
65 + if wordNum >= int(wordPos):
66 + # for tok in word.split():
67 + for t in termList:
68 + # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
69 + if w == t:
70 + return [w, offsetStart, offsetStart + len(w) - 1]
71 + #else:
72 + wordNum += 1
73 + offsetStart += len(w) + 1
74 + if wordNum > int(endPos):
75 + return None
76 + return None
77 +
78 +def getIdEntity(aList, etype, idE):
79 + entity = aList[0]
80 + if etype == "EFFECT":
81 + normalizedEffect = entity
82 + #print("EFFECT: {}".format(entity))
83 + if entity in hashNormalizedEffects:
84 + normalizedEffect = hashNormalizedEffects[entity]
85 + etype += "." + normalizedEffect
86 + #print("etype: {}".format(etype))
87 + entityPosStart = aList[1]
88 + entityPosEnd = aList[2]
89 + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
90 + #print("keyEntity: {}".format(keyEntity))
91 + if keyEntity not in hashEntities:
92 + idE += 1
93 + idEntity = "T{}".format(idE)
94 + hashEntities[keyEntity] = idEntity
95 + #print("New entity {}: {}".format(idEntity, keyEntity))
96 + return idEntity, idE
97 + else:
98 + idEntity = hashEntities[keyEntity]
99 + return idEntity, idE
100 +
101 +def getIdInteraction(regulator, regulated, effect, idI, hashInt):
102 + #print("hashInt: {}".format(hashInt))
103 + keyInteraction = "{} {} {}".format(regulator, regulated, effect)
104 + if keyInteraction not in hashInt:
105 + idI += 1
106 + idInteraction = "R{}".format(idI)
107 + hashInt[keyInteraction] = idInteraction
108 + #print("New interaction {}: {}".format(idInteraction, keyInteraction))
109 + #return idInteraction, idI
110 + else:
111 + idInteraction = hashInt[keyInteraction]
112 + return idInteraction, idI
113 +
114 +def saveFiles(filename, hashE, hashI, s, effect):
115 + if effect:
116 + outputPath = os.path.join(options.outputPath, "complete-ris")
117 + else:
118 + outputPath = os.path.join(options.outputPath, "incomplete-ris")
119 + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
120 + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
121 + for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
122 + aList = k.split()
123 + a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
124 + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
125 + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
126 + for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
127 + aList = k.split()
128 + a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
129 + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
130 + txtFile.write(s)
131 +
132 +def loadFileEntities(filename, outputPath, hashTemp):
133 + #print("Start loadFileEntities")
134 + idE = 1
135 + try:
136 + f = filename[:filename.rfind(".")] + ".a1"
137 + # print("file entities: {}".format(f))
138 + with open(os.path.join(outputPath, f), mode="r") as a1File:
139 + for line in a1File:
140 + line = line.strip('\n')
141 + listLine1 = line.split('\t')
142 + listLine2 = listLine1[1].split(' ')
143 + etype = listLine2[0]
144 + entityPosStart = listLine2[1]
145 + entityPosEnd = listLine2[2]
146 + entity = listLine1[2]
147 + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
148 + idEntity = listLine1[0]
149 + if keyEntity not in hashTemp:
150 + hashTemp[keyEntity] = idEntity
151 + if int(idEntity[1:]) > idE:
152 + idE = int(idEntity[1:])
153 + except IOError:
154 + print("IOError file: {}".format(os.path.join(outputPath, f)))
155 + # idE = 1
156 + return idE
157 +
158 +def loadFileInteractions(filename, outputPath, hashTemp):
159 + #print("Start loadFileInteractions")
160 + idI = 1
161 + try:
162 + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
163 + for line in a2File:
164 + #print("Line a2: {}".format(line))
165 + line = line.strip('\n')
166 + listLine1 = line.split('\t')
167 + listLine2 = listLine1[1].split(' ')
168 + regulator = listLine2[2]
169 + regulator = regulator[regulator.find(":") + 1:]
170 + regulated = listLine2[1]
171 + regulated = regulated[regulated.find(":") + 1:]
172 + effect = listLine2[0]
173 + effect = effect[effect.find(".") + 1:]
174 + idInteraction = listLine1[0]
175 + keyInteraction = "{} {} {}".format(regulator, regulated, effect)
176 + if keyInteraction not in hashTemp:
177 + hashTemp[keyInteraction] = idInteraction
178 + if int(idInteraction[1:]) > idI:
179 + idI = int(idInteraction[1:])
180 + except IOError:
181 + print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
182 + # idI = 1
183 + return idI
184 +
185 +def getRealPos(posStart, posEnd, lin):
186 + return (posStart, posEnd)
187 +
188 +def getRI(r, l):
189 + regulator = r.group('regulator')
190 + regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
191 + # regulatorStart = getRealPos(r.start('regulator'), l)
192 + # regulatorEnd = getRealPos(r.end('regulator'), l)
193 + regulated = r.group('regulated')
194 + regulatedPos = getRealPos(r.start('regulated'), r.end('regulated'), l)
195 + # regulatedStart = getRealPos(r.start('regulated'), l)
196 + # regulatedEnd = getRealPos(r.end('regulated'), l)
197 + effect = r.group('effect')
198 + effectPos = getRealPos(r.start('effect'), r.end('effect'), l)
199 + # effectStart = getRealPos(r.start('effect'), l)
200 + # effectEnd = getRealPos(r.end('effect'), l)
201 + #print("Regulator {}, start {}, end {}".format(regulator, regulatorPos[0], regulatorPos[1]))
202 + #print("Regulated {}, start {}, end {}".format(regulated, regulatedPos[0], regulatedPos[1]))
203 + #print("Effect {}, start {}, end {}".format(effect, effectPos[0], effectPos[1]))
204 + return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
205 + regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
206 + effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
207 +
208 +if __name__ == "__main__":
209 + # Parameter definition
210 + # python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py
211 + # --inputPath $(dirname ${file})
212 + # --inputFile $(basename ${file})
213 + # --outputPath $OUTPUT_PATH
214 + # --diccPath $DICC_PATH
215 + # --diccEffect normalized_Effects.json
216 + parser = OptionParser()
217 + parser.add_option("--inputPath", dest="inputPath",
218 + help="Input path", metavar="PATH")
219 + parser.add_option("--inputFile", dest="inputFile",
220 + help="Input file", metavar="FILE")
221 + parser.add_option("--outputPath", dest="outputPath",
222 + help="Output path", metavar="PATH")
223 + parser.add_option("--diccPath", dest="diccPath",
224 + help="Path to read dictionaries", metavar="PATH")
225 + # parser.add_option("--diccFile", dest="diccFile",
226 + # help="JSON file with entity dictionaries", metavar="FILE")
227 + parser.add_option("--diccEffect", dest="diccEffect",
228 + help="File with normalized effects", metavar="FILE")
229 +
230 + # parser.add_option("--format", dest="format",
231 + # help="Output format: standoff", metavar="TEXT")
232 + # parser.add_option("--diccEPAth", dest="diccEPAth",
233 + # help="File with normalized effects", metavar="FILE")
234 +
235 + (options, args) = parser.parse_args()
236 + #if len(args) > 0:
237 + # parser.error("None parameter entered.")
238 + # sys.exit(1)
239 +
240 + # Printing parameter values
241 + print('-------------------------------- PARAMETERS --------------------------------')
242 + print("Input path: " + str(options.inputPath))
243 + print("Input file: " + str(options.inputFile))
244 + print("Output path: " + str(options.outputPath))
245 + print("Path to read dictionaries: " + str(options.diccPath))
246 + # print("JSON file with entity dictionaries: " + str(options.diccFile))
247 + print("File with normalized effects: " + str(options.diccEffect))
248 + # print("Output format: " + str(options.format))
249 + # print("Path to read normalized effects: " + str(options.diccEPAth))
250 +
251 + # regularWords = words.words('en')
252 +
253 + # print('Loading dictionaries...')
254 + # with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
255 + # hashDicc = json.load(diccFile)
256 +
257 + # hashTermFiles = hashDicc["hashTermFiles"]
258 + # hashTerms = hashDicc["hashTerms"]
259 +
260 + # for key in hashTermFiles.keys():
261 + # for f in hashTermFiles[key]:
262 + # # print('File: ' + f)
263 + # with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
264 + # for line in iFile:
265 + # line = line.strip('\n')
266 + # line = line.replace(' ', '-')
267 + # if line not in hashTerms[key]:
268 + # hashTerms[key].append(line)
269 + # # if options.termLower:
270 + # # hashTerms[key].append(line.lower())
271 + # # if options.termCapitalize:
272 + # # hashTerms[key].append(line.capitalize())
273 + # print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
274 +
275 + # Loading normalized effects
276 + print('Loading normalized effects ending with -d...')
277 + hashNormalizedEffects = {}
278 + with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
279 + hashNormalizedEffects = json.load(diccFile)
280 + listEffects = []
281 + for eff in hashNormalizedEffects.keys():
282 + if eff.endswith('d'):
283 + listEffects.append(eff)
284 + listEffects.append("dependent")
285 + effects = "|".join(listEffects)
286 + #print("Effects: {}".format(effects))
287 +
288 + files = {}
289 + hashEntities = {}
290 + hashInteractions = {}
291 + hashInteractionsEffect = {}
292 + idEntities = 1
293 + idInteractions = 1
294 + idInteractionsEffect = 1
295 +
296 + # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)\s([^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
297 + # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+( [^ ]+)')
298 + # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))\s([^|]+\|[^|]+\|(CC|,))?)+ ([^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
299 + # regexAttRILeft = re.compile(r'(?:([^|\s]+\|[^|]+\|(?:GENE|TU))\s(?:[^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
300 + # regexAttRILeft = re.compile(r'(?=([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
301 + # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
302 + # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+) ([^ ]+ )+(?P<regulator>[^|]+\|[^|]+\|TF)')
303 + # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>' + r'(' + effects + ')\|[^|]+\|TF) [^|]+\|gene')
304 +
305 + # reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
306 + # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(regulated|repressed)\|[^|]+\|TF) [^|]+\|gene')
307 + # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ ){,5}(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
308 + # CMC 2018-11-07: regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
309 + regexAttRILeft = re.compile(
310 + r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF)')
311 + # regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ ){,5}(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
312 + # CMC 2018-11-07: regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ )+(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
313 + regexAttRIRight = re.compile(
314 + r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) ([^ ]+ )*(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
315 +
316 + filename = options.inputFile
317 + hashEntities = {}
318 + hashInteractions = {}
319 + hashInteractionsEffect = {}
320 + idEntities = 1
321 + idInteractions = 1
322 + idInteractionsEffect = 1
323 + outputPath = os.path.join(options.outputPath, "complete-ris")
324 + idEntities = loadFileEntities(filename, outputPath, hashEntities)
325 + idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
326 + outputPath = os.path.join(options.outputPath, "incomplete-ris")
327 + idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
328 +
329 + listRIs = []
330 +
331 + with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
332 + for line in iFile:
333 + line = line.rstrip('\n')
334 + # Buscar hacia la izquierda
335 + #print("Buscando hacia <<")
336 + result = regexAttRILeft.search(line)
337 + #print("result: {}".format(result))
338 + lineTemp = line
339 + # print("lineTemp: {}".format(lineTemp))
340 + while result:
341 + #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
342 + listRIs.append(getRI(result, line))
343 + #print("listRIs: {}".format(listRIs))
344 + lineTemp = lineTemp.replace(result.group('regulated'), '')
345 + #print("lineTemp for: {}".format(lineTemp))
346 + result = regexAttRILeft.search(lineTemp)
347 + #print("result: {}".format(result))
348 +
349 + # Buscar hacia la derecha
350 + #print("Buscando hacia >>")
351 + result = regexAttRIRight.search(line)
352 + #print("result: {}".format(result))
353 + lineTemp = line
354 + # print("lineTemp: {}".format(lineTemp))
355 + while result:
356 + #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
357 + listRIs.append(getRI(result, line))
358 + #print("listRIs: {}".format(listRIs))
359 + lineTemp = lineTemp.replace(result.group('regulated'), '')
360 + #print("lineTemp for: {}".format(lineTemp))
361 + result = regexAttRIRight.search(lineTemp)
362 + #print("result: {}".format(result))
363 +
364 + # result = regexAttRIRight.finditer(line)
365 + # lineTemp = line
366 + # while result:
367 + # listRIs.append(getRI(result, line))
368 + # lineTemp = lineTemp.replace(result.group('regulated'), '')
369 + # result = regexAttRIRight.finditer(lineTemp)
370 +
371 + # return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
372 + # regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
373 + # effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
374 + for ri in listRIs:
375 + #print("ri: {}".format(ri))
376 + if len(ri) != 4:
377 + print("WARNING! corrupted list")
378 + exit()
379 + regulator = ri[0]
380 + regulated = ri[1]
381 + effect = ri[2]
382 + line = ri[3]
383 +
384 + listElem = regulator.split('|')
385 + regulatorWord = listElem[0]
386 + regulatorType = listElem[2]
387 + regulatorStart = listElem[3]
388 + regulatorEnd = listElem[4]
389 +
390 + listElem = regulated.split('|')
391 + regulatedWord = listElem[0]
392 + regulatedType = listElem[2]
393 + regulatedStart = listElem[3]
394 + regulatedEnd = listElem[4]
395 +
396 + listElem = effect.split('|')
397 + effectWord = listElem[0]
398 + effectType = "EFFECT"
399 + effectStart = listElem[1]
400 + effectEnd = listElem[2]
401 +
402 + idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
403 + if regulatedType == "GENE":
404 + idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
405 + elif regulatedType == "TU":
406 + idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "TU", idEntities)
407 + else:
408 + print("WARNING! Unknown entity type")
409 + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
410 + idInteractions, hashInteractions)
411 + idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
412 + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
413 + idInteractionsEffect,
414 + hashInteractionsEffect)
415 +
416 + saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
417 + saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)
1 +# -*- coding: UTF-8 -*-
2 +from optparse import OptionParser
3 +import sys
4 +import os
5 +import json
6 +import operator
7 +import re
8 +from general_functions import getTypeRegulation
9 +from nltk.corpus import words
10 +
11 +__author__ = 'CMendezC'
12 +
13 +
14 +# Objective: obtain predicted ris from autoregulation sentences,
15 +# such as ArgP protein represses its own synthesis
16 +# Input format: transformed format.
17 +# WARNING: Only one sentence per line
18 +
19 +# Parameters:
20 +# 1) --inputPath Input path
21 +# 2) --inputFile Inpupt file
22 +# 3) --outputPath Output path
23 +# 5) --diccPath Dictionary path
24 +# 7) --diccEffect File with normalized effects
25 +
26 +# 6) --diccFile JSON file with entity dictionaries
27 +# 9) --diccEPAth Dictionary path diccEffect
28 +# 8) --format Output format: standoff, tabs
29 +
30 +# Ouput:
31 +# 1) File with predicted ris combined with existing files.
32 +# Format standoff:
33 +# T1 TF 0 0 ArgP
34 +# T2 GENE 0 0 Argp -- > argP
35 +# R1 Interaction.activator Target:T3 Agent:T1
36 +# Sentence ArgP protein represses its own synthesis
37 +# The FimZ transcription factor activates this promoter directly ,
38 +# and it also positively regulates the transcription of its own gene
39 +# FimZ is known to regulate the expression of its own gene positively
40 +# FimZ also positively regulates its own transcription
41 +# ArgP protein represses its own synthesis
42 +# ArgP both represses its own transcription
43 +# ArgP protein represses its own synthesis
44 +# OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
45 +# of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
46 +
47 +# Execution
48 +# python3 ri-autoregulation-extraction-v01.py
49 +# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
50 +# --inputFile dataSet_OnlyRI_sentences.auto.1017.txt
51 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs
52 +# --diccPath /home/cmendezc/terminologicalResources
53 +# --diccEffect normalized_Effects.json
54 +# python3 ri-autoregulation-extraction-v01.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences --inputFile dataSet_OnlyRI_sentences.auto.1017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json
55 +
56 +###########################################################
57 +# MAIN PROGRAM #
58 +###########################################################
59 +
60 +def getPosWord(wordPos, endPos, text, termList):
61 + offsetStart = 0
62 + wordNum = 0
63 + listText = text.split()
64 + for w in listText:
65 + # if filenameBefore.find('000-2') > -1:
66 + # print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
67 + if wordNum >= int(wordPos):
68 + # for tok in word.split():
69 + for t in termList:
70 + # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
71 + if w == t:
72 + return [w, offsetStart, offsetStart + len(w) - 1]
73 + #else:
74 + wordNum += 1
75 + offsetStart += len(w) + 1
76 + if wordNum > int(endPos):
77 + return None
78 + return None
79 +
80 +def getIdEntity(aList, etype, idE):
81 + entity = aList[0]
82 + if etype == "EFFECT":
83 + normalizedEffect = entity
84 + #print("EFFECT: {}".format(entity))
85 + if entity in hashNormalizedEffects:
86 + normalizedEffect = hashNormalizedEffects[entity]
87 + etype += "." + normalizedEffect
88 + #print("etype: {}".format(etype))
89 + entityPosStart = aList[1]
90 + entityPosEnd = aList[2]
91 + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
92 + #print("keyEntity: {}".format(keyEntity))
93 + if keyEntity not in hashEntities:
94 + idE += 1
95 + idEntity = "T{}".format(idE)
96 + hashEntities[keyEntity] = idEntity
97 + #print("New entity {}: {}".format(idEntity, keyEntity))
98 + return idEntity, idE
99 + else:
100 + idEntity = hashEntities[keyEntity]
101 + return idEntity, idE
102 +
103 +def getIdInteraction(regulator, regulated, effect, idI, hashInt):
104 + #print("hashInt: {}".format(hashInt))
105 + keyInteraction = "{} {} {}".format(regulator, regulated, effect)
106 + if keyInteraction not in hashInt:
107 + idI += 1
108 + idInteraction = "R{}".format(idI)
109 + hashInt[keyInteraction] = idInteraction
110 + #print("New interaction {}: {}".format(idInteraction, keyInteraction))
111 + #return idInteraction, idI
112 + else:
113 + idInteraction = hashInt[keyInteraction]
114 + return idInteraction, idI
115 +
116 +def saveFiles(filename, hashE, hashI, s, effect):
117 + if effect:
118 + outputPath = os.path.join(options.outputPath, "complete-ris")
119 + else:
120 + outputPath = os.path.join(options.outputPath, "incomplete-ris")
121 + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
122 + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
123 + for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
124 + aList = k.split()
125 + a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
126 + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
127 + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
128 + for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
129 + aList = k.split()
130 + a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
131 + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
132 + txtFile.write(s)
133 +
134 +def loadFileEntities(filename, outputPath, hashTemp):
135 + #print("Start loadFileEntities")
136 + idE = 1
137 + try:
138 + f = filename[:filename.rfind(".")] + ".a1"
139 + # print("file entities: {}".format(f))
140 + with open(os.path.join(outputPath, f), mode="r") as a1File:
141 + for line in a1File:
142 + line = line.strip('\n')
143 + listLine1 = line.split('\t')
144 + listLine2 = listLine1[1].split(' ')
145 + etype = listLine2[0]
146 + entityPosStart = listLine2[1]
147 + entityPosEnd = listLine2[2]
148 + entity = listLine1[2]
149 + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
150 + idEntity = listLine1[0]
151 + if keyEntity not in hashTemp:
152 + hashTemp[keyEntity] = idEntity
153 + if int(idEntity[1:]) > idE:
154 + idE = int(idEntity[1:])
155 + except IOError:
156 + print("IOError file: {}".format(os.path.join(outputPath, f)))
157 + # idE = 1
158 + return idE
159 +
160 +def loadFileInteractions(filename, outputPath, hashTemp):
161 + #print("Start loadFileInteractions")
162 + idI = 1
163 + try:
164 + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
165 + for line in a2File:
166 + #print("Line a2: {}".format(line))
167 + line = line.strip('\n')
168 + listLine1 = line.split('\t')
169 + listLine2 = listLine1[1].split(' ')
170 + regulator = listLine2[2]
171 + regulator = regulator[regulator.find(":") + 1:]
172 + regulated = listLine2[1]
173 + regulated = regulated[regulated.find(":") + 1:]
174 + effect = listLine2[0]
175 + effect = effect[effect.find(".") + 1:]
176 + idInteraction = listLine1[0]
177 + keyInteraction = "{} {} {}".format(regulator, regulated, effect)
178 + if keyInteraction not in hashTemp:
179 + hashTemp[keyInteraction] = idInteraction
180 + if int(idInteraction[1:]) > idI:
181 + idI = int(idInteraction[1:])
182 + except IOError:
183 + print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
184 + # idI = 1
185 + return idI
186 +
187 +'''
188 +def getTypeRegulation(effect_group, posini, sent, type_sent):
189 + # To change regulation effect in such as:
190 + # negative regulator --> repressor
191 + # positively regulates --> activator
192 + effect_ret = effect_group
193 + #listEff = effect_ret.split('|')
194 +
195 + if type_sent == "tra":
196 + regexTypeEffectPosi = re.compile(r'(?<=positive\|(RB|JJ) )' + effect_ret)
197 + regexTypeEffectNega = re.compile(r'(?<=negative\|(RB|JJ) )' + effect_ret)
198 + if regexTypeEffectPosi.search(sent, posini - 12):
199 + # Creo que no es necesario: effect_ret = "activator|{}|{}".format(listEff[1], listEff[2])
200 + effect_ret = "activator"
201 + print("Change regulation effect: {}".format(sent))
202 + elif regexTypeEffectNega.search(sent, posini - 12):
203 + # Creo que no es necesario: effect_ret = "repressor|{}|{}".format(listEff[1], listEff[2])
204 + effect_ret = "repressor"
205 + print("Change regulation effect: {}".format(sent))
206 + return effect_ret
207 +'''
208 +
209 +def getRealPos(posStart, posEnd, lin):
210 + return (posStart, posEnd)
211 +
212 +def getRI(r, l):
213 + regulator = r.group('regulator')
214 + regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
215 + # We change TF name to GENE name
216 + listRegulator = regulator.split('|')
217 + regulatorWord = listRegulator[0]
218 + regulated = regulatorWord[0].lower()+regulatorWord[1:]
219 + regulated += "|{}|GENE".format(regulated)
220 + regulatedPos = getRealPos(0, 0, l)
221 + effect = r.group('effect')
222 + # print("effect from group: {}".format(effect))
223 + effectPos = getRealPos(r.start('effect'), r.end('effect'), l)
224 +
225 + # To change regulation effect in:
226 + # negative regulator --> repressor
227 + # positively regulates --> activator
228 + effect = getTypeRegulation(effect, r.start('effect'), l, "tra")
229 +
230 + return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
231 + regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
232 + effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
233 +
234 +if __name__ == "__main__":
235 + # Parameter definition
236 + parser = OptionParser()
237 + parser.add_option("--inputPath", dest="inputPath",
238 + help="Input path", metavar="PATH")
239 + parser.add_option("--inputFile", dest="inputFile",
240 + help="Input file", metavar="FILE")
241 + parser.add_option("--outputPath", dest="outputPath",
242 + help="Output path", metavar="PATH")
243 + parser.add_option("--diccPath", dest="diccPath",
244 + help="Path to read dictionaries", metavar="PATH")
245 + parser.add_option("--diccEffect", dest="diccEffect",
246 + help="File with normalized effects", metavar="FILE")
247 +
248 + (options, args) = parser.parse_args()
249 + #if len(args) > 0:
250 + # parser.error("None parameter entered.")
251 + # sys.exit(1)
252 +
253 + # Printing parameter values
254 + print('-------------------------------- PARAMETERS --------------------------------')
255 + print("Input path: " + str(options.inputPath))
256 + print("Input file: " + str(options.inputFile))
257 + print("Output path: " + str(options.outputPath))
258 + print("Path to read dictionaries: " + str(options.diccPath))
259 + print("File with normalized effects: " + str(options.diccEffect))
260 +
261 + # Loading normalized effects
262 + print('Loading normalized effects (all)...')
263 + hashNormalizedEffects = {}
264 + with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
265 + hashNormalizedEffects = json.load(diccFile)
266 + listEffects = []
267 + for eff in hashNormalizedEffects.keys():
268 + listEffects.append(eff)
269 + effects = "|".join(listEffects)
270 + #print("Effects: {}".format(effects))
271 +
272 + files = {}
273 + hashEntities = {}
274 + hashInteractions = {}
275 + hashInteractionsEffect = {}
276 + idEntities = 1
277 + idInteractions = 1
278 + idInteractionsEffect = 1
279 +
280 + # The FimZ transcription factor activates this promoter directly ,
281 + # and it also positively regulates the transcription of its own gene
282 + # FimZ is known to regulate the expression of its own gene positively
283 + # FimZ also positively regulates its own transcription
284 + # ArgP protein represses its own synthesis
285 + # ArgP both represses its own transcription
286 + # ArgP protein represses its own synthesis
287 + # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
288 + # of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
289 + regexAutoRI = re.compile(
290 + # r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]\s){,4}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
291 + r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+\s(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
292 + #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^(TF)\s]+\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
293 + #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^T][^F]\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT')
294 +
295 + filename = options.inputFile
296 + hashEntities = {}
297 + hashInteractions = {}
298 + hashInteractionsEffect = {}
299 + idEntities = 1
300 + idInteractions = 1
301 + idInteractionsEffect = 1
302 + outputPath = os.path.join(options.outputPath, "complete-ris")
303 + idEntities = loadFileEntities(filename, outputPath, hashEntities)
304 + idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
305 + outputPath = os.path.join(options.outputPath, "incomplete-ris")
306 + idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
307 +
308 + listRIs = []
309 + # print("Read autoregulation file")
310 + with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
311 + for line in iFile:
312 + line = line.rstrip('\n')
313 + print("Buscando autoregulation")
314 + result = regexAutoRI.search(line)
315 + #print("result: {}".format(result))
316 + if result:
317 + lineTemp = result.string[result.end('regulator'):result.end(0)]
318 + # print("lineTemp: {}".format(lineTemp))
319 + result2 = regexAutoRI.search(lineTemp)
320 + if result2:
321 + print("Regulator {} regulated {} effect {}".format(result2.group('regulator'), result2.group('regulator'), result2.group('effect')))
322 + listRIs.append(getRI(result2, line))
323 + print("listRIs: {}".format(listRIs))
324 + elif result:
325 + print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulator'), result.group('effect')))
326 + listRIs.append(getRI(result, line))
327 + print("listRIs: {}".format(listRIs))
328 +
329 +
330 + for ri in listRIs:
331 + #print("ri: {}".format(ri))
332 + if len(ri) != 4:
333 + print("WARNING! corrupted list")
334 + exit()
335 + regulator = ri[0]
336 + regulated = ri[1]
337 + effect = ri[2]
338 + line = ri[3]
339 +
340 + listElem = regulator.split('|')
341 + regulatorWord = listElem[0]
342 + regulatorType = listElem[2]
343 + regulatorStart = listElem[3]
344 + regulatorEnd = listElem[4]
345 +
346 + listElem = regulated.split('|')
347 + regulatedWord = listElem[0]
348 + regulatedType = listElem[2]
349 + regulatedStart = listElem[3]
350 + regulatedEnd = listElem[4]
351 +
352 + listElem = effect.split('|')
353 + effectWord = listElem[0]
354 + effectType = "EFFECT"
355 + effectStart = listElem[1]
356 + effectEnd = listElem[2]
357 +
358 + idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
359 + idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
360 + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
361 + idInteractions, hashInteractions)
362 + idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
363 + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
364 + idInteractionsEffect,
365 + hashInteractionsEffect)
366 +
367 + saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
368 + saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)
1 +# -*- coding: UTF-8 -*-
2 +from optparse import OptionParser
3 +import sys
4 +import os
5 +import json
6 +import operator
7 +from general_functions import getTypeRegulation
8 +import re
9 +from nltk.corpus import words
10 +
11 +__author__ = 'CMendezC'
12 +
13 +
14 +# Objective: obtain predicted ris from triplets extracted by OpenIE Stanford CoreNLP
15 +# Input format:
16 +# WARNING: Only one sentence per line
17 +
18 +# Parameters:
19 +# 1) --inputPath Input path
20 +# 2) --inputFile Inpupt file
21 +# 3) --outputPath Output path
22 +# 5) --diccPath Dictionary path
23 +# 6) --diccFile JSON file with entity dictionaries
24 +# 7) --diccEffect File with normalized effects
25 +# 8) --format Output format: standoff, tabs
26 +# 9) --diccEPAth Dictionary path diccEffect
27 +
28 +# Ouput:
29 +# 1) File with predicted ris.
30 +# Format standoff:
31 +# T1 TF 0 0 MetR
32 +# T2 TU 0 0 metH
33 +# T3 GENE 0 0 metH
34 +# T1 Growth_condition 88 137 mitochondrial electron transport chain inhibitors
35 +# T2 Growth_condition 150 179 switch rich to minimal medium
36 +# R1 Interaction.activator Target:T3 Agent:T1
37 +# R2 Interaction.activator Target:T2 Agent:T1
38 +
39 +# Execution
40 +# python3.4 ri-openie-extraction.py
41 +# --inputFile /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris/predicted-ris.reverb
42 +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/processing-ris
43 +# --diccPath /home/cmendezc/terminologicalResources
44 +# --diccFile normalized_Effects_Type.json
45 +# --diccEffect termFilesTag_RIE_GCE_SYSTEM_ECCO.jsong
46 +# --format standoff
47 +
48 +###########################################################
49 +# MAIN PROGRAM #
50 +###########################################################
51 +
52 +def getPosWord(wordPos, endPos, text, termList, type_entity=""):
53 + #print("GETPOSWORD wordPOs {}".format(wordPos))
54 + offsetStart = 0
55 + wordNum = 0
56 + listText = text.split()
57 + for w in listText:
58 + # if filenameBefore.find('000-2') > -1:
59 + # print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
60 + if wordNum >= int(wordPos):
61 + # for tok in word.split():
62 + for t in termList:
63 + # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
64 + if w == t:
65 + if type_entity == "EFFECT":
66 + # To change regulation effect in:
67 + # negative regulator --> repressor
68 + # positively regulates --> activator
69 + print("text: {}".format(text))
70 + new_w = getTypeRegulation(w, int(wordPos), text, "word")
71 + return [new_w, offsetStart, offsetStart + len(w) - 1]
72 + else:
73 + return [w, offsetStart, offsetStart + len(w) - 1]
74 + #else:
75 + wordNum += 1
76 + offsetStart += len(w) + 1
77 + if wordNum > int(endPos):
78 + return None
79 + return None
80 +
81 +
82 +def getIdEntity(aList, etype, idE):
83 + entity = aList[0]
84 + if etype == "EFFECT":
85 + normalizedEffect = entity
86 + # print("EFFECT: {}".format(entity))
87 + if entity in hashEffects:
88 + normalizedEffect = hashEffects[entity]
89 + etype += "." + normalizedEffect
90 + # print("EFFECT: {}".format(entity))
91 + entityPosStart = aList[1]
92 + entityPosEnd = aList[2]
93 + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
94 + #if filenameBefore.find('061-02') > -1:
95 + # print("keyEntity: {}".format(keyEntity))
96 + # print("idE: {}".format(idE))
97 + # print("hashEntities: {}".format(hashEntities))
98 + if keyEntity not in hashEntities:
99 + idE += 1
100 + idEntity = "T{}".format(idE)
101 + #if filenameBefore.find('061-02') > -1:
102 + # print("idEntity not in hashEntities: {}".format(keyEntity))
103 + # print("idE not in hashEntities: {}".format(idE))
104 + hashEntities[keyEntity] = idEntity
105 + #print("New entity {}: {}".format(idEntity, keyEntity))
106 + return idEntity, idE
107 + else:
108 + idEntity = hashEntities[keyEntity]
109 + return idEntity, idE
110 +
111 +
112 +def getIdInteraction(regulator, regulated, effect, idI, hashInt):
113 + #print("hashInt: {}".format(hashInt))
114 + keyInteraction = "{} {} {}".format(regulator, regulated, effect)
115 + if keyInteraction not in hashInt:
116 + idI += 1
117 + idInteraction = "R{}".format(idI)
118 + hashInt[keyInteraction] = idInteraction
119 + #print("New interaction {}: {}".format(idInteraction, keyInteraction))
120 + #return idInteraction, idI
121 + else:
122 + idInteraction = hashInt[keyInteraction]
123 + return idInteraction, idI
124 +
125 +
126 +def saveFiles(filename, hashE, hashI, s, effect):
127 + if effect:
128 + outputPath = os.path.join(options.outputPath, "complete-ris")
129 + else:
130 + outputPath = os.path.join(options.outputPath, "incomplete-ris")
131 + with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="w") as a1File:
132 + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
133 + for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
134 + aList = k.split()
135 + a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
136 + with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="w") as a2File:
137 + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
138 + for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
139 + aList = k.split()
140 + a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
141 + with open(os.path.join(outputPath, filename[:file.find(".")] + ".txt"), mode="w") as txtFile:
142 + txtFile.write(s)
143 +
144 +def loadFileEntities(filename, outputPath, hashTemp):
145 + idE = 1
146 + try:
147 + with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="r") as a1File:
148 + for line in a1File:
149 + line = line.strip('\n')
150 + listLine1 = line.split('\t')
151 + listLine2 = listLine1[1].split(' ')
152 + etype = listLine2[0]
153 + entityPosStart = listLine2[1]
154 + entityPosEnd = listLine2[2]
155 + entity = listLine1[2]
156 + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
157 + idEntity = listLine1[0]
158 + if keyEntity not in hashTemp:
159 + hashTemp[keyEntity] = idEntity
160 + if int(idEntity[1:]) > idE:
161 + idE = int(idEntity[1:])
162 + except IOError:
163 + print("IOError file, idEntity starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a1")))
164 + # idE = 1
165 + return idE
166 +
167 +def loadFileInteractions(filename, outputPath, hashTemp):
168 + idI = 1
169 + try:
170 + with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="r") as a2File:
171 + for line in a2File:
172 + #print("Line a2: {}".format(line))
173 + line = line.strip('\n')
174 + listLine1 = line.split('\t')
175 + listLine2 = listLine1[1].split(' ')
176 + regulator = listLine2[2]
177 + regulator = regulator[regulator.find(":") + 1:]
178 + regulated = listLine2[1]
179 + regulated = regulated[regulated.find(":") + 1:]
180 + effect = listLine2[0]
181 + effect = effect[effect.find(".") + 1:]
182 + idInteraction = listLine1[0]
183 + keyInteraction = "{} {} {}".format(regulator, regulated, effect)
184 + if keyInteraction not in hashTemp:
185 + hashTemp[keyInteraction] = idInteraction
186 + if int(idInteraction[1:]) > idI:
187 + idI = int(idInteraction[1:])
188 + except IOError:
189 + print("IOError file, idInteraction starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a2")))
190 + # idI = 1
191 + return idI
192 +
193 +if __name__ == "__main__":
194 + # Parameter definition
195 + parser = OptionParser()
196 + parser.add_option("--inputPath", dest="inputPath",
197 + help="Input path", metavar="PATH")
198 + parser.add_option("--inputFile", dest="inputFile",
199 + help="Input file", metavar="FILE")
200 + parser.add_option("--outputPath", dest="outputPath",
201 + help="Output path", metavar="PATH")
202 + #parser.add_option("--outputFile", dest="outputFile",
203 + #help="Output file", metavar="FILE")
204 + parser.add_option("--diccPath", dest="diccPath",
205 + help="Path to read dictionaries", metavar="PATH")
206 + parser.add_option("--diccFile", dest="diccFile",
207 + help="JSON file with entity dictionaries", metavar="FILE")
208 + parser.add_option("--diccEffect", dest="diccEffect",
209 + help="File with normalized effects", metavar="FILE")
210 + parser.add_option("--format", dest="format",
211 + help="Output format: standoff", metavar="TEXT")
212 + parser.add_option("--diccEPAth", dest="diccEPAth",
213 + help="File with normalized effects", metavar="FILE")
214 +
215 + (options, args) = parser.parse_args()
216 + if len(args) > 0:
217 + parser.error("None parameter entered.")
218 + sys.exit(1)
219 +
220 + # Printing parameter values
221 + print('-------------------------------- PARAMETERS --------------------------------')
222 + print("Input path: " + str(options.inputPath))
223 + print("Input file: " + str(options.inputFile))
224 + print("Output path: " + str(options.outputPath))
225 + #print("Output file: " + str(options.outputFile))
226 + print("Path to read dictionaries: " + str(options.diccPath))
227 + print("JSON file with entity dictionaries: " + str(options.diccFile))
228 + print("Path to read normalized effects: " + str(options.diccEPAth))
229 + print("File with normalized effects: " + str(options.diccEffect))
230 + print("Output format: " + str(options.format))
231 +
232 + regularWords = words.words('en')
233 +
234 + print('Loading dictionaries...')
235 + with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
236 + hashDicc = json.load(diccFile)
237 +
238 + # hashTermFiles = hashDicc["hashTermFiles"]
239 + # hashTerms = hashDicc["hashTerms"]
240 +
241 + # for key in hashTermFiles.keys():
242 + # for f in hashTermFiles[key]:
243 + # # print('File: ' + f)
244 + # with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
245 + # for line in iFile:
246 + # line = line.strip('\n')
247 + # line = line.replace(' ', '-')
248 + # if line not in hashTerms[key]:
249 + # hashTerms[key].append(line)
250 + # # if options.termLower:
251 + # # hashTerms[key].append(line.lower())
252 + # # if options.termCapitalize:
253 + # # hashTerms[key].append(line.capitalize())
254 + # print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
255 +
256 + # Loading normalized effects
257 + print('Loading normalized effects...')
258 + with open(os.path.join(options.diccEPAth, options.diccEffect)) as diccFile:
259 + hashEffects = json.load(diccFile)
260 +
261 + files = {}
262 + hashEntities = {}
263 + hashInteractions = {}
264 + hashInteractionsEffect = {}
265 + idEntities = 1
266 + idInteractions = 1
267 + idInteractionsEffect = 1
268 + filenameBefore = ''
269 + regexNumFile = re.compile(r'_([0-9]+)[.-]')
270 + numFile = ""
271 + inumFile = 0
272 + hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}
273 +
274 + with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
275 + for line in iFile:
276 + line = line.rstrip('\n')
277 + listLine = line.split('\t')
278 + file = listLine[0]
279 + filename = file.split("/")[-1]
280 + filename = filename[:-4]
281 + if filename not in files:
282 + # New file, that is, new sentence
283 + files[filename] = 1
284 + if len(files) > 1:
285 + if len(hashEntities) > 0:
286 + #if filenameBefore.find('061-02') > -1:
287 + # print("filenameBefore: {}".format(filenameBefore))
288 + # print("Save hashEntities: {}".format(hashEntities))
289 + # print("Save hashInteractions: {}".format(hashInteractions))
290 + # print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
291 + saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
292 + saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)
293 + filenameBefore = filename
294 + hashEntities = {}
295 + hashInteractions = {}
296 + hashInteractionsEffect = {}
297 + idEntities = 1
298 + idInteractions = 1
299 + idInteractionsEffect = 1
300 + outputPath = os.path.join(options.outputPath, "complete-ris")
301 + idEntities = loadFileEntities(filename, outputPath, hashEntities)
302 + idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
303 + outputPath = os.path.join(options.outputPath, "incomplete-ris")
304 + idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
305 + result = regexNumFile.search(filenameBefore)
306 + if result:
307 + inumFile = int(result.group(1))
308 + numFile = str(inumFile)
309 + print("Numfile: {}".format(numFile))
310 + else:
311 + print("WARNING: numfile not found in filename")
312 + hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}
313 + if numFile in hashDicc:
314 + hashTemp = hashDicc[numFile]
315 + #print("hashDicc[numFile]: {}".format(hashTemp))
316 + for k, v in hashTemp.items():
317 + if v == "TF":
318 + # print("Verifiying TF")
319 + if "TF" in hashTerms:
320 + # print(" TF {}".format(k))
321 + hashTerms["TF"].append(k)
322 + else:
323 + hashTerms["TF"] = [k]
324 + elif v == "GENE":
325 + if "GENE" in hashTerms:
326 + hashTerms["GENE"].append(k)
327 + else:
328 + hashTerms["GENE"] = [k]
329 + elif v == "TU":
330 + if "TU" in hashTerms:
331 + hashTerms["TU"].append(k)
332 + else:
333 + hashTerms["TU"] = [k]
334 + elif v == "EFFECT":
335 + if "EFFECT" in hashTerms:
336 + hashTerms["EFFECT"].append(k)
337 + else:
338 + hashTerms["EFFECT"] = [k]
339 + else:
340 + print("WARNING: entity not found in dictionaries")
341 + else:
342 + print("WARNING: numfile not found in dictionaries")
343 + #if filename.find('061-02') > -1:
344 + # print("filename: {}".format(filename))
345 + # print("Load hashEntities: {}".format(hashEntities))
346 + # print("Load hashInteractions: {}".format(hashInteractions))
347 + # print("Load hashInteractionsEffect: {}".format(hashInteractionsEffect))
348 +
349 + wordA = listLine[2]
350 + wordB = listLine[3]
351 + wordC = listLine[4]
352 + startA = listLine[5]
353 + endA = listLine[6]
354 + startB = listLine[7]
355 + endB = listLine[8]
356 + startC = listLine[9]
357 + endC = listLine[10]
358 + sent = listLine[12]
359 + lemmaA = listLine[2]
360 + lemmaB = listLine[3]
361 + lemmaC = listLine[4]
362 +
363 + # Return [tok, offsetStart, offsetEnd ]
364 + # print("hashTerms[TF]: {}".format(hashTerms["TF"]))
365 + listRegulator = getPosWord(startA, endA, sent, hashTerms["TF"])
366 + if listRegulator is not None:
367 + #if filenameBefore.find('061-02') > -1:
368 + # print(">> Regulator found: {}".format(listRegulator[0]))
369 + listRegulated = getPosWord(startC, endC, sent, hashTerms["GENE"])
370 + if listRegulated is not None:
371 + #if filenameBefore.find('061-02') > -1:
372 + # print(">> Regulated GENE found: {}".format(listRegulated[0]))
373 + idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
374 + idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
375 + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
376 + #print("Review EFFECT")
377 + listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
378 + if listEffect is not None:
379 + idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
380 + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
381 + else:
382 + listRegulated = getPosWord(startC, endC, sent, hashTerms["TU"])
383 + if listRegulated is not None:
384 + #if filenameBefore.find('061-02') > -1:
385 + # print(">> Regulated TU found: {}".format(listRegulated[0]))
386 + idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
387 + idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
388 + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
389 + #print("Review EFFECT")
390 + listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
391 + if listEffect is not None:
392 + idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
393 + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
394 + else:
395 + listRegulator = getPosWord(startC, endC, sent, hashTerms["TF"])
396 + if listRegulator is not None:
397 + #if filenameBefore.find('061-02') > -1:
398 + # print(">> Regulator found: {}".format(listRegulator[0]))
399 + listRegulated = getPosWord(startA, endA, sent, hashTerms["GENE"])
400 + if listRegulated is not None:
401 + #if filenameBefore.find('061-02') > -1:
402 + # print(">> Regulated GENE found: {}".format(listRegulated[0]))
403 + idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
404 + idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
405 + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
406 + #print("Review EFFECT")
407 + listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
408 + if listEffect is not None:
409 + idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
410 + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
411 + else:
412 + listRegulated = getPosWord(startA, endA, sent, hashTerms["TU"])
413 + if listRegulated is not None:
414 + #if filenameBefore.find('061-02') > -1:
415 + # print(">> Regulated TU found: {}".format(listRegulated[0]))
416 + idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
417 + idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
418 + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
419 + #print("Review EFFECT")
420 + listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
421 + if listEffect is not None:
422 + idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
423 + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
424 + if len(files) > 1:
425 + if len(hashEntities) > 0:
426 + #print("filenameBefore: {}".format(filenameBefore))
427 + #print("Save hashEntities: {}".format(hashEntities))
428 + #print("Save hashInteractions: {}".format(hashInteractions))
429 + #print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
430 + saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
431 + saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)
1 +#!/bin/bash
2 +
3 +###### Automatic extraction of TRN from several files ######
4 +
5 +BRIES_HOME=/myhome/bries
6 +PMIDS_HOME=/myhome/preprocessed-files
7 +# We don't use REFERENCE_HOME because we don't evaluate. Path /reference-data-set doesn't exist. File no-reference.txt doesn't exist.
8 +REFERENCE_HOME=/myhome/reference-data-set
9 +
10 +for f in $PMIDS_HOME/original/text/*.*
11 +do
12 + FILE_NAME=$(basename "$f")
13 + FILE_NAME="${FILE_NAME%.*}"
14 + echo "File: $FILE_NAME"
15 + ./automatic-extraction-ris-gcs.sh $PMIDS_HOME/features/$FILE_NAME.tra.word.txt $PMIDS_HOME/transformed/$FILE_NAME.tra.txt $BRIES_HOME/ri-openie-extraction/$FILE_NAME.txt $BRIES_HOME/predicted-ris-gcs Y Y FILT1 $REFERENCE_HOME no-reference.txt $BRIES_HOME/evaluation-reports no-evaluation.txt diccionario-SYNONYMS.json $PMIDS_HOME/original/tsv 1>uno-$FILE_NAME.txt 2>dos-$FILE_NAME.txt
16 +done
1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +import json
8 +import re
9 +import pandas as pd
10 +
11 +__author__ = 'CMendezC'
12 +
13 +
14 +# Objective: Filter sentences with specific entities.
15 +# Also extract attributive sentences: effect-TF
16 +# And autoregulation: regulates its own gene
17 +# CFMC 2022-03-08: We added updating tsv file with idsentence, sentence and section (.pre.tsv)
18 +# to indicate filtered sentences.
19 +
20 +# Parameters:
21 +# 1) --inputFileWord Path and filename to read feature word file.
22 +# 2) --inputFileTrans Path and filename to read transformed file.
23 +# 3) --outputPath Path to place output file.
24 +# 4) --outputFile Output file.
25 +# 5) --filter FILT1: (GENE OR TU) AND TF
26 +# FILT2: (GENE OR TU) AND EFFECT AND TF
27 +# 6) --attrPath Path for attributive cases: ArgP-regulated genes
28 +# 8) --dicPath Path for dictionary
29 +# 9) --dicFile Path for dictionary file normalized_Effects.json
30 +# 10) --autoPath Path for autoregulation cases: regulates its own gene
31 +# /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
32 +
33 +# Output:
34 +# 1) Filtered sentences.
35 +# 2) Attributive sentences
36 +# 3) Autoregulation sentences
37 +
38 +
39 +###########################################################
40 +# MAIN PROGRAM #
41 +###########################################################
42 +
43 +def getEntities(tline, filt):
44 + # FILT1: (GENE OR TU) AND TF
45 + # FILT2: (GENE OR TU) AND EFFECT AND TF
46 + entities = {}
47 + tline = tline.rstrip('\n\r ')
48 + for token in tline.split(" "):
49 + # print("Token: {}".format(token))
50 + listElem = token.split("|")
51 + w = listElem[0]
52 + l = listElem[1]
53 + t = listElem[2]
54 + if filt == "FILT1" or filt == "FILT2":
55 + if t in ["GENE", "TU", "TF", "EFFECT"]:
56 + if w not in entities:
57 + entities[w] = t
58 + # if filt == "FILT2":
59 + # if t in ["GENE", "TU", "TF", "EFFECT"]:
60 + # if w not in entities:
61 + # entities[w] = t
62 + return entities
63 +
64 +if __name__ == "__main__":
65 + # Parameter definition
66 + parser = OptionParser()
67 +
68 + parser.add_option("--inputFileWord", dest="inputFileWord",
69 + help="Path and filename to read feature word file", metavar="PATH")
70 + parser.add_option("--inputFileTrans", dest="inputFileTrans",
71 + help="Path and filename to read transformed file", metavar="PATH")
72 + parser.add_option("--outputPath", dest="outputPath",
73 + help="Output path", metavar="PATH")
74 + parser.add_option("--outputFile", dest="outputFile",
75 + help="Output file", metavar="FILE")
76 + parser.add_option("--filter", dest="filter", choices=('FILT1', 'FILT2'), default=None,
77 + help="FILT1: (GENE OR TU) AND TF; FILT2: (GENE OR TU) AND EFFECT AND TF", metavar="TEXT")
78 + parser.add_option("--attrPath", dest="attrPath",
79 + help="Output path attributive sentences", metavar="PATH")
80 + parser.add_option("--dicPath", dest="dicPath",
81 + help="Output path dictionary", metavar="PATH")
82 + parser.add_option("--dicFile", dest="dicFile",
83 + help="Output file dictionary normalized_Effects.json", metavar="FILE")
84 + parser.add_option("--autoPath", dest="autoPath",
85 + help="Output path autoregulation sentences", metavar="PATH")
86 + parser.add_option("--tsvPath", dest="tsvPath",
87 + help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")
88 +
89 + (options, args) = parser.parse_args()
90 + if len(args) > 0:
91 + parser.error("None parameters indicated.")
92 + sys.exit(1)
93 +
94 + # Printing parameter values
95 + print('-------------------------------- PARAMETERS --------------------------------')
96 + print("Path and filename to read feature word file: " + str(options.inputFileWord))
97 + print("Path and filename to read transformed file: " + str(options.inputFileTrans))
98 + print("Output path: " + str(options.outputPath))
99 + print("Output file: " + str(options.outputFile))
100 + print("Filter: " + str(options.filter))
101 + print("Output path attributive sentences: " + str(options.attrPath))
102 + print("Output path autoregulation sentences: " + str(options.autoPath))
103 + print("Output path dictionary: " + str(options.dicPath))
104 + print("Output file dictionary normalized_Effects.json: " + str(options.dicFile))
105 + print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))
106 +
107 + # Loading normalized effects
108 + # print('Loading normalized effects...')
109 + hashNormalizedEffects = {}
110 + with open(os.path.join(options.dicPath, options.dicFile)) as diccFile:
111 + hashNormalizedEffects = json.load(diccFile)
112 + listEffects = []
113 + for eff in hashNormalizedEffects.keys():
114 + if eff.endswith('d'):
115 + listEffects.append(eff)
116 + listEffects.append("dependent")
117 + effects = "|".join(listEffects)
118 + print("Effects: {}".format(effects))
119 +
120 + t0 = time()
121 + count = 0
122 + hashEntities = {}
123 + hashAttrSent = {}
124 + hashAutoSent = {}
125 + # Original CMC 2018-11-07: reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
126 + # We decided to extract all sentences containing effect-TF because we observed some patterns where
127 + # "gene" does not appear, then, to recover these examples we employ a more general rule to separate
128 + # attributive sentences.
129 + reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF')
130 + # We decided to extract all sentences containing autoregulation
131 + # The FimZ transcription factor activates this promoter directly ,
132 + # and it also positively regulates the transcription of its own gene
133 + # FimZ is known to regulate the expression of its own gene positively
134 + # FimZ also positively regulates its own transcription
135 + # ArgP protein represses its own synthesis
136 + # ArgP both represses its own transcription
137 + # ArgP protein represses its own synthesis
138 + # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
139 + # of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
140 + reAutoSent = re.compile(r'(?<=\|TF).+\|EFFECT.+its\|its\|PRP\$ own\|own\|JJ')
141 + aFilter = options.filter
142 + print(" Processing file...{}".format(options.inputFileTrans))
143 + with open(os.path.join(options.outputPath, options.outputFile), "w", encoding="utf-8", errors="replace") as oFile:
144 + with open(os.path.join(options.inputFileTrans), mode="r", encoding="utf-8", errors="replace") as tFile, open(os.path.join(options.inputFileWord), mode="r", encoding="utf-8", errors="replace") as wFile:
145 + # CFMC 2022-03-09: Load tsv file with section, id sentence, sentence (Extracted from jsonpdf)
146 + file = options.inputFileTrans[options.inputFileTrans.rfind("/")+1:]
147 + file_tsv = file.replace(".tra.txt", ".pre.tsv")
148 + tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
149 + print("tsv_file.shape: {}".format(tsv_file.shape))
150 + tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
151 + print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
152 + # print(tsv_file_filtered.head(10))
153 + tsv_file_new = tsv_file_filtered.reset_index(drop=True)
154 + # print(tsv_file_new.shape)
155 + # print(tsv_file_new.head(10))
156 + i = 0
157 + for tLine, wLine in zip(tFile, wFile):
158 + # FILT1: (GENE OR TU) AND TF
159 + # FILT2: (GENE OR TU) AND EFFECT AND TF
160 + if aFilter is not None:
161 + reGENETU = re.compile(r'(\|GENE|\|TU)')
162 + reEFFECT = re.compile(r'\|EFFECT')
163 + reTF = re.compile(r'\|TF')
164 + tCount = str(count)
165 + if aFilter == "FILT1":
166 + if not (reGENETU.search(tLine) and reTF.search(tLine)):
167 + #print("NOT FOUND")
168 + # CFMC 2022-03-08
169 + tsv_file_new.at[i, 'status'] = 0
170 + i += 1
171 + continue
172 + else:
173 + #print("FOUND")
174 + oFile.write(wLine)
175 + if tCount not in hashEntities:
176 + hashEntities[tCount] = getEntities(tLine, aFilter)
177 + if reAttrSent.search(tLine):
178 + #print("ATTRIBUTIVE SENTENCE: {}".format(tLine))
179 + if tCount not in hashAttrSent:
180 + hashAttrSent[tCount] = tLine
181 + # Autoregulation sentences
182 + if reAutoSent.search(tLine):
183 + # print("AUOREGULATION SENTENCE: {}".format(tLine))
184 + if tCount not in hashAutoSent:
185 + hashAutoSent[tCount] = tLine
186 + #print(tLine)
187 + elif aFilter == "FILT2":
188 + if not (reGENETU.search(tLine) and reEFFECT.search(tLine) and reTF.search(tLine)):
189 + continue
190 + # CFMC 2022-03-08
191 + tsv_file_new.at[i, 'status'] = 0
192 + i += 1
193 + else:
194 + oFile.write(wLine)
195 + if tCount not in hashEntities:
196 + hashEntities[tCount] = getEntities(tLine, aFilter)
197 + if reAttrSent.search(tLine):
198 + if tCount not in hashAttrSent:
199 + hashAttrSent[tCount] = tLine
200 + if reAutoSent.search(tLine):
201 + if tCount not in hashAutoSent:
202 + hashAutoSent[tCount] = tLine
203 + count += 1
204 + i += 1
205 +
206 + merged = tsv_file.merge(tsv_file_new, on=['idsentence'], how='left')
207 + # print(merged.shape)
208 + # print(merged.head(10))
209 + tsv_file.status = merged.status_y.where(~merged.status_y.isnull(), tsv_file.status).astype(int)
210 + tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
211 + print("Last tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
212 + # print(tsv_file_filtered.head(10))
213 + tsv_file.to_csv(os.path.join(options.tsvPath, file_tsv.replace('.tsv', '.fil.tsv')), sep='\t')
214 +
215 + with open(os.path.join(options.outputPath, options.outputFile.replace(".txt", ".ents.json")), "w", encoding="utf-8",
216 + errors="replace") as eFile:
217 + json.dump(hashEntities, eFile)
218 +
219 + for f, sent in hashAttrSent.items():
220 + listPath = options.inputFileTrans.split('/')
221 + fileName = listPath[-1]
222 + fileName = fileName.replace('.tra.', '.att.' + f + '.')
223 + print("Save file {}".format(fileName))
224 + with open(os.path.join(options.attrPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
225 + aFile.write(sent)
226 +
227 + for f, sent in hashAutoSent.items():
228 + listPath = options.inputFileTrans.split('/')
229 + fileName = listPath[-1]
230 + fileName = fileName.replace('.tra.', '.auto.' + f + '.')
231 + print("Save file {}".format(fileName))
232 + with open(os.path.join(options.autoPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
233 + aFile.write(sent)
234 +
235 + print("Files split in: %fs" % (time() - t0))
1 +#!/bin/bash
2 +
3 +#Validate arguments
4 +if [[ ! ("$#" == 3 ) ]]; then
5 + echo 'Usage: ./sentence-simplification-main.sh <input_path> <output_file_path> <isimp_path>'
6 + exit 1
7 +fi
8 +
9 +SCRIPT_PATH=$(cd `dirname $0` && pwd)
10 +#Define aquí la palabra clave del grupo de oraciones a simplificar.
11 +INPUT_PATH=$1
12 +OUTPUT_INDEX_FILE_PATH=$2
13 +ISIMP_PATH=$3
14 +cd $SCRIPT_PATH
15 +
16 +
17 +
18 +
19 +#ANALIZAR EN ISIMP
20 +echo "Analysing in iSimp..."
21 +if [ -z "$(ls -A ./iSimp_sentences/)" ]; then :
22 +else
23 + #echo "Not Empty"
24 + rm ./iSimp_sentences/*
25 +fi
26 +#cd $INPUT_PATH
27 +for j in $INPUT_PATH/*
28 +do
29 + echo $j
30 + #echo "++++entrada_simp: $j salida_simp: $SCRIPT_PATH/iSimp_sentences/$(basename $j)"
31 + $ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
32 +done
33 +cd $SCRIPT_PATH
34 +
35 +#CREA INDICE DE ARCHIVOS SIMPLIFICADOS
36 +#touch $SCRIPT_PATH/index.txt
37 +>| $OUTPUT_INDEX_FILE_PATH
38 +
39 +#ALIMENTAR A ALGORITMO
40 +echo "Analysing in Algorithm..."
41 +if [ -z "$(ls -A ./algorithm_sentences/)" ]; then :
42 +else
43 + #echo "Not Empty"
44 + rm ./algorithm_sentences/*
45 +fi
46 +#cd ./iSimp_sentences
47 +for k in $SCRIPT_PATH/iSimp_sentences/*
48 +do
49 + echo $k
50 + #echo "entrada: $k salida: $SCRIPT_PATH/algorithm_sentences/$(basename $k) index: $OUTPUT_INDEX_FILE_PATH"
51 + python2 $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
52 +done
53 +cd $SCRIPT_PATH
1 +import copy
2 +import sys
3 +import requests
4 +
5 +class Simp(object):
6 + def __init__(self):
7 + self.TYPE=""
8 + self.TYPEx=0
9 + self.TYPEy=0
10 + self.TEXT=""
11 + self.COMP=[]
12 + def agregarTYPE(self,Type):
13 + self.TYPE=Type
14 + def agregarTEXT(self,text):
15 + self.TEXT=text
16 + def agregarCOMP(self,comp):
17 + self.COMP.append(comp)
18 +
19 +class Frase(object):
20 + def __init__(self):
21 + self.TYPE=""
22 + self.TEXT=""
23 + self.POS=""
24 + self.TREE=""
25 + self.SIMP=[]
26 + def agregarTYPE(self,Type):
27 + self.TYPE=Type
28 + def agregarTEXT(self,text):
29 + self.TEXT=text
30 + def agregarPOS(self,Pos):
31 + self.POS=Pos
32 + def agregarTREE(self,Tree):
33 + self.TREE=Tree
34 + def agregarSIMP(self):
35 + self.SIMP.append(Simp())
36 +
37 +class Sentence(object):
38 + def __init__(self):
39 + self.FLAG=True
40 + self.TEXT=""
41 + self.TREE=""
42 + self.SIMP=[]
43 + def agregarTEXT(self,text):
44 + self.TEXT=text
45 + def agregarTREE(self,Tree):
46 + self.TREE=Tree
47 + def agregarSIMP(self):
48 + self.SIMP.append(Simp())
49 +
50 +
51 +MEMORIAB=[]
52 +MEMORIAA=[]
53 +
54 +
55 +#----lectura de datos desde archivo
56 +arch=(sys.argv[1])
57 +f = open(arch)
58 +dato = f.read().splitlines()
59 +f.close
60 +frase=Frase()
61 +for i in range(len(dato)):
62 + if 'TYPE: ' in dato[i][0:6]:
63 + frase.agregarTYPE(dato[i][6:])
64 + elif 'TEXT: ' in dato[i][0:6]:
65 + frase.agregarTEXT(dato[i][6:])
66 + elif 'POS : ' in dato[i][0:6]:
67 + frase.agregarPOS(dato[i][6:])
68 + elif 'TREE: ' in dato[i][0:6]:
69 + frase.agregarTREE(dato[i][6:])
70 + elif 'SIMP:' in dato[i]:
71 + frase.agregarSIMP()
72 + elif ' TYPE: ' in dato[i][0:8]:
73 + frase.SIMP[-1].agregarTYPE(dato[i][8:])
74 + elif ' TEXT: ' in dato[i][0:8]:
75 + frase.SIMP[-1].agregarTEXT(dato[i][8:])
76 + elif ' COMP: ' in dato[i]:
77 + frase.SIMP[-1].agregarCOMP(dato[i][8:])
78 +#------------
79 +
80 +
81 +#-------Programa principal
82 +#Algoritmo v4
83 +
84 +
85 +if ((frase.TYPE.find('sentence')) !=- 1) and (frase.SIMP!=[]) and (frase.SIMP[0].TYPE != ''):
86 + y=1
87 + w=1
88 + SIMPworkspace=[]
89 + # copia TREE y cada SIMP a SENTENCE.1
90 + Sentence1=Sentence()
91 + Sentence1.TREE=copy.deepcopy(frase.TREE)
92 + Sentence1.TEXT=copy.deepcopy(frase.TEXT)
93 + for i in range(len(frase.SIMP)):
94 + #Sentence1.SIMP.append(Simp())
95 + #Sentence1.SIMP[i]=copy.deepcopy(frase.SIMP[i])
96 + SIMPworkspace.append(Simp())
97 + SIMPworkspace[i]=copy.deepcopy(frase.SIMP[i])
98 +
99 +## ORDENAMIENTO DE SIMPs
100 + for i in range(len(SIMPworkspace)):
101 + #print SIMPworkspace[i].TEXT
102 + #print SIMPworkspace[i].TYPE
103 + SIMPworkspace[i].TYPEx = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('[')+1:SIMPworkspace[i].TYPE.find('..')])
104 + SIMPworkspace[i].TYPEy = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('..')+2:SIMPworkspace[i].TYPE.find(']')])
105 + if 'parenthesis' in SIMPworkspace[i].TYPE:
106 + SIMPworkspace[i].TYPEy = SIMPworkspace[i].TYPEy + 2
107 + #print SIMPworkspace[i].TYPEx
108 + #print SIMPworkspace[i].TYPEy
109 +
110 +
111 + SIMPworkspace.sort(key=lambda x: x.TYPEy, reverse=True)
112 + SIMPworkspace.sort(key=lambda x: x.TYPEx)
113 +
114 +
115 + # for i in range(len(SIMPworkspace)):
116 + # print "\nSIMP " + str(i) + " :"
117 + # print SIMPworkspace[i].TYPE
118 + # print SIMPworkspace[i].TYPEx
119 + # print SIMPworkspace[i].TYPEy
120 + # print "\n"
121 +
122 + for i in range(len(SIMPworkspace)):
123 + Sentence1.SIMP.append(Simp())
124 + Sentence1.SIMP[i]=copy.deepcopy(SIMPworkspace[i])
125 +
126 +
127 + # Agrega la oracion original Sentence1 a la memoria como primer objeto en ser analizado
128 + MEMORIAB.append(Sentence())
129 + MEMORIAB[0]=copy.deepcopy(Sentence1)
130 +
131 +
132 +
133 + # 1 entrada al bucle A por cada SIMP diferente en Sentence1
134 + numSimp=len(Sentence1.SIMP)
135 + s = 0
136 + #bucle A
137 + while s < numSimp :
138 + #print "\nEntro por vez " + str(s) + " al bucle A"
139 + #print "Analizando todos los SIMP de tipo: " + MEMORIAB[0].SIMP[s].TYPE
140 + #Entra al bucle B el numero de veces igual al numerode elementos en MEMORIAB
141 + numMEM = len(MEMORIAB)
142 + t = 0
143 + #bucle B
144 + while t < numMEM :
145 + #print "Entro por vez " + str(t) + " al bucle B"
146 + #Entra si la oracion no ha sido analizada antes (FLAG==True) y si el texto del simp esta presente en la oracion.
147 + #print "CONDICIONES:"
148 + #print "SIMP " + MEMORIAB[0].SIMP[s].TEXT
149 + #print "SIMP " + MEMORIAB[0].SIMP[s].TYPE
150 + #print "MEMB " + str(MEMORIAB[t].FLAG)
151 + #print "MEMB " + MEMORIAB[t].TEXT
152 + if ( MEMORIAB[0].SIMP[s].TEXT in MEMORIAB[t].TEXT ) and ( MEMORIAB[t].FLAG == True ):
153 + MEMORIAB[t].FLAG = False
154 + #print "False to: " + MEMORIAB[t].TEXT
155 + #print "Entro a condicional"
156 + #Reglas de simplificacion
157 + if ( 'coordination' in MEMORIAB[t].SIMP[s].TYPE ) and ( not ('sentence coordination' in MEMORIAB[t].SIMP[s].TYPE ) ) :
158 + #print "Aplico regla coord"
159 + TEMPORALES = []
160 + c = len(MEMORIAB[t].SIMP[s].COMP)
161 + #print "Hay " + str(c) + " COMP en este SIMP"
162 + tt = 0
163 + while c > 0 :
164 + c = c - 1
165 + if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) :
166 + TEMPORALES.append(Sentence())
167 + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
168 + replaced = MEMORIAB[0].SIMP[s].TEXT
169 + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
170 + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
171 + replacer = MEMORIAB[0].TEXT[indice1:indice2]
172 + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
173 + tt = tt + 1
174 + #copiar simplificaciones de memoria temporal a MEMORIAB
175 + indtempamem = 0
176 + while indtempamem < len(TEMPORALES) :
177 + MEMORIAB.append(Sentence())
178 + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
179 + MEMORIAB[-1].FLAG = True
180 + #print MEMORIAB[-1].TEXT
181 + indtempamem = indtempamem + 1
182 + elif 'parenthesis' in MEMORIAB[t].SIMP[s].TYPE:
183 + #print "Aplico regla par"
184 + TEMPORALES = []
185 + c = len(MEMORIAB[t].SIMP[s].COMP)
186 + #print "Hay " + str(c) + " COMP en este SIMP"
187 + tt = 0
188 + while c > 0 :
189 + #print "entro al while de par"
190 + c = c - 1
191 + TEMPORALES.append(Sentence())
192 + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
193 + replaced = MEMORIAB[0].SIMP[s].TEXT + ' )'
194 + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
195 + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
196 + replacer = MEMORIAB[0].TEXT[indice1:indice2]
197 + #print "replaced: " + replaced
198 + #print "replacer: " + replacer
199 + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
200 + tt = tt + 1
201 + #copiar simplificaciones de memoria temporal a MEMORIAB
202 + indtempamem = 0
203 + while indtempamem < len(TEMPORALES) :
204 + MEMORIAB.append(Sentence())
205 + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
206 + MEMORIAB[-1].FLAG = True
207 + #print MEMORIAB[-1].TEXT
208 + indtempamem = indtempamem + 1
209 + elif 'apposition' in MEMORIAB[t].SIMP[s].TYPE:
210 + #print "Aplico regla Apposition"
211 + TEMPORALES = []
212 + c = len(MEMORIAB[t].SIMP[s].COMP)
213 + #print "Hay " + str(c) + " COMP en este SIMP"
214 + tt = 0
215 + while c > 0 :
216 + #print "entro al while de par"
217 + c = c - 1
218 + TEMPORALES.append(Sentence())
219 + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
220 + replaced = MEMORIAB[0].SIMP[s].TEXT
221 + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
222 + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
223 + replacer = MEMORIAB[0].TEXT[indice1:indice2]
224 + #print "replaced: " + replaced
225 + #print "replacer: " + replacer
226 + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
227 + tt = tt + 1
228 + #copiar simplificaciones de memoria temporal a MEMORIAB
229 + indtempamem = 0
230 + while indtempamem < len(TEMPORALES) :
231 + MEMORIAB.append(Sentence())
232 + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
233 + MEMORIAB[-1].FLAG = True
234 + #print "Copio a memoria: " + MEMORIAB[-1].TEXT
235 + indtempamem = indtempamem + 1
236 + elif 'member-collection' in MEMORIAB[t].SIMP[s].TYPE:
237 + #print "Aplico regla member-collection"
238 + TEMPORALES = []
239 + c = len(MEMORIAB[t].SIMP[s].COMP)
240 + #print "Hay " + str(c) + " COMP en este SIMP"
241 + tt = 0
242 + while c > 0 :
243 + #print "entro al while de mem"
244 + c = c - 1
245 + TEMPORALES.append(Sentence())
246 + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
247 + replaced = MEMORIAB[0].SIMP[s].TEXT
248 + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
249 + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
250 + replacer = MEMORIAB[0].TEXT[indice1:indice2]
251 + #print "replaced: " + replaced
252 + #print "replacer: " + replacer
253 + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
254 + tt = tt + 1
255 + #copiar simplificaciones de memoria temporal a MEMORIAB
256 + indtempamem = 0
257 + while indtempamem < len(TEMPORALES) :
258 + MEMORIAB.append(Sentence())
259 + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
260 + MEMORIAB[-1].FLAG = True
261 + #print "Copio a memoria: " + MEMORIAB[-1].TEXT
262 + indtempamem = indtempamem + 1
263 + elif 'sentence coordination' in MEMORIAB[t].SIMP[s].TYPE:
264 + #print "Aplico regla Verb"
265 + TEMPORALES = []
266 + c = len(MEMORIAB[t].SIMP[s].COMP)
267 + #print "Hay " + str(c) + " COMP en este SIMP"
268 + tt = 0
269 + while c > 0 :
270 + c = c - 1
271 + if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) :
272 + TEMPORALES.append(Sentence())
273 + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
274 + #sustituye todo el contenido de TEMPORAL.r/TREE, por el contenido la oracion coordinada
275 + #replaced = MEMORIAB[0].SIMP[s].TEXT
276 + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
277 + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
278 + replacer = MEMORIAB[0].TEXT[indice1:indice2]
279 + #print replacer
280 + TEMPORALES[tt].TEXT = replacer
281 + ## si la oracion no termina en punto o !
282 + tt = tt + 1
283 + #copiar simplificaciones de memoria temporal a MEMORIAB
284 + indtempamem = 0
285 + while indtempamem < len(TEMPORALES) :
286 + MEMORIAB.append(Sentence())
287 + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
288 + MEMORIAB[-1].FLAG = True
289 + #print MEMORIAB[-1].TEXT
290 + indtempamem = indtempamem + 1
291 + elif 'full relative clause' in MEMORIAB[t].SIMP[s].TYPE:
292 + #print "Aplico regla RelCl"
293 + TEMPORALES = []
294 + c = 0
295 + tt = 0
296 + while c < 2 :
297 + if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] :
298 + TEMPORALES.append(Sentence())
299 + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
300 + if MEMORIAB[0].TEXT[MEMORIAB[0].TEXT.index(TEMPORALES[tt].SIMP[s].TEXT)+len(TEMPORALES[tt].SIMP[s].TEXT)-1] == ',':
301 + replaced = MEMORIAB[0].SIMP[s].TEXT + ',' #posible error, si es asi probar con ' ,'
302 + else:
303 + replaced = MEMORIAB[0].SIMP[s].TEXT
304 + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
305 + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
306 + replacer = MEMORIAB[0].TEXT[indice1:indice2]
307 + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
308 + indice3 = indice1
309 + indice4 = indice2
310 + if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] :
311 + TEMPORALES.append(Sentence())
312 + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
313 + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
314 + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
315 + TEMPORALES[tt].TEXT = copy.deepcopy(MEMORIAB[0].TEXT[indice3:indice4]+' '+MEMORIAB[0].TEXT[indice1:indice2] ) ##
316 + cad3 = MEMORIAB[0].TEXT[indice1:indice2]
317 + cad4 = cad3.split()
318 + if (cad4[0]+'_WDT') in frase.POS:
319 + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(' '+cad4[0],'')
320 + tt = tt + 1
321 + c = c + 1
322 + #copiar simplificaciones de memoria temporal a MEMORIAB
323 + indtempamem = 0
324 + while indtempamem < len(TEMPORALES) :
325 + MEMORIAB.append(Sentence())
326 + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
327 + MEMORIAB[-1].FLAG = True
328 + #print MEMORIAB[-1].TEXT
329 + indtempamem = indtempamem + 1
330 + elif 'reduced relative clause' in MEMORIAB[t].SIMP[s].TYPE:
331 + #print "Aplico regla RelCl"
332 + TEMPORALES = []
333 + c = 0
334 + tt = 0
335 + while c < 2 :
336 + if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] :
337 + TEMPORALES.append(Sentence())
338 + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
339 + replaced = MEMORIAB[0].SIMP[s].TEXT
340 + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
341 + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
342 + replacer = MEMORIAB[0].TEXT[indice1:indice2]
343 + #subj = MEMORIAB[0].TEXT[indice1:(indice2+1)]
344 + subj = MEMORIAB[0].TEXT[indice1:(indice2)]
345 + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
346 + if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] :
347 + TEMPORALES.append(Sentence())
348 + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #el referente debera estar antes que la clausula para tener orden correcto
349 + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
350 + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
351 + replacer = MEMORIAB[0].TEXT[indice1:indice2]
352 + TEMPORALES[tt].TEXT = subj + " _ " + replacer #en este punto para ingresar copula necesitas info de numero y tiempo
353 + tt = tt + 1
354 + c = c + 1
355 + #copiar simplificaciones de memoria temporal a MEMORIAB
356 + indtempamem = 0
357 + while indtempamem < len(TEMPORALES) :
358 + MEMORIAB.append(Sentence())
359 + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
360 + MEMORIAB[-1].FLAG = True
361 + #print MEMORIAB[-1].TEXT
362 + indtempamem = indtempamem + 1
363 + elif 'hypernymy' in MEMORIAB[t].SIMP[s].TYPE:
364 + print "**hypernymy detected**"
365 + #print "True to: " + MEMORIAB[t].TEXT
366 + MEMORIAB[t].FLAG = True
367 + else:
368 + print "Error: Unknown simplification construct detected."
369 + #print "True to: " + MEMORIAB[t].TEXT
370 + MEMORIAB[t].FLAG = True
371 + t = t + 1
372 + s = s + 1
373 +
374 + #CONDICIONES PARA IMPRESION DE SIMPLIFICACIONES EN ARCHIVO DE TEXTO
375 + #print "Sentence simplificated. New sentences generated:"
376 + for i in range(len(MEMORIAB)):
377 + #se reutiliza flag para marcar las oraciones finales
378 + MEMORIAB[i].FLAG = True
379 + for j in range(len(MEMORIAB[0].SIMP)):
380 + #NOTA: si se agrega un constructo simplificable, anadirlo tambien a esta lista:
381 + if ( ('member-collection' in MEMORIAB[0].SIMP[j].TYPE) or ('apposition' in MEMORIAB[0].SIMP[j].TYPE) or ('coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('parenthesis' in MEMORIAB[0].SIMP[j].TYPE) or ('sentence coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('full relative clause' in MEMORIAB[0].SIMP[j].TYPE) or ('reduced relative clause' in MEMORIAB[0].SIMP[j].TYPE) ) and (MEMORIAB[0].SIMP[j].TEXT in MEMORIAB[i].TEXT) :
382 + MEMORIAB[i].FLAG = False
383 +
384 + ##areglar numeracion archivos salida ej 011
385 + arcsalnum = 0
386 + for i in range(len(MEMORIAB)):
387 + if MEMORIAB[i].FLAG == True:
388 + arcsalnum = arcsalnum + 1
389 + length = len(str(arcsalnum))
390 + #print('{:03d}'.format(arcsalnum)) # python >= 2.7 + python3
391 +# >>> n = '4'
392 +#>>> print n.zfill(3)
393 + arcsalnum = 0
394 + for i in range(len(MEMORIAB)):
395 + if MEMORIAB[i].FLAG == True:
396 + arcsalnum = arcsalnum + 1
397 + print MEMORIAB[i].TEXT#Salida
398 + archSalNombre = sys.argv[2]
399 + archSalNombre=archSalNombre[:-4] + "-" + (str(arcsalnum)).zfill(length) + '.alg'
400 + archivoSalida=open(archSalNombre,"w")
401 + archivoSalida.write(MEMORIAB[i].TEXT+"\n")##
402 + archivoSalida.close()
403 + #WRITE OUTPUT FILE PATH TO INDEX (Arg 3)
404 + index_name = sys.argv[3]
405 + index = open(index_name, "a+")
406 + archSalNombreforIndex=archSalNombre + "\n"
407 + index.write(archSalNombreforIndex)
408 + index.close()
409 +else:
410 + print frase.TEXT #----Salida si no habia constructos simplificables
411 + archSalNombre = sys.argv[2]
412 + archSalNombre = archSalNombre[:-4] + ".alg"
413 + archivoSalida = open(archSalNombre,"a+")
414 + archivoSalida.write(frase.TEXT+"\n")##
415 + archivoSalida.close()
416 + #WRITE OUTPUT FILE PATH TO INDEX (Arg 3)
417 + index_name = sys.argv[3]
418 + index = open(index_name, "a+")
419 + archSalNombreforIndex=archSalNombre + "\n"
420 + index.write(archSalNombreforIndex)
421 + index.close()
422 +
423 +
424 +#FIN
1 +Delete me
...\ No newline at end of file ...\ No newline at end of file