Showing
31 changed files
with
4658 additions
and
0 deletions
README.md
0 → 100644
1 | +# Bacterial regulatory interaction extraction system | ||
2 | + | ||
3 | +## Prerequisites | ||
4 | +1. Input file must be tokenized and sentence split | ||
5 | + | ||
6 | + | ||
7 | + | ||
8 | + | ||
9 | +## Run | ||
10 | +### Several files | ||
11 | +Set filenames and paths in run-several-files.sh | ||
12 | + | ||
13 | +## Acknowledgments | ||
14 | +This work was supported by UNAM-PAPIIT IA203420. | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
attributive-sentences/.gitignore
0 → 100644
1 | + |
automatic-extraction-ris-gcs.sh
0 → 100755
1 | +#!/bin/bash | ||
2 | +# Main script for automatic extraction of regulatory interactions | ||
3 | + | ||
4 | +#Parameters | ||
5 | +#1: Path y nombre de archivo con las frases preprocesadas en formato de tokens (palabras) | ||
6 | +#2: Path y nombre de archivo con las frases preprocesadas en formato trasformado (palabra|lemma|pos) | ||
7 | +#3: Path y nombre de archivo para procesamiento con OpenIE | ||
8 | +#4: Path de salida de archivos a1 y a2 con RIS y GCs | ||
9 | +#5: Simplificar Y/N? | ||
10 | +#6: Separar verbales y deverbales Y/N? | ||
11 | +#7: Filtro de frases que contengan entidades. FILT1 = (GENE OR TU) AND TF | ||
12 | +#8: Path con archivos a1 y a2 de referencia (RIs y GCs verdaderas) | ||
13 | +#9: Archivo de referencia (RIs y GCs verdaderas) | ||
14 | +#10: Path para guardar archivo de evaluación | ||
15 | +#11: Archivo para guardar resultados de la evaluación contra referencia | ||
16 | +#12: Archivo de sinónimos de TFs | ||
17 | + | ||
18 | +# RUN EXTRACTION FOR L&C STM | ||
19 | +# ./automatic-extraction-ris-gcs.sh | ||
20 | +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt | ||
21 | +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt | ||
22 | +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt | ||
23 | +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs | ||
24 | +# Y Y FILT1 | ||
25 | +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference | ||
26 | +# unused.txt | ||
27 | +# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports | ||
28 | +# unused.txt | ||
29 | +# diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
30 | +# 1>uno-STM-LC.txt | ||
31 | +# 2>dos-STM-LC.txt | ||
32 | +# ./automatic-extraction-ris-gcs.sh /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs Y Y FILT1 /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference unused.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports unused.txt diccionario-STM-LT2-v7.0.SYNONYMS.json 1>uno-STM-LC.txt 2>dos-STM-LC.txt | ||
33 | + | ||
34 | +# Some help | ||
35 | +# Filename without path: filename=$(basename "$fullfile") | ||
36 | +# Filename extension: extension="${filename##*.}" | ||
37 | +# Filename without extension: filename="${filename%.*}" | ||
38 | +# Por error de muchos archivos: find . -print0 | xargs -0 grep AcrR | ||
39 | + | ||
40 | + | ||
41 | + | ||
42 | +PATH_TO_CORENLP=/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09 | ||
43 | +DICC_PATH=/home/cmendezc/terminologicalResources | ||
44 | +ISIMP_PATH=/home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/sentence-simplification/isimp_v2 | ||
45 | + | ||
46 | +SCRIPT_PATH=$(cd `dirname $0` && pwd) | ||
47 | +INPUT_PATH=$1 | ||
48 | +INPUT_PATH_TRANS=$2 | ||
49 | +OUTPUT_FILE=$3 | ||
50 | +OUTPUT_PATH=$4 | ||
51 | +INPUT_NAME_EXT=$(basename "$INPUT_PATH") | ||
52 | +INPUT_NAME="${INPUT_NAME_EXT%.*}" | ||
53 | +# Simplify sentences? | ||
54 | +SIMPLIFY=$5 | ||
55 | +# Separate sentences with deverbal effect? | ||
56 | +DEVERBAL_SEPARATOR=$6 | ||
57 | +FILT=$7 | ||
58 | +TRUE_PATH=$8 | ||
59 | +TRUE_FILE=$9 | ||
60 | +PATH_EVAL=${10} | ||
61 | +FILE_EVAL=${11} | ||
62 | +DICC_SYNON=${12} | ||
63 | +# CFMC 2022-03-09: tsv file with section, id sentence, sentence (Extracted from jsonpdf) | ||
64 | +TSV_PATH=${13} | ||
65 | + | ||
66 | +#Validate arguments | ||
67 | +if [[ ! ("$#" == 13 ) ]]; then | ||
68 | + echo 'Usage: ./automatic-extraction-ris-gcs.sh <inputPath_wordFile> | ||
69 | + <inputPath_taggedFile> <outputPath_file> <simplify?> <deverbal_detector?> | ||
70 | + <filter> <true_path> <true_file> <path_evaluation_report> <file_evaluation_report> | ||
71 | + <dictionary_TFs_synonyms> <path_tsv_file>' | ||
72 | + exit 1 | ||
73 | +fi | ||
74 | + | ||
75 | +echo "********** SELECTED PARAMETERS **********" | ||
76 | +echo "INPUT PATH: $INPUT_PATH" | ||
77 | +echo "INPUT PATH TRANSFORMED FILE $INPUT_PATH_TRANS" | ||
78 | +echo "OUTPUT FILE: $OUTPUT_FILE" | ||
79 | +echo "OUTPUT PATH: $OUTPUT_PATH" | ||
80 | +echo "SIMPLIFY SENTENCES? $SIMPLIFY" | ||
81 | +echo "SEPARATE DEVERBAL SENTENCES? $DEVERBAL_SEPARATOR" | ||
82 | +echo "FILTER SENTENCES WITH ENTITIES? $FILT" | ||
83 | +echo "REFERENCE (TRUE) PATH: $TRUE_PATH" | ||
84 | +echo "REFERENCE (TRUE) FILE: $TRUE_FILE" | ||
85 | +echo "PATH EVALUATION REPORT: $PATH_EVAL" | ||
86 | +echo "FILE EVALUATION REPORT: $FILE_EVAL" | ||
87 | +echo "DICTIONARY OF SYNONYMS OF TFS: $DICC_SYNON" | ||
88 | + | ||
89 | +echo "********** SELECTED PROCESSES **********" | ||
90 | +CLEAN_OUTPUT=FALSE | ||
91 | +echo " Clean output paths: $CLEAN_OUTPUT" | ||
92 | + | ||
93 | +FILTER=TRUE | ||
94 | +echo " Filter sentences: $FILTER" | ||
95 | + | ||
96 | +CLEAN=TRUE | ||
97 | +echo " Clean sentences for iSimp: $CLEAN" | ||
98 | + | ||
99 | +SEPARATE=TRUE | ||
100 | +echo " Separate sentences to iSimp: $SEPARATE" | ||
101 | + | ||
102 | +SIMPLI=TRUE | ||
103 | +echo " Simplify sentences: $SIMPLI" | ||
104 | + | ||
105 | +DEVERBAL=TRUE | ||
106 | +echo " Separate deverbal and verbal sentences: $DEVERBAL" | ||
107 | + | ||
108 | +DEVTAG=TRUE # Needs DEVERBAL=TRUE | ||
109 | +echo " Tag sentences to separate deverbal and verbal sentences: $DEVTAG" | ||
110 | + | ||
111 | +DEVSEPAR=TRUE # Needs DEVERBAL=TRUE | ||
112 | +echo " Do separate deverbal and verbal sentences: $DEVSEPAR" | ||
113 | + | ||
114 | +EXTDEVERBAL=TRUE | ||
115 | +echo " Extract RI deverbal: $EXTDEVERBAL" | ||
116 | + | ||
117 | +OPENIE=TRUE | ||
118 | +echo " OpenIE triplet extraction: $OPENIE" | ||
119 | + | ||
120 | +EXTOPENIE=TRUE | ||
121 | +echo " Extract RI verbal: $EXTOPENIE" | ||
122 | + | ||
123 | +EXTATTRIB=TRUE | ||
124 | +echo " Extract RI attributive: $EXTATTRIB" | ||
125 | + | ||
126 | +EXTAUTOREG=TRUE | ||
127 | +echo " Extract RI autoregulation: $EXTAUTOREG" | ||
128 | + | ||
129 | +EXTGC=FALSE | ||
130 | +echo " Extract growth conditions: $EXTGC" | ||
131 | + | ||
132 | +EVAL=FALSE | ||
133 | +echo " Evaluate extraction: $EVAL" | ||
134 | + | ||
135 | +EVALGC=FALSE | ||
136 | +echo " Evaluate growth condition extraction: $EVALGC" | ||
137 | + | ||
138 | +######################### | ||
139 | +# Cleaning output paths # | ||
140 | +######################### | ||
141 | +if [ "$CLEAN_OUTPUT" = "TRUE" ]; then | ||
142 | + if [ -z "$(ls -A $OUTPUT_PATH/complete-ris/)" ]; then : | ||
143 | + else | ||
144 | + #echo "Not Empty" | ||
145 | + # Original: rm $OUTPUT_PATH/complete-ris/* | ||
146 | + find $OUTPUT_PATH/complete-ris -maxdepth 1 -name '*.*' -delete | ||
147 | + fi | ||
148 | + if [ -z "$(ls -A $OUTPUT_PATH/incomplete-ris/)" ]; then : | ||
149 | + else | ||
150 | + #echo "Not Empty" | ||
151 | + # Original: rm $OUTPUT_PATH/incomplete-ris/* | ||
152 | + find $OUTPUT_PATH/incomplete-ris -maxdepth 1 -name '*.*' -delete | ||
153 | + fi | ||
154 | +fi # if [ "$CLEAN_OUTPUT" = "TRUE" ]; then | ||
155 | +################ | ||
156 | +# preliminares # | ||
157 | +################ | ||
158 | +#Clone and update simplification pipeline | ||
159 | +#if [ ! -d "./sentence-simplification" ] | ||
160 | +# then | ||
161 | +# echo Downloading sentence simplificator... | ||
162 | +# git clone https://github.com/ezojg/sentence-simplification | ||
163 | +# else | ||
164 | +# cd ./sentence-simplification | ||
165 | +# git pull origin master | ||
166 | +# cd .. | ||
167 | +#fi | ||
168 | +#Check for iSimp | ||
169 | +#if [ ! -d "./sentence-simplification/isimp_v2" ] | ||
170 | +# then | ||
171 | +# echo ERROR: ./sentence-simplification/isimp_v2 not found. Please manually copy iSimp to said path. | ||
172 | +# exit 1 | ||
173 | +#fi | ||
174 | + | ||
175 | +if [ "$FILTER" = "TRUE" ]; then | ||
176 | +echo "********** FILTER SENTENCES **********" | ||
177 | +################################################### | ||
178 | +# filter sentences with entities of interest # | ||
179 | +# and collect attributive examples ArgP-regulated # | ||
180 | +################################################### | ||
181 | +# INPUT: | ||
182 | +# 1) --inputFileWord $INPUT_PATH input file transformed | ||
183 | +# 2) --inputFileTrans $INPUT_PATH_TRANS input file of feature 'word' | ||
184 | +# 3) --outputPath $SCRIPT_PATH/filtered-sentences | ||
185 | +# 4) --outputFile filtered-sentences.txt output File | ||
186 | +# 5) --filter filter $FILT | ||
187 | +# FILT1: (GENE OR TU) AND TF | ||
188 | +# FILT2: (GENE OR TU) AND EFFECT AND TF | ||
189 | +# 6) --attrPath $SCRIPT_PATH/attributive-sentences Path for attributive cases: ArgP-regulated genes | ||
190 | +# 7) --attrFile attributive-sentences.txt File for attributive cases: ArgP-regulated genes | ||
191 | +# $DICC_PATH/normalized_Effects.json | ||
192 | + | ||
193 | +cd $SCRIPT_PATH | ||
194 | +if [ -z "$(ls -A ./filtered-sentences/)" ]; then : | ||
195 | +else | ||
196 | + #echo "Not Empty" | ||
197 | + rm ./filtered-sentences/* | ||
198 | +fi | ||
199 | +if [ -z "$(ls -A ./attributive-sentences/)" ]; then : | ||
200 | +else | ||
201 | + #echo "Not Empty" | ||
202 | + rm ./attributive-sentences/* | ||
203 | +fi | ||
204 | +if [ -z "$(ls -A ./autoregulation-sentences/)" ]; then : | ||
205 | +else | ||
206 | + #echo "Not Empty" | ||
207 | + rm ./autoregulation-sentences/* | ||
208 | +fi | ||
209 | +# CFMC 2022-03-09: To update tsv file with filtered sentences | ||
210 | +# python3.4 $SCRIPT_PATH/sentence-filter.py --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json | ||
211 | +python3.4 $SCRIPT_PATH/sentence-filter_v02.py --tsvPath $TSV_PATH --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json | ||
212 | +fi # if [ "$PRE" = "TRUE" ]; then | ||
213 | + | ||
214 | +if [ "$CLEAN" = "TRUE" ]; then | ||
215 | +echo "********** CLEAN SENTENCES **********" | ||
216 | +################################# | ||
217 | +# Clean sentences for iSimpm # | ||
218 | +################################# | ||
219 | +# INPUT - PREVIOUS OUTPUT: filtered sentences $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt | ||
220 | +# output path and file $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT | ||
221 | +if [ -z "$(ls -A ./format/sanitized_sentences/)" ]; then : | ||
222 | +else | ||
223 | + #echo "Not Empty" | ||
224 | + rm ./format/sanitized_sentences/* | ||
225 | +fi | ||
226 | +#Original Daniel: python2 $SCRIPT_PATH/format/regex-before.py $INPUT_PATH $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT | ||
227 | +python2 $SCRIPT_PATH/format/regex.py $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT | ||
228 | +fi # if [ "$CLEAN" = "TRUE" ]; then | ||
229 | + | ||
230 | +if [ "$SEPARATE" = "TRUE" ]; then | ||
231 | +echo "********** SEPARATE SENTENCES **********" | ||
232 | +################################ | ||
233 | +# Separate sentences for iSimp # | ||
234 | +################################ | ||
235 | +# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/sanitized_sentences/$l | ||
236 | +# output path and file $SCRIPT_PATH/format/split_sentences/$BARE_NAME | ||
237 | +cd $SCRIPT_PATH | ||
238 | +if [ -z "$(ls -A ./format/split_sentences/)" ]; then : | ||
239 | + else | ||
240 | + rm ./format/split_sentences/* | ||
241 | +fi | ||
242 | +cd ./format/sanitized_sentences | ||
243 | +for l in $(\ls $INPUT_NAME*) | ||
244 | +do | ||
245 | + # echo $l | ||
246 | + BARE_NAME=$(echo $l | cut -f 1 -d '.') | ||
247 | + BARE_NAME+="_" | ||
248 | + LENGTH="$(wc -l < $l)" | ||
249 | + LENGTH="$(echo "${#LENGTH}")" | ||
250 | + split -a $LENGTH -d -l 1 --additional-suffix=.spt $SCRIPT_PATH/format/sanitized_sentences/$l $SCRIPT_PATH/format/split_sentences/$BARE_NAME | ||
251 | +done | ||
252 | +fi # if [ "$SEPARATE" = "TRUE" ]; then | ||
253 | + | ||
254 | +if [ "$SIMPLI" = "TRUE" ]; then | ||
255 | +echo "********** SIMPLIFY SENTENCES **********" | ||
256 | +###################### | ||
257 | +# Simplify sentences # | ||
258 | +###################### | ||
259 | +# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/split_sentences | ||
260 | +# output file $OUTPUT_FILE | ||
261 | +# path to iSimp $ISIMP_PATH | ||
262 | +# CALL: ./sentence-simplification/sentence-simplification-main.sh | ||
263 | +# CALL: $ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j) | ||
264 | +# CALL: $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH | ||
265 | +# $OUTPUT_INDEX_FILE_PATH = $OUTPUT_FILE | ||
266 | +# OUTPUT: simplified sentences in path ./algorithm_sentences | ||
267 | + | ||
268 | +# while true; do | ||
269 | + # read -p "Do you wish to simplificate sentences? [Y/N]: " yn | ||
270 | + # case $yn in | ||
271 | + # [Yy]* ) SIMP=1; break;; | ||
272 | + # [Nn]* ) SIMP=0; break;; | ||
273 | + # * ) echo "Please answer yes [Y] or no [N].";; | ||
274 | + # esac | ||
275 | +# done | ||
276 | +case $SIMPLIFY in | ||
277 | + [Yy]* ) | ||
278 | + SIMP=1 | ||
279 | + ;; | ||
280 | + [Nn]* ) | ||
281 | + SIMP=0 | ||
282 | + ;; | ||
283 | + * ) | ||
284 | + SIMP=1 | ||
285 | + ;; | ||
286 | +esac | ||
287 | +cd $SCRIPT_PATH | ||
288 | +if [ $SIMP == 1 ] | ||
289 | + then #USING SIMPLIFICATION | ||
290 | + echo "********** YES SIMPLIFY SENTENCES **********" | ||
291 | + #Copy file to sentence-simplification | ||
292 | + #FILE_NAME=$(basename "$INPUT_PATH") | ||
293 | + #Call simplification pipeline AND create a file with the paths for the simplificated sentences | ||
294 | + ./sentence-simplification/sentence-simplification-main.sh $SCRIPT_PATH/format/split_sentences $OUTPUT_FILE $ISIMP_PATH | ||
295 | + #echo "entrada: $SCRIPT_PATH/format/split_sentences --salida: $OUTPUT_FILE" | ||
296 | + #echo "Sentences simplificated. Paths to simplificated sentences saved in $OUTPUT_FILE" | ||
297 | + else #WITHOUT SIMPLIFICACION | ||
298 | + echo "********** NO SIMPLIFY SENTENCES **********" | ||
299 | + if [ -z "$(ls -A ./sentence-simplification/algorithm_sentences/)" ]; then : | ||
300 | + else | ||
301 | + #echo "Not Empty" | ||
302 | + rm ./sentence-simplification/algorithm_sentences/* | ||
303 | + fi | ||
304 | + ls $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE | ||
305 | + cp $SCRIPT_PATH/format/split_sentences/* $SCRIPT_PATH/sentence-simplification/algorithm_sentences | ||
306 | + #echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE" | ||
307 | +fi | ||
308 | +fi # if [ "$SIMPLI" = "TRUE" ]; then | ||
309 | + | ||
310 | +if [ "$DEVERBAL" = "TRUE" ]; then | ||
311 | +echo "********** SEPARATE VERBAL AND DEVERBAL SENTENCES **********" | ||
312 | +###################### | ||
313 | +# Deverbal separator # | ||
314 | +###################### | ||
315 | +# $PATH_TO_CORENLP | ||
316 | +# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/sentence-simplification/algorithm_sentences | ||
317 | +# output path $SCRIPT_PATH/deverbal-separator/separated_sentences | ||
318 | +# $DICC_PATH | ||
319 | +# $DEVTAG POS taggging sentences | ||
320 | +# $DEVSEPAR Do separate sentences | ||
321 | +# CALL: java -cp "$PATH_TO_CORENLP/*" | ||
322 | +# $SCRIPT_PATH/filter.py | ||
323 | +# OUTPUT: sentences separated in two paths according to verbal/deverbal effect | ||
324 | + | ||
325 | +case $DEVERBAL_SEPARATOR in | ||
326 | + [Yy]* ) | ||
327 | + DEVSEP=1 | ||
328 | + ;; | ||
329 | + [Nn]* ) | ||
330 | + DEVSEP=0 | ||
331 | + ;; | ||
332 | + * ) | ||
333 | + DEVSEP=1 | ||
334 | + ;; | ||
335 | +esac | ||
336 | +if [ $DEVSEP == 1 ] | ||
337 | + then #USING DEVERBAL SEPARATOR | ||
338 | + | ||
339 | + #if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/)" ]; then : | ||
340 | + #else | ||
341 | + #echo "Not Empty" | ||
342 | + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* | ||
343 | + # find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -maxdepth 1 -name '*.vrb' -delete | ||
344 | + #fi | ||
345 | + #if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/)" ]; then : | ||
346 | + #else | ||
347 | + #echo "Not Empty" | ||
348 | + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/* | ||
349 | + # find $SCRIPT_PATH/deverbal-separator/separated_sentences/dev -maxdepth 1 -name '*.dev' -delete | ||
350 | + #fi | ||
351 | + | ||
352 | + echo "********** YES SEPARATE VERBAL AND DEVERBAL SENTENCES **********" | ||
353 | + # Original Daniel 2018-12-06: ./deverbal-separator/separator.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR | ||
354 | + ./deverbal-separator/separator-v02.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR | ||
355 | + else #WITHOUT DEVERBAL SEPARATOR | ||
356 | + echo "********** NO SEPARATE VERBAL AND DEVERBAL SENTENCES **********" | ||
357 | + ls $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE | ||
358 | + #echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE" | ||
359 | +fi # [ $DEVSEP == 1 ] | ||
360 | +fi # if [ "$DEVERBAL" = "TRUE" ]; then | ||
361 | + | ||
362 | +if [ "$EXTDEVERBAL" = "TRUE" ]; then | ||
363 | +echo "********** EXTRACT RI DEVERBAL **********" | ||
364 | +####################### | ||
365 | +# Extract RI deverbal # | ||
366 | +####################### | ||
367 | +# INPUT: deverbal files $(dirname ${file}) $(basename ${file}) | ||
368 | +# output path $OUTPUT_PATH $(basename ${file%.*}) | ||
369 | +# $DICC_PATH/names_EFFECT_ONTOGENE.txt $ | ||
370 | +# DICC_PATH/names_GENE.txt | ||
371 | +# $DICC_PATH/names_GENE_ONTOGENE.txt | ||
372 | +# $DICC_PATH/names_GENE_SYN.txt | ||
373 | +# $DICC_PATH/names_TU.txt | ||
374 | +# $DICC_PATH/names_TU_ONTOGENE.txt | ||
375 | +# $DICC_PATH/names_TF_1grams.txt | ||
376 | +# $DICC_PATH/names_TF_2grams.txt | ||
377 | +# $DICC_PATH/names_TF_3grams.txt | ||
378 | +# $DICC_PATH/names_TF_4grams.txt | ||
379 | +# $DICC_PATH/names_TF_5Moregrams.txt | ||
380 | +# $DICC_PATH/names_TF_ONTOGENE.txt | ||
381 | +# $DICC_PATH/normalized_Effects.json | ||
382 | +# OUTPUT: standoff files with RIs | ||
383 | +# PATH ALREADY TAGGED ENTITIES: $SCRIPT_PATH/filtered-sentences | ||
384 | +# FILE ALREADY TAGGED ENTITIES: filtered-sentences.ents.json | ||
385 | + for file in $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/*.* | ||
386 | + do | ||
387 | + #python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-OriginalDaniel.py $file $OUTPUT_PATH/$(basename ${file%.*}) $DICC_PATH/names_EFFECT_ONTOGENE.txt $DICC_PATH/names_GENE.txt $DICC_PATH/names_GENE_ONTOGENE.txt $DICC_PATH/names_GENE_SYN.txt $DICC_PATH/names_TU.txt $DICC_PATH/names_TU_ONTOGENE.txt $DICC_PATH/names_TF_1grams.txt $DICC_PATH/names_TF_2grams.txt $DICC_PATH/names_TF_3grams.txt $DICC_PATH/names_TF_4grams.txt $DICC_PATH/names_TF_5Moregrams.txt $DICC_PATH/names_TF_ONTOGENE.txt | ||
388 | + #echo "Dir file: $(dirname ${file})" | ||
389 | + #echo "File $(basename ${file})" | ||
390 | + #echo "OUTOUT_PATH $OUTPUT_PATH" | ||
391 | + #echo "File $(basename ${file%.*})" | ||
392 | + echo "Dir and files: $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})" | ||
393 | + #python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v02.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json | ||
394 | + python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v03.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json | ||
395 | + done | ||
396 | +fi # if [ "$EXTDEVERBAL" = "TRUE" ]; then | ||
397 | + | ||
398 | +if [ "$OPENIE" = "TRUE" ]; then | ||
399 | +echo "********** OPENIE TRIPLET EXTRACTION **********" | ||
400 | + ######################### | ||
401 | + # OpenIE RI extraction # | ||
402 | + ######################### | ||
403 | + # Juntamos frases verbales en archivo para OpenIE extraction | ||
404 | + # Error: /bin/ls: Argument list too long: ls $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE | ||
405 | + echo " Join verbal sentences into file for OpenIE extraction" | ||
406 | + find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -type f -name '*' > $OUTPUT_FILE | ||
407 | + #echo "Deberval sentences separated. Paths to verbal sentences saved in $OUTPUT_FILE" | ||
408 | + | ||
409 | + echo " CoreNLP OpenIE..." | ||
410 | + java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.naturalli.OpenIE -filelist $OUTPUT_FILE -triple.strict false -triple.all_nominals true -format reverb > $OUTPUT_FILE.reverb | ||
411 | +fi # if [ "$OPENIE" = "TRUE" ]; then | ||
412 | + | ||
413 | +if [ "$EXTOPENIE" = "TRUE" ]; then | ||
414 | + echo "********** OPENIE RI EXTRACTION **********" | ||
415 | + ######################### | ||
416 | + # OpenIE RI extraction # | ||
417 | + ######################### | ||
418 | + # Sustituyo oie_compress de Nacho por un programa hecho por CMC para analizar las tripletas | ||
419 | + # y obtener aquellas que sugieran a los participantes y el efecto | ||
420 | + #Paste input and output for fancy printing | ||
421 | + # Original Nacho: echo " Fancy printing..." | ||
422 | + # Original Nacho: > $OUTPUT_FILE.fuzzy | ||
423 | + # Original Nacho: python3 oie_compress.py --oies $OUTPUT_FILE.reverb --op fuzzy --ris $DICC_PATH/normalized_Effects.json --out $OUTPUT_FILE.fuzzy | ||
424 | + # | ||
425 | + # --inputFile $OUTPUT_FILE.reverb file obtained with CoreNLPL | ||
426 | + # --outputPath $OUTPUT_PATH | ||
427 | + # --diccPath $SCRIPT_PATH/filtered-sentences Before: $DICC_PATH | ||
428 | + # --diccFile Before: termFilesTag_RIE_GCE_SYSTEM_ECCO.json | ||
429 | + # --diccEffect normalized_Effects.json | ||
430 | + # --format standoff | ||
431 | + # --diccEPAth $DICC_PATH | ||
432 | + # OUTPUT: standoff files with RIs | ||
433 | + | ||
434 | + # python3.4 $SCRIPT_PATH/ri-openie-extraction.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccFile termFilesTag_RIE_GCE_SYSTEM_ECCO.json --diccEffect normalized_Effects.json --format standoff | ||
435 | + python3.4 $SCRIPT_PATH/ri-openie-extraction-v02.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $SCRIPT_PATH/filtered-sentences --diccFile filtered-sentences.ents.json --diccEffect normalized_Effects.json --diccEPAth $DICC_PATH --format standoff | ||
436 | + | ||
437 | + #Join into single file | ||
438 | + #Sort fuzzy | ||
439 | + # Original Nacho: echo " Sort fuzzy..." | ||
440 | + # Obtiene tipo de efecto | ||
441 | + # Original Nacho: sort $OUTPUT_FILE.fuzzy -o $OUTPUT_FILE.fuzzy | ||
442 | + #Concatenate | ||
443 | + # CMC eliminated following lines because simplification was | ||
444 | + #discriminated before | ||
445 | + #if [ $SIMP == 1 ] | ||
446 | + #then #USING SIMPLIFICATION | ||
447 | + #ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils | ||
448 | + #awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls | ||
449 | + #cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als | ||
450 | + #paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger | ||
451 | + #else #WITHOUT SIMPLIFICACION | ||
452 | + #ls -l $SCRIPT_PATH/format/split_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils | ||
453 | + #awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls | ||
454 | + #cat $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE.als | ||
455 | + #paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger | ||
456 | + #fi | ||
457 | + # Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils | ||
458 | + # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls | ||
459 | + # Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als | ||
460 | + # Original Nacho: echo " Creating ils, fls and als files..." | ||
461 | + # Original Nacho: if [ $DEVSEP == 1 ] | ||
462 | + # Original Nacho: then #USING DEVERBAL SEPARATOR | ||
463 | + # Original Nacho: ls -l $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils | ||
464 | + # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls | ||
465 | + # Original Nacho: cat $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE.als | ||
466 | + # Original Nacho: else #WITHOUT DEVERBAL SEPARATOR | ||
467 | + # Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils | ||
468 | + # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls | ||
469 | + # Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als | ||
470 | + # Original Nacho: fi | ||
471 | + # Original Nacho: echo " Paste merger..." | ||
472 | + # Original Nacho: paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger | ||
473 | + # Original Nacho: echo " Create dsp file..." | ||
474 | + # Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp | ||
475 | + # Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp | ||
476 | + # rm $(dirname "$OUTPUT_FILE")/*.fls | ||
477 | + # rm $(dirname "$OUTPUT_FILE")/*.ils | ||
478 | + # rm $(dirname "$OUTPUT_FILE")/*.als | ||
479 | + #rm $SCRIPT_PATH/*.merger | ||
480 | + #rm $SCRIPT_PATH/*.reverb | ||
481 | + #rm $SCRIPT_PATH/*.fuzzy | ||
482 | +fi # if [ "$EXTOPENIE" = "TRUE" ]; then | ||
483 | + | ||
484 | +if [ "$EXTATTRIB" = "TRUE" ]; then | ||
485 | + echo "********** ATTRIBUTIVE RI EXTRACTION **********" | ||
486 | + ######################### | ||
487 | + # Attributive RI extraction # | ||
488 | + ######################### | ||
489 | + # Attributive RI extraction, such as AraP-regulated genes aragP, aragT | ||
490 | + # | ||
491 | + # --inputPath $SCRIPT_PATH/attributive-sentences | ||
492 | + # --outputPath $OUTPUT_PATH | ||
493 | + # --diccPath $SCRIPT_PATH/filtered-sentences Before: $DICC_PATH | ||
494 | + # --diccEffect normalized_Effects.json | ||
495 | + # OUTPUT: standoff files with RIs | ||
496 | + | ||
497 | + for file in $SCRIPT_PATH/attributive-sentences/*.* | ||
498 | + do | ||
499 | + echo "Dir file: $(dirname ${file})" | ||
500 | + echo "File: $(basename ${file})" | ||
501 | + # echo "OUTOUT_PATH $OUTPUT_PATH" | ||
502 | + # echo "File $(basename ${file%.*})" | ||
503 | + # echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})" | ||
504 | + if [ "$(basename ${file})" = "*.*" ]; then | ||
505 | + echo "None attributive sentence found" | ||
506 | + else | ||
507 | + python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json | ||
508 | + fi | ||
509 | + done | ||
510 | + | ||
511 | +fi # if [ "$EXTATTRIB" = "TRUE" ]; then | ||
512 | + | ||
513 | +if [ "$EXTAUTOREG" = "TRUE" ]; then | ||
514 | + echo "********** AUTOREGULATION RI EXTRACTION **********" | ||
515 | + ######################### | ||
516 | + # Autoregulation RI extraction # | ||
517 | + ######################### | ||
518 | + # Autoregulation RI extraction, such as ArgP protein represses its own synthesis | ||
519 | + # | ||
520 | + # --inputPath $SCRIPT_PATH/autoregulation-sentences | ||
521 | + # --outputPath $OUTPUT_PATH | ||
522 | + # --diccPath $DICC_PATH | ||
523 | + # --diccEffect normalized_Effects.json | ||
524 | + # OUTPUT: standoff files with RIs | ||
525 | + | ||
526 | + for file in $SCRIPT_PATH/autoregulation-sentences/*.* | ||
527 | + do | ||
528 | + echo "Dir file: $(dirname ${file})" | ||
529 | + echo "File: $(basename ${file})" | ||
530 | + # echo "OUTOUT_PATH $OUTPUT_PATH" | ||
531 | + # echo "File $(basename ${file%.*})" | ||
532 | + # echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})" | ||
533 | + if [ "$(basename ${file})" = "*.*" ]; then | ||
534 | + echo "None autoregulation sentence found" | ||
535 | + else | ||
536 | + python3 $SCRIPT_PATH/ri-autoregulation-extraction-v01.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json | ||
537 | + fi | ||
538 | + done | ||
539 | + | ||
540 | +fi # if [ "$EXTAUTOREG" = "TRUE" ]; then | ||
541 | + | ||
542 | +if [ "$EXTGC" = "TRUE" ]; then | ||
543 | + echo "********** EXTRACT GROWTH CONDITIONS **********" | ||
544 | + ############################# | ||
545 | + # Extract growth conditions # | ||
546 | + ############################# | ||
547 | + python3.4 $SCRIPT_PATH/extract-gcs/extract-gcs-regex.py --inputPath $OUTPUT_PATH/complete-ris --outputPath $OUTPUT_PATH/complete-ris --termPath /home/cmendezc/terminologicalResources | ||
548 | + #python3 ./GCs-regex-before.py ./ejemplo_11.spt | ||
549 | + #/home/elwe/Documents/prueba3/RIE_reordenado/RI-searcher/GC/ejemplo_11.spt ./ejemplo_11.a2 | ||
550 | + #./names_GC_ECCO_1grams.txt ./names_GC_ECCO_2grams.txt ./names_GC_ECCO_3grams.txt | ||
551 | + #./names_GC_ECCO_4grams.txt ./names_GC_ECCO_5Moregrams.txt | ||
552 | +fi # if [ "$EXTGC" = "TRUE" ]; then | ||
553 | + | ||
554 | +if [ "$EVAL" = "TRUE" ]; then | ||
555 | + echo "********** EVALUATE EXTRACTION **********" | ||
556 | + if [ "$EVALGC" = "TRUE" ]; then | ||
557 | + echo "********** EVALUATE GROWTH CONDITION EXTRACTION **********" | ||
558 | + python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON --evaluateGCs | ||
559 | + else | ||
560 | + echo "********** EVALUATE WITHOUT GROWTH CONDITION EXTRACTION **********" | ||
561 | + python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON | ||
562 | + | ||
563 | + fi # if [ "$EVALGC" = "TRUE" ]; then | ||
564 | +fi # if [ "$EVAL" = "TRUE" ]; then | ||
565 | + |
autoregulation-sentences/deleteme.txt
0 → 100644
File mode changed
deverbal-separator/filter-v03.py
0 → 100644
1 | +# import fileinput | ||
2 | +# import regex as re | ||
3 | +# from regex import finditer | ||
4 | +import sys | ||
5 | +import json | ||
6 | + | ||
7 | +if ( len( sys.argv ) != 3 ): | ||
8 | + # Original Daniel: sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <EFFs_dictionary> \n" ) | ||
9 | + sys.stderr.write("E: usage: " + sys.argv[0] + " <input_file> <normalized_Effects> \n") | ||
10 | + sys.stderr.flush(); | ||
11 | + | ||
12 | +# exit( 2 ); | ||
13 | + | ||
14 | +#LEER ARCHIVO INPUT | ||
15 | +text_file = open( sys.argv[1], "r" ) | ||
16 | +dato = text_file.read() | ||
17 | +text_file.close() | ||
18 | + | ||
19 | +#LEE DICCIONARIO | ||
20 | + | ||
21 | +# Loading normalized effects | ||
22 | +# print('Loading normalized effects...') | ||
23 | +with open(sys.argv[2]) as diccFile: | ||
24 | + hashNormalizedEffects = json.load(diccFile) | ||
25 | +DICC = list(hashNormalizedEffects.keys()) | ||
26 | + | ||
27 | +# Original Daniel: text_file = open( sys.argv[2], "r" ) | ||
28 | +# Original Daniel: DICC = text_file.read().splitlines() | ||
29 | +# Original Daniel: text_file.close() | ||
30 | + | ||
31 | + | ||
32 | +#declara variables | ||
33 | +is_dev = False | ||
34 | +is_vrb = False | ||
35 | + | ||
36 | + | ||
37 | +# DICC | ||
38 | +# 2018-11-30 CMC: We separated noun and only past participle for deverbal processing | ||
39 | +# and all verb forms as verbal | ||
40 | +# VRB: VB verb, base form think | ||
41 | +# VRB: VBZ verb, 3rd person singular present she thinks | ||
42 | +# VRB: VBP verb, non-3rd person singular present I think | ||
43 | +# VRB: VBD verb, past tense they thought | ||
44 | +# DEV: VBN verb, past participle a sunken ship | ||
45 | +# VRB: VBG verb, gerund or present participle thinking is fun | ||
46 | +# extend/VBP | ||
47 | +for i in range(len(DICC)): | ||
48 | + # print(DICC[i]) | ||
49 | + for token in dato.split(): | ||
50 | + word = token[:token.find("/")] | ||
51 | + tag = token[token.find("/")+1:] | ||
52 | + # print("word: {}".format(word)) | ||
53 | + # print("tag: {}".format(tag)) | ||
54 | + if (DICC[i] in word) and (("NN" in tag) | ||
55 | + or ("VBN" == tag) | ||
56 | + ): | ||
57 | + is_dev = True | ||
58 | + # print("deverbal: " + word) | ||
59 | + if (DICC[i] in word) and ("VB" in tag): | ||
60 | + is_vrb = True | ||
61 | + # print("verbal: " + word) | ||
62 | + | ||
63 | +if is_dev and is_vrb: | ||
64 | + sys.exit(11) | ||
65 | +elif is_dev: | ||
66 | + sys.exit(12) | ||
67 | +elif is_vrb: | ||
68 | + sys.exit(13) | ||
69 | +else: | ||
70 | + sys.exit(10) | ||
71 | + |
1 | + |
1 | + |
deverbal-separator/separator-v02.sh
0 → 100755
1 | +#!/bin/bash | ||
2 | +# Separates sentences by deverbal (.dev) and verbal (.vrb) | ||
3 | + | ||
4 | +# Original Daniel: PATH_TO_CORENLP=/home/elwe/Documents/temporal/CoreNLP | ||
5 | + | ||
6 | +#Validate arguments | ||
7 | +if [[ ! ("$#" == 6 ) ]]; then | ||
8 | + echo 'Usage: ./separator.sh <path_to_corenlp> <input_path> <output_path> <dicc_path> <if_tag> <if_separate>' | ||
9 | + exit 1 | ||
10 | +fi | ||
11 | + | ||
12 | +SCRIPT_PATH=$(cd `dirname $0` && pwd) | ||
13 | +# Original Daniel: INPUT_PATH=$1 #carpeta que contiene archivos a separar | ||
14 | +# Original Daniel: OUTPUT_PATH=$2 | ||
15 | +PATH_TO_CORENLP=$1 | ||
16 | +INPUT_PATH=$2 #carpeta que contiene archivos a separar | ||
17 | +OUTPUT_PATH=$3 | ||
18 | +DICC_PATH=$4 | ||
19 | +# Tag sentences to separate deverbal and verbal sentences: $DEVTAG | ||
20 | +TAG=$5 | ||
21 | +# Do separate deverbal and verbal sentences: $DEVSEPAR | ||
22 | +SEP=$6 | ||
23 | + | ||
24 | +if [ $TAG == "TRUE" ] | ||
25 | + then #ANALIZAR EN STANFORD PARSER | ||
26 | + | ||
27 | + if [ -z "$(ls -A $SCRIPT_PATH/tagged/)" ]; then : | ||
28 | + else | ||
29 | + #echo "Not Empty" | ||
30 | + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged/* | ||
31 | + find $SCRIPT_PATH/tagged -maxdepth 1 -name '*.conll' -delete | ||
32 | + fi | ||
33 | + | ||
34 | + # Added by CMC | ||
35 | + if [ -z "$(ls -A $SCRIPT_PATH/tagged-line/)" ]; then : | ||
36 | + else | ||
37 | + #echo "Not Empty" | ||
38 | + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged-line/* | ||
39 | + find $SCRIPT_PATH/tagged-line -maxdepth 1 -name '*.spt' -delete | ||
40 | + fi | ||
41 | + | ||
42 | + for j in $INPUT_PATH/* | ||
43 | + do | ||
44 | + #echo $j | ||
45 | + #Original Daniel: java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.parser.lexparser.LexicalizedParser -writeOutputFiles -retainTMPSubcategories -outputFormat "wordsAndTags" $SCRIPT_PATH/englishPCFG.ser.gz $j | ||
46 | + # Command line: java -cp "/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file datos_0.spt -outputDirectory tagged | ||
47 | + # java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged | ||
48 | + # With parse: java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged | ||
49 | + java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged | ||
50 | + done | ||
51 | + | ||
52 | + # Original Daniel: mv $INPUT_PATH/*.stp $SCRIPT_PATH/tagged/ | ||
53 | + for j in $SCRIPT_PATH/tagged/* | ||
54 | + do | ||
55 | + # Original Daniel: awk 'NF {print $2 "/" $4}' tagged/$j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${j%.spt}" | ||
56 | + filename=$(basename "$j") | ||
57 | + #filename="${filename%.*}" | ||
58 | + awk 'NF {print $2 "/" $4}' $j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${filename%.*}.spt" | ||
59 | + # Original Daniel: mv "$j" "${j%.stp}" | ||
60 | + done | ||
61 | +fi # if [ $TAG == "TRUE" ] | ||
62 | + | ||
63 | +if [ $SEP == "TRUE" ] | ||
64 | + then #SEPARAR ARCHIVOS | ||
65 | + | ||
66 | + # Original Daniel: if [ -z "$(ls -A $OUTPUT_PATH)" ]; then : | ||
67 | + # Modified by Carlos Méndez | ||
68 | + if [ -z "$(ls -A $OUTPUT_PATH/dev)" ]; then : | ||
69 | + else | ||
70 | + #echo "Not Empty" | ||
71 | + # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/dev/* | ||
72 | + find $OUTPUT_PATH/dev -maxdepth 1 -name '*.dev' -delete | ||
73 | + fi | ||
74 | + | ||
75 | + if [ -z "$(ls -A $OUTPUT_PATH/vrb)" ]; then : | ||
76 | + else | ||
77 | + #echo "Not Empty" | ||
78 | + # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/vrb/* | ||
79 | + find $OUTPUT_PATH/vrb -maxdepth 1 -name '*.vrb' -delete | ||
80 | + fi | ||
81 | + | ||
82 | + for j in $SCRIPT_PATH/tagged-line/* | ||
83 | + do | ||
84 | + # Original Daniel: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/names_EFFECT_ONTOGENE.txt | ||
85 | + # CMC 2018-12-04: Without separating verbal forms: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/normalized_Effects.json | ||
86 | + # CMC 2018-12-11: With separating verbal forms: python3 $SCRIPT_PATH/filter-v02.py $j $DICC_PATH/normalized_Effects.json | ||
87 | + # CMC 2018-12-11: Considering only passive verbal form as deverbal: VBN verb, past participle | ||
88 | + python3 $SCRIPT_PATH/filter-v03.py $j $DICC_PATH/normalized_Effects.json | ||
89 | + VAR=$? | ||
90 | + # filename=${j##*/} | ||
91 | + # inputfile=${filename%.spt} | ||
92 | + # exit | ||
93 | + | ||
94 | + if [ $VAR == 11 ]; then : | ||
95 | + #contiene dev y vrb $SCRIPT_PATH/tagged-line/ | ||
96 | + # o | ||
97 | + #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev | ||
98 | + #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb | ||
99 | + #echo "Deverbal and verbal" | ||
100 | + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev | ||
101 | + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb | ||
102 | + elif [ $VAR == 12 ]; then : | ||
103 | + #contiene dev | ||
104 | + #echo "Deverbal" | ||
105 | + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev | ||
106 | + # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev | ||
107 | + elif [ $VAR == 13 ]; then : | ||
108 | + #contiene vrb | ||
109 | + #echo "Verbal" | ||
110 | + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb | ||
111 | + # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb | ||
112 | + elif [ $VAR == 10 ]; then : | ||
113 | + #parece no contener dev ni vrb | ||
114 | + echo "Non deverbal and verbal" | ||
115 | + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb | ||
116 | + # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb | ||
117 | + fi | ||
118 | + done | ||
119 | +fi # if [ $SEP == "TRUE" ] |
deverbal-separator/tagged-line/.gitignore
0 → 100644
1 | + |
deverbal-separator/tagged/.gitignore
0 → 100644
1 | + |
evaluate-ris-gcs-standoff-v04.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | +import operator | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +import json | ||
7 | +import re | ||
8 | + | ||
9 | +__author__ = 'CMendezC' | ||
10 | + | ||
11 | + | ||
12 | +# Objective: evaluate predicted interactions in standoff format | ||
13 | +# versus true interactions in tab format | ||
14 | +# v04: add synonyms of TFs | ||
15 | + | ||
16 | +# Parameters: | ||
17 | +# 1) --truePath Path for true interactions | ||
18 | +# 2) --trueFile File for true interactions | ||
19 | +# 3) --predictedPath Path for predicted interactions | ||
20 | +# 4) --outputPath Output path | ||
21 | +# 5) --outputFile File for saving results | ||
22 | +# 6) --evaluateGCs Evaluate with GCs | ||
23 | +# 7) --diccPath Dictionary path | ||
24 | +# 8) --diccSynon File with synonyms of TFs | ||
25 | + | ||
26 | +# Ouput: | ||
27 | +# 1) File with TP, FP, FN and scores Precision, Recall , F1 | ||
28 | + | ||
29 | +# Execution: | ||
30 | +# python3.4 evaluate-ris-gcs-standoff.py | ||
31 | +# --truePath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/analysis-validation-data-sets | ||
32 | +# --trueFile ris-analysis-reference.txt | ||
33 | +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris-gcs | ||
34 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/evaluation-reports | ||
35 | +# --outputFile evaluation-riegce-system-ris-analysis.txt | ||
36 | +# --diccPath /home/cmendezc/terminologicalResources | ||
37 | +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
38 | +# --evaluateGCs | ||
39 | + | ||
40 | +########################################################### | ||
41 | +# MAIN PROGRAM # | ||
42 | +########################################################### | ||
43 | + | ||
44 | +def updateHashPredicted(pr, hashP, pm, sF, ef): | ||
45 | + if pr not in hashP: | ||
46 | + hashTemp = {"pmids": {pm: [sF]}, "orieff": ef} | ||
47 | + hashP[pr] = hashTemp | ||
48 | + else: | ||
49 | + hashTemp = hashP[pr] | ||
50 | + if pm in hashTemp["pmids"]: | ||
51 | + hashP[pr]["pmids"][pm].append(sF) | ||
52 | + else: | ||
53 | + hashP[pr]["pmids"][pm] = [sF] | ||
54 | + | ||
55 | + | ||
56 | +def getSummary(r, hashTemp): | ||
57 | + pmids = 0 | ||
58 | + sentences = 0 | ||
59 | + orieff = "" | ||
60 | + if r in hashTemp: | ||
61 | + # print("r: {}".format(r)) | ||
62 | + orieff = hashTemp[r]["orieff"] | ||
63 | + for pmid in hashTemp[r]["pmids"]: | ||
64 | + pmids += 1 | ||
65 | + # print("PMID with sentences: {}".format(pmid)) | ||
66 | + for sent in hashTemp[r]["pmids"][pmid]: | ||
67 | + sentences += 1 | ||
68 | + else: | ||
69 | + return "WARNING: no data available!" | ||
70 | + return "Artículos: {}\tFrases: {}\tOriginal effect: {}".format(pmids, sentences, orieff) | ||
71 | + | ||
72 | + | ||
73 | +def getDetail(r, hashTemp): | ||
74 | + return_text = "" | ||
75 | + sentences = 0 | ||
76 | + aHash = {} | ||
77 | + if r in hashTemp: | ||
78 | + for pmid in hashTemp[r]["pmids"]: | ||
79 | + for sent in hashTemp[r]["pmids"][pmid]: | ||
80 | + sentences += 1 | ||
81 | + if pmid not in aHash: | ||
82 | + aHash[pmid] = sentences | ||
83 | + else: | ||
84 | + return "WARNING: PMID duplicated!" | ||
85 | + else: | ||
86 | + return "WARNING: no data available!" | ||
87 | + for p, s in sorted(aHash.items(), key=operator.itemgetter(1), reverse=True): | ||
88 | + return_text += "\tPMID {}: {} frases\n".format(p, s) | ||
89 | + | ||
90 | + return return_text | ||
91 | + | ||
92 | + | ||
93 | +def get_standard_name(regSynon): | ||
94 | + reg = "" | ||
95 | + if regSynon in hashSynon: | ||
96 | + reg = hashSynon[regSynon] | ||
97 | + else: | ||
98 | + for syn, std in hashSynon.items(): | ||
99 | + if regSynon.startswith(syn): | ||
100 | + reg = regSynon.replace(syn, std, 1) | ||
101 | + break | ||
102 | + return reg | ||
103 | + | ||
104 | + | ||
105 | +def isCorrect(ripr, listT, rtype): | ||
106 | + # The predicted regulator starts with entity | ||
107 | + # Effect and regulated coincide | ||
108 | + # Regulator coincides with activator or repressor | ||
109 | + # We return a flag to indicate type of matching: full | ||
110 | + list_ripr = ripr.split('\t') | ||
111 | + regulator = list_ripr[0] | ||
112 | + regulatorStdName = "" | ||
113 | + if use_synonyms: | ||
114 | + regulatorStdName = get_standard_name(regulator) | ||
115 | + for rit in listT: | ||
116 | + # print("RI TRUE: {}".format(rit)) | ||
117 | + listRT = rit.split('\t') | ||
118 | + regulatorT = listRT[0] | ||
119 | + regexRegulatorStarts = re.compile(r'(' + regulatorT + r').+') | ||
120 | + if rtype == "ri": | ||
121 | + regulated = list_ripr[1] | ||
122 | + regulatedT = listRT[1] | ||
123 | + if (regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT: | ||
124 | + return (rit, 'Full') | ||
125 | + # For cases where regulator is part of the word, such as ArgP-regulated | ||
126 | + result = regexRegulatorStarts.match(regulator) | ||
127 | + if result: | ||
128 | + # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1))) | ||
129 | + regulator = result.group(1) | ||
130 | + if regulator == regulatorT and regulated == regulatedT: | ||
131 | + return (rit, 'Start') | ||
132 | + else: | ||
133 | + if use_synonyms: | ||
134 | + result = regexRegulatorStarts.match(regulatorStdName) | ||
135 | + if result: | ||
136 | + # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1))) | ||
137 | + regulator = result.group(1) | ||
138 | + if regulator == regulatorT and regulated == regulatedT: | ||
139 | + return (rit, 'Start') | ||
140 | + elif rtype == "rief": | ||
141 | + effect = list_ripr[2] | ||
142 | + regulated = list_ripr[1] | ||
143 | + effectT = listRT[2] | ||
144 | + regulatedT = listRT[1] | ||
145 | + # if ripr == "ArgP\ttargets\tregulator": | ||
146 | + # print("RI-PREDICT: ArgP\ttargets\tregulator") | ||
147 | + # print(" PREDICT: regulator {} effect {} regulated {}".format(regulator, effect, regulated)) | ||
148 | + # print(" TRUE: regulator {} effect {} regulated {}".format(regulatorT, effectT, regulatedT)) | ||
149 | + if ( | ||
150 | + regulator == regulatorT or regulatorStdName == regulatorT) and effect == effectT and regulated == regulatedT: | ||
151 | + return (rit, 'Full') | ||
152 | + elif ( | ||
153 | + regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT and effect == "regulator" and ( | ||
154 | + effectT == "activator" or effectT == "repressor"): | ||
155 | + # if ripr == "ArgP\ttargets\tregulator": | ||
156 | + # print(" Correct RI with regulator: {}".format(ripr)) | ||
157 | + # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no | ||
158 | + return (ripr, 'Regulator') | ||
159 | + else: | ||
160 | + # For cases where regulator is part of the word, such as ArgP-regulated | ||
161 | + result = regexRegulatorStarts.match(regulator) | ||
162 | + if result: | ||
163 | + # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1))) | ||
164 | + regulator = result.group(1) | ||
165 | + if regulator == regulatorT and effect == effectT and regulated == regulatedT: | ||
166 | + return (rit, 'Start') | ||
167 | + elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and ( | ||
168 | + effectT == "activator" or effectT == "repressor"): | ||
169 | + # if ripr == "ArgP\ttargets\tregulator": | ||
170 | + # print(" Correct RI with regulator: {}".format(ripr)) | ||
171 | + # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no | ||
172 | + # solo que en este caso uso solo el regulador | ||
173 | + # return rit | ||
174 | + return (regulator + '\t' + regulated + '\t' + effect, 'Regulator') | ||
175 | + else: | ||
176 | + if use_synonyms: | ||
177 | + result = regexRegulatorStarts.match(regulatorStdName) | ||
178 | + if result: | ||
179 | + if regulator == regulatorT and effect == effectT and regulated == regulatedT: | ||
180 | + return (rit, 'Start') | ||
181 | + elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and ( | ||
182 | + effectT == "activator" or effectT == "repressor"): | ||
183 | + # if ripr == "ArgP\ttargets\tregulator": | ||
184 | + # print(" Correct RI with regulator: {}".format(ripr)) | ||
185 | + # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no | ||
186 | + # solo que en este caso uso solo el regulador | ||
187 | + # return rit | ||
188 | + return (regulator + '\t' + regulated + '\t' + effect, 'Regulator') | ||
189 | + | ||
190 | + # CMC 2018-10-14: Revisar riefgc porque no se ha actualizado | ||
191 | + # elif rtype == "riefgc": | ||
192 | + # effect = list_ripr[2] | ||
193 | + # regulated = list_ripr[1] | ||
194 | + # gc = list_ripr[3] | ||
195 | + # effectT = listRT[2] | ||
196 | + # regulatedT = listRT[1] | ||
197 | + # gcT = listRT[3] | ||
198 | + # if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT: | ||
199 | + # return rit | ||
200 | + # elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT: | ||
201 | + # return rit | ||
202 | + # else: | ||
203 | + # # For cases where regulator is part of the word, such as ArgP-regulated | ||
204 | + # result = regexRegulatorStarts.match(regulator) | ||
205 | + # if result: | ||
206 | + # #print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1))) | ||
207 | + # regulator = result.group(1) | ||
208 | + # if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT: | ||
209 | + # return rit | ||
210 | + # elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT: | ||
211 | + # return rit | ||
212 | + return ('', '') | ||
213 | + | ||
214 | + | ||
215 | +def get_scores_rules(listTrue, listPredicted, hashTemp, title, ri_type): | ||
216 | + print("Evaluation") | ||
217 | + # print(listPredicted) | ||
218 | + # Precision = Extraídos correctos / Predichos | ||
219 | + # Recall = Extraídos correctos / Referencia | ||
220 | + # F - 1 = 2 * ((Precision * Recall) / (Precision + Recall)) | ||
221 | + correct = 0 | ||
222 | + incorrect = 0 | ||
223 | + # For registering correct and incorrect RIs | ||
224 | + hashPredicted = {} | ||
225 | + # To print output RIs | ||
226 | + hashOutputRIs = {} | ||
227 | + # For registering unrecovered RIs | ||
228 | + hashUnrecovered = {} | ||
229 | + | ||
230 | + predicted = len(listPredicted) | ||
231 | + print("len(listPredicted): {}".format(predicted)) | ||
232 | + reference = len(listTrue) | ||
233 | + # print("Reference: {}".format(reference)) | ||
234 | + | ||
235 | + listRecovered = [] | ||
236 | + for ri_pred in listPredicted: | ||
237 | + print("ri_pred: {}".format(ri_pred)) | ||
238 | + # if ri_pred in hashPredicted: | ||
239 | + # print("WARNING: RI predicted {} duplicated {}".format(ri_pred, hashPredicted[ri_pred])) | ||
240 | + # else: | ||
241 | + # First all predicted RIs are incorrect | ||
242 | + # hashPredicted[ri_pred] = "incorrect" | ||
243 | + # if ri_pred in listTrue: | ||
244 | + # hashPredicted[ri_pred] = "correct" | ||
245 | + # listRecovered.append(ri_pred) | ||
246 | + # correct += 1 | ||
247 | + # continue | ||
248 | + riTrue = '' | ||
249 | + result = isCorrect(ri_pred, listTrue, ri_type) | ||
250 | + riResult = result[0] | ||
251 | + matchType = result[1] | ||
252 | + if riResult != '': | ||
253 | + if riResult not in hashOutputRIs: | ||
254 | + hashOutputRIs[riResult] = "Correct" | ||
255 | + if ri_pred not in hashPredicted: | ||
256 | + hashPredicted[ri_pred] = "correct" | ||
257 | + print("ri_pred {} correct".format(ri_pred)) | ||
258 | + correct += 1 | ||
259 | + # Complete matching or the predicted regulator starts with entity | ||
260 | + if matchType == 'Full' or matchType == 'Start': | ||
261 | + # ri_pred matches with ri_true | ||
262 | + if riResult in listRecovered: | ||
263 | + print("WARNING: riResult {} already in listRecovered".format(riResult)) | ||
264 | + else: | ||
265 | + listRecovered.append(riResult) | ||
266 | + else: | ||
267 | + incorrect += 1 | ||
268 | + if riResult not in hashOutputRIs: | ||
269 | + hashOutputRIs[riResult] = "Incorrect" | ||
270 | + if ri_pred not in hashPredicted: | ||
271 | + hashPredicted[ri_pred] = "incorrect" | ||
272 | + print("ri_pred {} incorrect".format(ri_pred)) | ||
273 | + | ||
274 | + if len(hashPredicted) != predicted: | ||
275 | + print("ERROR: number of predicted RIs mismatch") | ||
276 | + # return | ||
277 | + print("Predicted: {}".format(predicted)) | ||
278 | + print("len(hashPredicted): {}".format(len(hashPredicted))) | ||
279 | + | ||
280 | + cor = 0 | ||
281 | + inc = 0 | ||
282 | + for r, v in hashPredicted.items(): | ||
283 | + if v == "correct": | ||
284 | + cor += 1 | ||
285 | + elif v == "incorrect": | ||
286 | + inc += 1 | ||
287 | + if cor != correct: | ||
288 | + print("ERROR: number of correct RIs mismatch") | ||
289 | + # return | ||
290 | + if inc != incorrect: | ||
291 | + print("ERROR: number of incorrect RIs mismatch") | ||
292 | + # return | ||
293 | + print("Correct: {}".format(correct)) | ||
294 | + print("Incorrect: {}".format(incorrect)) | ||
295 | + | ||
296 | + unrecovered = 0 | ||
297 | + recovered = 0 # Only when coincide with reference | ||
298 | + # without considering Regulator correct when Activator or Repressor appears in reference | ||
299 | + listRecovered2 = [] | ||
300 | + listUnrecovered = [] | ||
301 | + for ri in listTrue: | ||
302 | + if ri not in listRecovered: | ||
303 | + if ri in listUnrecovered: | ||
304 | + print("WARNING: ri {} already in listUnrecovered".format(ri)) | ||
305 | + else: | ||
306 | + listUnrecovered.append(ri) | ||
307 | + unrecovered += 1 | ||
308 | + else: | ||
309 | + if ri in listRecovered2: | ||
310 | + print("WARNING: ri {} already in listRecovered2".format(ri)) | ||
311 | + else: | ||
312 | + listRecovered2.append(ri) | ||
313 | + recovered += 1 | ||
314 | + | ||
315 | + print("Len listRecovered: {}".format(len(listRecovered))) | ||
316 | + print("Len listRecovered2: {}".format(len(listRecovered2))) | ||
317 | + print("Len listUnrecovered: {}".format(len(listUnrecovered))) | ||
318 | + # if (unrecovered + correct) != reference: | ||
319 | + # print("ERROR: number of unrecovered {} + correct {} and reference {} RIs mismatch".format(unrecovered, correct, reference)) | ||
320 | + # return | ||
321 | + | ||
322 | + print("{}".format(title)) | ||
323 | + print("Predicted: {}".format(predicted)) | ||
324 | + print("Reference: {}".format(reference)) | ||
325 | + print("Unrecovered: {}".format(unrecovered)) | ||
326 | + print("Recovered: {}".format(recovered)) | ||
327 | + | ||
328 | + precision = correct / predicted | ||
329 | + print("Precision = correct / predicted: {}".format(precision)) | ||
330 | + # recall = correct / reference | ||
331 | + # We calculate recall as recovery rate, because correct instances are calculates | ||
332 | + # considering Regulator correct when Activator and Repressor appears in reference | ||
333 | + recall = recovered / reference | ||
334 | + print("Recall = recovered / reference: {}".format(recall)) | ||
335 | + f1 = 2 * ((precision * recall) / (precision + recall)) | ||
336 | + print("F1: {}".format(f1)) | ||
337 | + | ||
338 | + with open(os.path.join(options.outputPath, options.outputFile), mode="a", errors="replace") as oFile: | ||
339 | + oFile.write("{}\n".format(title)) | ||
340 | + oFile.write("Predicted: {}\n".format(predicted)) | ||
341 | + oFile.write("Reference: {}\n".format(reference)) | ||
342 | + oFile.write("Correct: {}\n".format(correct)) | ||
343 | + oFile.write("Incorrect: {}\n".format(incorrect)) | ||
344 | + oFile.write("Unrecovered: {}\n".format(unrecovered)) | ||
345 | + oFile.write("Recovered: {}\n".format(recovered)) | ||
346 | + oFile.write("Precision = correct / predicted: {}\n".format(precision)) | ||
347 | + oFile.write("Recall = recovered / reference: {}\n".format(recall)) | ||
348 | + oFile.write("F1: {}\n".format(f1)) | ||
349 | + oFile.write("Unrecovered instances:\n") | ||
350 | + for r in sorted(listUnrecovered): | ||
351 | + oFile.write("\tUnrecovered: {}\n".format(r)) | ||
352 | + oFile.write("Recovered instances:\n") | ||
353 | + for r in sorted(listRecovered): | ||
354 | + oFile.write("\tRecovered: {}\n".format(r)) | ||
355 | + oFile.write("Incorrect instances:\n") | ||
356 | + for r, v in sorted(hashPredicted.items()): | ||
357 | + if v == "incorrect": | ||
358 | + oFile.write("\tIncorrect: {}\n".format(r)) | ||
359 | + oFile.write("Correct instances:\n") | ||
360 | + for r, v in sorted(hashPredicted.items()): | ||
361 | + if v == "correct": | ||
362 | + oFile.write("\tCorrect: {}\n".format(r)) | ||
363 | + # oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp))) | ||
364 | + # oFile.write("\t{}\n".format(getDetail(r, hashTemp))) | ||
365 | + | ||
366 | + | ||
367 | +def get_scores(listTrue, listPredicted, hashTemp, title): | ||
368 | + # Precision = Extraídos correctos / Extraídos | ||
369 | + # Recall = Extraídos correctos / Referencia | ||
370 | + # F - 1 = 2 * ((Precision * Recall) / (Precision + Recall)) | ||
371 | + print("{}".format(title)) | ||
372 | + # print("listTrue: {}".format(listTrue)) | ||
373 | + # print("listPredicted: {}".format(listPredicted)) | ||
374 | + print("Predicted: {}".format(len(listPredicted))) | ||
375 | + print("Reference: {}".format(len(listTrue))) | ||
376 | + correct = set(listTrue) & set(listPredicted) | ||
377 | + print("Correct: {} ({})".format(len(correct), len(correct) / len(listPredicted))) | ||
378 | + incorrect = set(listPredicted) - set(listTrue) | ||
379 | + print("Incorrect: {} ({})".format(len(incorrect), len(incorrect) / len(listPredicted))) | ||
380 | + unrecovered = set(listTrue) - set(listPredicted) | ||
381 | + print("Unrecovered: {} ()".format(len(unrecovered), len(unrecovered) / len(listTrue))) | ||
382 | + precision = len(correct) / len(listPredicted) | ||
383 | + print("Precision: {}".format(precision)) | ||
384 | + recall = len(correct) / len(listTrue) | ||
385 | + print("Recall: {}".format(recall)) | ||
386 | + f1 = 2 * ((precision * recall) / (precision + recall)) | ||
387 | + print("F1: {}".format(f1)) | ||
388 | + | ||
389 | + with open(os.path.join(options.outputPath, options.outputFile), mode="a") as oFile: | ||
390 | + oFile.write("{}\n".format(title)) | ||
391 | + oFile.write("Predicted: {}\n".format(len(listPredicted))) | ||
392 | + oFile.write("Reference: {}\n".format(len(listTrue))) | ||
393 | + oFile.write("Correct: {}\n".format(len(correct))) | ||
394 | + oFile.write("Incorrect: {}\n".format(len(incorrect))) | ||
395 | + oFile.write("Unrecovered: {}\n".format(len(unrecovered))) | ||
396 | + oFile.write("Precision: {}\n".format(precision)) | ||
397 | + oFile.write("Recall: {}\n".format(recall)) | ||
398 | + oFile.write("F1: {}\n".format(f1)) | ||
399 | + oFile.write("Correct instances:\n") | ||
400 | + for r in sorted(correct): | ||
401 | + oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp))) | ||
402 | + oFile.write("\t{}\n".format(getDetail(r, hashTemp))) | ||
403 | + oFile.write("Incorrect instances:\n") | ||
404 | + for r in sorted(incorrect): | ||
405 | + oFile.write("\t{}\n".format(r)) | ||
406 | + oFile.write("Unrecovered instances:\n") | ||
407 | + for r in sorted(unrecovered): | ||
408 | + oFile.write("\t{}\n".format(r)) | ||
409 | + | ||
410 | + | ||
411 | +if __name__ == "__main__": | ||
412 | + # Parameter definition | ||
413 | + parser = OptionParser() | ||
414 | + parser.add_option("--truePath", dest="truePath", | ||
415 | + help="Path true ris gcs", metavar="PATH") | ||
416 | + parser.add_option("--trueFile", dest="trueFile", | ||
417 | + help="File true ris gcs", metavar="FILE") | ||
418 | + parser.add_option("--predictedPath", dest="predictedPath", | ||
419 | + help="Path predicted ris gcs", metavar="PATH") | ||
420 | + parser.add_option("--outputPath", dest="outputPath", | ||
421 | + help="Output path", metavar="PATH") | ||
422 | + parser.add_option("--outputFile", dest="outputFile", | ||
423 | + help="File for saving results", metavar="FILE") | ||
424 | + parser.add_option("--evaluateGCs", default=False, | ||
425 | + action="store_true", dest="evaluateGCs", | ||
426 | + help="Evaluate GCs?") | ||
427 | + parser.add_option("--diccPath", dest="diccPath", | ||
428 | + help="Path to dictionary", metavar="PATH") | ||
429 | + parser.add_option("--diccSynon", dest="diccSynon", | ||
430 | + help="File with synonyms", metavar="FILE") | ||
431 | + | ||
432 | + (options, args) = parser.parse_args() | ||
433 | + if len(args) > 0: | ||
434 | + parser.error("None parameter entered.") | ||
435 | + sys.exit(1) | ||
436 | + | ||
437 | + # Printing parameter values | ||
438 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
439 | + print("Path true ris gcs: " + str(options.truePath)) | ||
440 | + print("File true ris gcs: " + str(options.trueFile)) | ||
441 | + print("Path predicted ris gcs: " + str(options.predictedPath)) | ||
442 | + print("Output path: " + str(options.outputPath)) | ||
443 | + print("File for saving results: " + str(options.outputFile)) | ||
444 | + print("Evaluate GCs: " + str(options.evaluateGCs)) | ||
445 | + print("Path to dictionary: " + str(options.diccPath)) | ||
446 | + print("File with synonyms: " + str(options.diccSynon)) | ||
447 | + | ||
448 | + use_synonyms = False | ||
449 | + hashSynon = {} | ||
450 | + if options.diccPath != None and options.diccSynon != "no-synonyms": | ||
451 | + print("***** Using synonyms *****") | ||
452 | + use_synonyms = True | ||
453 | + print('Loading dictionary of synonyms...') | ||
454 | + with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon: | ||
455 | + hashSynon = json.load(diccSynon) | ||
456 | + print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon))) | ||
457 | + | ||
458 | + listTrueRI = [] # Without effect nor gc | ||
459 | + listTrueRIEF = [] # With effect nor gc | ||
460 | + if options.evaluateGCs: | ||
461 | + listTrueRIEFGC = [] # With effect and gc | ||
462 | + # Read and process Reference | ||
463 | + with open(os.path.join(options.truePath, options.trueFile), mode="r", encoding="utf-8") as iFile: | ||
464 | + for line in iFile: | ||
465 | + line = line.strip('\n') | ||
466 | + if line.startswith("#"): | ||
467 | + continue | ||
468 | + listElem = line.split('\t') | ||
469 | + if len(listElem) > 4: | ||
470 | + regulator = listElem[2] | ||
471 | + regulated = listElem[3] | ||
472 | + effect = listElem[4] | ||
473 | + if options.evaluateGCs: | ||
474 | + gc = listElem[5] | ||
475 | + else: | ||
476 | + regulator = listElem[0] | ||
477 | + regulated = listElem[1] | ||
478 | + effect = listElem[2] | ||
479 | + if options.evaluateGCs: | ||
480 | + gc = listElem[3] | ||
481 | + if effect == "binding": | ||
482 | + effect = "regulator" | ||
483 | + ri = "{}\t{}".format(regulator, regulated) | ||
484 | + if ri not in listTrueRI: | ||
485 | + listTrueRI.append(ri) | ||
486 | + rief = "{}\t{}\t{}".format(regulator, regulated, effect) | ||
487 | + if rief not in listTrueRIEF: | ||
488 | + listTrueRIEF.append(rief) | ||
489 | + if options.evaluateGCs: | ||
490 | + riefgc = "{}\t{}\t{}\t{}".format(regulator, regulated, effect, gc) | ||
491 | + if riefgc not in listTrueRIEFGC: | ||
492 | + listTrueRIEFGC.append(riefgc) | ||
493 | + print(" RIs en referencia antes regulators: {}".format(len(listTrueRI))) | ||
494 | + print(" RIEFs en referencia antes regulators: {}".format(len(listTrueRIEF))) | ||
495 | + if options.evaluateGCs: | ||
496 | + print(" RIEFGCs en referencia antes regulators: {}".format(len(listTrueRIEFGC))) | ||
497 | + | ||
498 | + # Eliminate those RIs with regulator which also have RIs with activator or repressor | ||
499 | + listRITemp = [] | ||
500 | + for ri in listTrueRIEF: | ||
501 | + listRI = ri.split('\t') | ||
502 | + regulator = listRI[0] | ||
503 | + regulated = listRI[1] | ||
504 | + effect = listRI[2] | ||
505 | + if effect == "regulator": | ||
506 | + tempRIA = "{}\t{}\t{}".format(regulator, regulated, "activator") | ||
507 | + tempRIR = "{}\t{}\t{}".format(regulator, regulated, "repressor") | ||
508 | + if tempRIA in listTrueRIEF or tempRIR in listTrueRIEF: | ||
509 | + pass | ||
510 | + # print("RI regulator matchs RI activator/repressor: {}".format(ri)) | ||
511 | + # listTrueRIEF.remove(ri) | ||
512 | + else: | ||
513 | + # print("Len before: {}".format(len(listRITemp))) | ||
514 | + listRITemp.append(ri) | ||
515 | + # print("Len after: {}".format(len(listRITemp))) | ||
516 | + else: | ||
517 | + listRITemp.append(ri) | ||
518 | + listTrueRIEF = listRITemp | ||
519 | + | ||
520 | + print(" RIEFs en referencia después regulators: {}".format(len(listTrueRIEF))) | ||
521 | + if options.evaluateGCs: | ||
522 | + for ri in listTrueRIEFGC: | ||
523 | + listRI = ri.split('\t') | ||
524 | + regulator = listRI[0] | ||
525 | + regulated = listRI[1] | ||
526 | + effect = listRI[2] | ||
527 | + gc = listRI[3] | ||
528 | + if effect == "regulator": | ||
529 | + tempRIGCA = "{}\t{}\t{}\t{}".format(regulator, regulated, "activator", gc) | ||
530 | + tempRIGCR = "{}\t{}\t{}\t{}".format(regulator, regulated, "repressor", gc) | ||
531 | + if tempRIGCA in listTrueRIEFGC or tempRIGCR in listTrueRIEFGC: | ||
532 | + listTrueRIEFGC.remove(ri) | ||
533 | + print(" RIEFGCs en referencia después regulators: {}".format(len(listTrueRIEFGC))) | ||
534 | + | ||
535 | + listPredictedRI = [] | ||
536 | + hashPredictedRI = {} | ||
537 | + listPredictedRIEF = [] | ||
538 | + hashPredictedRIEF = {} | ||
539 | + if options.evaluateGCs: | ||
540 | + listPredictedRIEFGC = [] | ||
541 | + hashPredictedRIEFGC = {} | ||
542 | + hashFiles = {} | ||
543 | + for path, dirs, files in os.walk(options.predictedPath): | ||
544 | + for file in files: | ||
545 | + if file.endswith(".a1"): | ||
546 | + filename = file[:-3] | ||
547 | + if filename not in hashFiles: | ||
548 | + hashFiles[filename] = 1 | ||
549 | + else: | ||
550 | + hashFiles[filename] += 1 | ||
551 | + print("Files: {}".format(len(hashFiles))) | ||
552 | + | ||
553 | + hashEntities = {} | ||
554 | + processedFiles = 0 | ||
555 | + for file in sorted(hashFiles.keys()): | ||
556 | + print("File: {}".format(file)) | ||
557 | + pmid = file[:file.find("_")] | ||
558 | + # print("pmid {}".format(pmid)) | ||
559 | + sentenceFile = file[:file.find("-", file.find("_"))] + ".txt" | ||
560 | + hashEntities = {} | ||
561 | + hashOriginalEffect = {} | ||
562 | + with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File: | ||
563 | + for line in a1File: | ||
564 | + line = line.strip('\n') | ||
565 | + listLine1 = line.split('\t') | ||
566 | + listLine2 = listLine1[1].split(' ') | ||
567 | + entity = listLine2[0] | ||
568 | + idEntity = listLine1[0] | ||
569 | + originalEffect = listLine1[2] | ||
570 | + if entity.startswith("EFFECT"): | ||
571 | + entity = entity[entity.find(".") + 1:] | ||
572 | + print("Entity: {}".format(entity)) | ||
573 | + entity = entity.replace("_dev", "") | ||
574 | + print("Entity without _dev: {}".format(entity)) | ||
575 | + if idEntity not in hashOriginalEffect: | ||
576 | + hashOriginalEffect[idEntity] = originalEffect | ||
577 | + else: | ||
578 | + entity = listLine1[2] | ||
579 | + if idEntity not in hashEntities: | ||
580 | + hashEntities[idEntity] = entity | ||
581 | + print("hashEntities: {}".format(hashEntities)) | ||
582 | + | ||
583 | + with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File: | ||
584 | + for line in a2File: | ||
585 | + # print("Line a2: {}".format(line)) | ||
586 | + # R1 Interaction.T3 Target:T2 Agent:T1 Condition: T4 | ||
587 | + line = line.strip('\n') | ||
588 | + listLine1 = line.split('\t') | ||
589 | + listLine2 = listLine1[1].split(' ') | ||
590 | + regulator = listLine2[2] | ||
591 | + regulator = regulator[regulator.find(":") + 1:] | ||
592 | + regulated = listLine2[1] | ||
593 | + regulated = regulated[regulated.find(":") + 1:] | ||
594 | + effect = listLine2[0] | ||
595 | + effect = effect[effect.find(".") + 1:] | ||
596 | + # print("effect: {}".format(hashEntities[effect])) | ||
597 | + # if hashEntities[effect] == "binding": | ||
598 | + # continue | ||
599 | + if options.evaluateGCs: | ||
600 | + gc = listLine2[3] | ||
601 | + gc = gc[gc.find(":") + 1:] | ||
602 | + | ||
603 | + pri = "{}\t{}".format(hashEntities[regulator], hashEntities[regulated]) | ||
604 | + if pri not in listPredictedRI: | ||
605 | + listPredictedRI.append(pri) | ||
606 | + updateHashPredicted(pri, hashPredictedRI, pmid, sentenceFile, None) | ||
607 | + | ||
608 | + prief = "{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated], hashEntities[effect]) | ||
609 | + print("prief: {}".format(prief)) | ||
610 | + if prief not in listPredictedRIEF: | ||
611 | + listPredictedRIEF.append(prief) | ||
612 | + updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect]) | ||
613 | + | ||
614 | + if options.evaluateGCs: | ||
615 | + priefgc = "{}\t{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated], | ||
616 | + hashEntities[effect], hashEntities[gc]) | ||
617 | + if priefgc not in listPredictedRIEFGC: | ||
618 | + listPredictedRIEFGC.append(priefgc) | ||
619 | + updateHashPredicted(priefgc, hashPredictedRIEFGC, pmid, sentenceFile, hashOriginalEffect[effect]) | ||
620 | + processedFiles += 1 | ||
621 | + | ||
622 | + print("Processed files: {}".format(processedFiles)) | ||
623 | + with open(os.path.join(options.outputPath, options.outputFile), mode="w") as oFile: | ||
624 | + pass | ||
625 | + get_scores_rules(listTrueRIEF, listPredictedRIEF, hashPredictedRIEF, | ||
626 | + "Scores regulator-regulated-effect (without gc)", "rief") | ||
627 | + get_scores_rules(listTrueRI, listPredictedRI, hashPredictedRI, "Scores regulator-regulated (without effect nor gc)", | ||
628 | + "ri") | ||
629 | + if options.evaluateGCs: | ||
630 | + get_scores_rules(listTrueRIEFGC, listPredictedRIEFGC, hashPredictedRIEFGC, | ||
631 | + "Scores regulator-regulated-effect-gc", "riefgc") |
extract-ris-deverbal/EFF_DVB-regex-v03.py
0 → 100644
1 | +import fileinput | ||
2 | +#import regex as re | ||
3 | +#from regex import finditer | ||
4 | +# We use Python3 and we had to eliminate option overlapped from finditer method of re | ||
5 | +# As Daniel created this Python script in Python 2.7, he used overlapped, but in | ||
6 | +# Python 3 this option was eliminated. | ||
7 | +import re | ||
8 | +from re import finditer | ||
9 | +import sys | ||
10 | +import os | ||
11 | +import json | ||
12 | + | ||
13 | +if (len(sys.argv) != 8): | ||
14 | + sys.stderr.write("E: usage: " + sys.argv[ | ||
15 | + 0] + " <input_path> <input_file> <output_path> <output_file> <normalized_Effects> <entity_path> <entity_file>\n") | ||
16 | + sys.stderr.flush(); | ||
17 | + exit(2); | ||
18 | + | ||
19 | +# LEER ARCHIVO INPUT | ||
20 | +# Original Daniel: text_file = open( sys.argv[1], "r" ) | ||
21 | +# Original Daniel: dato = text_file.read() | ||
22 | +# Original Daniel: text_file.close() | ||
23 | +filename = sys.argv[2] | ||
24 | +input_file = open(os.path.join(sys.argv[1], filename), "r") | ||
25 | +#print("Input file: {}".format(os.path.join(sys.argv[1], sys.argv[2]))) | ||
26 | +dato = input_file.read() | ||
27 | +input_file.close() | ||
28 | + | ||
29 | +# Loading normalized effects | ||
30 | +# print('Loading normalized effects...') | ||
31 | +with open(os.path.join(sys.argv[5])) as diccFile: | ||
32 | + hashNormalizedEffects = json.load(diccFile) | ||
33 | + | ||
34 | +# USING ALREADY TAGGED ENTITIES OF THE FILE (in filter sentence step) | ||
35 | +#<entity_path> <entity_file> | ||
36 | +# READ DICTIONARY WITH ALREADY TAGGED ENTITIES | ||
37 | +entity_path = sys.argv[6] | ||
38 | +entity_file = sys.argv[7] | ||
39 | +print('Loading dictionaries with already tagged entities...') | ||
40 | +with open(os.path.join(entity_path, entity_file)) as entFile: | ||
41 | + hashDicc = json.load(entFile) | ||
42 | +print(' Loading dictionaries with already tagged entities... Done!') | ||
43 | +# CREATE LISTS WITH ALREADY TAGGED ENTITIES OF THE FILE | ||
44 | +regexNumFile = re.compile(r'_([0-9]+)[.-]') | ||
45 | +result = regexNumFile.search(filename) | ||
46 | +numFile = "" | ||
47 | +inumFile = 0 | ||
48 | +if result: | ||
49 | + inumFile = int(result.group(1)) | ||
50 | + numFile = str(inumFile) | ||
51 | + print("Numfile: {}".format(numFile)) | ||
52 | +else: | ||
53 | + print("WARNING: numfile not found in filename") | ||
54 | + | ||
55 | +ATEREG1 = [] | ||
56 | +PTEREG1GENE = [] | ||
57 | +PTEREG1TU = [] | ||
58 | +listEffects = [] | ||
59 | + | ||
60 | +if numFile in hashDicc: | ||
61 | + hashTemp = hashDicc[numFile] | ||
62 | + # print("hashDicc[numFile]: {}".format(hashTemp)) | ||
63 | + for k, v in hashTemp.items(): | ||
64 | + if v == "TF": | ||
65 | + # print("Verifiying TF") | ||
66 | + if k not in ATEREG1: | ||
67 | + # print(" TF {}".format(k)) | ||
68 | + ATEREG1.append(k) | ||
69 | + elif v == "GENE": | ||
70 | + if k not in PTEREG1GENE: | ||
71 | + PTEREG1GENE.append(k) | ||
72 | + elif v == "TU": | ||
73 | + if k not in PTEREG1TU: | ||
74 | + PTEREG1TU.append(k) | ||
75 | + elif v == "EFFECT": | ||
76 | + if k not in listEffects: | ||
77 | + listEffects.append(k) | ||
78 | + else: | ||
79 | + print("WARNING: entity not found in dictionaries") | ||
80 | +else: | ||
81 | + print("WARNING: numfile not found in dictionaries") | ||
82 | + | ||
83 | +# QUITA EXTENSION DE NOMBRE DE ARCHIVO | ||
84 | +# Original Daniel: split_line = sys.argv[2] | ||
85 | +output_path = sys.argv[3] | ||
86 | +# Original Daniel: split_line = split_line[:-4] | ||
87 | +# Original Daniel: file_name = split_line + ".a2" | ||
88 | +input_file_name = sys.argv[2] | ||
89 | +# Original Daniel: open( file_name , 'w').close() | ||
90 | +file_name_entities_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a1") | ||
91 | +file_name_interactions_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a2") | ||
92 | +file_name_entities_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a1") | ||
93 | +file_name_interactions_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a2") | ||
94 | + | ||
95 | +file_name_text_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".txt") | ||
96 | +file_name_text_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".txt") | ||
97 | + | ||
98 | +open(file_name_entities_complete, 'w').close() | ||
99 | +open(file_name_interactions_complete, 'w').close() | ||
100 | +# Original Daniel: open( file_name , 'w').close() | ||
101 | +open(file_name_entities_incomplete, 'w').close() | ||
102 | +open(file_name_interactions_incomplete, 'w').close() | ||
103 | + | ||
104 | +# declara variables | ||
105 | +# Original Daniel: impresion = [] | ||
106 | +impresionEntities = [] | ||
107 | +impresionInteractionsComplete = [] | ||
108 | +impresionInteractionsIncomplete = [] | ||
109 | +salida_a2 = [] | ||
110 | +salida_a2_trimmed = [] | ||
111 | +salida_a2_str = [] | ||
112 | +q2line = () | ||
113 | +listadeRIs = [] | ||
114 | +posiblesminimos = [[], []] | ||
115 | +posiblesmaximos = [[], []] | ||
116 | +listasecundaria = [] | ||
117 | +listasecundaria_trimmed = [] | ||
118 | +impresionEntities = [] | ||
119 | +impresionInteractionsComplete = [] | ||
120 | +impresionInteractionsIncomplete = [] | ||
121 | + | ||
122 | +# Effects | ||
123 | +for i in range(len(listEffects)): | ||
124 | + if listEffects[i] in dato: | ||
125 | + for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at|for)\b)', dato): # "of","for" o "at" a la derecha de EFF | ||
126 | + # Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at)\b)', dato, | ||
127 | + # Original Daniel: overlapped=True): # "of" o "at" a la derecha de EFF | ||
128 | + spantup = match.span(1) | ||
129 | + # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1)) | ||
130 | + if match.group(1).lower() in hashNormalizedEffects: | ||
131 | + effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()] | ||
132 | + else: | ||
133 | + effect = "EFFECT." + "deverbal_effect" | ||
134 | + # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1)) | ||
135 | + a2line = (effect, spantup[0], spantup[1] - 1, match.group(1)) | ||
136 | + #print("Append effect a2line: {}".format(a2line)) | ||
137 | + salida_a2.append(a2line) | ||
138 | + for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato): # "by" a la derecha de EFF | ||
139 | + # Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato, | ||
140 | + # Original Daniel: overlapped=True): # "by" a la derecha de EFF | ||
141 | + spantup = match.span(1) | ||
142 | + # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1)) | ||
143 | + if match.group(1).lower() in hashNormalizedEffects: | ||
144 | + effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()] | ||
145 | + else: | ||
146 | + effect = "EFFECT." + "deverbal_effect" | ||
147 | + # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1)) | ||
148 | + a2line = (effect, spantup[0], spantup[1] - 1, match.group(1)) | ||
149 | + salida_a2.append(a2line) | ||
150 | + #print("Append effect a2line: {}".format(a2line)) | ||
151 | + for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato): # "is the" 0-1 palabras a la izquierda de EFF | ||
152 | + # Original Daniel: for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato, | ||
153 | + # Original Daniel: overlapped=True): # "is the" 0-1 palabras a la izquierda de EFF | ||
154 | + spantup = match.span(3) | ||
155 | + # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(3)) | ||
156 | + if match.group(1).lower() in hashNormalizedEffects: | ||
157 | + effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()] | ||
158 | + else: | ||
159 | + effect = "EFFECT." + "deverbal_effect" | ||
160 | + # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(3)) | ||
161 | + a2line = (effect, spantup[0], spantup[1] - 1, match.group(3)) | ||
162 | + salida_a2.append(a2line) | ||
163 | + #print("Append effect a2line: {}".format(a2line)) | ||
164 | +#print("Efectos salida_a2: {}".format(salida_a2)) | ||
165 | + | ||
166 | +# PTEREG1GENE regulados pacientes GENE | ||
167 | +for i in range(len(PTEREG1GENE)): | ||
168 | + if PTEREG1GENE[i] in dato: | ||
169 | + # print(PTEREG1GENE[i]) | ||
170 | + for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato): # "of", "for" o "at" 0-1 palabras a la izq de regulado | ||
171 | + # Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato, | ||
172 | + # Original Daniel: overlapped=True): # "of" o "at" 0-1 palabras a la izq de regulado | ||
173 | + spantup = match.span(3) | ||
174 | + # print("match {} spantup {}".format(match.group(3), match.span(3))) | ||
175 | + # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3)) | ||
176 | + a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(3)) | ||
177 | + salida_a2.append(a2line) | ||
178 | + # print("Append regulados a2line: {}".format(a2line)) | ||
179 | + for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato): # regulados sin patron | ||
180 | + # Original Daniel: for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato, overlapped=True): # regulados sin patron | ||
181 | + spantup = match.span(1) | ||
182 | + # print("match {} spantup {}".format(match.group(1), match.span(1))) | ||
183 | + # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1)) | ||
184 | + a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(1)) | ||
185 | + listasecundaria.append(a2line) | ||
186 | +#print("Efectos regulados gene listasecundaria: {}".format(listasecundaria)) | ||
187 | + | ||
188 | +# CMC: ADDED TO SEPARTE REGULATED GENE AND TU | ||
189 | +# PTEREG1TU regulados pacientes TU | ||
190 | +for i in range(len(PTEREG1TU)): | ||
191 | + if PTEREG1TU[i] in dato: | ||
192 | + # print(PTEREG1TU[i]) | ||
193 | + for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato): # "of","for" o "at" 0-1 palabras a la izq de regulado | ||
194 | + # Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato, | ||
195 | + # Original Daniel: overlapped=True): # "of" o "at" 0-1 palabras a la izq de regulado | ||
196 | + spantup = match.span(3) | ||
197 | + # print("match: " + match.group(3)) | ||
198 | + # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3)) | ||
199 | + a2line = ('TU', spantup[0], spantup[1] - 1, match.group(3)) | ||
200 | + salida_a2.append(a2line) | ||
201 | + # print("Append regulados a2line: {}".format(a2line)) | ||
202 | + for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato): # regulados sin patron | ||
203 | + # for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato, overlapped=True): # regulados sin patron | ||
204 | + spantup = match.span(1) | ||
205 | + # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1)) | ||
206 | + a2line = ('TU', spantup[0], spantup[1] - 1, match.group(1)) | ||
207 | + listasecundaria.append(a2line) | ||
208 | +#print("Efectos regulados tu listasecundaria: {}".format(listasecundaria)) | ||
209 | + | ||
210 | +# ATEREG1 reguladores agentes | ||
211 | +for i in range(len(ATEREG1)): | ||
212 | + if ATEREG1[i] in dato: | ||
213 | + # print(ATEREG1[i]) | ||
214 | + for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato): # "by" 0-1 palabras a la izq de regulado | ||
215 | + # Original Daniel: for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato, | ||
216 | + # Original Daniel: overlapped=True): # "by" 0-1 palabras a la izq de regulado | ||
217 | + spantup = match.span(2) | ||
218 | + # print("match: " + match.group(2)) | ||
219 | + # print("match {} spantup {}".format(match.group(2), match.span(2))) | ||
220 | + # Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(2)) | ||
221 | + a2line = ('TF', spantup[0], spantup[1] - 1, match.group(2)) | ||
222 | + salida_a2.append(a2line) | ||
223 | + #print("Append regulator a2line: {}".format(a2line)) | ||
224 | + for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato): # reguladores sin patron | ||
225 | + # for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato, overlapped=True): # reguladores sin patron | ||
226 | + spantup = match.span(1) | ||
227 | + # print("match {} spantup {}".format(match.group(1), match.span(1))) | ||
228 | + # Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(1)) | ||
229 | + a2line = ('TF', spantup[0], spantup[1] - 1, match.group(1)) | ||
230 | + listasecundaria.append(a2line) | ||
231 | + #print("Append regulator a2line: {}".format(a2line)) | ||
232 | +#print("Reguladores agentes salida_a2: {}".format(salida_a2)) | ||
233 | +#print("Reguladores agentes listasecundaria: {}".format(listasecundaria)) | ||
234 | + | ||
235 | +# Elimina etiquetados repetidos o que estan incluidos en otros | ||
236 | +if salida_a2: | ||
237 | + salida_a2.sort(key=lambda tup: tup[1]) | ||
238 | + salida_a2_trimmed.append(salida_a2[0]) | ||
239 | + for i in range(len(salida_a2)): | ||
240 | + copiar = True | ||
241 | + for j in range(len(salida_a2_trimmed)): | ||
242 | + if ((salida_a2[i][1] >= salida_a2_trimmed[j][1]) and (salida_a2[i][2] <= salida_a2_trimmed[j][2])): | ||
243 | + copiar = False | ||
244 | + if copiar: | ||
245 | + salida_a2_trimmed.append(salida_a2[i]) | ||
246 | +if listasecundaria: | ||
247 | + listasecundaria.sort(key=lambda tup: tup[1]) | ||
248 | + listasecundaria_trimmed.append(listasecundaria[0]) | ||
249 | + for i in range(len(listasecundaria)): | ||
250 | + copiar = True | ||
251 | + for j in range(len(listasecundaria_trimmed)): | ||
252 | + if ((listasecundaria[i][1] >= listasecundaria_trimmed[j][1]) and ( | ||
253 | + listasecundaria[i][2] <= listasecundaria_trimmed[j][2])): | ||
254 | + copiar = False | ||
255 | + if copiar: | ||
256 | + listasecundaria_trimmed.append(listasecundaria[i]) | ||
257 | +# print("Sin repeticiones salida_a2_trimmed: {}".format(salida_a2_trimmed)) | ||
258 | +#print("Sin repeticiones listasecundaria_trimmed: {}".format(listasecundaria_trimmed)) | ||
259 | + | ||
260 | +# Asigna identificadores (TX) a entidades (eff, regulador, regulado) | ||
261 | +lastID = 0 | ||
262 | +for i in range(len(salida_a2_trimmed)): | ||
263 | + # if sys.argv[2].find('355') > -1: | ||
264 | + # print("i : {}".format(i)) | ||
265 | + salida_a2_trimmed[i] = list(salida_a2_trimmed[i]) | ||
266 | + ID = "T" + str(i + 1) | ||
267 | + salida_a2_trimmed[i].insert(0, ID) | ||
268 | + lastID = i + 1 | ||
269 | + # if sys.argv[2].find('355') > -1: | ||
270 | + # print("lastID : {}".format(lastID)) | ||
271 | + | ||
272 | +for i in range(len(listasecundaria_trimmed)): | ||
273 | + # if sys.argv[2].find('355') > -1: | ||
274 | + # print("i : {}".format(i)) | ||
275 | + # print("lastID : {}".format(lastID)) | ||
276 | + listasecundaria_trimmed[i] = list(listasecundaria_trimmed[i]) | ||
277 | + ID = "T" + str(i + 1 + lastID) | ||
278 | + listasecundaria_trimmed[i].insert(0, ID) | ||
279 | + | ||
280 | +# print("Con identificadores salida_a2_trimmed: {}".format(salida_a2_trimmed)) | ||
281 | +#print("Con identificadores listasecundaria_trimmed: {}".format(listasecundaria_trimmed)) | ||
282 | + | ||
283 | +#print("salida_a2_trimmed") ######################### | ||
284 | +#print(salida_a2_trimmed) ######################### | ||
285 | +#print("listasecundaria_trimmed") | ||
286 | +#print(listasecundaria_trimmed) | ||
287 | + | ||
288 | +# Arma Interacciones Regulatorias | ||
289 | +i = 0 | ||
290 | +while i < int(len(salida_a2_trimmed)): | ||
291 | + if "EFFECT" in salida_a2_trimmed[i][1]: | ||
292 | + # BUSCA REGULADO A LA DERECHA | ||
293 | + nuevaRI = [salida_a2_trimmed[i][0], "", ""] # efecto, tema, causa | ||
294 | + ref = "" | ||
295 | + posiblesminimos = [[], []] | ||
296 | + j = 0 | ||
297 | + while j < int(len(salida_a2_trimmed)): | ||
298 | + # Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]): | ||
299 | + if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]): | ||
300 | + posiblesminimos[0].append(salida_a2_trimmed[j][2]) | ||
301 | + posiblesminimos[1].append(salida_a2_trimmed[j][0]) | ||
302 | + j = j + 1 | ||
303 | + if posiblesminimos[0]: | ||
304 | + refpointer = posiblesminimos[0].index(min(posiblesminimos[0])) | ||
305 | + ref = posiblesminimos[1][refpointer] | ||
306 | + # si no encuentra, BUSCA REGULADO A LA IZQUIERDA | ||
307 | + if not ref: | ||
308 | + posiblesmaximos = [[], []] | ||
309 | + j = 0 | ||
310 | + while j < int(len(salida_a2_trimmed)): | ||
311 | + # Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]): | ||
312 | + if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]): | ||
313 | + posiblesmaximos[0].append(salida_a2_trimmed[j][3]) | ||
314 | + posiblesmaximos[1].append(salida_a2_trimmed[j][0]) | ||
315 | + j = j + 1 | ||
316 | + if posiblesmaximos[0]: | ||
317 | + refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0])) | ||
318 | + ref = posiblesmaximos[1][refpointer] | ||
319 | + nuevaRI[1] = ref | ||
320 | + # BUSCA REGULADOR A LA DERECHA | ||
321 | + ref = "" | ||
322 | + posiblesminimos = [[], []] | ||
323 | + j = 0 | ||
324 | + while j < int(len(salida_a2_trimmed)): | ||
325 | + # Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]): | ||
326 | + if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]): | ||
327 | + posiblesminimos[0].append(salida_a2_trimmed[j][2]) | ||
328 | + posiblesminimos[1].append(salida_a2_trimmed[j][0]) | ||
329 | + j = j + 1 | ||
330 | + if posiblesminimos[0]: | ||
331 | + refpointer = posiblesminimos[0].index(min(posiblesminimos[0])) | ||
332 | + ref = posiblesminimos[1][refpointer] | ||
333 | + # si no encuentra, BUSCA REGULADOR A LA IZQUIERDA | ||
334 | + if not ref: | ||
335 | + posiblesmaximos = [[], []] | ||
336 | + j = 0 | ||
337 | + while j < int(len(salida_a2_trimmed)): | ||
338 | + # Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]): | ||
339 | + if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]): | ||
340 | + posiblesmaximos[0].append(salida_a2_trimmed[j][3]) | ||
341 | + posiblesmaximos[1].append(salida_a2_trimmed[j][0]) | ||
342 | + j = j + 1 | ||
343 | + if posiblesmaximos[0]: | ||
344 | + refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0])) | ||
345 | + ref = posiblesmaximos[1][refpointer] | ||
346 | + nuevaRI[2] = ref | ||
347 | + listadeRIs.append(nuevaRI) | ||
348 | + i = i + 1 | ||
349 | + | ||
350 | +# SEGUNDA FASE DE BUSQUEDA DE REGULADORES Y REGULADOS | ||
351 | +i = 0 | ||
352 | +while i < int(len(listadeRIs)): | ||
353 | + if not listadeRIs[i][1]: # no regulado | ||
354 | + ref = "" | ||
355 | + posiblesminimos = [[], []] | ||
356 | + # BUSCA REGULADO A LA DERECHA | ||
357 | + j = 0 | ||
358 | + while j < int(len(listasecundaria_trimmed)): | ||
359 | + for k in range(len(salida_a2_trimmed)): | ||
360 | + if listadeRIs[i][0] == salida_a2_trimmed[k][0]: | ||
361 | + ind = k | ||
362 | + # Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]): | ||
363 | + if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]): | ||
364 | + posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3])) | ||
365 | + posiblesminimos[1].append(listasecundaria_trimmed[j][0]) | ||
366 | + j = j + 1 | ||
367 | + # BUSCA REGULADO A LA IZQUIERDA | ||
368 | + j = 0 | ||
369 | + while j < int(len(listasecundaria_trimmed)): | ||
370 | + for k in range(len(salida_a2_trimmed)): | ||
371 | + if listadeRIs[i][0] == salida_a2_trimmed[k][0]: | ||
372 | + ind = k | ||
373 | + # Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]): | ||
374 | + if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]): | ||
375 | + posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3])) | ||
376 | + posiblesminimos[1].append(listasecundaria_trimmed[j][0]) | ||
377 | + j = j + 1 | ||
378 | + # ELIGE EL REGULADO MAS CERCANO | ||
379 | + if posiblesminimos[0]: | ||
380 | + refpointer = posiblesminimos[0].index(min(posiblesminimos[0])) | ||
381 | + ref = posiblesminimos[1][refpointer] | ||
382 | + # print(ref) | ||
383 | + listadeRIs[i][1] = ref | ||
384 | + if not listadeRIs[i][2]: # no regulador | ||
385 | + ref = "" | ||
386 | + posiblesminimos = [[], []] | ||
387 | + # BUSCA REGULADO A LA DERECHA | ||
388 | + j = 0 | ||
389 | + while j < int(len(listasecundaria_trimmed)): | ||
390 | + for k in range(len(salida_a2_trimmed)): | ||
391 | + if listadeRIs[i][0] == salida_a2_trimmed[k][0]: | ||
392 | + ind = k | ||
393 | + # Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]): | ||
394 | + if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]): | ||
395 | + posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3])) | ||
396 | + posiblesminimos[1].append(listasecundaria_trimmed[j][0]) | ||
397 | + j = j + 1 | ||
398 | + # BUSCA REGULADO A LA IZQUIERDA | ||
399 | + j = 0 | ||
400 | + while j < int(len(listasecundaria_trimmed)): | ||
401 | + for k in range(len(salida_a2_trimmed)): | ||
402 | + if listadeRIs[i][0] == salida_a2_trimmed[k][0]: | ||
403 | + ind = k | ||
404 | + # Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]): | ||
405 | + if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]): | ||
406 | + posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3])) | ||
407 | + posiblesminimos[1].append(listasecundaria_trimmed[j][0]) | ||
408 | + j = j + 1 | ||
409 | + # ELIGE EL REGULADO MAS CERCANO | ||
410 | + if posiblesminimos[0]: | ||
411 | + refpointer = posiblesminimos[0].index(min(posiblesminimos[0])) | ||
412 | + ref = posiblesminimos[1][refpointer] | ||
413 | + # print(ref) | ||
414 | + listadeRIs[i][2] = ref | ||
415 | + i = i + 1 | ||
416 | +#print("ListadeRIs: {}".format(listadeRIs)) | ||
417 | + | ||
418 | +# Elige reguladores y regulados de la listasecundaria para ser impresos | ||
419 | +setmem = [] | ||
420 | +k = 0 | ||
421 | +while k < int(len(listadeRIs)): | ||
422 | + j = 0 | ||
423 | + copysec = False | ||
424 | + #while j < int(len(listasecundaria_trimmed)): | ||
425 | + while j < len(listasecundaria_trimmed): | ||
426 | + # print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs)) | ||
427 | + # Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]: | ||
428 | + if listasecundaria_trimmed[j][0] == listadeRIs[k][2]: | ||
429 | + # print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][2] {}".format(listasecundaria_trimmed[j][0], | ||
430 | + # listadeRIs[k][2])) | ||
431 | + copysec = True | ||
432 | + # print("j: {}".format(j)) | ||
433 | + indj = j | ||
434 | + j = j + 1 | ||
435 | + if copysec: | ||
436 | + setmem.append(listasecundaria_trimmed[indj]) | ||
437 | + # print("setmen: {}".format(setmem)) | ||
438 | + | ||
439 | + #### CMC: AGREGO ESTE CODIGO PARA BUSCAR REGULADOS YA QUE EL CODIGO ANTERIOR BUSCA REGULADORES | ||
440 | + j = 0 | ||
441 | + copysec = False | ||
442 | + #while j < int(len(listasecundaria_trimmed)): | ||
443 | + while j < len(listasecundaria_trimmed): | ||
444 | + # print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs)) | ||
445 | + # Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]: | ||
446 | + if listasecundaria_trimmed[j][0] == listadeRIs[k][1]: | ||
447 | + # print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][1] {}".format(listasecundaria_trimmed[j][0], | ||
448 | + # listadeRIs[k][1])) | ||
449 | + copysec = True | ||
450 | + # print("j: {}".format(j)) | ||
451 | + indj = j | ||
452 | + j = j + 1 | ||
453 | + if copysec: | ||
454 | + setmem.append(listasecundaria_trimmed[indj]) | ||
455 | + # print("setmen: {}".format(setmem)) | ||
456 | + | ||
457 | + k = k + 1 | ||
458 | +setmem = sorted(setmem) | ||
459 | +# print("setmen: {}".format(setmem)) | ||
460 | +dedup = [setmem[i] for i in range(len(setmem)) if i == 0 or setmem[i] != setmem[i - 1]] | ||
461 | +# print("dedup: {}".format(dedup)) | ||
462 | +salida_a2_trimmed.extend(dedup) | ||
463 | +#print("salida_a2_trimmed after listasecundaria_trimmed: {}".format(salida_a2_trimmed)) | ||
464 | + | ||
465 | +# Asigna identificadores (EX) a eventos (RIs) | ||
466 | +for i in range(len(listadeRIs)): | ||
467 | + # Original Daniel: ID = "E" + str(i+1) | ||
468 | + ID = "R" + str(i + 1) | ||
469 | + listadeRIs[i].insert(0, ID) | ||
470 | +#print("Con identificadores ListadeRIs: {}".format(listadeRIs)) | ||
471 | + | ||
472 | +# CREA LISTADO DE EVENTOS (EX) Y ENTIDADES (TX) EN FORMATO DE IMPESIÓN | ||
473 | +for i in range(len(salida_a2_trimmed)): | ||
474 | + linea = str(salida_a2_trimmed[i][0]) + ' ' + str(salida_a2_trimmed[i][1]) + ' ' + str( | ||
475 | + salida_a2_trimmed[i][2]) + ' ' + str(salida_a2_trimmed[i][3]) + ' ' + str(salida_a2_trimmed[i][4]) | ||
476 | + # Original Daniel: impresion.append(linea) | ||
477 | + impresionEntities.append(linea) | ||
478 | + | ||
479 | +for i in range(len(listadeRIs)): | ||
480 | + if listadeRIs[i][2] and listadeRIs[i][3]: | ||
481 | + # Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2]) + ' ' + 'Cause:' + str(listadeRIs[i][3]) | ||
482 | + linea = str(listadeRIs[i][0]) + ' ' + "Interaction." + str(listadeRIs[i][1]) + ' ' + 'Target:' + str( | ||
483 | + listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3]) | ||
484 | + # Original Daniel: elif listadeRIs[i][2]: | ||
485 | + # Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2]) | ||
486 | + # Original Daniel: elif listadeRIs[i][3]: | ||
487 | + # Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Cause:' + str(listadeRIs[i][3]) | ||
488 | + # Original Daniel: else: | ||
489 | + # Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) | ||
490 | + # Original Daniel: impresion.append(linea) | ||
491 | + impresionInteractionsComplete.append(linea) | ||
492 | + #print("Interaction complete: {}".format(linea)) | ||
493 | + linea = str(listadeRIs[i][0]) + ' ' + "Interaction.regulator" + ' ' + 'Target:' + str( | ||
494 | + listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3]) | ||
495 | + impresionInteractionsIncomplete.append(linea) | ||
496 | + | ||
497 | +#print("Entities: {}".format(impresionEntities)) | ||
498 | + | ||
499 | +# Escribir entidades interacciones completas en a1 | ||
500 | +for line in impresionEntities: | ||
501 | + # Original Daniel: save_file = open( file_name, "a" ) | ||
502 | + save_file = open(file_name_entities_complete, "a") | ||
503 | + save_file.write(line) | ||
504 | + save_file.write("\n") | ||
505 | + save_file.close() | ||
506 | + | ||
507 | +# Escribir entidades interacciones incompletas en a1 | ||
508 | +for line in impresionEntities: | ||
509 | + # Original Daniel: save_file = open( file_name, "a" ) | ||
510 | + save_file = open(file_name_entities_incomplete, "a") | ||
511 | + save_file.write(line) | ||
512 | + save_file.write("\n") | ||
513 | + save_file.close() | ||
514 | + | ||
515 | +# Escribir interacciones completas (regulator, effect, regulated) | ||
516 | +# print("InteractionsComplete: {}".format(impresionInteractionsComplete)) | ||
517 | +for line in impresionInteractionsComplete: | ||
518 | + # Original Daniel: save_file = open( file_name, "a" ) | ||
519 | + save_file = open(file_name_interactions_complete, "a") | ||
520 | + save_file.write(line) | ||
521 | + save_file.write("\n") | ||
522 | + save_file.close() | ||
523 | + | ||
524 | +# Escribir interacciones incompletas (regulator, "regulator", regulated) | ||
525 | +# print("InteractionsIncomplete: {}".format(impresionInteractionsIncomplete)) | ||
526 | +for line in impresionInteractionsIncomplete: | ||
527 | + # Original Daniel: save_file = open( file_name, "a" ) | ||
528 | + save_file = open(file_name_interactions_incomplete, "a") | ||
529 | + save_file.write(line) | ||
530 | + save_file.write("\n") | ||
531 | + save_file.close() | ||
532 | + | ||
533 | +with open(file_name_text_complete, mode="w") as txtFile: | ||
534 | + txtFile.write(dato) | ||
535 | +with open(file_name_text_incomplete, mode="w") as txtFile: | ||
536 | + txtFile.write(dato) |
filtered-sentences/.gitignore
0 → 100644
1 | + |
format/regex.py
0 → 100644
1 | +import fileinput | ||
2 | +import re | ||
3 | +import sys | ||
4 | + | ||
5 | +if ( len( sys.argv ) < 3 ): | ||
6 | + sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <output_file> \n" ) | ||
7 | + sys.stderr.flush(); | ||
8 | + | ||
9 | + exit( 2 ); | ||
10 | +else: | ||
11 | + print("Ok.") | ||
12 | + | ||
13 | +#LEER ARCHIVO INPUT | ||
14 | +text_file = open( sys.argv[1], "r" ) | ||
15 | +dato = text_file.read().splitlines() | ||
16 | +text_file.close() | ||
17 | + | ||
18 | + | ||
19 | +#QUITA EXTENSION DE NOMBRE DE ARCHIVO | ||
20 | +split_line = sys.argv[2] | ||
21 | +split_line = split_line[:-4] | ||
22 | +file_name="" | ||
23 | +file_name = split_line + ".san" | ||
24 | +open( file_name , 'w').close() | ||
25 | + | ||
26 | +#ESCRIBIR REGEX EN ARGV 2 | ||
27 | +for line in dato: | ||
28 | + line = re.sub('[\(][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NNNNa_) | ||
29 | + line = re.sub('[\[][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NNNNa_] | ||
30 | + line = re.sub('[\(][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NN,NN,NN_) | ||
31 | + line = re.sub('[\[][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NN,NN,NN_] | ||
32 | + line = re.sub('[\(][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num_) | ||
33 | + line = re.sub('[\(][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num.num_) | ||
34 | + line = re.sub('[\(][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num-num_) | ||
35 | + line = re.sub('[\[][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num_] | ||
36 | + line = re.sub('[\[][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num.num_] | ||
37 | + line = re.sub('[\[][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num-num_] | ||
38 | + line = re.sub('[\(]\s[a-zA-Z]{1}\s[\)]', '', line.rstrip()) #elimina (_alpha_) | ||
39 | + line = re.sub('[\[]\s[a-zA-Z]{1}\s[\]]', '', line.rstrip()) #elimina [_alpha_] | ||
40 | + line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman_) | ||
41 | + line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman-Roman_) | ||
42 | + line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman_) | ||
43 | + line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman-roman_) | ||
44 | + line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman_] | ||
45 | + line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman-Roman_] | ||
46 | + line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman_] | ||
47 | + line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman-roman_] | ||
48 | + line = re.sub('[\(][^\(|^\)]*\s(fig\s\.|figure|see|i\s\.\se\s\.|e\s\.\sg\s\.|tab\s\.table)\s[^\(|^\)]*[\)]', '', line.rstrip(), flags=re.I) # | ||
49 | + line = re.sub(' ', ' ', line.rstrip()) #elimina (_NNNNa_) | ||
50 | + #print(line) | ||
51 | + | ||
52 | + | ||
53 | + save_file = open( file_name, "a" ) | ||
54 | + save_file.write(line) | ||
55 | + save_file.write("\n") | ||
56 | + save_file.close() |
format/sanitized_sentences/.gitignore
0 → 100644
1 | + |
format/split_sentences/.gitignore
0 → 100644
1 | + |
get-TRN-Organism-v1.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | +import operator | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +import json | ||
7 | +import re | ||
8 | +import pandas as pd | ||
9 | + | ||
10 | +__author__ = 'CMendezC' | ||
11 | + | ||
12 | + | ||
13 | +# Objective: add organism annotation (http://pakal.ccg.unam.mx/cmendezc/bacteria-annotation) to TRN tabla | ||
14 | + | ||
15 | +# Parameters: | ||
16 | +# 1) --trnPath Path to TRN detail table | ||
17 | +# 2) --trnFile File of TRN detail table | ||
18 | +# 3) --outputPath Output path | ||
19 | +# 4) --organismPath Path to Organism annotation table | ||
20 | +# 5) --organismFile File of Organism annotation table | ||
21 | + | ||
22 | +# Ouput: | ||
23 | +# 1) Tsv file detail with: | ||
24 | +# TF TypeRegulated Regulated Effect PMID IdSentence TypeSentence Sentence | ||
25 | +# Original_idsentence Original_sentence SectionNum SectionName OrganismMentions OrganismScore ConfirmationLevel | ||
26 | +# OrganismScore = { | ||
27 | +# If only salmonella or only non identified organism = 1, | ||
28 | +# If (startswith salmonella or non identified organism) and other organisms = 0.5 | ||
29 | +# If only other organisms = 0 | ||
30 | +# } | ||
31 | + | ||
32 | +# Execution: | ||
33 | +# python3.4 get-TRN-Organism-v1.py | ||
34 | + | ||
35 | +# Local | ||
36 | +# python get-TRN-Organism-v1.py | ||
37 | +# --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" | ||
38 | +# --trnFile STMTRN_all.detail.tsv | ||
39 | +# --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" | ||
40 | +# --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results | ||
41 | +# --organismFile annotations_STMTRN_all.sentences.csv | ||
42 | +# python3 get-TRN-Organism-v1.py --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --trnFile STMTRN_all.detail.tsv --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results --organismFile annotations_STMTRN_all.sentences.csv | ||
43 | + | ||
44 | +########################################################### | ||
45 | +# MAIN PROGRAM # | ||
46 | +########################################################### | ||
47 | + | ||
48 | +def only_salmonella_or_non_identified_organism(list_temp): | ||
49 | + non_identified_organisms = [ | ||
50 | + 'unidentified plasmid', | ||
51 | + 'unidentified', | ||
52 | + 'bacterium', | ||
53 | + 'bacterium IFAM-3211', | ||
54 | + 'bacterium IFAM-2074', | ||
55 | + 'bacterium IFAM-1493', | ||
56 | + 'bacterium IFAM-3215', | ||
57 | + 'bacterium IFAM-3359', | ||
58 | + 'hybrid', | ||
59 | + 'Vector pMC1403', | ||
60 | + 'Transposon Tn10', | ||
61 | + 'unidentified cloning vector', | ||
62 | + 'Plasmid F', | ||
63 | + 'Cloning vector pUC19' | ||
64 | + ] | ||
65 | + matches = 0 | ||
66 | + for o in list_temp: | ||
67 | + if o.lower().startswith("salmonella") or o in non_identified_organisms: | ||
68 | + matches += 1 | ||
69 | + if matches == len(list_temp): | ||
70 | + return True | ||
71 | + else: | ||
72 | + return False | ||
73 | + | ||
74 | +def salmonella_or_non_identified_and_other_organisms(list_temp): | ||
75 | + non_identified_organisms = [ | ||
76 | + 'unidentified plasmid', | ||
77 | + 'unidentified', | ||
78 | + 'bacterium', | ||
79 | + 'bacterium IFAM-3211', | ||
80 | + 'bacterium IFAM-2074', | ||
81 | + 'bacterium IFAM-1493', | ||
82 | + 'bacterium IFAM-3215', | ||
83 | + 'bacterium IFAM-3359', | ||
84 | + 'hybrid', | ||
85 | + 'Vector pMC1403', | ||
86 | + 'Transposon Tn10', | ||
87 | + 'unidentified cloning vector', | ||
88 | + 'Plasmid F', | ||
89 | + 'Cloning vector pUC19' | ||
90 | + ] | ||
91 | + matches = 0 | ||
92 | + for o in list_temp: | ||
93 | + if o.lower().startswith("salmonella") or o in non_identified_organisms: | ||
94 | + matches += 1 | ||
95 | + if matches < len(list_temp) and matches > 0: | ||
96 | + return True | ||
97 | + else: | ||
98 | + return False | ||
99 | + | ||
100 | +def only_other_organims(list_temp): | ||
101 | + non_identified_organisms = [ | ||
102 | + 'unidentified plasmid', | ||
103 | + 'unidentified', | ||
104 | + 'bacterium', | ||
105 | + 'bacterium IFAM-3211', | ||
106 | + 'bacterium IFAM-2074', | ||
107 | + 'bacterium IFAM-1493', | ||
108 | + 'bacterium IFAM-3215', | ||
109 | + 'bacterium IFAM-3359', | ||
110 | + 'hybrid', | ||
111 | + 'Vector pMC1403', | ||
112 | + 'Transposon Tn10', | ||
113 | + 'unidentified cloning vector', | ||
114 | + 'Plasmid F', | ||
115 | + 'Cloning vector pUC19' | ||
116 | + ] | ||
117 | + matches = 0 | ||
118 | + for o in list_temp: | ||
119 | + if o.lower().startswith("salmonella") or o in non_identified_organisms: | ||
120 | + matches += 1 | ||
121 | + if matches == 0: | ||
122 | + return True | ||
123 | + else: | ||
124 | + return False | ||
125 | + | ||
126 | +if __name__ == "__main__": | ||
127 | + # Parameter definition | ||
128 | + parser = OptionParser() | ||
129 | + parser.add_option("--trnPath", dest="trnPath", | ||
130 | + help="Path to TRN detail table", metavar="PATH") | ||
131 | + parser.add_option("--trnFile", dest="trnFile", | ||
132 | + help="File of TRN detail table", metavar="FILE") | ||
133 | + parser.add_option("--outputPath", dest="outputPath", | ||
134 | + help="Output path", metavar="PATH") | ||
135 | + parser.add_option("--organismPath", dest="organismPath", | ||
136 | + help="Path to organism annotation table", metavar="PATH") | ||
137 | + parser.add_option("--organismFile", dest="organismFile", | ||
138 | + help="File of organism annotation table", metavar="FILE") | ||
139 | + | ||
140 | + (options, args) = parser.parse_args() | ||
141 | + if len(args) > 0: | ||
142 | + parser.error("None parameter entered.") | ||
143 | + sys.exit(1) | ||
144 | + | ||
145 | + # Printing parameter values | ||
146 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
147 | + print("Path to TRN detail table: " + str(options.trnPath)) | ||
148 | + print("File of TRN detail table: " + str(options.trnFile)) | ||
149 | + print("Output path: " + str(options.outputPath)) | ||
150 | + print("Path to organism annotation table: " + str(options.organismPath)) | ||
151 | + print("File of organism annotation table: " + str(options.organismFile)) | ||
152 | + | ||
153 | + # Load organism annotation table | ||
154 | + print("Loading organism annotation table") | ||
155 | + df_organisms = pd.read_csv(os.path.join(options.organismPath, options.organismFile), sep=',') | ||
156 | + print("Total de frases anotadas con organism: {}".format(df_organisms.shape[0])) | ||
157 | + | ||
158 | + # Load TRN detail table | ||
159 | + print("Loading TRN detail table") | ||
160 | + df_detail = pd.read_csv(os.path.join(options.trnPath, options.trnFile), sep='\t') | ||
161 | + print("Total de frases en TRN: {}".format(df_detail.shape[0])) | ||
162 | + | ||
163 | + # Fix column for organism. We changed this issue in get-TRN-v2.py | ||
164 | + df_detail = df_detail.rename(columns={"Organism": "Organisms"}) | ||
165 | + df_detail['OrganismScore'] = 1.00 | ||
166 | + print(df_detail.columns) | ||
167 | + #print(df_detail['Sentence'].head(15)) | ||
168 | + | ||
169 | + for idx in df_organisms.index: | ||
170 | + organisms = df_organisms['Organisms'][idx] | ||
171 | + SentenceNumberInFile = df_organisms['SentenceNumberInFile'][idx] | ||
172 | + SentenceNumberInFile = SentenceNumberInFile - 2 | ||
173 | + # print("Organisms before: {}".format(df_detail.Organisms[SentenceNumberInFile])) | ||
174 | + df_detail.Organisms[SentenceNumberInFile] = organisms | ||
175 | + # print("Organisms assigned: {}".format(df_detail.Organisms[SentenceNumberInFile])) | ||
176 | + | ||
177 | + # OrganismScore = { | ||
178 | + # If only salmonella or only non identified organism = 1, | ||
179 | + # If (startswith salmonella or non identified organism) and other organisms = 0.5 | ||
180 | + # If only other organisms = 0 | ||
181 | + # } | ||
182 | + list_organisms = organisms.split(';') | ||
183 | + # print(" OrganismScore before: {}".format(df_detail.OrganismScore[SentenceNumberInFile])) | ||
184 | + if only_salmonella_or_non_identified_organism(list_organisms): | ||
185 | + df_detail.OrganismScore[SentenceNumberInFile] = 1.00 | ||
186 | + elif salmonella_or_non_identified_and_other_organisms(list_organisms): | ||
187 | + df_detail.OrganismScore[SentenceNumberInFile] = 0.50 | ||
188 | + elif only_other_organims(list_organisms): | ||
189 | + df_detail.OrganismScore[SentenceNumberInFile] = 0.00 | ||
190 | + # print(" OrganismScore assigned: {}".format(df_detail.OrganismScore[SentenceNumberInFile])) | ||
191 | + | ||
192 | + hashPredictedRIs = {} | ||
193 | + hashPredictedRIsCount = {} | ||
194 | + hashPredictedRIsCountVer = {} | ||
195 | + hashPredictedRIsCountDev = {} | ||
196 | + hashPredictedRIsCountAtt = {} | ||
197 | + hashPredictedRIsCountAuto = {} | ||
198 | + hashPredictedRIsScore = {} | ||
199 | + hashPredictedRIsRI = {} | ||
200 | + for idx in df_detail.index: | ||
201 | + tf = df_detail['TF'][idx] | ||
202 | + TypeRegulated = df_detail['TypeRegulated'][idx] | ||
203 | + Regulated = df_detail['Regulated'][idx] | ||
204 | + Effect = df_detail['Effect'][idx] | ||
205 | + pmid = df_detail['PMID'][idx] | ||
206 | + numsent = df_detail['NumSentence'][idx] | ||
207 | + type_sent = df_detail['TypeSentence'][idx] | ||
208 | + sentence = df_detail['Sentence'][idx] | ||
209 | + original_idsentence = df_detail['OriginalIdSentence'][idx] | ||
210 | + original_sentence = df_detail['OriginalSentence'][idx] | ||
211 | + section_num = df_detail['SectionNum'][idx] | ||
212 | + section_name = df_detail['SectionName'][idx] | ||
213 | + organisms = df_detail['Organisms'][idx] | ||
214 | + organism_score = df_detail['OrganismScore'][idx] | ||
215 | + llave = "{}\t{}\t{}\t{}".format(tf, TypeRegulated, Regulated, Effect) | ||
216 | + if organism_score == 0: | ||
217 | + continue | ||
218 | + if llave in hashPredictedRIs: | ||
219 | + hashPredictedRIs[llave].append( | ||
220 | + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence, | ||
221 | + original_sentence, section_num, section_name, organisms, | ||
222 | + organism_score, "", "", "", "", "", "")) | ||
223 | + hashPredictedRIsCount[llave] += 1 | ||
224 | + if type_sent == "ver/dev": | ||
225 | + hashPredictedRIsCountVer[llave] += 1 | ||
226 | + elif type_sent == "dev": | ||
227 | + hashPredictedRIsCountDev[llave] += 1 | ||
228 | + elif type_sent == "att": | ||
229 | + hashPredictedRIsCountAtt[llave] += 1 | ||
230 | + elif type_sent == "auto": | ||
231 | + hashPredictedRIsCountAuto[llave] += 1 | ||
232 | + # if organism_score == 0.5: | ||
233 | + # We penalize RI | ||
234 | + # hashPredictedRIsScore[llave] -= 0.05 | ||
235 | + | ||
236 | + else: | ||
237 | + hashPredictedRIs[llave] = [ | ||
238 | + "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence, | ||
239 | + original_sentence, section_num, section_name, organisms, | ||
240 | + organism_score, "", "", "", "", "", "")] | ||
241 | + hashPredictedRIsCount[llave] = 1 | ||
242 | + hashPredictedRIsCountVer[llave] = 0 | ||
243 | + hashPredictedRIsCountDev[llave] = 0 | ||
244 | + hashPredictedRIsCountAtt[llave] = 0 | ||
245 | + hashPredictedRIsCountAuto[llave] = 0 | ||
246 | + hashPredictedRIsScore[llave] = 1 | ||
247 | + if type_sent == "ver/dev": | ||
248 | + hashPredictedRIsCountVer[llave] = 1 | ||
249 | + elif type_sent == "dev": | ||
250 | + hashPredictedRIsCountDev[llave] = 1 | ||
251 | + elif type_sent == "att": | ||
252 | + hashPredictedRIsCountAtt[llave] = 1 | ||
253 | + elif type_sent == "auto": | ||
254 | + hashPredictedRIsCountAuto[llave] = 1 | ||
255 | + # if organism_score == 0.5: | ||
256 | + # We penalize RI | ||
257 | + # hashPredictedRIsScore[llave] -= 0.05 | ||
258 | + | ||
259 | + print("Total RIs en TRN con organismo: {}".format(len(hashPredictedRIs))) | ||
260 | + with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "summary_org")), mode="w") as oFile: | ||
261 | + # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n") | ||
262 | + oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n") | ||
263 | + for k,v in hashPredictedRIs.items(): | ||
264 | + RI_value = "True" | ||
265 | + # if hashPredictedRIsScore[k] < 1: | ||
266 | + # RI_value = "Possible" | ||
267 | + oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k], | ||
268 | + hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], | ||
269 | + hashPredictedRIsScore[k], RI_value)) | ||
270 | + with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "detail_org")), mode="w") as oFile: | ||
271 | + # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n") | ||
272 | + oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tKT\tCL\tSource\tSpeculation\tNegation\tConfirmationLevel\n") | ||
273 | + i = 0 | ||
274 | + for k,v in hashPredictedRIs.items(): | ||
275 | + for s in v: | ||
276 | + oFile.write("{}\t{}\n".format(k, s)) | ||
277 | + i += 1 | ||
278 | + print("Total de frases en TRN organismo: {}".format(i)) | ||
279 | + |
get-TRN-v2.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | +import operator | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +import json | ||
7 | +import re | ||
8 | +import pandas as pd | ||
9 | + | ||
10 | +__author__ = 'CMendezC' | ||
11 | + | ||
12 | + | ||
13 | +# Objective: generate TRN | ||
14 | +# CFMC 2022-03-11: Agregamos: | ||
15 | +# 1) Sección de oraciones de salida | ||
16 | +# 2) | ||
17 | + | ||
18 | +# Parameters: | ||
19 | +# 1) --predictedPath Path for predicted interactions | ||
20 | +# 2) --outputPath Output path | ||
21 | +# 3) --outputFile Preffix file for saving TRN | ||
22 | +# 4) --diccPath Dictionary path | ||
23 | +# 5) --diccSynon File with synonyms of TFs | ||
24 | +# 6) --tsvPath Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf | ||
25 | +# 7) --jsonpdfPath Path to read jsonpdf file to extract section name | ||
26 | + | ||
27 | +# Ouput: | ||
28 | +# 1) Tsv file detail with: | ||
29 | +# TF TypeRegulated Regulated Effect PMID IdSentence TypeSentence Sentence | ||
30 | +# Original_idsentence Original_sentence SectionNum SectionName OrganismMentions OrganismScore ConfirmationLevel | ||
31 | + | ||
32 | +# 1) Tsv file summary with: | ||
33 | +# TF TypeRegulated Regulated Effect SentCount Ver/Dev Att Auto Score RI (True/False) | ||
34 | + | ||
35 | +# Execution: | ||
36 | +# Version 2 TRN Salmonella | ||
37 | +# python3.4 get-TRN-v2.py | ||
38 | +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris | ||
39 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn | ||
40 | +# --outputFile STMTRN_v2 | ||
41 | +# --diccPath /home/cmendezc/terminologicalResources | ||
42 | +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
43 | +# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv | ||
44 | +# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf | ||
45 | +# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_v2 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf | ||
46 | + | ||
47 | +# articulos_sal_4 | ||
48 | +# python3.4 get-TRN-v2.py | ||
49 | +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris | ||
50 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn | ||
51 | +# --outputFile STMTRN_articulos_sal_4 | ||
52 | +# --diccPath /home/cmendezc/terminologicalResources | ||
53 | +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
54 | +# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv | ||
55 | +# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf | ||
56 | +# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_4 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf | ||
57 | + | ||
58 | +# articulos_sal_1 | ||
59 | +# python3.4 get-TRN-v2.py | ||
60 | +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris | ||
61 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn | ||
62 | +# --outputFile STMTRN_articulos_sal_1 | ||
63 | +# --diccPath /home/cmendezc/terminologicalResources | ||
64 | +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
65 | +# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv | ||
66 | +# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf | ||
67 | +# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_1 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf | ||
68 | + | ||
69 | +# all = articulos_sal_1 + articulos_sal_2 + articulos_sal_3 + articulos_sal_4 | ||
70 | +# python3.4 get-TRN-v2.py | ||
71 | +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris | ||
72 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn | ||
73 | +# --outputFile STMTRN_all | ||
74 | +# --diccPath /home/cmendezc/terminologicalResources | ||
75 | +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
76 | +# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv | ||
77 | +# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf | ||
78 | +# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_all --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf | ||
79 | + | ||
80 | +#### | ||
81 | +# python3.4 get-TRN-v1.py | ||
82 | +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris | ||
83 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn | ||
84 | +# --outputFile STMTRN | ||
85 | +# --diccPath /home/cmendezc/terminologicalResources | ||
86 | +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
87 | +# python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
88 | + | ||
89 | +# Con dataset automatic-extraction-STM-RIs-dataset | ||
90 | +# python3.4 get-TRN-v1.py | ||
91 | +# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris | ||
92 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn | ||
93 | +# --outputFile STM-RIs-dataset | ||
94 | +# --diccPath /home/cmendezc/terminologicalResources | ||
95 | +# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
96 | +# python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STM-RIs-dataset --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json | ||
97 | + | ||
98 | +########################################################### | ||
99 | +# MAIN PROGRAM # | ||
100 | +########################################################### | ||
101 | + | ||
102 | +def updateHashPredicted(pr, hashP, pm, sF, ef): | ||
103 | + # updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect]) | ||
104 | + if pr not in hashP: | ||
105 | + hashTemp = {"pmids": {pm: [sF]}, "orieff": ef} | ||
106 | + hashP[pr] = hashTemp | ||
107 | + else: | ||
108 | + hashTemp = hashP[pr] | ||
109 | + if pm in hashTemp["pmids"]: | ||
110 | + hashP[pr]["pmids"][pm].append(sF) | ||
111 | + else: | ||
112 | + hashP[pr]["pmids"][pm] = [sF] | ||
113 | + | ||
114 | +def get_standard_name(regSynon): | ||
115 | + reg = regSynon | ||
116 | + if regSynon in hashSynon: | ||
117 | + reg = hashSynon[regSynon] | ||
118 | + else: | ||
119 | + for syn, std in hashSynon.items(): | ||
120 | + if regSynon.startswith(syn): | ||
121 | + reg = regSynon.replace(syn, std, 1) | ||
122 | + break | ||
123 | + return reg | ||
124 | + | ||
125 | +if __name__ == "__main__": | ||
126 | + # Parameter definition | ||
127 | + parser = OptionParser() | ||
128 | + parser.add_option("--predictedPath", dest="predictedPath", | ||
129 | + help="Path predicted ris gcs", metavar="PATH") | ||
130 | + parser.add_option("--outputPath", dest="outputPath", | ||
131 | + help="Output path", metavar="PATH") | ||
132 | + parser.add_option("--outputFile", dest="outputFile", | ||
133 | + help="Preffix file for saving results", metavar="FILE") | ||
134 | + parser.add_option("--diccPath", dest="diccPath", | ||
135 | + help="Path to dictionary", metavar="PATH") | ||
136 | + parser.add_option("--diccSynon", dest="diccSynon", | ||
137 | + help="File with synonyms", metavar="FILE") | ||
138 | + parser.add_option("--tsvPath", dest="tsvPath", | ||
139 | + help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH") | ||
140 | + parser.add_option("--jsonpdfPath", dest="jsonpdfPath", | ||
141 | + help="Path to read jsonpdf file to extract section name", metavar="PATH") | ||
142 | + | ||
143 | + (options, args) = parser.parse_args() | ||
144 | + if len(args) > 0: | ||
145 | + parser.error("None parameter entered.") | ||
146 | + sys.exit(1) | ||
147 | + | ||
148 | + # Printing parameter values | ||
149 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
150 | + print("Path predicted ris gcs: " + str(options.predictedPath)) | ||
151 | + print("Output path: " + str(options.outputPath)) | ||
152 | + print("Preffix file for saving results: " + str(options.outputFile)) | ||
153 | + print("Path to dictionary: " + str(options.diccPath)) | ||
154 | + print("File with synonyms: " + str(options.diccSynon)) | ||
155 | + print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath)) | ||
156 | + print("Path to read jsonpdf file to extract section name: " + str(options.jsonpdfPath)) | ||
157 | + | ||
158 | + use_synonyms = False | ||
159 | + hashSynon = {} | ||
160 | + if options.diccPath != None and options.diccSynon != "no-synonyms": | ||
161 | + print("***** Using synonyms *****") | ||
162 | + use_synonyms = True | ||
163 | + print('Loading dictionary of synonyms...') | ||
164 | + with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon: | ||
165 | + hashSynon = json.load(diccSynon) | ||
166 | + print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon))) | ||
167 | + | ||
168 | + hashPredictedRIs = {} | ||
169 | + hashPredictedRIsCount = {} | ||
170 | + hashPredictedRIsCountVer = {} | ||
171 | + hashPredictedRIsCountDev = {} | ||
172 | + hashPredictedRIsCountAtt = {} | ||
173 | + hashPredictedRIsCountAuto = {} | ||
174 | + hashFiles = {} | ||
175 | + for path, dirs, files in os.walk(options.predictedPath): | ||
176 | + for file in files: | ||
177 | + if file.endswith(".a1"): | ||
178 | + filename = file[:-3] | ||
179 | + if filename not in hashFiles: | ||
180 | + hashFiles[filename] = 1 | ||
181 | + else: | ||
182 | + hashFiles[filename] += 1 | ||
183 | + print("Files: {}".format(len(hashFiles))) | ||
184 | + | ||
185 | + processedFiles = 0 | ||
186 | + id_ri = 1 | ||
187 | + regex_att_auto = re.compile(r"(\.att\.|\.auto\.)[0-9]*$") | ||
188 | + for file in sorted(hashFiles.keys()): | ||
189 | + print("File: {}".format(file)) | ||
190 | + type_sent = "ver/dev" | ||
191 | + if file.find("dataSet_OnlyRI_sentences") > -1: | ||
192 | + pmid = "000000" | ||
193 | + if file.find("dataSet_OnlyRI_sentences.") > -1: | ||
194 | + if file.find(".att.") > -1: | ||
195 | + numsent = file[file.find("att.") + 4:] | ||
196 | + type_sent = "att" | ||
197 | + if pmid.find(".auto.") > -1: | ||
198 | + numsent = file[file.find("auto.") + 5:] | ||
199 | + type_sent = "auto" | ||
200 | + else: | ||
201 | + numsent = file[file.find("_", file.find("_", file.find("_") + 1) + 1) + 1:file.find("-")] | ||
202 | + numsent = numsent.replace(".al", "") | ||
203 | + print("dataSet_OnlyRI_sentences numsent: {}".format(numsent)) | ||
204 | + print("dataSet_OnlyRI_sentences pmid: {}".format(pmid)) | ||
205 | + else: | ||
206 | + pmid = file[:file.find("_")] | ||
207 | + # print("pmid: {}".format(pmid)) | ||
208 | + numsent = file[file.find("_")+1:file.find("-")] | ||
209 | + numsent = numsent.replace(".al", "") | ||
210 | + if pmid.find(".att.") > -1: | ||
211 | + # CFMC 2022-03-11: Fix errro in pmid | ||
212 | + # CFMC 2022-03-11 Original: pmid = pmid.replace(".att.", "") | ||
213 | + pmid = regex_att_auto.sub("", pmid) | ||
214 | + numsent = file[file.find("att.")+4:] | ||
215 | + type_sent = "att" | ||
216 | + if pmid.find(".auto.") > -1: | ||
217 | + # CFMC 2022-03-11: Fix errro in pmid | ||
218 | + # CFMC 2022-03-11 Original: pmid = pmid.replace(".auto.", "") | ||
219 | + pmid = regex_att_auto.sub("", pmid) | ||
220 | + numsent = file[file.find("auto.") + 5:] | ||
221 | + type_sent = "auto" | ||
222 | + # numsent = file[file.find("_"):file.find("-")] | ||
223 | + # print("pmid {}".format(pmid)) | ||
224 | + # print("numsent: {}".format(numsent)) | ||
225 | + | ||
226 | + sentenceFile = file[:file.find("-", file.find("_"))] + ".txt" | ||
227 | + hashEntitiesGenes = {} | ||
228 | + hashEntitiesTUs = {} | ||
229 | + hashEntitiesTFs = {} | ||
230 | + hashEntitiesEffects = {} | ||
231 | + hashOriginalEffect = {} | ||
232 | + regex_fix_regulator = re.compile(r'(Regulated|Binds|Bind|deverbal_effect|Regulate)') | ||
233 | + regex_fix_repressor = re.compile(r'(Repressing|Represses)') | ||
234 | + with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File: | ||
235 | + for line in a1File: | ||
236 | + line = line.strip('\n') | ||
237 | + listLine1 = line.split('\t') | ||
238 | + listLine2 = listLine1[1].split(' ') | ||
239 | + entity = listLine2[0] | ||
240 | + entity_type = listLine2[0] | ||
241 | + idEntity = listLine1[0] | ||
242 | + originalEffect = listLine1[2] | ||
243 | + if entity.startswith("EFFECT"): | ||
244 | + entity = entity[entity.find(".") + 1:] | ||
245 | + # print("Entity: {}".format(entity)) | ||
246 | + if pmid.find("_dev") > -1: | ||
247 | + type_sent = "dev" | ||
248 | + entity = entity.replace("_dev", "") | ||
249 | + # print("Entity without _dev: {}".format(entity)) | ||
250 | + if idEntity not in hashOriginalEffect: | ||
251 | + hashOriginalEffect[idEntity] = originalEffect | ||
252 | + if idEntity not in hashEntitiesEffects: | ||
253 | + # We fixed some wrong effects in TRN, but we must fix this also in another script where error is produced | ||
254 | + if regex_fix_regulator.match(entity): | ||
255 | + print("WARNING EFFECT: {}".format(entity)) | ||
256 | + entity = regex_fix_regulator.sub("regulator", entity) | ||
257 | + print("WARNING EFFECT after: {}".format(entity)) | ||
258 | + if regex_fix_repressor.match(entity): | ||
259 | + print("WARNING EFFECT: {}".format(entity)) | ||
260 | + entity = regex_fix_repressor.sub("repressor", entity) | ||
261 | + print("WARNING EFFECT after: {}".format(entity)) | ||
262 | + hashEntitiesEffects[idEntity] = entity | ||
263 | + else: | ||
264 | + entity = listLine1[2] | ||
265 | + if entity_type == "GENE": | ||
266 | + if idEntity not in hashEntitiesGenes: | ||
267 | + hashEntitiesGenes[idEntity] = entity | ||
268 | + elif entity_type == "TU": | ||
269 | + if idEntity not in hashEntitiesTUs: | ||
270 | + hashEntitiesTUs[idEntity] = entity | ||
271 | + elif entity_type == "TF": | ||
272 | + if idEntity not in hashEntitiesTFs: | ||
273 | + hashEntitiesTFs[idEntity] = entity | ||
274 | + | ||
275 | + # print("hashEntities: {}".format(hashEntitiesGenes)) | ||
276 | + # print("hashEntities: {}".format(hashEntitiesTUs)) | ||
277 | + # print("hashEntities: {}".format(hashEntitiesTFs)) | ||
278 | + | ||
279 | + with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File: | ||
280 | + sentence = '' | ||
281 | + with open(os.path.join(options.predictedPath, file + ".txt"), mode="r") as txtFile: | ||
282 | + sentence = txtFile.read() | ||
283 | + listTokens = [token.split('|')[0] for token in sentence.split()] | ||
284 | + sentence = ' '.join(listTokens) | ||
285 | + | ||
286 | + # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence | ||
287 | + # Open jsonpdf file | ||
288 | + hash_sections = {} | ||
289 | + sentences = {} | ||
290 | + print('Loading jsonpdf file...') | ||
291 | + with open(os.path.join(options.jsonpdfPath, pmid + ".jsonpdf"), "r", encoding="utf-8", errors="replace") as jsonpdfFile: | ||
292 | + text_file = jsonpdfFile.read() | ||
293 | + if file.startswith("26781240"): | ||
294 | + text_file = text_file.replace(" \\ ", " \\\\ ") | ||
295 | + elif file.startswith("26249345"): | ||
296 | + text_file = text_file.replace('}], ', '}],"sections": {}') | ||
297 | + try: | ||
298 | + hash_jsonpdf = json.loads(text_file) | ||
299 | + print(' Loading jsponpdf file... done!') | ||
300 | + except Exception as e: | ||
301 | + print(e) | ||
302 | + print(" Loading jsonpdf file failed: {}".format(file)) | ||
303 | + hash_sections = hash_jsonpdf["sections"] | ||
304 | + # print("Sections: {}".format(hash_sections)) | ||
305 | + sentences = hash_jsonpdf["sentences"] | ||
306 | + # Open tsv file | ||
307 | + print('Loading tsv file...') | ||
308 | + file_tsv = pmid + ".pre.fil.tsv" | ||
309 | + tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv)) | ||
310 | + # print("tsv_file.shape: {}".format(tsv_file.shape)) | ||
311 | + tsv_file_filtered = tsv_file[tsv_file['status'] == 1] | ||
312 | + # print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape)) | ||
313 | + tsv_file_new = tsv_file_filtered.reset_index(drop=True) | ||
314 | + # print(tsv_file_new.head(10)) | ||
315 | + print(' Loading tsv file... done!') | ||
316 | + numsent_int = int(numsent) | ||
317 | + original_sentence = tsv_file_new.at[numsent_int, 'sentence'] | ||
318 | + section_num = tsv_file_new.at[numsent_int, 'section'] | ||
319 | + # print("type(section_num): {}".format(type(section_num))) | ||
320 | + original_idsentence = tsv_file_new.at[numsent_int, 'idsentence'] | ||
321 | + section_num_str = str(section_num) | ||
322 | + if section_num_str in hash_sections: | ||
323 | + section_name = hash_sections[section_num_str] | ||
324 | + else: | ||
325 | + section_name = "Unknown" | ||
326 | + | ||
327 | + for line in a2File: | ||
328 | + # print("Line a2: {}".format(line)) | ||
329 | + # R1 Interaction.T3 Target:T2 Agent:T1 Condition: T4 | ||
330 | + line = line.strip('\n') | ||
331 | + listLine1 = line.split('\t') | ||
332 | + listLine2 = listLine1[1].split(' ') | ||
333 | + regulator = listLine2[2] | ||
334 | + regulator = regulator[regulator.find(":") + 1:] | ||
335 | + regulated = listLine2[1] | ||
336 | + regulated = regulated[regulated.find(":") + 1:] | ||
337 | + effect = listLine2[0] | ||
338 | + effect = effect[effect.find(".") + 1:] | ||
339 | + | ||
340 | + tf = hashEntitiesTFs[regulator] | ||
341 | + if tf.endswith("ed"): | ||
342 | + tf = tf[:tf.find("-")] | ||
343 | + #else: | ||
344 | + # Clean TF names by expressions seen in TRN outpur file | ||
345 | + tf = re.sub(r"(/absence|controlle|activation|‐regulate|‐mediate|mediate|-regulate|regulate|ˉ|-like|-mutant|-type|-independent|-dependent|dependent|-dependant|-binding|-and|-family|-bound|-deficient|-indepen-dent|-inducing|-green|-overproducing|-or|-depletion|-repressible|-dual|-box)", "", tf) | ||
346 | + # Clean false TF names - 2329 | ||
347 | + result = re.match(r"(cyclic|RHONDA|Crawford|Hulett|Rhodobacter|Danino|Huang|Neisseria|Huang|HUGHES1|Robbe-Saule|Danchin|Roberts|Furer|Hunter|Furue|Humphreys|Nacional)", tf) | ||
348 | + if result: | ||
349 | + break | ||
350 | + # H | ||
351 | + tf = get_standard_name(tf) | ||
352 | + | ||
353 | + # print("numsent: {}".format(numsent)) | ||
354 | + # For L&C do not increment 1 | ||
355 | + # CFMC 2022-03-11 Original: numsent_int = int(numsent) | ||
356 | + | ||
357 | + if regulated in hashEntitiesGenes: | ||
358 | + type_regulated = "Gene" | ||
359 | + llave = "{}\t{}\t{}\t{}".format(tf, "gene", hashEntitiesGenes[regulated], | ||
360 | + hashEntitiesEffects[effect]) | ||
361 | + elif regulated in hashEntitiesTUs: | ||
362 | + type_regulated ="TU" | ||
363 | + llave = "{}\t{}\t{}\t{}".format(tf, "TU", hashEntitiesTUs[regulated], | ||
364 | + hashEntitiesEffects[effect]) | ||
365 | + else: | ||
366 | + print("ERROR: Regulated did not found!") | ||
367 | + # Clean false cases | ||
368 | + if llave.startswith("Hu"): | ||
369 | + break | ||
370 | + | ||
371 | + if llave in hashPredictedRIs: | ||
372 | + # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence | ||
373 | + hashPredictedRIs[llave].append("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, "")) | ||
374 | + hashPredictedRIsCount[llave] += 1 | ||
375 | + if type_sent == "ver/dev": | ||
376 | + # if llave in hashPredictedRIsCountVer: | ||
377 | + hashPredictedRIsCountVer[llave] += 1 | ||
378 | + # else: | ||
379 | + # hashPredictedRIsCountVer[llave] = 1 | ||
380 | + elif type_sent == "dev": | ||
381 | + # if llave in hashPredictedRIsCountVer: | ||
382 | + hashPredictedRIsCountDev[llave] += 1 | ||
383 | + # else: | ||
384 | + # hashPredictedRIsCountDev[llave] = 1 | ||
385 | + elif type_sent == "att": | ||
386 | + # if llave in hashPredictedRIsCountVer: | ||
387 | + hashPredictedRIsCountAtt[llave] += 1 | ||
388 | + # else: | ||
389 | + # hashPredictedRIsCountAtt[llave] = 1 | ||
390 | + elif type_sent == "auto": | ||
391 | + # if llave in hashPredictedRIsCountVer: | ||
392 | + hashPredictedRIsCountAuto[llave] += 1 | ||
393 | + # else: | ||
394 | + # hashPredictedRIsCountAuto[llave] = 1 | ||
395 | + else: | ||
396 | + # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence | ||
397 | + hashPredictedRIs[llave] = ["{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, "")] | ||
398 | + hashPredictedRIsCount[llave] = 1 | ||
399 | + hashPredictedRIsCountVer[llave] = 0 | ||
400 | + hashPredictedRIsCountDev[llave] = 0 | ||
401 | + hashPredictedRIsCountAtt[llave] = 0 | ||
402 | + hashPredictedRIsCountAuto[llave] = 0 | ||
403 | + if type_sent == "ver/dev": | ||
404 | + hashPredictedRIsCountVer[llave] = 1 | ||
405 | + elif type_sent == "dev": | ||
406 | + hashPredictedRIsCountDev[llave] = 1 | ||
407 | + elif type_sent == "att": | ||
408 | + hashPredictedRIsCountAtt[llave] = 1 | ||
409 | + elif type_sent == "auto": | ||
410 | + hashPredictedRIsCountAuto[llave] = 1 | ||
411 | + | ||
412 | + id_ri += 1 | ||
413 | + processedFiles += 1 | ||
414 | + | ||
415 | + print("Processed files: {}".format(processedFiles)) | ||
416 | + with open(os.path.join(options.outputPath, options.outputFile + ".summary.tsv"), mode="w") as oFile: | ||
417 | + # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n") | ||
418 | + oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n") | ||
419 | + for k,v in hashPredictedRIs.items(): | ||
420 | + oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k], | ||
421 | + hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], "1", "True")) | ||
422 | + #oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k], hashPredictedRIsCountDev[k], hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], v)) | ||
423 | + with open(os.path.join(options.outputPath, options.outputFile + ".detail.tsv"), mode="w") as oFile: | ||
424 | + # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n") | ||
425 | + oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tConfirmationLevel\n") | ||
426 | + for k,v in hashPredictedRIs.items(): | ||
427 | + for s in v: | ||
428 | + oFile.write("{}\t{}\n".format(k, s)) | ||
429 | + |
predicted-ris-gcs/complete-ris/.gitignore
0 → 100644
1 | + |
predicted-ris-gcs/incomplete-ris/.gitignore
0 → 100644
1 | + |
ri-attributive-extraction-v02.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | +from optparse import OptionParser | ||
3 | +import sys | ||
4 | +import os | ||
5 | +import json | ||
6 | +import operator | ||
7 | +import re | ||
8 | +from nltk.corpus import words | ||
9 | + | ||
10 | +__author__ = 'CMendezC' | ||
11 | + | ||
12 | + | ||
13 | +# Objective: obtain predicted ris from attributive sentences, such as ArgP-regulated gene argP | ||
14 | +# Input format: transformed format. | ||
15 | +# WARNING: Only one sentence per line | ||
16 | + | ||
17 | +# Parameters: | ||
18 | +# 1) --inputPath Input path | ||
19 | +# 2) --inputFile Inpupt file | ||
20 | +# 3) --outputPath Output path | ||
21 | +# 5) --diccPath Dictionary path | ||
22 | +# 7) --diccEffect File with normalized effects | ||
23 | + | ||
24 | +# 6) --diccFile JSON file with entity dictionaries | ||
25 | +# 9) --diccEPAth Dictionary path diccEffect | ||
26 | +# 8) --format Output format: standoff, tabs | ||
27 | + | ||
28 | +# Ouput: | ||
29 | +# 1) File with predicted ris combined with existing files. | ||
30 | +# Format standoff: | ||
31 | +# T1 TF 0 0 ArgP-regulated | ||
32 | +# T2 GENE 0 0 argP | ||
33 | +# T1 Growth_condition 88 137 mitochondrial electron transport chain inhibitors | ||
34 | +# R1 Interaction.activator Target:T3 Agent:T1 | ||
35 | + | ||
36 | +# Execution | ||
37 | +# C:\anaconda3\python ri-attributive-extraction.py | ||
38 | +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences | ||
39 | +# --inputFile ris-sentences-analysis.att.017.txt | ||
40 | +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs | ||
41 | +# --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources | ||
42 | +# --diccEffect normalized_Effects.json | ||
43 | +# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json | ||
44 | +# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.286.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json | ||
45 | + | ||
46 | +# python3 ri-attributive-extraction.py | ||
47 | +# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences | ||
48 | +# --inputFile ris-sentences-analysis.att.017.txt | ||
49 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs | ||
50 | +# --diccPath /home/cmendezc/terminologicalResources | ||
51 | +# --diccEffect normalized_Effects.json | ||
52 | +# python3 ri-attributive-extraction.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json | ||
53 | + | ||
54 | +########################################################### | ||
55 | +# MAIN PROGRAM # | ||
56 | +########################################################### | ||
57 | + | ||
58 | +def getPosWord(wordPos, endPos, text, termList): | ||
59 | + offsetStart = 0 | ||
60 | + wordNum = 0 | ||
61 | + listText = text.split() | ||
62 | + for w in listText: | ||
63 | + # if filenameBefore.find('000-2') > -1: | ||
64 | + # print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos)) | ||
65 | + if wordNum >= int(wordPos): | ||
66 | + # for tok in word.split(): | ||
67 | + for t in termList: | ||
68 | + # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords): | ||
69 | + if w == t: | ||
70 | + return [w, offsetStart, offsetStart + len(w) - 1] | ||
71 | + #else: | ||
72 | + wordNum += 1 | ||
73 | + offsetStart += len(w) + 1 | ||
74 | + if wordNum > int(endPos): | ||
75 | + return None | ||
76 | + return None | ||
77 | + | ||
78 | +def getIdEntity(aList, etype, idE): | ||
79 | + entity = aList[0] | ||
80 | + if etype == "EFFECT": | ||
81 | + normalizedEffect = entity | ||
82 | + #print("EFFECT: {}".format(entity)) | ||
83 | + if entity in hashNormalizedEffects: | ||
84 | + normalizedEffect = hashNormalizedEffects[entity] | ||
85 | + etype += "." + normalizedEffect | ||
86 | + #print("etype: {}".format(etype)) | ||
87 | + entityPosStart = aList[1] | ||
88 | + entityPosEnd = aList[2] | ||
89 | + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity) | ||
90 | + #print("keyEntity: {}".format(keyEntity)) | ||
91 | + if keyEntity not in hashEntities: | ||
92 | + idE += 1 | ||
93 | + idEntity = "T{}".format(idE) | ||
94 | + hashEntities[keyEntity] = idEntity | ||
95 | + #print("New entity {}: {}".format(idEntity, keyEntity)) | ||
96 | + return idEntity, idE | ||
97 | + else: | ||
98 | + idEntity = hashEntities[keyEntity] | ||
99 | + return idEntity, idE | ||
100 | + | ||
101 | +def getIdInteraction(regulator, regulated, effect, idI, hashInt): | ||
102 | + #print("hashInt: {}".format(hashInt)) | ||
103 | + keyInteraction = "{} {} {}".format(regulator, regulated, effect) | ||
104 | + if keyInteraction not in hashInt: | ||
105 | + idI += 1 | ||
106 | + idInteraction = "R{}".format(idI) | ||
107 | + hashInt[keyInteraction] = idInteraction | ||
108 | + #print("New interaction {}: {}".format(idInteraction, keyInteraction)) | ||
109 | + #return idInteraction, idI | ||
110 | + else: | ||
111 | + idInteraction = hashInt[keyInteraction] | ||
112 | + return idInteraction, idI | ||
113 | + | ||
114 | +def saveFiles(filename, hashE, hashI, s, effect): | ||
115 | + if effect: | ||
116 | + outputPath = os.path.join(options.outputPath, "complete-ris") | ||
117 | + else: | ||
118 | + outputPath = os.path.join(options.outputPath, "incomplete-ris") | ||
119 | + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File: | ||
120 | + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File: | ||
121 | + for k, v in sorted(hashE.items(), key=operator.itemgetter(1)): | ||
122 | + aList = k.split() | ||
123 | + a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3])) | ||
124 | + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File: | ||
125 | + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File: | ||
126 | + for k, v in sorted(hashI.items(), key=operator.itemgetter(1)): | ||
127 | + aList = k.split() | ||
128 | + a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0])) | ||
129 | + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile: | ||
130 | + txtFile.write(s) | ||
131 | + | ||
132 | +def loadFileEntities(filename, outputPath, hashTemp): | ||
133 | + #print("Start loadFileEntities") | ||
134 | + idE = 1 | ||
135 | + try: | ||
136 | + f = filename[:filename.rfind(".")] + ".a1" | ||
137 | + # print("file entities: {}".format(f)) | ||
138 | + with open(os.path.join(outputPath, f), mode="r") as a1File: | ||
139 | + for line in a1File: | ||
140 | + line = line.strip('\n') | ||
141 | + listLine1 = line.split('\t') | ||
142 | + listLine2 = listLine1[1].split(' ') | ||
143 | + etype = listLine2[0] | ||
144 | + entityPosStart = listLine2[1] | ||
145 | + entityPosEnd = listLine2[2] | ||
146 | + entity = listLine1[2] | ||
147 | + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity) | ||
148 | + idEntity = listLine1[0] | ||
149 | + if keyEntity not in hashTemp: | ||
150 | + hashTemp[keyEntity] = idEntity | ||
151 | + if int(idEntity[1:]) > idE: | ||
152 | + idE = int(idEntity[1:]) | ||
153 | + except IOError: | ||
154 | + print("IOError file: {}".format(os.path.join(outputPath, f))) | ||
155 | + # idE = 1 | ||
156 | + return idE | ||
157 | + | ||
158 | +def loadFileInteractions(filename, outputPath, hashTemp): | ||
159 | + #print("Start loadFileInteractions") | ||
160 | + idI = 1 | ||
161 | + try: | ||
162 | + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File: | ||
163 | + for line in a2File: | ||
164 | + #print("Line a2: {}".format(line)) | ||
165 | + line = line.strip('\n') | ||
166 | + listLine1 = line.split('\t') | ||
167 | + listLine2 = listLine1[1].split(' ') | ||
168 | + regulator = listLine2[2] | ||
169 | + regulator = regulator[regulator.find(":") + 1:] | ||
170 | + regulated = listLine2[1] | ||
171 | + regulated = regulated[regulated.find(":") + 1:] | ||
172 | + effect = listLine2[0] | ||
173 | + effect = effect[effect.find(".") + 1:] | ||
174 | + idInteraction = listLine1[0] | ||
175 | + keyInteraction = "{} {} {}".format(regulator, regulated, effect) | ||
176 | + if keyInteraction not in hashTemp: | ||
177 | + hashTemp[keyInteraction] = idInteraction | ||
178 | + if int(idInteraction[1:]) > idI: | ||
179 | + idI = int(idInteraction[1:]) | ||
180 | + except IOError: | ||
181 | + print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"))) | ||
182 | + # idI = 1 | ||
183 | + return idI | ||
184 | + | ||
185 | +def getRealPos(posStart, posEnd, lin): | ||
186 | + return (posStart, posEnd) | ||
187 | + | ||
188 | +def getRI(r, l): | ||
189 | + regulator = r.group('regulator') | ||
190 | + regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l) | ||
191 | + # regulatorStart = getRealPos(r.start('regulator'), l) | ||
192 | + # regulatorEnd = getRealPos(r.end('regulator'), l) | ||
193 | + regulated = r.group('regulated') | ||
194 | + regulatedPos = getRealPos(r.start('regulated'), r.end('regulated'), l) | ||
195 | + # regulatedStart = getRealPos(r.start('regulated'), l) | ||
196 | + # regulatedEnd = getRealPos(r.end('regulated'), l) | ||
197 | + effect = r.group('effect') | ||
198 | + effectPos = getRealPos(r.start('effect'), r.end('effect'), l) | ||
199 | + # effectStart = getRealPos(r.start('effect'), l) | ||
200 | + # effectEnd = getRealPos(r.end('effect'), l) | ||
201 | + #print("Regulator {}, start {}, end {}".format(regulator, regulatorPos[0], regulatorPos[1])) | ||
202 | + #print("Regulated {}, start {}, end {}".format(regulated, regulatedPos[0], regulatedPos[1])) | ||
203 | + #print("Effect {}, start {}, end {}".format(effect, effectPos[0], effectPos[1])) | ||
204 | + return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]), | ||
205 | + regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]), | ||
206 | + effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l] | ||
207 | + | ||
208 | +if __name__ == "__main__": | ||
209 | + # Parameter definition | ||
210 | + # python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py | ||
211 | + # --inputPath $(dirname ${file}) | ||
212 | + # --inputFile $(basename ${file}) | ||
213 | + # --outputPath $OUTPUT_PATH | ||
214 | + # --diccPath $DICC_PATH | ||
215 | + # --diccEffect normalized_Effects.json | ||
216 | + parser = OptionParser() | ||
217 | + parser.add_option("--inputPath", dest="inputPath", | ||
218 | + help="Input path", metavar="PATH") | ||
219 | + parser.add_option("--inputFile", dest="inputFile", | ||
220 | + help="Input file", metavar="FILE") | ||
221 | + parser.add_option("--outputPath", dest="outputPath", | ||
222 | + help="Output path", metavar="PATH") | ||
223 | + parser.add_option("--diccPath", dest="diccPath", | ||
224 | + help="Path to read dictionaries", metavar="PATH") | ||
225 | + # parser.add_option("--diccFile", dest="diccFile", | ||
226 | + # help="JSON file with entity dictionaries", metavar="FILE") | ||
227 | + parser.add_option("--diccEffect", dest="diccEffect", | ||
228 | + help="File with normalized effects", metavar="FILE") | ||
229 | + | ||
230 | + # parser.add_option("--format", dest="format", | ||
231 | + # help="Output format: standoff", metavar="TEXT") | ||
232 | + # parser.add_option("--diccEPAth", dest="diccEPAth", | ||
233 | + # help="File with normalized effects", metavar="FILE") | ||
234 | + | ||
235 | + (options, args) = parser.parse_args() | ||
236 | + #if len(args) > 0: | ||
237 | + # parser.error("None parameter entered.") | ||
238 | + # sys.exit(1) | ||
239 | + | ||
240 | + # Printing parameter values | ||
241 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
242 | + print("Input path: " + str(options.inputPath)) | ||
243 | + print("Input file: " + str(options.inputFile)) | ||
244 | + print("Output path: " + str(options.outputPath)) | ||
245 | + print("Path to read dictionaries: " + str(options.diccPath)) | ||
246 | + # print("JSON file with entity dictionaries: " + str(options.diccFile)) | ||
247 | + print("File with normalized effects: " + str(options.diccEffect)) | ||
248 | + # print("Output format: " + str(options.format)) | ||
249 | + # print("Path to read normalized effects: " + str(options.diccEPAth)) | ||
250 | + | ||
251 | + # regularWords = words.words('en') | ||
252 | + | ||
253 | + # print('Loading dictionaries...') | ||
254 | + # with open(os.path.join(options.diccPath, options.diccFile)) as diccFile: | ||
255 | + # hashDicc = json.load(diccFile) | ||
256 | + | ||
257 | + # hashTermFiles = hashDicc["hashTermFiles"] | ||
258 | + # hashTerms = hashDicc["hashTerms"] | ||
259 | + | ||
260 | + # for key in hashTermFiles.keys(): | ||
261 | + # for f in hashTermFiles[key]: | ||
262 | + # # print('File: ' + f) | ||
263 | + # with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile: | ||
264 | + # for line in iFile: | ||
265 | + # line = line.strip('\n') | ||
266 | + # line = line.replace(' ', '-') | ||
267 | + # if line not in hashTerms[key]: | ||
268 | + # hashTerms[key].append(line) | ||
269 | + # # if options.termLower: | ||
270 | + # # hashTerms[key].append(line.lower()) | ||
271 | + # # if options.termCapitalize: | ||
272 | + # # hashTerms[key].append(line.capitalize()) | ||
273 | + # print(' Terms read {} size: {}'.format(key, len(hashTerms[key]))) | ||
274 | + | ||
275 | + # Loading normalized effects | ||
276 | + print('Loading normalized effects ending with -d...') | ||
277 | + hashNormalizedEffects = {} | ||
278 | + with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile: | ||
279 | + hashNormalizedEffects = json.load(diccFile) | ||
280 | + listEffects = [] | ||
281 | + for eff in hashNormalizedEffects.keys(): | ||
282 | + if eff.endswith('d'): | ||
283 | + listEffects.append(eff) | ||
284 | + listEffects.append("dependent") | ||
285 | + effects = "|".join(listEffects) | ||
286 | + #print("Effects: {}".format(effects)) | ||
287 | + | ||
288 | + files = {} | ||
289 | + hashEntities = {} | ||
290 | + hashInteractions = {} | ||
291 | + hashInteractionsEffect = {} | ||
292 | + idEntities = 1 | ||
293 | + idInteractions = 1 | ||
294 | + idInteractionsEffect = 1 | ||
295 | + | ||
296 | + # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)\s([^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF') | ||
297 | + # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+( [^ ]+)') | ||
298 | + # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))\s([^|]+\|[^|]+\|(CC|,))?)+ ([^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF') | ||
299 | + # regexAttRILeft = re.compile(r'(?:([^|\s]+\|[^|]+\|(?:GENE|TU))\s(?:[^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF') | ||
300 | + # regexAttRILeft = re.compile(r'(?=([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)') | ||
301 | + # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)') | ||
302 | + # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+) ([^ ]+ )+(?P<regulator>[^|]+\|[^|]+\|TF)') | ||
303 | + # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>' + r'(' + effects + ')\|[^|]+\|TF) [^|]+\|gene') | ||
304 | + | ||
305 | + # reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene') | ||
306 | + # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(regulated|repressed)\|[^|]+\|TF) [^|]+\|gene') | ||
307 | + # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ ){,5}(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene') | ||
308 | + # CMC 2018-11-07: regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene') | ||
309 | + regexAttRILeft = re.compile( | ||
310 | + r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF)') | ||
311 | + # regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ ){,5}(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))') | ||
312 | + # CMC 2018-11-07: regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ )+(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))') | ||
313 | + regexAttRIRight = re.compile( | ||
314 | + r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) ([^ ]+ )*(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))') | ||
315 | + | ||
316 | + filename = options.inputFile | ||
317 | + hashEntities = {} | ||
318 | + hashInteractions = {} | ||
319 | + hashInteractionsEffect = {} | ||
320 | + idEntities = 1 | ||
321 | + idInteractions = 1 | ||
322 | + idInteractionsEffect = 1 | ||
323 | + outputPath = os.path.join(options.outputPath, "complete-ris") | ||
324 | + idEntities = loadFileEntities(filename, outputPath, hashEntities) | ||
325 | + idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect) | ||
326 | + outputPath = os.path.join(options.outputPath, "incomplete-ris") | ||
327 | + idInteractions = loadFileInteractions(filename, outputPath, hashInteractions) | ||
328 | + | ||
329 | + listRIs = [] | ||
330 | + | ||
331 | + with open(os.path.join(options.inputPath, options.inputFile)) as iFile: | ||
332 | + for line in iFile: | ||
333 | + line = line.rstrip('\n') | ||
334 | + # Buscar hacia la izquierda | ||
335 | + #print("Buscando hacia <<") | ||
336 | + result = regexAttRILeft.search(line) | ||
337 | + #print("result: {}".format(result)) | ||
338 | + lineTemp = line | ||
339 | + # print("lineTemp: {}".format(lineTemp)) | ||
340 | + while result: | ||
341 | + #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect'))) | ||
342 | + listRIs.append(getRI(result, line)) | ||
343 | + #print("listRIs: {}".format(listRIs)) | ||
344 | + lineTemp = lineTemp.replace(result.group('regulated'), '') | ||
345 | + #print("lineTemp for: {}".format(lineTemp)) | ||
346 | + result = regexAttRILeft.search(lineTemp) | ||
347 | + #print("result: {}".format(result)) | ||
348 | + | ||
349 | + # Buscar hacia la derecha | ||
350 | + #print("Buscando hacia >>") | ||
351 | + result = regexAttRIRight.search(line) | ||
352 | + #print("result: {}".format(result)) | ||
353 | + lineTemp = line | ||
354 | + # print("lineTemp: {}".format(lineTemp)) | ||
355 | + while result: | ||
356 | + #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect'))) | ||
357 | + listRIs.append(getRI(result, line)) | ||
358 | + #print("listRIs: {}".format(listRIs)) | ||
359 | + lineTemp = lineTemp.replace(result.group('regulated'), '') | ||
360 | + #print("lineTemp for: {}".format(lineTemp)) | ||
361 | + result = regexAttRIRight.search(lineTemp) | ||
362 | + #print("result: {}".format(result)) | ||
363 | + | ||
364 | + # result = regexAttRIRight.finditer(line) | ||
365 | + # lineTemp = line | ||
366 | + # while result: | ||
367 | + # listRIs.append(getRI(result, line)) | ||
368 | + # lineTemp = lineTemp.replace(result.group('regulated'), '') | ||
369 | + # result = regexAttRIRight.finditer(lineTemp) | ||
370 | + | ||
371 | + # return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]), | ||
372 | + # regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]), | ||
373 | + # effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l] | ||
374 | + for ri in listRIs: | ||
375 | + #print("ri: {}".format(ri)) | ||
376 | + if len(ri) != 4: | ||
377 | + print("WARNING! corrupted list") | ||
378 | + exit() | ||
379 | + regulator = ri[0] | ||
380 | + regulated = ri[1] | ||
381 | + effect = ri[2] | ||
382 | + line = ri[3] | ||
383 | + | ||
384 | + listElem = regulator.split('|') | ||
385 | + regulatorWord = listElem[0] | ||
386 | + regulatorType = listElem[2] | ||
387 | + regulatorStart = listElem[3] | ||
388 | + regulatorEnd = listElem[4] | ||
389 | + | ||
390 | + listElem = regulated.split('|') | ||
391 | + regulatedWord = listElem[0] | ||
392 | + regulatedType = listElem[2] | ||
393 | + regulatedStart = listElem[3] | ||
394 | + regulatedEnd = listElem[4] | ||
395 | + | ||
396 | + listElem = effect.split('|') | ||
397 | + effectWord = listElem[0] | ||
398 | + effectType = "EFFECT" | ||
399 | + effectStart = listElem[1] | ||
400 | + effectEnd = listElem[2] | ||
401 | + | ||
402 | + idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities) | ||
403 | + if regulatedType == "GENE": | ||
404 | + idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities) | ||
405 | + elif regulatedType == "TU": | ||
406 | + idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "TU", idEntities) | ||
407 | + else: | ||
408 | + print("WARNING! Unknown entity type") | ||
409 | + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", | ||
410 | + idInteractions, hashInteractions) | ||
411 | + idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities) | ||
412 | + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, | ||
413 | + idInteractionsEffect, | ||
414 | + hashInteractionsEffect) | ||
415 | + | ||
416 | + saveFiles(filename, hashEntities, hashInteractions, line, effect=False) | ||
417 | + saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True) |
ri-autoregulation-extraction-v01.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | +from optparse import OptionParser | ||
3 | +import sys | ||
4 | +import os | ||
5 | +import json | ||
6 | +import operator | ||
7 | +import re | ||
8 | +from general_functions import getTypeRegulation | ||
9 | +from nltk.corpus import words | ||
10 | + | ||
11 | +__author__ = 'CMendezC' | ||
12 | + | ||
13 | + | ||
14 | +# Objective: obtain predicted ris from autoregulation sentences, | ||
15 | +# such as ArgP protein represses its own synthesis | ||
16 | +# Input format: transformed format. | ||
17 | +# WARNING: Only one sentence per line | ||
18 | + | ||
19 | +# Parameters: | ||
20 | +# 1) --inputPath Input path | ||
21 | +# 2) --inputFile Inpupt file | ||
22 | +# 3) --outputPath Output path | ||
23 | +# 5) --diccPath Dictionary path | ||
24 | +# 7) --diccEffect File with normalized effects | ||
25 | + | ||
26 | +# 6) --diccFile JSON file with entity dictionaries | ||
27 | +# 9) --diccEPAth Dictionary path diccEffect | ||
28 | +# 8) --format Output format: standoff, tabs | ||
29 | + | ||
30 | +# Ouput: | ||
31 | +# 1) File with predicted ris combined with existing files. | ||
32 | +# Format standoff: | ||
33 | +# T1 TF 0 0 ArgP | ||
34 | +# T2 GENE 0 0 Argp -- > argP | ||
35 | +# R1 Interaction.activator Target:T3 Agent:T1 | ||
36 | +# Sentence ArgP protein represses its own synthesis | ||
37 | +# The FimZ transcription factor activates this promoter directly , | ||
38 | +# and it also positively regulates the transcription of its own gene | ||
39 | +# FimZ is known to regulate the expression of its own gene positively | ||
40 | +# FimZ also positively regulates its own transcription | ||
41 | +# ArgP protein represses its own synthesis | ||
42 | +# ArgP both represses its own transcription | ||
43 | +# ArgP protein represses its own synthesis | ||
44 | +# OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT | ||
45 | +# of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN | ||
46 | + | ||
47 | +# Execution | ||
48 | +# python3 ri-autoregulation-extraction-v01.py | ||
49 | +# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences | ||
50 | +# --inputFile dataSet_OnlyRI_sentences.auto.1017.txt | ||
51 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs | ||
52 | +# --diccPath /home/cmendezc/terminologicalResources | ||
53 | +# --diccEffect normalized_Effects.json | ||
54 | +# python3 ri-autoregulation-extraction-v01.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences --inputFile dataSet_OnlyRI_sentences.auto.1017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json | ||
55 | + | ||
56 | +########################################################### | ||
57 | +# MAIN PROGRAM # | ||
58 | +########################################################### | ||
59 | + | ||
60 | +def getPosWord(wordPos, endPos, text, termList): | ||
61 | + offsetStart = 0 | ||
62 | + wordNum = 0 | ||
63 | + listText = text.split() | ||
64 | + for w in listText: | ||
65 | + # if filenameBefore.find('000-2') > -1: | ||
66 | + # print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos)) | ||
67 | + if wordNum >= int(wordPos): | ||
68 | + # for tok in word.split(): | ||
69 | + for t in termList: | ||
70 | + # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords): | ||
71 | + if w == t: | ||
72 | + return [w, offsetStart, offsetStart + len(w) - 1] | ||
73 | + #else: | ||
74 | + wordNum += 1 | ||
75 | + offsetStart += len(w) + 1 | ||
76 | + if wordNum > int(endPos): | ||
77 | + return None | ||
78 | + return None | ||
79 | + | ||
80 | +def getIdEntity(aList, etype, idE): | ||
81 | + entity = aList[0] | ||
82 | + if etype == "EFFECT": | ||
83 | + normalizedEffect = entity | ||
84 | + #print("EFFECT: {}".format(entity)) | ||
85 | + if entity in hashNormalizedEffects: | ||
86 | + normalizedEffect = hashNormalizedEffects[entity] | ||
87 | + etype += "." + normalizedEffect | ||
88 | + #print("etype: {}".format(etype)) | ||
89 | + entityPosStart = aList[1] | ||
90 | + entityPosEnd = aList[2] | ||
91 | + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity) | ||
92 | + #print("keyEntity: {}".format(keyEntity)) | ||
93 | + if keyEntity not in hashEntities: | ||
94 | + idE += 1 | ||
95 | + idEntity = "T{}".format(idE) | ||
96 | + hashEntities[keyEntity] = idEntity | ||
97 | + #print("New entity {}: {}".format(idEntity, keyEntity)) | ||
98 | + return idEntity, idE | ||
99 | + else: | ||
100 | + idEntity = hashEntities[keyEntity] | ||
101 | + return idEntity, idE | ||
102 | + | ||
103 | +def getIdInteraction(regulator, regulated, effect, idI, hashInt): | ||
104 | + #print("hashInt: {}".format(hashInt)) | ||
105 | + keyInteraction = "{} {} {}".format(regulator, regulated, effect) | ||
106 | + if keyInteraction not in hashInt: | ||
107 | + idI += 1 | ||
108 | + idInteraction = "R{}".format(idI) | ||
109 | + hashInt[keyInteraction] = idInteraction | ||
110 | + #print("New interaction {}: {}".format(idInteraction, keyInteraction)) | ||
111 | + #return idInteraction, idI | ||
112 | + else: | ||
113 | + idInteraction = hashInt[keyInteraction] | ||
114 | + return idInteraction, idI | ||
115 | + | ||
116 | +def saveFiles(filename, hashE, hashI, s, effect): | ||
117 | + if effect: | ||
118 | + outputPath = os.path.join(options.outputPath, "complete-ris") | ||
119 | + else: | ||
120 | + outputPath = os.path.join(options.outputPath, "incomplete-ris") | ||
121 | + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File: | ||
122 | + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File: | ||
123 | + for k, v in sorted(hashE.items(), key=operator.itemgetter(1)): | ||
124 | + aList = k.split() | ||
125 | + a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3])) | ||
126 | + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File: | ||
127 | + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File: | ||
128 | + for k, v in sorted(hashI.items(), key=operator.itemgetter(1)): | ||
129 | + aList = k.split() | ||
130 | + a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0])) | ||
131 | + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile: | ||
132 | + txtFile.write(s) | ||
133 | + | ||
134 | +def loadFileEntities(filename, outputPath, hashTemp): | ||
135 | + #print("Start loadFileEntities") | ||
136 | + idE = 1 | ||
137 | + try: | ||
138 | + f = filename[:filename.rfind(".")] + ".a1" | ||
139 | + # print("file entities: {}".format(f)) | ||
140 | + with open(os.path.join(outputPath, f), mode="r") as a1File: | ||
141 | + for line in a1File: | ||
142 | + line = line.strip('\n') | ||
143 | + listLine1 = line.split('\t') | ||
144 | + listLine2 = listLine1[1].split(' ') | ||
145 | + etype = listLine2[0] | ||
146 | + entityPosStart = listLine2[1] | ||
147 | + entityPosEnd = listLine2[2] | ||
148 | + entity = listLine1[2] | ||
149 | + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity) | ||
150 | + idEntity = listLine1[0] | ||
151 | + if keyEntity not in hashTemp: | ||
152 | + hashTemp[keyEntity] = idEntity | ||
153 | + if int(idEntity[1:]) > idE: | ||
154 | + idE = int(idEntity[1:]) | ||
155 | + except IOError: | ||
156 | + print("IOError file: {}".format(os.path.join(outputPath, f))) | ||
157 | + # idE = 1 | ||
158 | + return idE | ||
159 | + | ||
160 | +def loadFileInteractions(filename, outputPath, hashTemp): | ||
161 | + #print("Start loadFileInteractions") | ||
162 | + idI = 1 | ||
163 | + try: | ||
164 | + with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File: | ||
165 | + for line in a2File: | ||
166 | + #print("Line a2: {}".format(line)) | ||
167 | + line = line.strip('\n') | ||
168 | + listLine1 = line.split('\t') | ||
169 | + listLine2 = listLine1[1].split(' ') | ||
170 | + regulator = listLine2[2] | ||
171 | + regulator = regulator[regulator.find(":") + 1:] | ||
172 | + regulated = listLine2[1] | ||
173 | + regulated = regulated[regulated.find(":") + 1:] | ||
174 | + effect = listLine2[0] | ||
175 | + effect = effect[effect.find(".") + 1:] | ||
176 | + idInteraction = listLine1[0] | ||
177 | + keyInteraction = "{} {} {}".format(regulator, regulated, effect) | ||
178 | + if keyInteraction not in hashTemp: | ||
179 | + hashTemp[keyInteraction] = idInteraction | ||
180 | + if int(idInteraction[1:]) > idI: | ||
181 | + idI = int(idInteraction[1:]) | ||
182 | + except IOError: | ||
183 | + print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"))) | ||
184 | + # idI = 1 | ||
185 | + return idI | ||
186 | + | ||
187 | +''' | ||
188 | +def getTypeRegulation(effect_group, posini, sent, type_sent): | ||
189 | + # To change regulation effect in such as: | ||
190 | + # negative regulator --> repressor | ||
191 | + # positively regulates --> activator | ||
192 | + effect_ret = effect_group | ||
193 | + #listEff = effect_ret.split('|') | ||
194 | + | ||
195 | + if type_sent == "tra": | ||
196 | + regexTypeEffectPosi = re.compile(r'(?<=positive\|(RB|JJ) )' + effect_ret) | ||
197 | + regexTypeEffectNega = re.compile(r'(?<=negative\|(RB|JJ) )' + effect_ret) | ||
198 | + if regexTypeEffectPosi.search(sent, posini - 12): | ||
199 | + # Creo que no es necesario: effect_ret = "activator|{}|{}".format(listEff[1], listEff[2]) | ||
200 | + effect_ret = "activator" | ||
201 | + print("Change regulation effect: {}".format(sent)) | ||
202 | + elif regexTypeEffectNega.search(sent, posini - 12): | ||
203 | + # Creo que no es necesario: effect_ret = "repressor|{}|{}".format(listEff[1], listEff[2]) | ||
204 | + effect_ret = "repressor" | ||
205 | + print("Change regulation effect: {}".format(sent)) | ||
206 | + return effect_ret | ||
207 | +''' | ||
208 | + | ||
209 | +def getRealPos(posStart, posEnd, lin): | ||
210 | + return (posStart, posEnd) | ||
211 | + | ||
212 | +def getRI(r, l): | ||
213 | + regulator = r.group('regulator') | ||
214 | + regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l) | ||
215 | + # We change TF name to GENE name | ||
216 | + listRegulator = regulator.split('|') | ||
217 | + regulatorWord = listRegulator[0] | ||
218 | + regulated = regulatorWord[0].lower()+regulatorWord[1:] | ||
219 | + regulated += "|{}|GENE".format(regulated) | ||
220 | + regulatedPos = getRealPos(0, 0, l) | ||
221 | + effect = r.group('effect') | ||
222 | + # print("effect from group: {}".format(effect)) | ||
223 | + effectPos = getRealPos(r.start('effect'), r.end('effect'), l) | ||
224 | + | ||
225 | + # To change regulation effect in: | ||
226 | + # negative regulator --> repressor | ||
227 | + # positively regulates --> activator | ||
228 | + effect = getTypeRegulation(effect, r.start('effect'), l, "tra") | ||
229 | + | ||
230 | + return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]), | ||
231 | + regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]), | ||
232 | + effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l] | ||
233 | + | ||
234 | +if __name__ == "__main__": | ||
235 | + # Parameter definition | ||
236 | + parser = OptionParser() | ||
237 | + parser.add_option("--inputPath", dest="inputPath", | ||
238 | + help="Input path", metavar="PATH") | ||
239 | + parser.add_option("--inputFile", dest="inputFile", | ||
240 | + help="Input file", metavar="FILE") | ||
241 | + parser.add_option("--outputPath", dest="outputPath", | ||
242 | + help="Output path", metavar="PATH") | ||
243 | + parser.add_option("--diccPath", dest="diccPath", | ||
244 | + help="Path to read dictionaries", metavar="PATH") | ||
245 | + parser.add_option("--diccEffect", dest="diccEffect", | ||
246 | + help="File with normalized effects", metavar="FILE") | ||
247 | + | ||
248 | + (options, args) = parser.parse_args() | ||
249 | + #if len(args) > 0: | ||
250 | + # parser.error("None parameter entered.") | ||
251 | + # sys.exit(1) | ||
252 | + | ||
253 | + # Printing parameter values | ||
254 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
255 | + print("Input path: " + str(options.inputPath)) | ||
256 | + print("Input file: " + str(options.inputFile)) | ||
257 | + print("Output path: " + str(options.outputPath)) | ||
258 | + print("Path to read dictionaries: " + str(options.diccPath)) | ||
259 | + print("File with normalized effects: " + str(options.diccEffect)) | ||
260 | + | ||
261 | + # Loading normalized effects | ||
262 | + print('Loading normalized effects (all)...') | ||
263 | + hashNormalizedEffects = {} | ||
264 | + with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile: | ||
265 | + hashNormalizedEffects = json.load(diccFile) | ||
266 | + listEffects = [] | ||
267 | + for eff in hashNormalizedEffects.keys(): | ||
268 | + listEffects.append(eff) | ||
269 | + effects = "|".join(listEffects) | ||
270 | + #print("Effects: {}".format(effects)) | ||
271 | + | ||
272 | + files = {} | ||
273 | + hashEntities = {} | ||
274 | + hashInteractions = {} | ||
275 | + hashInteractionsEffect = {} | ||
276 | + idEntities = 1 | ||
277 | + idInteractions = 1 | ||
278 | + idInteractionsEffect = 1 | ||
279 | + | ||
280 | + # The FimZ transcription factor activates this promoter directly , | ||
281 | + # and it also positively regulates the transcription of its own gene | ||
282 | + # FimZ is known to regulate the expression of its own gene positively | ||
283 | + # FimZ also positively regulates its own transcription | ||
284 | + # ArgP protein represses its own synthesis | ||
285 | + # ArgP both represses its own transcription | ||
286 | + # ArgP protein represses its own synthesis | ||
287 | + # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT | ||
288 | + # of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN | ||
289 | + regexAutoRI = re.compile( | ||
290 | + # r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]\s){,4}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)') | ||
291 | + r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+\s(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)') | ||
292 | + #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^(TF)\s]+\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)') | ||
293 | + #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^T][^F]\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT') | ||
294 | + | ||
295 | + filename = options.inputFile | ||
296 | + hashEntities = {} | ||
297 | + hashInteractions = {} | ||
298 | + hashInteractionsEffect = {} | ||
299 | + idEntities = 1 | ||
300 | + idInteractions = 1 | ||
301 | + idInteractionsEffect = 1 | ||
302 | + outputPath = os.path.join(options.outputPath, "complete-ris") | ||
303 | + idEntities = loadFileEntities(filename, outputPath, hashEntities) | ||
304 | + idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect) | ||
305 | + outputPath = os.path.join(options.outputPath, "incomplete-ris") | ||
306 | + idInteractions = loadFileInteractions(filename, outputPath, hashInteractions) | ||
307 | + | ||
308 | + listRIs = [] | ||
309 | + # print("Read autoregulation file") | ||
310 | + with open(os.path.join(options.inputPath, options.inputFile)) as iFile: | ||
311 | + for line in iFile: | ||
312 | + line = line.rstrip('\n') | ||
313 | + print("Buscando autoregulation") | ||
314 | + result = regexAutoRI.search(line) | ||
315 | + #print("result: {}".format(result)) | ||
316 | + if result: | ||
317 | + lineTemp = result.string[result.end('regulator'):result.end(0)] | ||
318 | + # print("lineTemp: {}".format(lineTemp)) | ||
319 | + result2 = regexAutoRI.search(lineTemp) | ||
320 | + if result2: | ||
321 | + print("Regulator {} regulated {} effect {}".format(result2.group('regulator'), result2.group('regulator'), result2.group('effect'))) | ||
322 | + listRIs.append(getRI(result2, line)) | ||
323 | + print("listRIs: {}".format(listRIs)) | ||
324 | + elif result: | ||
325 | + print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulator'), result.group('effect'))) | ||
326 | + listRIs.append(getRI(result, line)) | ||
327 | + print("listRIs: {}".format(listRIs)) | ||
328 | + | ||
329 | + | ||
330 | + for ri in listRIs: | ||
331 | + #print("ri: {}".format(ri)) | ||
332 | + if len(ri) != 4: | ||
333 | + print("WARNING! corrupted list") | ||
334 | + exit() | ||
335 | + regulator = ri[0] | ||
336 | + regulated = ri[1] | ||
337 | + effect = ri[2] | ||
338 | + line = ri[3] | ||
339 | + | ||
340 | + listElem = regulator.split('|') | ||
341 | + regulatorWord = listElem[0] | ||
342 | + regulatorType = listElem[2] | ||
343 | + regulatorStart = listElem[3] | ||
344 | + regulatorEnd = listElem[4] | ||
345 | + | ||
346 | + listElem = regulated.split('|') | ||
347 | + regulatedWord = listElem[0] | ||
348 | + regulatedType = listElem[2] | ||
349 | + regulatedStart = listElem[3] | ||
350 | + regulatedEnd = listElem[4] | ||
351 | + | ||
352 | + listElem = effect.split('|') | ||
353 | + effectWord = listElem[0] | ||
354 | + effectType = "EFFECT" | ||
355 | + effectStart = listElem[1] | ||
356 | + effectEnd = listElem[2] | ||
357 | + | ||
358 | + idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities) | ||
359 | + idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities) | ||
360 | + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", | ||
361 | + idInteractions, hashInteractions) | ||
362 | + idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities) | ||
363 | + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, | ||
364 | + idInteractionsEffect, | ||
365 | + hashInteractionsEffect) | ||
366 | + | ||
367 | + saveFiles(filename, hashEntities, hashInteractions, line, effect=False) | ||
368 | + saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True) |
ri-openie-extraction-v02.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | +from optparse import OptionParser | ||
3 | +import sys | ||
4 | +import os | ||
5 | +import json | ||
6 | +import operator | ||
7 | +from general_functions import getTypeRegulation | ||
8 | +import re | ||
9 | +from nltk.corpus import words | ||
10 | + | ||
11 | +__author__ = 'CMendezC' | ||
12 | + | ||
13 | + | ||
14 | +# Objective: obtain predicted ris from triplets extracted by OpenIE Stanford CoreNLP | ||
15 | +# Input format: | ||
16 | +# WARNING: Only one sentence per line | ||
17 | + | ||
18 | +# Parameters: | ||
19 | +# 1) --inputPath Input path | ||
20 | +# 2) --inputFile Inpupt file | ||
21 | +# 3) --outputPath Output path | ||
22 | +# 5) --diccPath Dictionary path | ||
23 | +# 6) --diccFile JSON file with entity dictionaries | ||
24 | +# 7) --diccEffect File with normalized effects | ||
25 | +# 8) --format Output format: standoff, tabs | ||
26 | +# 9) --diccEPAth Dictionary path diccEffect | ||
27 | + | ||
28 | +# Ouput: | ||
29 | +# 1) File with predicted ris. | ||
30 | +# Format standoff: | ||
31 | +# T1 TF 0 0 MetR | ||
32 | +# T2 TU 0 0 metH | ||
33 | +# T3 GENE 0 0 metH | ||
34 | +# T1 Growth_condition 88 137 mitochondrial electron transport chain inhibitors | ||
35 | +# T2 Growth_condition 150 179 switch rich to minimal medium | ||
36 | +# R1 Interaction.activator Target:T3 Agent:T1 | ||
37 | +# R2 Interaction.activator Target:T2 Agent:T1 | ||
38 | + | ||
39 | +# Execution | ||
40 | +# python3.4 ri-openie-extraction.py | ||
41 | +# --inputFile /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris/predicted-ris.reverb | ||
42 | +# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/processing-ris | ||
43 | +# --diccPath /home/cmendezc/terminologicalResources | ||
44 | +# --diccFile normalized_Effects_Type.json | ||
45 | +# --diccEffect termFilesTag_RIE_GCE_SYSTEM_ECCO.jsong | ||
46 | +# --format standoff | ||
47 | + | ||
48 | +########################################################### | ||
49 | +# MAIN PROGRAM # | ||
50 | +########################################################### | ||
51 | + | ||
52 | +def getPosWord(wordPos, endPos, text, termList, type_entity=""): | ||
53 | + #print("GETPOSWORD wordPOs {}".format(wordPos)) | ||
54 | + offsetStart = 0 | ||
55 | + wordNum = 0 | ||
56 | + listText = text.split() | ||
57 | + for w in listText: | ||
58 | + # if filenameBefore.find('000-2') > -1: | ||
59 | + # print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos)) | ||
60 | + if wordNum >= int(wordPos): | ||
61 | + # for tok in word.split(): | ||
62 | + for t in termList: | ||
63 | + # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords): | ||
64 | + if w == t: | ||
65 | + if type_entity == "EFFECT": | ||
66 | + # To change regulation effect in: | ||
67 | + # negative regulator --> repressor | ||
68 | + # positively regulates --> activator | ||
69 | + print("text: {}".format(text)) | ||
70 | + new_w = getTypeRegulation(w, int(wordPos), text, "word") | ||
71 | + return [new_w, offsetStart, offsetStart + len(w) - 1] | ||
72 | + else: | ||
73 | + return [w, offsetStart, offsetStart + len(w) - 1] | ||
74 | + #else: | ||
75 | + wordNum += 1 | ||
76 | + offsetStart += len(w) + 1 | ||
77 | + if wordNum > int(endPos): | ||
78 | + return None | ||
79 | + return None | ||
80 | + | ||
81 | + | ||
82 | +def getIdEntity(aList, etype, idE): | ||
83 | + entity = aList[0] | ||
84 | + if etype == "EFFECT": | ||
85 | + normalizedEffect = entity | ||
86 | + # print("EFFECT: {}".format(entity)) | ||
87 | + if entity in hashEffects: | ||
88 | + normalizedEffect = hashEffects[entity] | ||
89 | + etype += "." + normalizedEffect | ||
90 | + # print("EFFECT: {}".format(entity)) | ||
91 | + entityPosStart = aList[1] | ||
92 | + entityPosEnd = aList[2] | ||
93 | + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity) | ||
94 | + #if filenameBefore.find('061-02') > -1: | ||
95 | + # print("keyEntity: {}".format(keyEntity)) | ||
96 | + # print("idE: {}".format(idE)) | ||
97 | + # print("hashEntities: {}".format(hashEntities)) | ||
98 | + if keyEntity not in hashEntities: | ||
99 | + idE += 1 | ||
100 | + idEntity = "T{}".format(idE) | ||
101 | + #if filenameBefore.find('061-02') > -1: | ||
102 | + # print("idEntity not in hashEntities: {}".format(keyEntity)) | ||
103 | + # print("idE not in hashEntities: {}".format(idE)) | ||
104 | + hashEntities[keyEntity] = idEntity | ||
105 | + #print("New entity {}: {}".format(idEntity, keyEntity)) | ||
106 | + return idEntity, idE | ||
107 | + else: | ||
108 | + idEntity = hashEntities[keyEntity] | ||
109 | + return idEntity, idE | ||
110 | + | ||
111 | + | ||
112 | +def getIdInteraction(regulator, regulated, effect, idI, hashInt): | ||
113 | + #print("hashInt: {}".format(hashInt)) | ||
114 | + keyInteraction = "{} {} {}".format(regulator, regulated, effect) | ||
115 | + if keyInteraction not in hashInt: | ||
116 | + idI += 1 | ||
117 | + idInteraction = "R{}".format(idI) | ||
118 | + hashInt[keyInteraction] = idInteraction | ||
119 | + #print("New interaction {}: {}".format(idInteraction, keyInteraction)) | ||
120 | + #return idInteraction, idI | ||
121 | + else: | ||
122 | + idInteraction = hashInt[keyInteraction] | ||
123 | + return idInteraction, idI | ||
124 | + | ||
125 | + | ||
126 | +def saveFiles(filename, hashE, hashI, s, effect): | ||
127 | + if effect: | ||
128 | + outputPath = os.path.join(options.outputPath, "complete-ris") | ||
129 | + else: | ||
130 | + outputPath = os.path.join(options.outputPath, "incomplete-ris") | ||
131 | + with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="w") as a1File: | ||
132 | + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File: | ||
133 | + for k, v in sorted(hashE.items(), key=operator.itemgetter(1)): | ||
134 | + aList = k.split() | ||
135 | + a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3])) | ||
136 | + with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="w") as a2File: | ||
137 | + #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File: | ||
138 | + for k, v in sorted(hashI.items(), key=operator.itemgetter(1)): | ||
139 | + aList = k.split() | ||
140 | + a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0])) | ||
141 | + with open(os.path.join(outputPath, filename[:file.find(".")] + ".txt"), mode="w") as txtFile: | ||
142 | + txtFile.write(s) | ||
143 | + | ||
144 | +def loadFileEntities(filename, outputPath, hashTemp): | ||
145 | + idE = 1 | ||
146 | + try: | ||
147 | + with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="r") as a1File: | ||
148 | + for line in a1File: | ||
149 | + line = line.strip('\n') | ||
150 | + listLine1 = line.split('\t') | ||
151 | + listLine2 = listLine1[1].split(' ') | ||
152 | + etype = listLine2[0] | ||
153 | + entityPosStart = listLine2[1] | ||
154 | + entityPosEnd = listLine2[2] | ||
155 | + entity = listLine1[2] | ||
156 | + keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity) | ||
157 | + idEntity = listLine1[0] | ||
158 | + if keyEntity not in hashTemp: | ||
159 | + hashTemp[keyEntity] = idEntity | ||
160 | + if int(idEntity[1:]) > idE: | ||
161 | + idE = int(idEntity[1:]) | ||
162 | + except IOError: | ||
163 | + print("IOError file, idEntity starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a1"))) | ||
164 | + # idE = 1 | ||
165 | + return idE | ||
166 | + | ||
167 | +def loadFileInteractions(filename, outputPath, hashTemp): | ||
168 | + idI = 1 | ||
169 | + try: | ||
170 | + with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="r") as a2File: | ||
171 | + for line in a2File: | ||
172 | + #print("Line a2: {}".format(line)) | ||
173 | + line = line.strip('\n') | ||
174 | + listLine1 = line.split('\t') | ||
175 | + listLine2 = listLine1[1].split(' ') | ||
176 | + regulator = listLine2[2] | ||
177 | + regulator = regulator[regulator.find(":") + 1:] | ||
178 | + regulated = listLine2[1] | ||
179 | + regulated = regulated[regulated.find(":") + 1:] | ||
180 | + effect = listLine2[0] | ||
181 | + effect = effect[effect.find(".") + 1:] | ||
182 | + idInteraction = listLine1[0] | ||
183 | + keyInteraction = "{} {} {}".format(regulator, regulated, effect) | ||
184 | + if keyInteraction not in hashTemp: | ||
185 | + hashTemp[keyInteraction] = idInteraction | ||
186 | + if int(idInteraction[1:]) > idI: | ||
187 | + idI = int(idInteraction[1:]) | ||
188 | + except IOError: | ||
189 | + print("IOError file, idInteraction starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a2"))) | ||
190 | + # idI = 1 | ||
191 | + return idI | ||
192 | + | ||
193 | +if __name__ == "__main__": | ||
194 | + # Parameter definition | ||
195 | + parser = OptionParser() | ||
196 | + parser.add_option("--inputPath", dest="inputPath", | ||
197 | + help="Input path", metavar="PATH") | ||
198 | + parser.add_option("--inputFile", dest="inputFile", | ||
199 | + help="Input file", metavar="FILE") | ||
200 | + parser.add_option("--outputPath", dest="outputPath", | ||
201 | + help="Output path", metavar="PATH") | ||
202 | + #parser.add_option("--outputFile", dest="outputFile", | ||
203 | + #help="Output file", metavar="FILE") | ||
204 | + parser.add_option("--diccPath", dest="diccPath", | ||
205 | + help="Path to read dictionaries", metavar="PATH") | ||
206 | + parser.add_option("--diccFile", dest="diccFile", | ||
207 | + help="JSON file with entity dictionaries", metavar="FILE") | ||
208 | + parser.add_option("--diccEffect", dest="diccEffect", | ||
209 | + help="File with normalized effects", metavar="FILE") | ||
210 | + parser.add_option("--format", dest="format", | ||
211 | + help="Output format: standoff", metavar="TEXT") | ||
212 | + parser.add_option("--diccEPAth", dest="diccEPAth", | ||
213 | + help="File with normalized effects", metavar="FILE") | ||
214 | + | ||
215 | + (options, args) = parser.parse_args() | ||
216 | + if len(args) > 0: | ||
217 | + parser.error("None parameter entered.") | ||
218 | + sys.exit(1) | ||
219 | + | ||
220 | + # Printing parameter values | ||
221 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
222 | + print("Input path: " + str(options.inputPath)) | ||
223 | + print("Input file: " + str(options.inputFile)) | ||
224 | + print("Output path: " + str(options.outputPath)) | ||
225 | + #print("Output file: " + str(options.outputFile)) | ||
226 | + print("Path to read dictionaries: " + str(options.diccPath)) | ||
227 | + print("JSON file with entity dictionaries: " + str(options.diccFile)) | ||
228 | + print("Path to read normalized effects: " + str(options.diccEPAth)) | ||
229 | + print("File with normalized effects: " + str(options.diccEffect)) | ||
230 | + print("Output format: " + str(options.format)) | ||
231 | + | ||
232 | + regularWords = words.words('en') | ||
233 | + | ||
234 | + print('Loading dictionaries...') | ||
235 | + with open(os.path.join(options.diccPath, options.diccFile)) as diccFile: | ||
236 | + hashDicc = json.load(diccFile) | ||
237 | + | ||
238 | + # hashTermFiles = hashDicc["hashTermFiles"] | ||
239 | + # hashTerms = hashDicc["hashTerms"] | ||
240 | + | ||
241 | + # for key in hashTermFiles.keys(): | ||
242 | + # for f in hashTermFiles[key]: | ||
243 | + # # print('File: ' + f) | ||
244 | + # with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile: | ||
245 | + # for line in iFile: | ||
246 | + # line = line.strip('\n') | ||
247 | + # line = line.replace(' ', '-') | ||
248 | + # if line not in hashTerms[key]: | ||
249 | + # hashTerms[key].append(line) | ||
250 | + # # if options.termLower: | ||
251 | + # # hashTerms[key].append(line.lower()) | ||
252 | + # # if options.termCapitalize: | ||
253 | + # # hashTerms[key].append(line.capitalize()) | ||
254 | + # print(' Terms read {} size: {}'.format(key, len(hashTerms[key]))) | ||
255 | + | ||
256 | + # Loading normalized effects | ||
257 | + print('Loading normalized effects...') | ||
258 | + with open(os.path.join(options.diccEPAth, options.diccEffect)) as diccFile: | ||
259 | + hashEffects = json.load(diccFile) | ||
260 | + | ||
261 | + files = {} | ||
262 | + hashEntities = {} | ||
263 | + hashInteractions = {} | ||
264 | + hashInteractionsEffect = {} | ||
265 | + idEntities = 1 | ||
266 | + idInteractions = 1 | ||
267 | + idInteractionsEffect = 1 | ||
268 | + filenameBefore = '' | ||
269 | + regexNumFile = re.compile(r'_([0-9]+)[.-]') | ||
270 | + numFile = "" | ||
271 | + inumFile = 0 | ||
272 | + hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []} | ||
273 | + | ||
274 | + with open(os.path.join(options.inputPath, options.inputFile)) as iFile: | ||
275 | + for line in iFile: | ||
276 | + line = line.rstrip('\n') | ||
277 | + listLine = line.split('\t') | ||
278 | + file = listLine[0] | ||
279 | + filename = file.split("/")[-1] | ||
280 | + filename = filename[:-4] | ||
281 | + if filename not in files: | ||
282 | + # New file, that is, new sentence | ||
283 | + files[filename] = 1 | ||
284 | + if len(files) > 1: | ||
285 | + if len(hashEntities) > 0: | ||
286 | + #if filenameBefore.find('061-02') > -1: | ||
287 | + # print("filenameBefore: {}".format(filenameBefore)) | ||
288 | + # print("Save hashEntities: {}".format(hashEntities)) | ||
289 | + # print("Save hashInteractions: {}".format(hashInteractions)) | ||
290 | + # print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect)) | ||
291 | + saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False) | ||
292 | + saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True) | ||
293 | + filenameBefore = filename | ||
294 | + hashEntities = {} | ||
295 | + hashInteractions = {} | ||
296 | + hashInteractionsEffect = {} | ||
297 | + idEntities = 1 | ||
298 | + idInteractions = 1 | ||
299 | + idInteractionsEffect = 1 | ||
300 | + outputPath = os.path.join(options.outputPath, "complete-ris") | ||
301 | + idEntities = loadFileEntities(filename, outputPath, hashEntities) | ||
302 | + idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect) | ||
303 | + outputPath = os.path.join(options.outputPath, "incomplete-ris") | ||
304 | + idInteractions = loadFileInteractions(filename, outputPath, hashInteractions) | ||
305 | + result = regexNumFile.search(filenameBefore) | ||
306 | + if result: | ||
307 | + inumFile = int(result.group(1)) | ||
308 | + numFile = str(inumFile) | ||
309 | + print("Numfile: {}".format(numFile)) | ||
310 | + else: | ||
311 | + print("WARNING: numfile not found in filename") | ||
312 | + hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []} | ||
313 | + if numFile in hashDicc: | ||
314 | + hashTemp = hashDicc[numFile] | ||
315 | + #print("hashDicc[numFile]: {}".format(hashTemp)) | ||
316 | + for k, v in hashTemp.items(): | ||
317 | + if v == "TF": | ||
318 | + # print("Verifiying TF") | ||
319 | + if "TF" in hashTerms: | ||
320 | + # print(" TF {}".format(k)) | ||
321 | + hashTerms["TF"].append(k) | ||
322 | + else: | ||
323 | + hashTerms["TF"] = [k] | ||
324 | + elif v == "GENE": | ||
325 | + if "GENE" in hashTerms: | ||
326 | + hashTerms["GENE"].append(k) | ||
327 | + else: | ||
328 | + hashTerms["GENE"] = [k] | ||
329 | + elif v == "TU": | ||
330 | + if "TU" in hashTerms: | ||
331 | + hashTerms["TU"].append(k) | ||
332 | + else: | ||
333 | + hashTerms["TU"] = [k] | ||
334 | + elif v == "EFFECT": | ||
335 | + if "EFFECT" in hashTerms: | ||
336 | + hashTerms["EFFECT"].append(k) | ||
337 | + else: | ||
338 | + hashTerms["EFFECT"] = [k] | ||
339 | + else: | ||
340 | + print("WARNING: entity not found in dictionaries") | ||
341 | + else: | ||
342 | + print("WARNING: numfile not found in dictionaries") | ||
343 | + #if filename.find('061-02') > -1: | ||
344 | + # print("filename: {}".format(filename)) | ||
345 | + # print("Load hashEntities: {}".format(hashEntities)) | ||
346 | + # print("Load hashInteractions: {}".format(hashInteractions)) | ||
347 | + # print("Load hashInteractionsEffect: {}".format(hashInteractionsEffect)) | ||
348 | + | ||
349 | + wordA = listLine[2] | ||
350 | + wordB = listLine[3] | ||
351 | + wordC = listLine[4] | ||
352 | + startA = listLine[5] | ||
353 | + endA = listLine[6] | ||
354 | + startB = listLine[7] | ||
355 | + endB = listLine[8] | ||
356 | + startC = listLine[9] | ||
357 | + endC = listLine[10] | ||
358 | + sent = listLine[12] | ||
359 | + lemmaA = listLine[2] | ||
360 | + lemmaB = listLine[3] | ||
361 | + lemmaC = listLine[4] | ||
362 | + | ||
363 | + # Return [tok, offsetStart, offsetEnd ] | ||
364 | + # print("hashTerms[TF]: {}".format(hashTerms["TF"])) | ||
365 | + listRegulator = getPosWord(startA, endA, sent, hashTerms["TF"]) | ||
366 | + if listRegulator is not None: | ||
367 | + #if filenameBefore.find('061-02') > -1: | ||
368 | + # print(">> Regulator found: {}".format(listRegulator[0])) | ||
369 | + listRegulated = getPosWord(startC, endC, sent, hashTerms["GENE"]) | ||
370 | + if listRegulated is not None: | ||
371 | + #if filenameBefore.find('061-02') > -1: | ||
372 | + # print(">> Regulated GENE found: {}".format(listRegulated[0])) | ||
373 | + idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities) | ||
374 | + idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities) | ||
375 | + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions) | ||
376 | + #print("Review EFFECT") | ||
377 | + listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT") | ||
378 | + if listEffect is not None: | ||
379 | + idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities) | ||
380 | + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect) | ||
381 | + else: | ||
382 | + listRegulated = getPosWord(startC, endC, sent, hashTerms["TU"]) | ||
383 | + if listRegulated is not None: | ||
384 | + #if filenameBefore.find('061-02') > -1: | ||
385 | + # print(">> Regulated TU found: {}".format(listRegulated[0])) | ||
386 | + idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities) | ||
387 | + idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities) | ||
388 | + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions) | ||
389 | + #print("Review EFFECT") | ||
390 | + listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT") | ||
391 | + if listEffect is not None: | ||
392 | + idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities) | ||
393 | + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect) | ||
394 | + else: | ||
395 | + listRegulator = getPosWord(startC, endC, sent, hashTerms["TF"]) | ||
396 | + if listRegulator is not None: | ||
397 | + #if filenameBefore.find('061-02') > -1: | ||
398 | + # print(">> Regulator found: {}".format(listRegulator[0])) | ||
399 | + listRegulated = getPosWord(startA, endA, sent, hashTerms["GENE"]) | ||
400 | + if listRegulated is not None: | ||
401 | + #if filenameBefore.find('061-02') > -1: | ||
402 | + # print(">> Regulated GENE found: {}".format(listRegulated[0])) | ||
403 | + idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities) | ||
404 | + idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities) | ||
405 | + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions) | ||
406 | + #print("Review EFFECT") | ||
407 | + listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT") | ||
408 | + if listEffect is not None: | ||
409 | + idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities) | ||
410 | + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect) | ||
411 | + else: | ||
412 | + listRegulated = getPosWord(startA, endA, sent, hashTerms["TU"]) | ||
413 | + if listRegulated is not None: | ||
414 | + #if filenameBefore.find('061-02') > -1: | ||
415 | + # print(">> Regulated TU found: {}".format(listRegulated[0])) | ||
416 | + idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities) | ||
417 | + idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities) | ||
418 | + idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions) | ||
419 | + #print("Review EFFECT") | ||
420 | + listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT") | ||
421 | + if listEffect is not None: | ||
422 | + idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities) | ||
423 | + idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect) | ||
424 | + if len(files) > 1: | ||
425 | + if len(hashEntities) > 0: | ||
426 | + #print("filenameBefore: {}".format(filenameBefore)) | ||
427 | + #print("Save hashEntities: {}".format(hashEntities)) | ||
428 | + #print("Save hashInteractions: {}".format(hashInteractions)) | ||
429 | + #print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect)) | ||
430 | + saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False) | ||
431 | + saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True) |
ri-openie-extraction/.gitignore
0 → 100644
1 | + |
run-several-files.sh
0 → 100755
1 | +#!/bin/bash | ||
2 | + | ||
3 | +###### Automatic extraction of TRN from several files ###### | ||
4 | + | ||
5 | +BRIES_HOME=/myhome/bries | ||
6 | +PMIDS_HOME=/myhome/preprocessed-files | ||
7 | +# We don't use REFERENCE_HOME because we don't evaluate. Path /reference-data-set doesn't exist. File no-reference.txt doesn't exist. | ||
8 | +REFERENCE_HOME=/myhome/reference-data-set | ||
9 | + | ||
10 | +for f in $PMIDS_HOME/original/text/*.* | ||
11 | +do | ||
12 | + FILE_NAME=$(basename "$f") | ||
13 | + FILE_NAME="${FILE_NAME%.*}" | ||
14 | + echo "File: $FILE_NAME" | ||
15 | + ./automatic-extraction-ris-gcs.sh $PMIDS_HOME/features/$FILE_NAME.tra.word.txt $PMIDS_HOME/transformed/$FILE_NAME.tra.txt $BRIES_HOME/ri-openie-extraction/$FILE_NAME.txt $BRIES_HOME/predicted-ris-gcs Y Y FILT1 $REFERENCE_HOME no-reference.txt $BRIES_HOME/evaluation-reports no-evaluation.txt diccionario-SYNONYMS.json $PMIDS_HOME/original/tsv 1>uno-$FILE_NAME.txt 2>dos-$FILE_NAME.txt | ||
16 | +done |
sentence-filter_v02.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +from time import time | ||
7 | +import json | ||
8 | +import re | ||
9 | +import pandas as pd | ||
10 | + | ||
11 | +__author__ = 'CMendezC' | ||
12 | + | ||
13 | + | ||
14 | +# Objective: Filter sentences with specific entities. | ||
15 | +# Also extract attributive sentences: effect-TF | ||
16 | +# And autoregulation: regulates its own gene | ||
17 | +# CFMC 2022-03-08: We added updating tsv file with idsentence, sentence and section (.pre.tsv) | ||
18 | +# to indicate filtered sentences. | ||
19 | + | ||
20 | +# Parameters: | ||
21 | +# 1) --inputFileWord Path and filename to read feature word file. | ||
22 | +# 2) --inputFileTrans Path and filename to read transformed file. | ||
23 | +# 3) --outputPath Path to place output file. | ||
24 | +# 4) --outputFile Output file. | ||
25 | +# 5) --filter FILT1: (GENE OR TU) AND TF | ||
26 | +# FILT2: (GENE OR TU) AND EFFECT AND TF | ||
27 | +# 6) --attrPath Path for attributive cases: ArgP-regulated genes | ||
28 | +# 8) --dicPath Path for dictionary | ||
29 | +# 9) --dicFile Path for dictionary file normalized_Effects.json | ||
30 | +# 10) --autoPath Path for autoregulation cases: regulates its own gene | ||
31 | +# /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences | ||
32 | + | ||
33 | +# Output: | ||
34 | +# 1) Filtered sentences. | ||
35 | +# 2) Attributive sentences | ||
36 | +# 3) Autoregulation sentences | ||
37 | + | ||
38 | + | ||
39 | +########################################################### | ||
40 | +# MAIN PROGRAM # | ||
41 | +########################################################### | ||
42 | + | ||
43 | +def getEntities(tline, filt): | ||
44 | + # FILT1: (GENE OR TU) AND TF | ||
45 | + # FILT2: (GENE OR TU) AND EFFECT AND TF | ||
46 | + entities = {} | ||
47 | + tline = tline.rstrip('\n\r ') | ||
48 | + for token in tline.split(" "): | ||
49 | + # print("Token: {}".format(token)) | ||
50 | + listElem = token.split("|") | ||
51 | + w = listElem[0] | ||
52 | + l = listElem[1] | ||
53 | + t = listElem[2] | ||
54 | + if filt == "FILT1" or filt == "FILT2": | ||
55 | + if t in ["GENE", "TU", "TF", "EFFECT"]: | ||
56 | + if w not in entities: | ||
57 | + entities[w] = t | ||
58 | + # if filt == "FILT2": | ||
59 | + # if t in ["GENE", "TU", "TF", "EFFECT"]: | ||
60 | + # if w not in entities: | ||
61 | + # entities[w] = t | ||
62 | + return entities | ||
63 | + | ||
64 | +if __name__ == "__main__": | ||
65 | + # Parameter definition | ||
66 | + parser = OptionParser() | ||
67 | + | ||
68 | + parser.add_option("--inputFileWord", dest="inputFileWord", | ||
69 | + help="Path and filename to read feature word file", metavar="PATH") | ||
70 | + parser.add_option("--inputFileTrans", dest="inputFileTrans", | ||
71 | + help="Path and filename to read transformed file", metavar="PATH") | ||
72 | + parser.add_option("--outputPath", dest="outputPath", | ||
73 | + help="Output path", metavar="PATH") | ||
74 | + parser.add_option("--outputFile", dest="outputFile", | ||
75 | + help="Output file", metavar="FILE") | ||
76 | + parser.add_option("--filter", dest="filter", choices=('FILT1', 'FILT2'), default=None, | ||
77 | + help="FILT1: (GENE OR TU) AND TF; FILT2: (GENE OR TU) AND EFFECT AND TF", metavar="TEXT") | ||
78 | + parser.add_option("--attrPath", dest="attrPath", | ||
79 | + help="Output path attributive sentences", metavar="PATH") | ||
80 | + parser.add_option("--dicPath", dest="dicPath", | ||
81 | + help="Output path dictionary", metavar="PATH") | ||
82 | + parser.add_option("--dicFile", dest="dicFile", | ||
83 | + help="Output file dictionary normalized_Effects.json", metavar="FILE") | ||
84 | + parser.add_option("--autoPath", dest="autoPath", | ||
85 | + help="Output path autoregulation sentences", metavar="PATH") | ||
86 | + parser.add_option("--tsvPath", dest="tsvPath", | ||
87 | + help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH") | ||
88 | + | ||
89 | + (options, args) = parser.parse_args() | ||
90 | + if len(args) > 0: | ||
91 | + parser.error("None parameters indicated.") | ||
92 | + sys.exit(1) | ||
93 | + | ||
94 | + # Printing parameter values | ||
95 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
96 | + print("Path and filename to read feature word file: " + str(options.inputFileWord)) | ||
97 | + print("Path and filename to read transformed file: " + str(options.inputFileTrans)) | ||
98 | + print("Output path: " + str(options.outputPath)) | ||
99 | + print("Output file: " + str(options.outputFile)) | ||
100 | + print("Filter: " + str(options.filter)) | ||
101 | + print("Output path attributive sentences: " + str(options.attrPath)) | ||
102 | + print("Output path autoregulation sentences: " + str(options.autoPath)) | ||
103 | + print("Output path dictionary: " + str(options.dicPath)) | ||
104 | + print("Output file dictionary normalized_Effects.json: " + str(options.dicFile)) | ||
105 | + print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath)) | ||
106 | + | ||
107 | + # Loading normalized effects | ||
108 | + # print('Loading normalized effects...') | ||
109 | + hashNormalizedEffects = {} | ||
110 | + with open(os.path.join(options.dicPath, options.dicFile)) as diccFile: | ||
111 | + hashNormalizedEffects = json.load(diccFile) | ||
112 | + listEffects = [] | ||
113 | + for eff in hashNormalizedEffects.keys(): | ||
114 | + if eff.endswith('d'): | ||
115 | + listEffects.append(eff) | ||
116 | + listEffects.append("dependent") | ||
117 | + effects = "|".join(listEffects) | ||
118 | + print("Effects: {}".format(effects)) | ||
119 | + | ||
120 | + t0 = time() | ||
121 | + count = 0 | ||
122 | + hashEntities = {} | ||
123 | + hashAttrSent = {} | ||
124 | + hashAutoSent = {} | ||
125 | + # Original CMC 2018-11-07: reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene') | ||
126 | + # We decided to extract all sentences containing effect-TF because we observed some patterns where | ||
127 | + # "gene" does not appear, then, to recover these examples we employ a more general rule to separate | ||
128 | + # attributive sentences. | ||
129 | + reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF') | ||
130 | + # We decided to extract all sentences containing autoregulation | ||
131 | + # The FimZ transcription factor activates this promoter directly , | ||
132 | + # and it also positively regulates the transcription of its own gene | ||
133 | + # FimZ is known to regulate the expression of its own gene positively | ||
134 | + # FimZ also positively regulates its own transcription | ||
135 | + # ArgP protein represses its own synthesis | ||
136 | + # ArgP both represses its own transcription | ||
137 | + # ArgP protein represses its own synthesis | ||
138 | + # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT | ||
139 | + # of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN | ||
140 | + reAutoSent = re.compile(r'(?<=\|TF).+\|EFFECT.+its\|its\|PRP\$ own\|own\|JJ') | ||
141 | + aFilter = options.filter | ||
142 | + print(" Processing file...{}".format(options.inputFileTrans)) | ||
143 | + with open(os.path.join(options.outputPath, options.outputFile), "w", encoding="utf-8", errors="replace") as oFile: | ||
144 | + with open(os.path.join(options.inputFileTrans), mode="r", encoding="utf-8", errors="replace") as tFile, open(os.path.join(options.inputFileWord), mode="r", encoding="utf-8", errors="replace") as wFile: | ||
145 | + # CFMC 2022-03-09: Load tsv file with section, id sentence, sentence (Extracted from jsonpdf) | ||
146 | + file = options.inputFileTrans[options.inputFileTrans.rfind("/")+1:] | ||
147 | + file_tsv = file.replace(".tra.txt", ".pre.tsv") | ||
148 | + tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv)) | ||
149 | + print("tsv_file.shape: {}".format(tsv_file.shape)) | ||
150 | + tsv_file_filtered = tsv_file[tsv_file['status'] == 1] | ||
151 | + print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape)) | ||
152 | + # print(tsv_file_filtered.head(10)) | ||
153 | + tsv_file_new = tsv_file_filtered.reset_index(drop=True) | ||
154 | + # print(tsv_file_new.shape) | ||
155 | + # print(tsv_file_new.head(10)) | ||
156 | + i = 0 | ||
157 | + for tLine, wLine in zip(tFile, wFile): | ||
158 | + # FILT1: (GENE OR TU) AND TF | ||
159 | + # FILT2: (GENE OR TU) AND EFFECT AND TF | ||
160 | + if aFilter is not None: | ||
161 | + reGENETU = re.compile(r'(\|GENE|\|TU)') | ||
162 | + reEFFECT = re.compile(r'\|EFFECT') | ||
163 | + reTF = re.compile(r'\|TF') | ||
164 | + tCount = str(count) | ||
165 | + if aFilter == "FILT1": | ||
166 | + if not (reGENETU.search(tLine) and reTF.search(tLine)): | ||
167 | + #print("NOT FOUND") | ||
168 | + # CFMC 2022-03-08 | ||
169 | + tsv_file_new.at[i, 'status'] = 0 | ||
170 | + i += 1 | ||
171 | + continue | ||
172 | + else: | ||
173 | + #print("FOUND") | ||
174 | + oFile.write(wLine) | ||
175 | + if tCount not in hashEntities: | ||
176 | + hashEntities[tCount] = getEntities(tLine, aFilter) | ||
177 | + if reAttrSent.search(tLine): | ||
178 | + #print("ATTRIBUTIVE SENTENCE: {}".format(tLine)) | ||
179 | + if tCount not in hashAttrSent: | ||
180 | + hashAttrSent[tCount] = tLine | ||
181 | + # Autoregulation sentences | ||
182 | + if reAutoSent.search(tLine): | ||
183 | + # print("AUOREGULATION SENTENCE: {}".format(tLine)) | ||
184 | + if tCount not in hashAutoSent: | ||
185 | + hashAutoSent[tCount] = tLine | ||
186 | + #print(tLine) | ||
187 | + elif aFilter == "FILT2": | ||
188 | + if not (reGENETU.search(tLine) and reEFFECT.search(tLine) and reTF.search(tLine)): | ||
189 | + continue | ||
190 | + # CFMC 2022-03-08 | ||
191 | + tsv_file_new.at[i, 'status'] = 0 | ||
192 | + i += 1 | ||
193 | + else: | ||
194 | + oFile.write(wLine) | ||
195 | + if tCount not in hashEntities: | ||
196 | + hashEntities[tCount] = getEntities(tLine, aFilter) | ||
197 | + if reAttrSent.search(tLine): | ||
198 | + if tCount not in hashAttrSent: | ||
199 | + hashAttrSent[tCount] = tLine | ||
200 | + if reAutoSent.search(tLine): | ||
201 | + if tCount not in hashAutoSent: | ||
202 | + hashAutoSent[tCount] = tLine | ||
203 | + count += 1 | ||
204 | + i += 1 | ||
205 | + | ||
206 | + merged = tsv_file.merge(tsv_file_new, on=['idsentence'], how='left') | ||
207 | + # print(merged.shape) | ||
208 | + # print(merged.head(10)) | ||
209 | + tsv_file.status = merged.status_y.where(~merged.status_y.isnull(), tsv_file.status).astype(int) | ||
210 | + tsv_file_filtered = tsv_file[tsv_file['status'] == 1] | ||
211 | + print("Last tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape)) | ||
212 | + # print(tsv_file_filtered.head(10)) | ||
213 | + tsv_file.to_csv(os.path.join(options.tsvPath, file_tsv.replace('.tsv', '.fil.tsv')), sep='\t') | ||
214 | + | ||
215 | + with open(os.path.join(options.outputPath, options.outputFile.replace(".txt", ".ents.json")), "w", encoding="utf-8", | ||
216 | + errors="replace") as eFile: | ||
217 | + json.dump(hashEntities, eFile) | ||
218 | + | ||
219 | + for f, sent in hashAttrSent.items(): | ||
220 | + listPath = options.inputFileTrans.split('/') | ||
221 | + fileName = listPath[-1] | ||
222 | + fileName = fileName.replace('.tra.', '.att.' + f + '.') | ||
223 | + print("Save file {}".format(fileName)) | ||
224 | + with open(os.path.join(options.attrPath, fileName), "w", encoding="utf-8", errors="replace") as aFile: | ||
225 | + aFile.write(sent) | ||
226 | + | ||
227 | + for f, sent in hashAutoSent.items(): | ||
228 | + listPath = options.inputFileTrans.split('/') | ||
229 | + fileName = listPath[-1] | ||
230 | + fileName = fileName.replace('.tra.', '.auto.' + f + '.') | ||
231 | + print("Save file {}".format(fileName)) | ||
232 | + with open(os.path.join(options.autoPath, fileName), "w", encoding="utf-8", errors="replace") as aFile: | ||
233 | + aFile.write(sent) | ||
234 | + | ||
235 | + print("Files split in: %fs" % (time() - t0)) |
1 | + |
1 | + |
1 | +#!/bin/bash | ||
2 | + | ||
3 | +#Validate arguments | ||
4 | +if [[ ! ("$#" == 3 ) ]]; then | ||
5 | + echo 'Usage: ./sentence-simplification-main.sh <input_path> <output_file_path> <isimp_path>' | ||
6 | + exit 1 | ||
7 | +fi | ||
8 | + | ||
9 | +SCRIPT_PATH=$(cd `dirname $0` && pwd) | ||
10 | +#Define aquí la palabra clave del grupo de oraciones a simplificar. | ||
11 | +INPUT_PATH=$1 | ||
12 | +OUTPUT_INDEX_FILE_PATH=$2 | ||
13 | +ISIMP_PATH=$3 | ||
14 | +cd $SCRIPT_PATH | ||
15 | + | ||
16 | + | ||
17 | + | ||
18 | + | ||
19 | +#ANALIZAR EN ISIMP | ||
20 | +echo "Analysing in iSimp..." | ||
21 | +if [ -z "$(ls -A ./iSimp_sentences/)" ]; then : | ||
22 | +else | ||
23 | + #echo "Not Empty" | ||
24 | + rm ./iSimp_sentences/* | ||
25 | +fi | ||
26 | +#cd $INPUT_PATH | ||
27 | +for j in $INPUT_PATH/* | ||
28 | +do | ||
29 | + echo $j | ||
30 | + #echo "++++entrada_simp: $j salida_simp: $SCRIPT_PATH/iSimp_sentences/$(basename $j)" | ||
31 | + $ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j) | ||
32 | +done | ||
33 | +cd $SCRIPT_PATH | ||
34 | + | ||
35 | +#CREA INDICE DE ARCHIVOS SIMPLIFICADOS | ||
36 | +#touch $SCRIPT_PATH/index.txt | ||
37 | +>| $OUTPUT_INDEX_FILE_PATH | ||
38 | + | ||
39 | +#ALIMENTAR A ALGORITMO | ||
40 | +echo "Analysing in Algorithm..." | ||
41 | +if [ -z "$(ls -A ./algorithm_sentences/)" ]; then : | ||
42 | +else | ||
43 | + #echo "Not Empty" | ||
44 | + rm ./algorithm_sentences/* | ||
45 | +fi | ||
46 | +#cd ./iSimp_sentences | ||
47 | +for k in $SCRIPT_PATH/iSimp_sentences/* | ||
48 | +do | ||
49 | + echo $k | ||
50 | + #echo "entrada: $k salida: $SCRIPT_PATH/algorithm_sentences/$(basename $k) index: $OUTPUT_INDEX_FILE_PATH" | ||
51 | + python2 $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH | ||
52 | +done | ||
53 | +cd $SCRIPT_PATH |
sentence-simplification/simplifier.py
0 → 100644
1 | +import copy | ||
2 | +import sys | ||
3 | +import requests | ||
4 | + | ||
5 | +class Simp(object): | ||
6 | + def __init__(self): | ||
7 | + self.TYPE="" | ||
8 | + self.TYPEx=0 | ||
9 | + self.TYPEy=0 | ||
10 | + self.TEXT="" | ||
11 | + self.COMP=[] | ||
12 | + def agregarTYPE(self,Type): | ||
13 | + self.TYPE=Type | ||
14 | + def agregarTEXT(self,text): | ||
15 | + self.TEXT=text | ||
16 | + def agregarCOMP(self,comp): | ||
17 | + self.COMP.append(comp) | ||
18 | + | ||
19 | +class Frase(object): | ||
20 | + def __init__(self): | ||
21 | + self.TYPE="" | ||
22 | + self.TEXT="" | ||
23 | + self.POS="" | ||
24 | + self.TREE="" | ||
25 | + self.SIMP=[] | ||
26 | + def agregarTYPE(self,Type): | ||
27 | + self.TYPE=Type | ||
28 | + def agregarTEXT(self,text): | ||
29 | + self.TEXT=text | ||
30 | + def agregarPOS(self,Pos): | ||
31 | + self.POS=Pos | ||
32 | + def agregarTREE(self,Tree): | ||
33 | + self.TREE=Tree | ||
34 | + def agregarSIMP(self): | ||
35 | + self.SIMP.append(Simp()) | ||
36 | + | ||
37 | +class Sentence(object): | ||
38 | + def __init__(self): | ||
39 | + self.FLAG=True | ||
40 | + self.TEXT="" | ||
41 | + self.TREE="" | ||
42 | + self.SIMP=[] | ||
43 | + def agregarTEXT(self,text): | ||
44 | + self.TEXT=text | ||
45 | + def agregarTREE(self,Tree): | ||
46 | + self.TREE=Tree | ||
47 | + def agregarSIMP(self): | ||
48 | + self.SIMP.append(Simp()) | ||
49 | + | ||
50 | + | ||
51 | +MEMORIAB=[] | ||
52 | +MEMORIAA=[] | ||
53 | + | ||
54 | + | ||
55 | +#----lectura de datos desde archivo | ||
56 | +arch=(sys.argv[1]) | ||
57 | +f = open(arch) | ||
58 | +dato = f.read().splitlines() | ||
59 | +f.close | ||
60 | +frase=Frase() | ||
61 | +for i in range(len(dato)): | ||
62 | + if 'TYPE: ' in dato[i][0:6]: | ||
63 | + frase.agregarTYPE(dato[i][6:]) | ||
64 | + elif 'TEXT: ' in dato[i][0:6]: | ||
65 | + frase.agregarTEXT(dato[i][6:]) | ||
66 | + elif 'POS : ' in dato[i][0:6]: | ||
67 | + frase.agregarPOS(dato[i][6:]) | ||
68 | + elif 'TREE: ' in dato[i][0:6]: | ||
69 | + frase.agregarTREE(dato[i][6:]) | ||
70 | + elif 'SIMP:' in dato[i]: | ||
71 | + frase.agregarSIMP() | ||
72 | + elif ' TYPE: ' in dato[i][0:8]: | ||
73 | + frase.SIMP[-1].agregarTYPE(dato[i][8:]) | ||
74 | + elif ' TEXT: ' in dato[i][0:8]: | ||
75 | + frase.SIMP[-1].agregarTEXT(dato[i][8:]) | ||
76 | + elif ' COMP: ' in dato[i]: | ||
77 | + frase.SIMP[-1].agregarCOMP(dato[i][8:]) | ||
78 | +#------------ | ||
79 | + | ||
80 | + | ||
81 | +#-------Programa principal | ||
82 | +#Algoritmo v4 | ||
83 | + | ||
84 | + | ||
85 | +if ((frase.TYPE.find('sentence')) !=- 1) and (frase.SIMP!=[]) and (frase.SIMP[0].TYPE != ''): | ||
86 | + y=1 | ||
87 | + w=1 | ||
88 | + SIMPworkspace=[] | ||
89 | + # copia TREE y cada SIMP a SENTENCE.1 | ||
90 | + Sentence1=Sentence() | ||
91 | + Sentence1.TREE=copy.deepcopy(frase.TREE) | ||
92 | + Sentence1.TEXT=copy.deepcopy(frase.TEXT) | ||
93 | + for i in range(len(frase.SIMP)): | ||
94 | + #Sentence1.SIMP.append(Simp()) | ||
95 | + #Sentence1.SIMP[i]=copy.deepcopy(frase.SIMP[i]) | ||
96 | + SIMPworkspace.append(Simp()) | ||
97 | + SIMPworkspace[i]=copy.deepcopy(frase.SIMP[i]) | ||
98 | + | ||
99 | +## ORDENAMIENTO DE SIMPs | ||
100 | + for i in range(len(SIMPworkspace)): | ||
101 | + #print SIMPworkspace[i].TEXT | ||
102 | + #print SIMPworkspace[i].TYPE | ||
103 | + SIMPworkspace[i].TYPEx = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('[')+1:SIMPworkspace[i].TYPE.find('..')]) | ||
104 | + SIMPworkspace[i].TYPEy = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('..')+2:SIMPworkspace[i].TYPE.find(']')]) | ||
105 | + if 'parenthesis' in SIMPworkspace[i].TYPE: | ||
106 | + SIMPworkspace[i].TYPEy = SIMPworkspace[i].TYPEy + 2 | ||
107 | + #print SIMPworkspace[i].TYPEx | ||
108 | + #print SIMPworkspace[i].TYPEy | ||
109 | + | ||
110 | + | ||
111 | + SIMPworkspace.sort(key=lambda x: x.TYPEy, reverse=True) | ||
112 | + SIMPworkspace.sort(key=lambda x: x.TYPEx) | ||
113 | + | ||
114 | + | ||
115 | + # for i in range(len(SIMPworkspace)): | ||
116 | + # print "\nSIMP " + str(i) + " :" | ||
117 | + # print SIMPworkspace[i].TYPE | ||
118 | + # print SIMPworkspace[i].TYPEx | ||
119 | + # print SIMPworkspace[i].TYPEy | ||
120 | + # print "\n" | ||
121 | + | ||
122 | + for i in range(len(SIMPworkspace)): | ||
123 | + Sentence1.SIMP.append(Simp()) | ||
124 | + Sentence1.SIMP[i]=copy.deepcopy(SIMPworkspace[i]) | ||
125 | + | ||
126 | + | ||
127 | + # Agrega la oracion original Sentence1 a la memoria como primer objeto en ser analizado | ||
128 | + MEMORIAB.append(Sentence()) | ||
129 | + MEMORIAB[0]=copy.deepcopy(Sentence1) | ||
130 | + | ||
131 | + | ||
132 | + | ||
133 | + # 1 entrada al bucle A por cada SIMP diferente en Sentence1 | ||
134 | + numSimp=len(Sentence1.SIMP) | ||
135 | + s = 0 | ||
136 | + #bucle A | ||
137 | + while s < numSimp : | ||
138 | + #print "\nEntro por vez " + str(s) + " al bucle A" | ||
139 | + #print "Analizando todos los SIMP de tipo: " + MEMORIAB[0].SIMP[s].TYPE | ||
140 | + #Entra al bucle B el numero de veces igual al numerode elementos en MEMORIAB | ||
141 | + numMEM = len(MEMORIAB) | ||
142 | + t = 0 | ||
143 | + #bucle B | ||
144 | + while t < numMEM : | ||
145 | + #print "Entro por vez " + str(t) + " al bucle B" | ||
146 | + #Entra si la oracion no ha sido analizada antes (FLAG==True) y si el texto del simp esta presente en la oracion. | ||
147 | + #print "CONDICIONES:" | ||
148 | + #print "SIMP " + MEMORIAB[0].SIMP[s].TEXT | ||
149 | + #print "SIMP " + MEMORIAB[0].SIMP[s].TYPE | ||
150 | + #print "MEMB " + str(MEMORIAB[t].FLAG) | ||
151 | + #print "MEMB " + MEMORIAB[t].TEXT | ||
152 | + if ( MEMORIAB[0].SIMP[s].TEXT in MEMORIAB[t].TEXT ) and ( MEMORIAB[t].FLAG == True ): | ||
153 | + MEMORIAB[t].FLAG = False | ||
154 | + #print "False to: " + MEMORIAB[t].TEXT | ||
155 | + #print "Entro a condicional" | ||
156 | + #Reglas de simplificacion | ||
157 | + if ( 'coordination' in MEMORIAB[t].SIMP[s].TYPE ) and ( not ('sentence coordination' in MEMORIAB[t].SIMP[s].TYPE ) ) : | ||
158 | + #print "Aplico regla coord" | ||
159 | + TEMPORALES = [] | ||
160 | + c = len(MEMORIAB[t].SIMP[s].COMP) | ||
161 | + #print "Hay " + str(c) + " COMP en este SIMP" | ||
162 | + tt = 0 | ||
163 | + while c > 0 : | ||
164 | + c = c - 1 | ||
165 | + if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) : | ||
166 | + TEMPORALES.append(Sentence()) | ||
167 | + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) | ||
168 | + replaced = MEMORIAB[0].SIMP[s].TEXT | ||
169 | + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')]) | ||
170 | + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')]) | ||
171 | + replacer = MEMORIAB[0].TEXT[indice1:indice2] | ||
172 | + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer) | ||
173 | + tt = tt + 1 | ||
174 | + #copiar simplificaciones de memoria temporal a MEMORIAB | ||
175 | + indtempamem = 0 | ||
176 | + while indtempamem < len(TEMPORALES) : | ||
177 | + MEMORIAB.append(Sentence()) | ||
178 | + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem]) | ||
179 | + MEMORIAB[-1].FLAG = True | ||
180 | + #print MEMORIAB[-1].TEXT | ||
181 | + indtempamem = indtempamem + 1 | ||
182 | + elif 'parenthesis' in MEMORIAB[t].SIMP[s].TYPE: | ||
183 | + #print "Aplico regla par" | ||
184 | + TEMPORALES = [] | ||
185 | + c = len(MEMORIAB[t].SIMP[s].COMP) | ||
186 | + #print "Hay " + str(c) + " COMP en este SIMP" | ||
187 | + tt = 0 | ||
188 | + while c > 0 : | ||
189 | + #print "entro al while de par" | ||
190 | + c = c - 1 | ||
191 | + TEMPORALES.append(Sentence()) | ||
192 | + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) | ||
193 | + replaced = MEMORIAB[0].SIMP[s].TEXT + ' )' | ||
194 | + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')]) | ||
195 | + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')]) | ||
196 | + replacer = MEMORIAB[0].TEXT[indice1:indice2] | ||
197 | + #print "replaced: " + replaced | ||
198 | + #print "replacer: " + replacer | ||
199 | + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer) | ||
200 | + tt = tt + 1 | ||
201 | + #copiar simplificaciones de memoria temporal a MEMORIAB | ||
202 | + indtempamem = 0 | ||
203 | + while indtempamem < len(TEMPORALES) : | ||
204 | + MEMORIAB.append(Sentence()) | ||
205 | + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem]) | ||
206 | + MEMORIAB[-1].FLAG = True | ||
207 | + #print MEMORIAB[-1].TEXT | ||
208 | + indtempamem = indtempamem + 1 | ||
209 | + elif 'apposition' in MEMORIAB[t].SIMP[s].TYPE: | ||
210 | + #print "Aplico regla Apposition" | ||
211 | + TEMPORALES = [] | ||
212 | + c = len(MEMORIAB[t].SIMP[s].COMP) | ||
213 | + #print "Hay " + str(c) + " COMP en este SIMP" | ||
214 | + tt = 0 | ||
215 | + while c > 0 : | ||
216 | + #print "entro al while de par" | ||
217 | + c = c - 1 | ||
218 | + TEMPORALES.append(Sentence()) | ||
219 | + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) | ||
220 | + replaced = MEMORIAB[0].SIMP[s].TEXT | ||
221 | + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')]) | ||
222 | + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')]) | ||
223 | + replacer = MEMORIAB[0].TEXT[indice1:indice2] | ||
224 | + #print "replaced: " + replaced | ||
225 | + #print "replacer: " + replacer | ||
226 | + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer) | ||
227 | + tt = tt + 1 | ||
228 | + #copiar simplificaciones de memoria temporal a MEMORIAB | ||
229 | + indtempamem = 0 | ||
230 | + while indtempamem < len(TEMPORALES) : | ||
231 | + MEMORIAB.append(Sentence()) | ||
232 | + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem]) | ||
233 | + MEMORIAB[-1].FLAG = True | ||
234 | + #print "Copio a memoria: " + MEMORIAB[-1].TEXT | ||
235 | + indtempamem = indtempamem + 1 | ||
236 | + elif 'member-collection' in MEMORIAB[t].SIMP[s].TYPE: | ||
237 | + #print "Aplico regla member-collection" | ||
238 | + TEMPORALES = [] | ||
239 | + c = len(MEMORIAB[t].SIMP[s].COMP) | ||
240 | + #print "Hay " + str(c) + " COMP en este SIMP" | ||
241 | + tt = 0 | ||
242 | + while c > 0 : | ||
243 | + #print "entro al while de mem" | ||
244 | + c = c - 1 | ||
245 | + TEMPORALES.append(Sentence()) | ||
246 | + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) | ||
247 | + replaced = MEMORIAB[0].SIMP[s].TEXT | ||
248 | + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')]) | ||
249 | + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')]) | ||
250 | + replacer = MEMORIAB[0].TEXT[indice1:indice2] | ||
251 | + #print "replaced: " + replaced | ||
252 | + #print "replacer: " + replacer | ||
253 | + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer) | ||
254 | + tt = tt + 1 | ||
255 | + #copiar simplificaciones de memoria temporal a MEMORIAB | ||
256 | + indtempamem = 0 | ||
257 | + while indtempamem < len(TEMPORALES) : | ||
258 | + MEMORIAB.append(Sentence()) | ||
259 | + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem]) | ||
260 | + MEMORIAB[-1].FLAG = True | ||
261 | + #print "Copio a memoria: " + MEMORIAB[-1].TEXT | ||
262 | + indtempamem = indtempamem + 1 | ||
263 | + elif 'sentence coordination' in MEMORIAB[t].SIMP[s].TYPE: | ||
264 | + #print "Aplico regla Verb" | ||
265 | + TEMPORALES = [] | ||
266 | + c = len(MEMORIAB[t].SIMP[s].COMP) | ||
267 | + #print "Hay " + str(c) + " COMP en este SIMP" | ||
268 | + tt = 0 | ||
269 | + while c > 0 : | ||
270 | + c = c - 1 | ||
271 | + if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) : | ||
272 | + TEMPORALES.append(Sentence()) | ||
273 | + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) | ||
274 | + #sustituye todo el contenido de TEMPORAL.r/TREE, por el contenido la oracion coordinada | ||
275 | + #replaced = MEMORIAB[0].SIMP[s].TEXT | ||
276 | + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')]) | ||
277 | + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')]) | ||
278 | + replacer = MEMORIAB[0].TEXT[indice1:indice2] | ||
279 | + #print replacer | ||
280 | + TEMPORALES[tt].TEXT = replacer | ||
281 | + ## si la oracion no termina en punto o ! | ||
282 | + tt = tt + 1 | ||
283 | + #copiar simplificaciones de memoria temporal a MEMORIAB | ||
284 | + indtempamem = 0 | ||
285 | + while indtempamem < len(TEMPORALES) : | ||
286 | + MEMORIAB.append(Sentence()) | ||
287 | + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem]) | ||
288 | + MEMORIAB[-1].FLAG = True | ||
289 | + #print MEMORIAB[-1].TEXT | ||
290 | + indtempamem = indtempamem + 1 | ||
291 | + elif 'full relative clause' in MEMORIAB[t].SIMP[s].TYPE: | ||
292 | + #print "Aplico regla RelCl" | ||
293 | + TEMPORALES = [] | ||
294 | + c = 0 | ||
295 | + tt = 0 | ||
296 | + while c < 2 : | ||
297 | + if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] : | ||
298 | + TEMPORALES.append(Sentence()) | ||
299 | + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok | ||
300 | + if MEMORIAB[0].TEXT[MEMORIAB[0].TEXT.index(TEMPORALES[tt].SIMP[s].TEXT)+len(TEMPORALES[tt].SIMP[s].TEXT)-1] == ',': | ||
301 | + replaced = MEMORIAB[0].SIMP[s].TEXT + ',' #posible error, si es asi probar con ' ,' | ||
302 | + else: | ||
303 | + replaced = MEMORIAB[0].SIMP[s].TEXT | ||
304 | + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')]) | ||
305 | + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')]) | ||
306 | + replacer = MEMORIAB[0].TEXT[indice1:indice2] | ||
307 | + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer) | ||
308 | + indice3 = indice1 | ||
309 | + indice4 = indice2 | ||
310 | + if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] : | ||
311 | + TEMPORALES.append(Sentence()) | ||
312 | + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok | ||
313 | + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')]) | ||
314 | + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')]) | ||
315 | + TEMPORALES[tt].TEXT = copy.deepcopy(MEMORIAB[0].TEXT[indice3:indice4]+' '+MEMORIAB[0].TEXT[indice1:indice2] ) ## | ||
316 | + cad3 = MEMORIAB[0].TEXT[indice1:indice2] | ||
317 | + cad4 = cad3.split() | ||
318 | + if (cad4[0]+'_WDT') in frase.POS: | ||
319 | + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(' '+cad4[0],'') | ||
320 | + tt = tt + 1 | ||
321 | + c = c + 1 | ||
322 | + #copiar simplificaciones de memoria temporal a MEMORIAB | ||
323 | + indtempamem = 0 | ||
324 | + while indtempamem < len(TEMPORALES) : | ||
325 | + MEMORIAB.append(Sentence()) | ||
326 | + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem]) | ||
327 | + MEMORIAB[-1].FLAG = True | ||
328 | + #print MEMORIAB[-1].TEXT | ||
329 | + indtempamem = indtempamem + 1 | ||
330 | + elif 'reduced relative clause' in MEMORIAB[t].SIMP[s].TYPE: | ||
331 | + #print "Aplico regla RelCl" | ||
332 | + TEMPORALES = [] | ||
333 | + c = 0 | ||
334 | + tt = 0 | ||
335 | + while c < 2 : | ||
336 | + if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] : | ||
337 | + TEMPORALES.append(Sentence()) | ||
338 | + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok | ||
339 | + replaced = MEMORIAB[0].SIMP[s].TEXT | ||
340 | + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')]) | ||
341 | + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')]) | ||
342 | + replacer = MEMORIAB[0].TEXT[indice1:indice2] | ||
343 | + #subj = MEMORIAB[0].TEXT[indice1:(indice2+1)] | ||
344 | + subj = MEMORIAB[0].TEXT[indice1:(indice2)] | ||
345 | + TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer) | ||
346 | + if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] : | ||
347 | + TEMPORALES.append(Sentence()) | ||
348 | + TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #el referente debera estar antes que la clausula para tener orden correcto | ||
349 | + indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')]) | ||
350 | + indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')]) | ||
351 | + replacer = MEMORIAB[0].TEXT[indice1:indice2] | ||
352 | + TEMPORALES[tt].TEXT = subj + " _ " + replacer #en este punto para ingresar copula necesitas info de numero y tiempo | ||
353 | + tt = tt + 1 | ||
354 | + c = c + 1 | ||
355 | + #copiar simplificaciones de memoria temporal a MEMORIAB | ||
356 | + indtempamem = 0 | ||
357 | + while indtempamem < len(TEMPORALES) : | ||
358 | + MEMORIAB.append(Sentence()) | ||
359 | + MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem]) | ||
360 | + MEMORIAB[-1].FLAG = True | ||
361 | + #print MEMORIAB[-1].TEXT | ||
362 | + indtempamem = indtempamem + 1 | ||
363 | + elif 'hypernymy' in MEMORIAB[t].SIMP[s].TYPE: | ||
364 | + print "**hypernymy detected**" | ||
365 | + #print "True to: " + MEMORIAB[t].TEXT | ||
366 | + MEMORIAB[t].FLAG = True | ||
367 | + else: | ||
368 | + print "Error: Unknown simplification construct detected." | ||
369 | + #print "True to: " + MEMORIAB[t].TEXT | ||
370 | + MEMORIAB[t].FLAG = True | ||
371 | + t = t + 1 | ||
372 | + s = s + 1 | ||
373 | + | ||
374 | + #CONDICIONES PARA IMPRESION DE SIMPLIFICACIONES EN ARCHIVO DE TEXTO | ||
375 | + #print "Sentence simplificated. New sentences generated:" | ||
376 | + for i in range(len(MEMORIAB)): | ||
377 | + #se reutiliza flag para marcar las oraciones finales | ||
378 | + MEMORIAB[i].FLAG = True | ||
379 | + for j in range(len(MEMORIAB[0].SIMP)): | ||
380 | + #NOTA: si se agrega un constructo simplificable, anadirlo tambien a esta lista: | ||
381 | + if ( ('member-collection' in MEMORIAB[0].SIMP[j].TYPE) or ('apposition' in MEMORIAB[0].SIMP[j].TYPE) or ('coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('parenthesis' in MEMORIAB[0].SIMP[j].TYPE) or ('sentence coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('full relative clause' in MEMORIAB[0].SIMP[j].TYPE) or ('reduced relative clause' in MEMORIAB[0].SIMP[j].TYPE) ) and (MEMORIAB[0].SIMP[j].TEXT in MEMORIAB[i].TEXT) : | ||
382 | + MEMORIAB[i].FLAG = False | ||
383 | + | ||
384 | + ##areglar numeracion archivos salida ej 011 | ||
385 | + arcsalnum = 0 | ||
386 | + for i in range(len(MEMORIAB)): | ||
387 | + if MEMORIAB[i].FLAG == True: | ||
388 | + arcsalnum = arcsalnum + 1 | ||
389 | + length = len(str(arcsalnum)) | ||
390 | + #print('{:03d}'.format(arcsalnum)) # python >= 2.7 + python3 | ||
391 | +# >>> n = '4' | ||
392 | +#>>> print n.zfill(3) | ||
393 | + arcsalnum = 0 | ||
394 | + for i in range(len(MEMORIAB)): | ||
395 | + if MEMORIAB[i].FLAG == True: | ||
396 | + arcsalnum = arcsalnum + 1 | ||
397 | + print MEMORIAB[i].TEXT#Salida | ||
398 | + archSalNombre = sys.argv[2] | ||
399 | + archSalNombre=archSalNombre[:-4] + "-" + (str(arcsalnum)).zfill(length) + '.alg' | ||
400 | + archivoSalida=open(archSalNombre,"w") | ||
401 | + archivoSalida.write(MEMORIAB[i].TEXT+"\n")## | ||
402 | + archivoSalida.close() | ||
403 | + #WRITE OUTPUT FILE PATH TO INDEX (Arg 3) | ||
404 | + index_name = sys.argv[3] | ||
405 | + index = open(index_name, "a+") | ||
406 | + archSalNombreforIndex=archSalNombre + "\n" | ||
407 | + index.write(archSalNombreforIndex) | ||
408 | + index.close() | ||
409 | +else: | ||
410 | + print frase.TEXT #----Salida si no habia constructos simplificables | ||
411 | + archSalNombre = sys.argv[2] | ||
412 | + archSalNombre = archSalNombre[:-4] + ".alg" | ||
413 | + archivoSalida = open(archSalNombre,"a+") | ||
414 | + archivoSalida.write(frase.TEXT+"\n")## | ||
415 | + archivoSalida.close() | ||
416 | + #WRITE OUTPUT FILE PATH TO INDEX (Arg 3) | ||
417 | + index_name = sys.argv[3] | ||
418 | + index = open(index_name, "a+") | ||
419 | + archSalNombreforIndex=archSalNombre + "\n" | ||
420 | + index.write(archSalNombreforIndex) | ||
421 | + index.close() | ||
422 | + | ||
423 | + | ||
424 | +#FIN |
trn/empty-file.txt
0 → 100644
1 | +Delete me | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment