cmendezc

Bacterial regulatory interaction extraction system

# Bacterial regulatory interaction extraction system
## Prerequisites
1. Input file must be tokenized and sentence split
## Run
### Several files
Set filenames and paths in run-several-files.sh
## Acknowledgments
This work was supported by UNAM-PAPIIT IA203420.
\ No newline at end of file
#!/bin/bash
# Main script for automatic extraction of regulatory interactions
#Parameters
#1: Path y nombre de archivo con las frases preprocesadas en formato de tokens (palabras)
#2: Path y nombre de archivo con las frases preprocesadas en formato trasformado (palabra|lemma|pos)
#3: Path y nombre de archivo para procesamiento con OpenIE
#4: Path de salida de archivos a1 y a2 con RIS y GCs
#5: Simplificar Y/N?
#6: Separar verbales y deverbales Y/N?
#7: Filtro de frases que contengan entidades. FILT1 = (GENE OR TU) AND TF
#8: Path con archivos a1 y a2 de referencia (RIs y GCs verdaderas)
#9: Archivo de referencia (RIs y GCs verdaderas)
#10: Path para guardar archivo de evaluación
#11: Archivo para guardar resultados de la evaluación contra referencia
#12: Archivo de sinónimos de TFs
# RUN EXTRACTION FOR L&C STM
# ./automatic-extraction-ris-gcs.sh
# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt
# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt
# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt
# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs
# Y Y FILT1
# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference
# unused.txt
# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports
# unused.txt
# diccionario-STM-LT2-v7.0.SYNONYMS.json
# 1>uno-STM-LC.txt
# 2>dos-STM-LC.txt
# ./automatic-extraction-ris-gcs.sh /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs Y Y FILT1 /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference unused.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports unused.txt diccionario-STM-LT2-v7.0.SYNONYMS.json 1>uno-STM-LC.txt 2>dos-STM-LC.txt
# Some help
# Filename without path: filename=$(basename "$fullfile")
# Filename extension: extension="${filename##*.}"
# Filename without extension: filename="${filename%.*}"
# Por error de muchos archivos: find . -print0 | xargs -0 grep AcrR
PATH_TO_CORENLP=/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09
DICC_PATH=/home/cmendezc/terminologicalResources
ISIMP_PATH=/home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/sentence-simplification/isimp_v2
SCRIPT_PATH=$(cd `dirname $0` && pwd)
INPUT_PATH=$1
INPUT_PATH_TRANS=$2
OUTPUT_FILE=$3
OUTPUT_PATH=$4
INPUT_NAME_EXT=$(basename "$INPUT_PATH")
INPUT_NAME="${INPUT_NAME_EXT%.*}"
# Simplify sentences?
SIMPLIFY=$5
# Separate sentences with deverbal effect?
DEVERBAL_SEPARATOR=$6
FILT=$7
TRUE_PATH=$8
TRUE_FILE=$9
PATH_EVAL=${10}
FILE_EVAL=${11}
DICC_SYNON=${12}
# CFMC 2022-03-09: tsv file with section, id sentence, sentence (Extracted from jsonpdf)
TSV_PATH=${13}
#Validate arguments
if [[ ! ("$#" == 13 ) ]]; then
echo 'Usage: ./automatic-extraction-ris-gcs.sh <inputPath_wordFile>
<inputPath_taggedFile> <outputPath_file> <simplify?> <deverbal_detector?>
<filter> <true_path> <true_file> <path_evaluation_report> <file_evaluation_report>
<dictionary_TFs_synonyms> <path_tsv_file>'
exit 1
fi
echo "********** SELECTED PARAMETERS **********"
echo "INPUT PATH: $INPUT_PATH"
echo "INPUT PATH TRANSFORMED FILE $INPUT_PATH_TRANS"
echo "OUTPUT FILE: $OUTPUT_FILE"
echo "OUTPUT PATH: $OUTPUT_PATH"
echo "SIMPLIFY SENTENCES? $SIMPLIFY"
echo "SEPARATE DEVERBAL SENTENCES? $DEVERBAL_SEPARATOR"
echo "FILTER SENTENCES WITH ENTITIES? $FILT"
echo "REFERENCE (TRUE) PATH: $TRUE_PATH"
echo "REFERENCE (TRUE) FILE: $TRUE_FILE"
echo "PATH EVALUATION REPORT: $PATH_EVAL"
echo "FILE EVALUATION REPORT: $FILE_EVAL"
echo "DICTIONARY OF SYNONYMS OF TFS: $DICC_SYNON"
echo "********** SELECTED PROCESSES **********"
CLEAN_OUTPUT=FALSE
echo " Clean output paths: $CLEAN_OUTPUT"
FILTER=TRUE
echo " Filter sentences: $FILTER"
CLEAN=TRUE
echo " Clean sentences for iSimp: $CLEAN"
SEPARATE=TRUE
echo " Separate sentences to iSimp: $SEPARATE"
SIMPLI=TRUE
echo " Simplify sentences: $SIMPLI"
DEVERBAL=TRUE
echo " Separate deverbal and verbal sentences: $DEVERBAL"
DEVTAG=TRUE # Needs DEVERBAL=TRUE
echo " Tag sentences to separate deverbal and verbal sentences: $DEVTAG"
DEVSEPAR=TRUE # Needs DEVERBAL=TRUE
echo " Do separate deverbal and verbal sentences: $DEVSEPAR"
EXTDEVERBAL=TRUE
echo " Extract RI deverbal: $EXTDEVERBAL"
OPENIE=TRUE
echo " OpenIE triplet extraction: $OPENIE"
EXTOPENIE=TRUE
echo " Extract RI verbal: $EXTOPENIE"
EXTATTRIB=TRUE
echo " Extract RI attributive: $EXTATTRIB"
EXTAUTOREG=TRUE
echo " Extract RI autoregulation: $EXTAUTOREG"
EXTGC=FALSE
echo " Extract growth conditions: $EXTGC"
EVAL=FALSE
echo " Evaluate extraction: $EVAL"
EVALGC=FALSE
echo " Evaluate growth condition extraction: $EVALGC"
#########################
# Cleaning output paths #
#########################
if [ "$CLEAN_OUTPUT" = "TRUE" ]; then
if [ -z "$(ls -A $OUTPUT_PATH/complete-ris/)" ]; then :
else
#echo "Not Empty"
# Original: rm $OUTPUT_PATH/complete-ris/*
find $OUTPUT_PATH/complete-ris -maxdepth 1 -name '*.*' -delete
fi
if [ -z "$(ls -A $OUTPUT_PATH/incomplete-ris/)" ]; then :
else
#echo "Not Empty"
# Original: rm $OUTPUT_PATH/incomplete-ris/*
find $OUTPUT_PATH/incomplete-ris -maxdepth 1 -name '*.*' -delete
fi
fi # if [ "$CLEAN_OUTPUT" = "TRUE" ]; then
################
# preliminares #
################
#Clone and update simplification pipeline
#if [ ! -d "./sentence-simplification" ]
# then
# echo Downloading sentence simplificator...
# git clone https://github.com/ezojg/sentence-simplification
# else
# cd ./sentence-simplification
# git pull origin master
# cd ..
#fi
#Check for iSimp
#if [ ! -d "./sentence-simplification/isimp_v2" ]
# then
# echo ERROR: ./sentence-simplification/isimp_v2 not found. Please manually copy iSimp to said path.
# exit 1
#fi
if [ "$FILTER" = "TRUE" ]; then
echo "********** FILTER SENTENCES **********"
###################################################
# filter sentences with entities of interest #
# and collect attributive examples ArgP-regulated #
###################################################
# INPUT:
# 1) --inputFileWord $INPUT_PATH input file transformed
# 2) --inputFileTrans $INPUT_PATH_TRANS input file of feature 'word'
# 3) --outputPath $SCRIPT_PATH/filtered-sentences
# 4) --outputFile filtered-sentences.txt output File
# 5) --filter filter $FILT
# FILT1: (GENE OR TU) AND TF
# FILT2: (GENE OR TU) AND EFFECT AND TF
# 6) --attrPath $SCRIPT_PATH/attributive-sentences Path for attributive cases: ArgP-regulated genes
# 7) --attrFile attributive-sentences.txt File for attributive cases: ArgP-regulated genes
# $DICC_PATH/normalized_Effects.json
cd $SCRIPT_PATH
if [ -z "$(ls -A ./filtered-sentences/)" ]; then :
else
#echo "Not Empty"
rm ./filtered-sentences/*
fi
if [ -z "$(ls -A ./attributive-sentences/)" ]; then :
else
#echo "Not Empty"
rm ./attributive-sentences/*
fi
if [ -z "$(ls -A ./autoregulation-sentences/)" ]; then :
else
#echo "Not Empty"
rm ./autoregulation-sentences/*
fi
# CFMC 2022-03-09: To update tsv file with filtered sentences
# python3.4 $SCRIPT_PATH/sentence-filter.py --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json
python3.4 $SCRIPT_PATH/sentence-filter_v02.py --tsvPath $TSV_PATH --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json
fi # if [ "$PRE" = "TRUE" ]; then
if [ "$CLEAN" = "TRUE" ]; then
echo "********** CLEAN SENTENCES **********"
#################################
# Clean sentences for iSimpm #
#################################
# INPUT - PREVIOUS OUTPUT: filtered sentences $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt
# output path and file $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
if [ -z "$(ls -A ./format/sanitized_sentences/)" ]; then :
else
#echo "Not Empty"
rm ./format/sanitized_sentences/*
fi
#Original Daniel: python2 $SCRIPT_PATH/format/regex-before.py $INPUT_PATH $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
python2 $SCRIPT_PATH/format/regex.py $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
fi # if [ "$CLEAN" = "TRUE" ]; then
if [ "$SEPARATE" = "TRUE" ]; then
echo "********** SEPARATE SENTENCES **********"
################################
# Separate sentences for iSimp #
################################
# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/sanitized_sentences/$l
# output path and file $SCRIPT_PATH/format/split_sentences/$BARE_NAME
cd $SCRIPT_PATH
if [ -z "$(ls -A ./format/split_sentences/)" ]; then :
else
rm ./format/split_sentences/*
fi
cd ./format/sanitized_sentences
for l in $(\ls $INPUT_NAME*)
do
# echo $l
BARE_NAME=$(echo $l | cut -f 1 -d '.')
BARE_NAME+="_"
LENGTH="$(wc -l < $l)"
LENGTH="$(echo "${#LENGTH}")"
split -a $LENGTH -d -l 1 --additional-suffix=.spt $SCRIPT_PATH/format/sanitized_sentences/$l $SCRIPT_PATH/format/split_sentences/$BARE_NAME
done
fi # if [ "$SEPARATE" = "TRUE" ]; then
if [ "$SIMPLI" = "TRUE" ]; then
echo "********** SIMPLIFY SENTENCES **********"
######################
# Simplify sentences #
######################
# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/split_sentences
# output file $OUTPUT_FILE
# path to iSimp $ISIMP_PATH
# CALL: ./sentence-simplification/sentence-simplification-main.sh
# CALL: $ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
# CALL: $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
# $OUTPUT_INDEX_FILE_PATH = $OUTPUT_FILE
# OUTPUT: simplified sentences in path ./algorithm_sentences
# while true; do
# read -p "Do you wish to simplificate sentences? [Y/N]: " yn
# case $yn in
# [Yy]* ) SIMP=1; break;;
# [Nn]* ) SIMP=0; break;;
# * ) echo "Please answer yes [Y] or no [N].";;
# esac
# done
case $SIMPLIFY in
[Yy]* )
SIMP=1
;;
[Nn]* )
SIMP=0
;;
* )
SIMP=1
;;
esac
cd $SCRIPT_PATH
if [ $SIMP == 1 ]
then #USING SIMPLIFICATION
echo "********** YES SIMPLIFY SENTENCES **********"
#Copy file to sentence-simplification
#FILE_NAME=$(basename "$INPUT_PATH")
#Call simplification pipeline AND create a file with the paths for the simplificated sentences
./sentence-simplification/sentence-simplification-main.sh $SCRIPT_PATH/format/split_sentences $OUTPUT_FILE $ISIMP_PATH
#echo "entrada: $SCRIPT_PATH/format/split_sentences --salida: $OUTPUT_FILE"
#echo "Sentences simplificated. Paths to simplificated sentences saved in $OUTPUT_FILE"
else #WITHOUT SIMPLIFICACION
echo "********** NO SIMPLIFY SENTENCES **********"
if [ -z "$(ls -A ./sentence-simplification/algorithm_sentences/)" ]; then :
else
#echo "Not Empty"
rm ./sentence-simplification/algorithm_sentences/*
fi
ls $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE
cp $SCRIPT_PATH/format/split_sentences/* $SCRIPT_PATH/sentence-simplification/algorithm_sentences
#echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE"
fi
fi # if [ "$SIMPLI" = "TRUE" ]; then
if [ "$DEVERBAL" = "TRUE" ]; then
echo "********** SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
######################
# Deverbal separator #
######################
# $PATH_TO_CORENLP
# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/sentence-simplification/algorithm_sentences
# output path $SCRIPT_PATH/deverbal-separator/separated_sentences
# $DICC_PATH
# $DEVTAG POS taggging sentences
# $DEVSEPAR Do separate sentences
# CALL: java -cp "$PATH_TO_CORENLP/*"
# $SCRIPT_PATH/filter.py
# OUTPUT: sentences separated in two paths according to verbal/deverbal effect
case $DEVERBAL_SEPARATOR in
[Yy]* )
DEVSEP=1
;;
[Nn]* )
DEVSEP=0
;;
* )
DEVSEP=1
;;
esac
if [ $DEVSEP == 1 ]
then #USING DEVERBAL SEPARATOR
#if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/)" ]; then :
#else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/*
# find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -maxdepth 1 -name '*.vrb' -delete
#fi
#if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/)" ]; then :
#else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/*
# find $SCRIPT_PATH/deverbal-separator/separated_sentences/dev -maxdepth 1 -name '*.dev' -delete
#fi
echo "********** YES SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
# Original Daniel 2018-12-06: ./deverbal-separator/separator.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR
./deverbal-separator/separator-v02.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR
else #WITHOUT DEVERBAL SEPARATOR
echo "********** NO SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
ls $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE
#echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE"
fi # [ $DEVSEP == 1 ]
fi # if [ "$DEVERBAL" = "TRUE" ]; then
if [ "$EXTDEVERBAL" = "TRUE" ]; then
echo "********** EXTRACT RI DEVERBAL **********"
#######################
# Extract RI deverbal #
#######################
# INPUT: deverbal files $(dirname ${file}) $(basename ${file})
# output path $OUTPUT_PATH $(basename ${file%.*})
# $DICC_PATH/names_EFFECT_ONTOGENE.txt $
# DICC_PATH/names_GENE.txt
# $DICC_PATH/names_GENE_ONTOGENE.txt
# $DICC_PATH/names_GENE_SYN.txt
# $DICC_PATH/names_TU.txt
# $DICC_PATH/names_TU_ONTOGENE.txt
# $DICC_PATH/names_TF_1grams.txt
# $DICC_PATH/names_TF_2grams.txt
# $DICC_PATH/names_TF_3grams.txt
# $DICC_PATH/names_TF_4grams.txt
# $DICC_PATH/names_TF_5Moregrams.txt
# $DICC_PATH/names_TF_ONTOGENE.txt
# $DICC_PATH/normalized_Effects.json
# OUTPUT: standoff files with RIs
# PATH ALREADY TAGGED ENTITIES: $SCRIPT_PATH/filtered-sentences
# FILE ALREADY TAGGED ENTITIES: filtered-sentences.ents.json
for file in $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/*.*
do
#python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-OriginalDaniel.py $file $OUTPUT_PATH/$(basename ${file%.*}) $DICC_PATH/names_EFFECT_ONTOGENE.txt $DICC_PATH/names_GENE.txt $DICC_PATH/names_GENE_ONTOGENE.txt $DICC_PATH/names_GENE_SYN.txt $DICC_PATH/names_TU.txt $DICC_PATH/names_TU_ONTOGENE.txt $DICC_PATH/names_TF_1grams.txt $DICC_PATH/names_TF_2grams.txt $DICC_PATH/names_TF_3grams.txt $DICC_PATH/names_TF_4grams.txt $DICC_PATH/names_TF_5Moregrams.txt $DICC_PATH/names_TF_ONTOGENE.txt
#echo "Dir file: $(dirname ${file})"
#echo "File $(basename ${file})"
#echo "OUTOUT_PATH $OUTPUT_PATH"
#echo "File $(basename ${file%.*})"
echo "Dir and files: $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
#python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v02.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json
python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v03.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json
done
fi # if [ "$EXTDEVERBAL" = "TRUE" ]; then
if [ "$OPENIE" = "TRUE" ]; then
echo "********** OPENIE TRIPLET EXTRACTION **********"
#########################
# OpenIE RI extraction #
#########################
# Juntamos frases verbales en archivo para OpenIE extraction
# Error: /bin/ls: Argument list too long: ls $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE
echo " Join verbal sentences into file for OpenIE extraction"
find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -type f -name '*' > $OUTPUT_FILE
#echo "Deberval sentences separated. Paths to verbal sentences saved in $OUTPUT_FILE"
echo " CoreNLP OpenIE..."
java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.naturalli.OpenIE -filelist $OUTPUT_FILE -triple.strict false -triple.all_nominals true -format reverb > $OUTPUT_FILE.reverb
fi # if [ "$OPENIE" = "TRUE" ]; then
if [ "$EXTOPENIE" = "TRUE" ]; then
echo "********** OPENIE RI EXTRACTION **********"
#########################
# OpenIE RI extraction #
#########################
# Sustituyo oie_compress de Nacho por un programa hecho por CMC para analizar las tripletas
# y obtener aquellas que sugieran a los participantes y el efecto
#Paste input and output for fancy printing
# Original Nacho: echo " Fancy printing..."
# Original Nacho: > $OUTPUT_FILE.fuzzy
# Original Nacho: python3 oie_compress.py --oies $OUTPUT_FILE.reverb --op fuzzy --ris $DICC_PATH/normalized_Effects.json --out $OUTPUT_FILE.fuzzy
#
# --inputFile $OUTPUT_FILE.reverb file obtained with CoreNLPL
# --outputPath $OUTPUT_PATH
# --diccPath $SCRIPT_PATH/filtered-sentences Before: $DICC_PATH
# --diccFile Before: termFilesTag_RIE_GCE_SYSTEM_ECCO.json
# --diccEffect normalized_Effects.json
# --format standoff
# --diccEPAth $DICC_PATH
# OUTPUT: standoff files with RIs
# python3.4 $SCRIPT_PATH/ri-openie-extraction.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccFile termFilesTag_RIE_GCE_SYSTEM_ECCO.json --diccEffect normalized_Effects.json --format standoff
python3.4 $SCRIPT_PATH/ri-openie-extraction-v02.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $SCRIPT_PATH/filtered-sentences --diccFile filtered-sentences.ents.json --diccEffect normalized_Effects.json --diccEPAth $DICC_PATH --format standoff
#Join into single file
#Sort fuzzy
# Original Nacho: echo " Sort fuzzy..."
# Obtiene tipo de efecto
# Original Nacho: sort $OUTPUT_FILE.fuzzy -o $OUTPUT_FILE.fuzzy
#Concatenate
# CMC eliminated following lines because simplification was
#discriminated before
#if [ $SIMP == 1 ]
#then #USING SIMPLIFICATION
#ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
#awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
#cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
#paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
#else #WITHOUT SIMPLIFICACION
#ls -l $SCRIPT_PATH/format/split_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
#awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
#cat $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE.als
#paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
#fi
# Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
# Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
# Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
# Original Nacho: echo " Creating ils, fls and als files..."
# Original Nacho: if [ $DEVSEP == 1 ]
# Original Nacho: then #USING DEVERBAL SEPARATOR
# Original Nacho: ls -l $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
# Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
# Original Nacho: cat $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE.als
# Original Nacho: else #WITHOUT DEVERBAL SEPARATOR
# Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
# Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
# Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
# Original Nacho: fi
# Original Nacho: echo " Paste merger..."
# Original Nacho: paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
# Original Nacho: echo " Create dsp file..."
# Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp
# Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp
# rm $(dirname "$OUTPUT_FILE")/*.fls
# rm $(dirname "$OUTPUT_FILE")/*.ils
# rm $(dirname "$OUTPUT_FILE")/*.als
#rm $SCRIPT_PATH/*.merger
#rm $SCRIPT_PATH/*.reverb
#rm $SCRIPT_PATH/*.fuzzy
fi # if [ "$EXTOPENIE" = "TRUE" ]; then
if [ "$EXTATTRIB" = "TRUE" ]; then
echo "********** ATTRIBUTIVE RI EXTRACTION **********"
#########################
# Attributive RI extraction #
#########################
# Attributive RI extraction, such as AraP-regulated genes aragP, aragT
#
# --inputPath $SCRIPT_PATH/attributive-sentences
# --outputPath $OUTPUT_PATH
# --diccPath $SCRIPT_PATH/filtered-sentences Before: $DICC_PATH
# --diccEffect normalized_Effects.json
# OUTPUT: standoff files with RIs
for file in $SCRIPT_PATH/attributive-sentences/*.*
do
echo "Dir file: $(dirname ${file})"
echo "File: $(basename ${file})"
# echo "OUTOUT_PATH $OUTPUT_PATH"
# echo "File $(basename ${file%.*})"
# echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
if [ "$(basename ${file})" = "*.*" ]; then
echo "None attributive sentence found"
else
python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json
fi
done
fi # if [ "$EXTATTRIB" = "TRUE" ]; then
if [ "$EXTAUTOREG" = "TRUE" ]; then
echo "********** AUTOREGULATION RI EXTRACTION **********"
#########################
# Autoregulation RI extraction #
#########################
# Autoregulation RI extraction, such as ArgP protein represses its own synthesis
#
# --inputPath $SCRIPT_PATH/autoregulation-sentences
# --outputPath $OUTPUT_PATH
# --diccPath $DICC_PATH
# --diccEffect normalized_Effects.json
# OUTPUT: standoff files with RIs
for file in $SCRIPT_PATH/autoregulation-sentences/*.*
do
echo "Dir file: $(dirname ${file})"
echo "File: $(basename ${file})"
# echo "OUTOUT_PATH $OUTPUT_PATH"
# echo "File $(basename ${file%.*})"
# echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
if [ "$(basename ${file})" = "*.*" ]; then
echo "None autoregulation sentence found"
else
python3 $SCRIPT_PATH/ri-autoregulation-extraction-v01.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json
fi
done
fi # if [ "$EXTAUTOREG" = "TRUE" ]; then
if [ "$EXTGC" = "TRUE" ]; then
echo "********** EXTRACT GROWTH CONDITIONS **********"
#############################
# Extract growth conditions #
#############################
python3.4 $SCRIPT_PATH/extract-gcs/extract-gcs-regex.py --inputPath $OUTPUT_PATH/complete-ris --outputPath $OUTPUT_PATH/complete-ris --termPath /home/cmendezc/terminologicalResources
#python3 ./GCs-regex-before.py ./ejemplo_11.spt
#/home/elwe/Documents/prueba3/RIE_reordenado/RI-searcher/GC/ejemplo_11.spt ./ejemplo_11.a2
#./names_GC_ECCO_1grams.txt ./names_GC_ECCO_2grams.txt ./names_GC_ECCO_3grams.txt
#./names_GC_ECCO_4grams.txt ./names_GC_ECCO_5Moregrams.txt
fi # if [ "$EXTGC" = "TRUE" ]; then
if [ "$EVAL" = "TRUE" ]; then
echo "********** EVALUATE EXTRACTION **********"
if [ "$EVALGC" = "TRUE" ]; then
echo "********** EVALUATE GROWTH CONDITION EXTRACTION **********"
python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON --evaluateGCs
else
echo "********** EVALUATE WITHOUT GROWTH CONDITION EXTRACTION **********"
python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON
fi # if [ "$EVALGC" = "TRUE" ]; then
fi # if [ "$EVAL" = "TRUE" ]; then
# import fileinput
# import regex as re
# from regex import finditer
import sys
import json
if ( len( sys.argv ) != 3 ):
# Original Daniel: sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <EFFs_dictionary> \n" )
sys.stderr.write("E: usage: " + sys.argv[0] + " <input_file> <normalized_Effects> \n")
sys.stderr.flush();
# exit( 2 );
#LEER ARCHIVO INPUT
text_file = open( sys.argv[1], "r" )
dato = text_file.read()
text_file.close()
#LEE DICCIONARIO
# Loading normalized effects
# print('Loading normalized effects...')
with open(sys.argv[2]) as diccFile:
hashNormalizedEffects = json.load(diccFile)
DICC = list(hashNormalizedEffects.keys())
# Original Daniel: text_file = open( sys.argv[2], "r" )
# Original Daniel: DICC = text_file.read().splitlines()
# Original Daniel: text_file.close()
#declara variables
is_dev = False
is_vrb = False
# DICC
# 2018-11-30 CMC: We separated noun and only past participle for deverbal processing
# and all verb forms as verbal
# VRB: VB verb, base form think
# VRB: VBZ verb, 3rd person singular present she thinks
# VRB: VBP verb, non-3rd person singular present I think
# VRB: VBD verb, past tense they thought
# DEV: VBN verb, past participle a sunken ship
# VRB: VBG verb, gerund or present participle thinking is fun
# extend/VBP
for i in range(len(DICC)):
# print(DICC[i])
for token in dato.split():
word = token[:token.find("/")]
tag = token[token.find("/")+1:]
# print("word: {}".format(word))
# print("tag: {}".format(tag))
if (DICC[i] in word) and (("NN" in tag)
or ("VBN" == tag)
):
is_dev = True
# print("deverbal: " + word)
if (DICC[i] in word) and ("VB" in tag):
is_vrb = True
# print("verbal: " + word)
if is_dev and is_vrb:
sys.exit(11)
elif is_dev:
sys.exit(12)
elif is_vrb:
sys.exit(13)
else:
sys.exit(10)
#!/bin/bash
# Separates sentences by deverbal (.dev) and verbal (.vrb)
# Original Daniel: PATH_TO_CORENLP=/home/elwe/Documents/temporal/CoreNLP
#Validate arguments
if [[ ! ("$#" == 6 ) ]]; then
echo 'Usage: ./separator.sh <path_to_corenlp> <input_path> <output_path> <dicc_path> <if_tag> <if_separate>'
exit 1
fi
SCRIPT_PATH=$(cd `dirname $0` && pwd)
# Original Daniel: INPUT_PATH=$1 #carpeta que contiene archivos a separar
# Original Daniel: OUTPUT_PATH=$2
PATH_TO_CORENLP=$1
INPUT_PATH=$2 #carpeta que contiene archivos a separar
OUTPUT_PATH=$3
DICC_PATH=$4
# Tag sentences to separate deverbal and verbal sentences: $DEVTAG
TAG=$5
# Do separate deverbal and verbal sentences: $DEVSEPAR
SEP=$6
if [ $TAG == "TRUE" ]
then #ANALIZAR EN STANFORD PARSER
if [ -z "$(ls -A $SCRIPT_PATH/tagged/)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged/*
find $SCRIPT_PATH/tagged -maxdepth 1 -name '*.conll' -delete
fi
# Added by CMC
if [ -z "$(ls -A $SCRIPT_PATH/tagged-line/)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged-line/*
find $SCRIPT_PATH/tagged-line -maxdepth 1 -name '*.spt' -delete
fi
for j in $INPUT_PATH/*
do
#echo $j
#Original Daniel: java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.parser.lexparser.LexicalizedParser -writeOutputFiles -retainTMPSubcategories -outputFormat "wordsAndTags" $SCRIPT_PATH/englishPCFG.ser.gz $j
# Command line: java -cp "/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file datos_0.spt -outputDirectory tagged
# java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
# With parse: java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
done
# Original Daniel: mv $INPUT_PATH/*.stp $SCRIPT_PATH/tagged/
for j in $SCRIPT_PATH/tagged/*
do
# Original Daniel: awk 'NF {print $2 "/" $4}' tagged/$j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${j%.spt}"
filename=$(basename "$j")
#filename="${filename%.*}"
awk 'NF {print $2 "/" $4}' $j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${filename%.*}.spt"
# Original Daniel: mv "$j" "${j%.stp}"
done
fi # if [ $TAG == "TRUE" ]
if [ $SEP == "TRUE" ]
then #SEPARAR ARCHIVOS
# Original Daniel: if [ -z "$(ls -A $OUTPUT_PATH)" ]; then :
# Modified by Carlos Méndez
if [ -z "$(ls -A $OUTPUT_PATH/dev)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/dev/*
find $OUTPUT_PATH/dev -maxdepth 1 -name '*.dev' -delete
fi
if [ -z "$(ls -A $OUTPUT_PATH/vrb)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/vrb/*
find $OUTPUT_PATH/vrb -maxdepth 1 -name '*.vrb' -delete
fi
for j in $SCRIPT_PATH/tagged-line/*
do
# Original Daniel: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/names_EFFECT_ONTOGENE.txt
# CMC 2018-12-04: Without separating verbal forms: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/normalized_Effects.json
# CMC 2018-12-11: With separating verbal forms: python3 $SCRIPT_PATH/filter-v02.py $j $DICC_PATH/normalized_Effects.json
# CMC 2018-12-11: Considering only passive verbal form as deverbal: VBN verb, past participle
python3 $SCRIPT_PATH/filter-v03.py $j $DICC_PATH/normalized_Effects.json
VAR=$?
# filename=${j##*/}
# inputfile=${filename%.spt}
# exit
if [ $VAR == 11 ]; then :
#contiene dev y vrb $SCRIPT_PATH/tagged-line/
# o
#Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
#Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
#echo "Deverbal and verbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
elif [ $VAR == 12 ]; then :
#contiene dev
#echo "Deverbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
# cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
elif [ $VAR == 13 ]; then :
#contiene vrb
#echo "Verbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
# cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
elif [ $VAR == 10 ]; then :
#parece no contener dev ni vrb
echo "Non deverbal and verbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
# cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
fi
done
fi # if [ $SEP == "TRUE" ]
# -*- coding: UTF-8 -*-
import operator
from optparse import OptionParser
import os
import sys
import json
import re
__author__ = 'CMendezC'
# Objective: evaluate predicted interactions in standoff format
# versus true interactions in tab format
# v04: add synonyms of TFs
# Parameters:
# 1) --truePath Path for true interactions
# 2) --trueFile File for true interactions
# 3) --predictedPath Path for predicted interactions
# 4) --outputPath Output path
# 5) --outputFile File for saving results
# 6) --evaluateGCs Evaluate with GCs
# 7) --diccPath Dictionary path
# 8) --diccSynon File with synonyms of TFs
# Ouput:
# 1) File with TP, FP, FN and scores Precision, Recall , F1
# Execution:
# python3.4 evaluate-ris-gcs-standoff.py
# --truePath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/analysis-validation-data-sets
# --trueFile ris-analysis-reference.txt
# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris-gcs
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/evaluation-reports
# --outputFile evaluation-riegce-system-ris-analysis.txt
# --diccPath /home/cmendezc/terminologicalResources
# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
# --evaluateGCs
###########################################################
# MAIN PROGRAM #
###########################################################
def updateHashPredicted(pr, hashP, pm, sF, ef):
if pr not in hashP:
hashTemp = {"pmids": {pm: [sF]}, "orieff": ef}
hashP[pr] = hashTemp
else:
hashTemp = hashP[pr]
if pm in hashTemp["pmids"]:
hashP[pr]["pmids"][pm].append(sF)
else:
hashP[pr]["pmids"][pm] = [sF]
def getSummary(r, hashTemp):
pmids = 0
sentences = 0
orieff = ""
if r in hashTemp:
# print("r: {}".format(r))
orieff = hashTemp[r]["orieff"]
for pmid in hashTemp[r]["pmids"]:
pmids += 1
# print("PMID with sentences: {}".format(pmid))
for sent in hashTemp[r]["pmids"][pmid]:
sentences += 1
else:
return "WARNING: no data available!"
return "Artículos: {}\tFrases: {}\tOriginal effect: {}".format(pmids, sentences, orieff)
def getDetail(r, hashTemp):
return_text = ""
sentences = 0
aHash = {}
if r in hashTemp:
for pmid in hashTemp[r]["pmids"]:
for sent in hashTemp[r]["pmids"][pmid]:
sentences += 1
if pmid not in aHash:
aHash[pmid] = sentences
else:
return "WARNING: PMID duplicated!"
else:
return "WARNING: no data available!"
for p, s in sorted(aHash.items(), key=operator.itemgetter(1), reverse=True):
return_text += "\tPMID {}: {} frases\n".format(p, s)
return return_text
def get_standard_name(regSynon):
reg = ""
if regSynon in hashSynon:
reg = hashSynon[regSynon]
else:
for syn, std in hashSynon.items():
if regSynon.startswith(syn):
reg = regSynon.replace(syn, std, 1)
break
return reg
def isCorrect(ripr, listT, rtype):
# The predicted regulator starts with entity
# Effect and regulated coincide
# Regulator coincides with activator or repressor
# We return a flag to indicate type of matching: full
list_ripr = ripr.split('\t')
regulator = list_ripr[0]
regulatorStdName = ""
if use_synonyms:
regulatorStdName = get_standard_name(regulator)
for rit in listT:
# print("RI TRUE: {}".format(rit))
listRT = rit.split('\t')
regulatorT = listRT[0]
regexRegulatorStarts = re.compile(r'(' + regulatorT + r').+')
if rtype == "ri":
regulated = list_ripr[1]
regulatedT = listRT[1]
if (regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT:
return (rit, 'Full')
# For cases where regulator is part of the word, such as ArgP-regulated
result = regexRegulatorStarts.match(regulator)
if result:
# print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
regulator = result.group(1)
if regulator == regulatorT and regulated == regulatedT:
return (rit, 'Start')
else:
if use_synonyms:
result = regexRegulatorStarts.match(regulatorStdName)
if result:
# print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
regulator = result.group(1)
if regulator == regulatorT and regulated == regulatedT:
return (rit, 'Start')
elif rtype == "rief":
effect = list_ripr[2]
regulated = list_ripr[1]
effectT = listRT[2]
regulatedT = listRT[1]
# if ripr == "ArgP\ttargets\tregulator":
# print("RI-PREDICT: ArgP\ttargets\tregulator")
# print(" PREDICT: regulator {} effect {} regulated {}".format(regulator, effect, regulated))
# print(" TRUE: regulator {} effect {} regulated {}".format(regulatorT, effectT, regulatedT))
if (
regulator == regulatorT or regulatorStdName == regulatorT) and effect == effectT and regulated == regulatedT:
return (rit, 'Full')
elif (
regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT and effect == "regulator" and (
effectT == "activator" or effectT == "repressor"):
# if ripr == "ArgP\ttargets\tregulator":
# print(" Correct RI with regulator: {}".format(ripr))
# return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
return (ripr, 'Regulator')
else:
# For cases where regulator is part of the word, such as ArgP-regulated
result = regexRegulatorStarts.match(regulator)
if result:
# print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
regulator = result.group(1)
if regulator == regulatorT and effect == effectT and regulated == regulatedT:
return (rit, 'Start')
elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and (
effectT == "activator" or effectT == "repressor"):
# if ripr == "ArgP\ttargets\tregulator":
# print(" Correct RI with regulator: {}".format(ripr))
# return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
# solo que en este caso uso solo el regulador
# return rit
return (regulator + '\t' + regulated + '\t' + effect, 'Regulator')
else:
if use_synonyms:
result = regexRegulatorStarts.match(regulatorStdName)
if result:
if regulator == regulatorT and effect == effectT and regulated == regulatedT:
return (rit, 'Start')
elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and (
effectT == "activator" or effectT == "repressor"):
# if ripr == "ArgP\ttargets\tregulator":
# print(" Correct RI with regulator: {}".format(ripr))
# return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
# solo que en este caso uso solo el regulador
# return rit
return (regulator + '\t' + regulated + '\t' + effect, 'Regulator')
# CMC 2018-10-14: Revisar riefgc porque no se ha actualizado
# elif rtype == "riefgc":
# effect = list_ripr[2]
# regulated = list_ripr[1]
# gc = list_ripr[3]
# effectT = listRT[2]
# regulatedT = listRT[1]
# gcT = listRT[3]
# if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT:
# return rit
# elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT:
# return rit
# else:
# # For cases where regulator is part of the word, such as ArgP-regulated
# result = regexRegulatorStarts.match(regulator)
# if result:
# #print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
# regulator = result.group(1)
# if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT:
# return rit
# elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT:
# return rit
return ('', '')
def get_scores_rules(listTrue, listPredicted, hashTemp, title, ri_type):
print("Evaluation")
# print(listPredicted)
# Precision = Extraídos correctos / Predichos
# Recall = Extraídos correctos / Referencia
# F - 1 = 2 * ((Precision * Recall) / (Precision + Recall))
correct = 0
incorrect = 0
# For registering correct and incorrect RIs
hashPredicted = {}
# To print output RIs
hashOutputRIs = {}
# For registering unrecovered RIs
hashUnrecovered = {}
predicted = len(listPredicted)
print("len(listPredicted): {}".format(predicted))
reference = len(listTrue)
# print("Reference: {}".format(reference))
listRecovered = []
for ri_pred in listPredicted:
print("ri_pred: {}".format(ri_pred))
# if ri_pred in hashPredicted:
# print("WARNING: RI predicted {} duplicated {}".format(ri_pred, hashPredicted[ri_pred]))
# else:
# First all predicted RIs are incorrect
# hashPredicted[ri_pred] = "incorrect"
# if ri_pred in listTrue:
# hashPredicted[ri_pred] = "correct"
# listRecovered.append(ri_pred)
# correct += 1
# continue
riTrue = ''
result = isCorrect(ri_pred, listTrue, ri_type)
riResult = result[0]
matchType = result[1]
if riResult != '':
if riResult not in hashOutputRIs:
hashOutputRIs[riResult] = "Correct"
if ri_pred not in hashPredicted:
hashPredicted[ri_pred] = "correct"
print("ri_pred {} correct".format(ri_pred))
correct += 1
# Complete matching or the predicted regulator starts with entity
if matchType == 'Full' or matchType == 'Start':
# ri_pred matches with ri_true
if riResult in listRecovered:
print("WARNING: riResult {} already in listRecovered".format(riResult))
else:
listRecovered.append(riResult)
else:
incorrect += 1
if riResult not in hashOutputRIs:
hashOutputRIs[riResult] = "Incorrect"
if ri_pred not in hashPredicted:
hashPredicted[ri_pred] = "incorrect"
print("ri_pred {} incorrect".format(ri_pred))
if len(hashPredicted) != predicted:
print("ERROR: number of predicted RIs mismatch")
# return
print("Predicted: {}".format(predicted))
print("len(hashPredicted): {}".format(len(hashPredicted)))
cor = 0
inc = 0
for r, v in hashPredicted.items():
if v == "correct":
cor += 1
elif v == "incorrect":
inc += 1
if cor != correct:
print("ERROR: number of correct RIs mismatch")
# return
if inc != incorrect:
print("ERROR: number of incorrect RIs mismatch")
# return
print("Correct: {}".format(correct))
print("Incorrect: {}".format(incorrect))
unrecovered = 0
recovered = 0 # Only when coincide with reference
# without considering Regulator correct when Activator or Repressor appears in reference
listRecovered2 = []
listUnrecovered = []
for ri in listTrue:
if ri not in listRecovered:
if ri in listUnrecovered:
print("WARNING: ri {} already in listUnrecovered".format(ri))
else:
listUnrecovered.append(ri)
unrecovered += 1
else:
if ri in listRecovered2:
print("WARNING: ri {} already in listRecovered2".format(ri))
else:
listRecovered2.append(ri)
recovered += 1
print("Len listRecovered: {}".format(len(listRecovered)))
print("Len listRecovered2: {}".format(len(listRecovered2)))
print("Len listUnrecovered: {}".format(len(listUnrecovered)))
# if (unrecovered + correct) != reference:
# print("ERROR: number of unrecovered {} + correct {} and reference {} RIs mismatch".format(unrecovered, correct, reference))
# return
print("{}".format(title))
print("Predicted: {}".format(predicted))
print("Reference: {}".format(reference))
print("Unrecovered: {}".format(unrecovered))
print("Recovered: {}".format(recovered))
precision = correct / predicted
print("Precision = correct / predicted: {}".format(precision))
# recall = correct / reference
# We calculate recall as recovery rate, because correct instances are calculates
# considering Regulator correct when Activator and Repressor appears in reference
recall = recovered / reference
print("Recall = recovered / reference: {}".format(recall))
f1 = 2 * ((precision * recall) / (precision + recall))
print("F1: {}".format(f1))
with open(os.path.join(options.outputPath, options.outputFile), mode="a", errors="replace") as oFile:
oFile.write("{}\n".format(title))
oFile.write("Predicted: {}\n".format(predicted))
oFile.write("Reference: {}\n".format(reference))
oFile.write("Correct: {}\n".format(correct))
oFile.write("Incorrect: {}\n".format(incorrect))
oFile.write("Unrecovered: {}\n".format(unrecovered))
oFile.write("Recovered: {}\n".format(recovered))
oFile.write("Precision = correct / predicted: {}\n".format(precision))
oFile.write("Recall = recovered / reference: {}\n".format(recall))
oFile.write("F1: {}\n".format(f1))
oFile.write("Unrecovered instances:\n")
for r in sorted(listUnrecovered):
oFile.write("\tUnrecovered: {}\n".format(r))
oFile.write("Recovered instances:\n")
for r in sorted(listRecovered):
oFile.write("\tRecovered: {}\n".format(r))
oFile.write("Incorrect instances:\n")
for r, v in sorted(hashPredicted.items()):
if v == "incorrect":
oFile.write("\tIncorrect: {}\n".format(r))
oFile.write("Correct instances:\n")
for r, v in sorted(hashPredicted.items()):
if v == "correct":
oFile.write("\tCorrect: {}\n".format(r))
# oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp)))
# oFile.write("\t{}\n".format(getDetail(r, hashTemp)))
def get_scores(listTrue, listPredicted, hashTemp, title):
# Precision = Extraídos correctos / Extraídos
# Recall = Extraídos correctos / Referencia
# F - 1 = 2 * ((Precision * Recall) / (Precision + Recall))
print("{}".format(title))
# print("listTrue: {}".format(listTrue))
# print("listPredicted: {}".format(listPredicted))
print("Predicted: {}".format(len(listPredicted)))
print("Reference: {}".format(len(listTrue)))
correct = set(listTrue) & set(listPredicted)
print("Correct: {} ({})".format(len(correct), len(correct) / len(listPredicted)))
incorrect = set(listPredicted) - set(listTrue)
print("Incorrect: {} ({})".format(len(incorrect), len(incorrect) / len(listPredicted)))
unrecovered = set(listTrue) - set(listPredicted)
print("Unrecovered: {} ()".format(len(unrecovered), len(unrecovered) / len(listTrue)))
precision = len(correct) / len(listPredicted)
print("Precision: {}".format(precision))
recall = len(correct) / len(listTrue)
print("Recall: {}".format(recall))
f1 = 2 * ((precision * recall) / (precision + recall))
print("F1: {}".format(f1))
with open(os.path.join(options.outputPath, options.outputFile), mode="a") as oFile:
oFile.write("{}\n".format(title))
oFile.write("Predicted: {}\n".format(len(listPredicted)))
oFile.write("Reference: {}\n".format(len(listTrue)))
oFile.write("Correct: {}\n".format(len(correct)))
oFile.write("Incorrect: {}\n".format(len(incorrect)))
oFile.write("Unrecovered: {}\n".format(len(unrecovered)))
oFile.write("Precision: {}\n".format(precision))
oFile.write("Recall: {}\n".format(recall))
oFile.write("F1: {}\n".format(f1))
oFile.write("Correct instances:\n")
for r in sorted(correct):
oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp)))
oFile.write("\t{}\n".format(getDetail(r, hashTemp)))
oFile.write("Incorrect instances:\n")
for r in sorted(incorrect):
oFile.write("\t{}\n".format(r))
oFile.write("Unrecovered instances:\n")
for r in sorted(unrecovered):
oFile.write("\t{}\n".format(r))
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--truePath", dest="truePath",
help="Path true ris gcs", metavar="PATH")
parser.add_option("--trueFile", dest="trueFile",
help="File true ris gcs", metavar="FILE")
parser.add_option("--predictedPath", dest="predictedPath",
help="Path predicted ris gcs", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--outputFile", dest="outputFile",
help="File for saving results", metavar="FILE")
parser.add_option("--evaluateGCs", default=False,
action="store_true", dest="evaluateGCs",
help="Evaluate GCs?")
parser.add_option("--diccPath", dest="diccPath",
help="Path to dictionary", metavar="PATH")
parser.add_option("--diccSynon", dest="diccSynon",
help="File with synonyms", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path true ris gcs: " + str(options.truePath))
print("File true ris gcs: " + str(options.trueFile))
print("Path predicted ris gcs: " + str(options.predictedPath))
print("Output path: " + str(options.outputPath))
print("File for saving results: " + str(options.outputFile))
print("Evaluate GCs: " + str(options.evaluateGCs))
print("Path to dictionary: " + str(options.diccPath))
print("File with synonyms: " + str(options.diccSynon))
use_synonyms = False
hashSynon = {}
if options.diccPath != None and options.diccSynon != "no-synonyms":
print("***** Using synonyms *****")
use_synonyms = True
print('Loading dictionary of synonyms...')
with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon:
hashSynon = json.load(diccSynon)
print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon)))
listTrueRI = [] # Without effect nor gc
listTrueRIEF = [] # With effect nor gc
if options.evaluateGCs:
listTrueRIEFGC = [] # With effect and gc
# Read and process Reference
with open(os.path.join(options.truePath, options.trueFile), mode="r", encoding="utf-8") as iFile:
for line in iFile:
line = line.strip('\n')
if line.startswith("#"):
continue
listElem = line.split('\t')
if len(listElem) > 4:
regulator = listElem[2]
regulated = listElem[3]
effect = listElem[4]
if options.evaluateGCs:
gc = listElem[5]
else:
regulator = listElem[0]
regulated = listElem[1]
effect = listElem[2]
if options.evaluateGCs:
gc = listElem[3]
if effect == "binding":
effect = "regulator"
ri = "{}\t{}".format(regulator, regulated)
if ri not in listTrueRI:
listTrueRI.append(ri)
rief = "{}\t{}\t{}".format(regulator, regulated, effect)
if rief not in listTrueRIEF:
listTrueRIEF.append(rief)
if options.evaluateGCs:
riefgc = "{}\t{}\t{}\t{}".format(regulator, regulated, effect, gc)
if riefgc not in listTrueRIEFGC:
listTrueRIEFGC.append(riefgc)
print(" RIs en referencia antes regulators: {}".format(len(listTrueRI)))
print(" RIEFs en referencia antes regulators: {}".format(len(listTrueRIEF)))
if options.evaluateGCs:
print(" RIEFGCs en referencia antes regulators: {}".format(len(listTrueRIEFGC)))
# Eliminate those RIs with regulator which also have RIs with activator or repressor
listRITemp = []
for ri in listTrueRIEF:
listRI = ri.split('\t')
regulator = listRI[0]
regulated = listRI[1]
effect = listRI[2]
if effect == "regulator":
tempRIA = "{}\t{}\t{}".format(regulator, regulated, "activator")
tempRIR = "{}\t{}\t{}".format(regulator, regulated, "repressor")
if tempRIA in listTrueRIEF or tempRIR in listTrueRIEF:
pass
# print("RI regulator matchs RI activator/repressor: {}".format(ri))
# listTrueRIEF.remove(ri)
else:
# print("Len before: {}".format(len(listRITemp)))
listRITemp.append(ri)
# print("Len after: {}".format(len(listRITemp)))
else:
listRITemp.append(ri)
listTrueRIEF = listRITemp
print(" RIEFs en referencia después regulators: {}".format(len(listTrueRIEF)))
if options.evaluateGCs:
for ri in listTrueRIEFGC:
listRI = ri.split('\t')
regulator = listRI[0]
regulated = listRI[1]
effect = listRI[2]
gc = listRI[3]
if effect == "regulator":
tempRIGCA = "{}\t{}\t{}\t{}".format(regulator, regulated, "activator", gc)
tempRIGCR = "{}\t{}\t{}\t{}".format(regulator, regulated, "repressor", gc)
if tempRIGCA in listTrueRIEFGC or tempRIGCR in listTrueRIEFGC:
listTrueRIEFGC.remove(ri)
print(" RIEFGCs en referencia después regulators: {}".format(len(listTrueRIEFGC)))
listPredictedRI = []
hashPredictedRI = {}
listPredictedRIEF = []
hashPredictedRIEF = {}
if options.evaluateGCs:
listPredictedRIEFGC = []
hashPredictedRIEFGC = {}
hashFiles = {}
for path, dirs, files in os.walk(options.predictedPath):
for file in files:
if file.endswith(".a1"):
filename = file[:-3]
if filename not in hashFiles:
hashFiles[filename] = 1
else:
hashFiles[filename] += 1
print("Files: {}".format(len(hashFiles)))
hashEntities = {}
processedFiles = 0
for file in sorted(hashFiles.keys()):
print("File: {}".format(file))
pmid = file[:file.find("_")]
# print("pmid {}".format(pmid))
sentenceFile = file[:file.find("-", file.find("_"))] + ".txt"
hashEntities = {}
hashOriginalEffect = {}
with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File:
for line in a1File:
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
entity = listLine2[0]
idEntity = listLine1[0]
originalEffect = listLine1[2]
if entity.startswith("EFFECT"):
entity = entity[entity.find(".") + 1:]
print("Entity: {}".format(entity))
entity = entity.replace("_dev", "")
print("Entity without _dev: {}".format(entity))
if idEntity not in hashOriginalEffect:
hashOriginalEffect[idEntity] = originalEffect
else:
entity = listLine1[2]
if idEntity not in hashEntities:
hashEntities[idEntity] = entity
print("hashEntities: {}".format(hashEntities))
with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File:
for line in a2File:
# print("Line a2: {}".format(line))
# R1 Interaction.T3 Target:T2 Agent:T1 Condition: T4
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
regulator = listLine2[2]
regulator = regulator[regulator.find(":") + 1:]
regulated = listLine2[1]
regulated = regulated[regulated.find(":") + 1:]
effect = listLine2[0]
effect = effect[effect.find(".") + 1:]
# print("effect: {}".format(hashEntities[effect]))
# if hashEntities[effect] == "binding":
# continue
if options.evaluateGCs:
gc = listLine2[3]
gc = gc[gc.find(":") + 1:]
pri = "{}\t{}".format(hashEntities[regulator], hashEntities[regulated])
if pri not in listPredictedRI:
listPredictedRI.append(pri)
updateHashPredicted(pri, hashPredictedRI, pmid, sentenceFile, None)
prief = "{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated], hashEntities[effect])
print("prief: {}".format(prief))
if prief not in listPredictedRIEF:
listPredictedRIEF.append(prief)
updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect])
if options.evaluateGCs:
priefgc = "{}\t{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated],
hashEntities[effect], hashEntities[gc])
if priefgc not in listPredictedRIEFGC:
listPredictedRIEFGC.append(priefgc)
updateHashPredicted(priefgc, hashPredictedRIEFGC, pmid, sentenceFile, hashOriginalEffect[effect])
processedFiles += 1
print("Processed files: {}".format(processedFiles))
with open(os.path.join(options.outputPath, options.outputFile), mode="w") as oFile:
pass
get_scores_rules(listTrueRIEF, listPredictedRIEF, hashPredictedRIEF,
"Scores regulator-regulated-effect (without gc)", "rief")
get_scores_rules(listTrueRI, listPredictedRI, hashPredictedRI, "Scores regulator-regulated (without effect nor gc)",
"ri")
if options.evaluateGCs:
get_scores_rules(listTrueRIEFGC, listPredictedRIEFGC, hashPredictedRIEFGC,
"Scores regulator-regulated-effect-gc", "riefgc")
import fileinput
#import regex as re
#from regex import finditer
# We use Python3 and we had to eliminate option overlapped from finditer method of re
# As Daniel created this Python script in Python 2.7, he used overlapped, but in
# Python 3 this option was eliminated.
import re
from re import finditer
import sys
import os
import json
if (len(sys.argv) != 8):
sys.stderr.write("E: usage: " + sys.argv[
0] + " <input_path> <input_file> <output_path> <output_file> <normalized_Effects> <entity_path> <entity_file>\n")
sys.stderr.flush();
exit(2);
# LEER ARCHIVO INPUT
# Original Daniel: text_file = open( sys.argv[1], "r" )
# Original Daniel: dato = text_file.read()
# Original Daniel: text_file.close()
filename = sys.argv[2]
input_file = open(os.path.join(sys.argv[1], filename), "r")
#print("Input file: {}".format(os.path.join(sys.argv[1], sys.argv[2])))
dato = input_file.read()
input_file.close()
# Loading normalized effects
# print('Loading normalized effects...')
with open(os.path.join(sys.argv[5])) as diccFile:
hashNormalizedEffects = json.load(diccFile)
# USING ALREADY TAGGED ENTITIES OF THE FILE (in filter sentence step)
#<entity_path> <entity_file>
# READ DICTIONARY WITH ALREADY TAGGED ENTITIES
entity_path = sys.argv[6]
entity_file = sys.argv[7]
print('Loading dictionaries with already tagged entities...')
with open(os.path.join(entity_path, entity_file)) as entFile:
hashDicc = json.load(entFile)
print(' Loading dictionaries with already tagged entities... Done!')
# CREATE LISTS WITH ALREADY TAGGED ENTITIES OF THE FILE
regexNumFile = re.compile(r'_([0-9]+)[.-]')
result = regexNumFile.search(filename)
numFile = ""
inumFile = 0
if result:
inumFile = int(result.group(1))
numFile = str(inumFile)
print("Numfile: {}".format(numFile))
else:
print("WARNING: numfile not found in filename")
ATEREG1 = []
PTEREG1GENE = []
PTEREG1TU = []
listEffects = []
if numFile in hashDicc:
hashTemp = hashDicc[numFile]
# print("hashDicc[numFile]: {}".format(hashTemp))
for k, v in hashTemp.items():
if v == "TF":
# print("Verifiying TF")
if k not in ATEREG1:
# print(" TF {}".format(k))
ATEREG1.append(k)
elif v == "GENE":
if k not in PTEREG1GENE:
PTEREG1GENE.append(k)
elif v == "TU":
if k not in PTEREG1TU:
PTEREG1TU.append(k)
elif v == "EFFECT":
if k not in listEffects:
listEffects.append(k)
else:
print("WARNING: entity not found in dictionaries")
else:
print("WARNING: numfile not found in dictionaries")
# QUITA EXTENSION DE NOMBRE DE ARCHIVO
# Original Daniel: split_line = sys.argv[2]
output_path = sys.argv[3]
# Original Daniel: split_line = split_line[:-4]
# Original Daniel: file_name = split_line + ".a2"
input_file_name = sys.argv[2]
# Original Daniel: open( file_name , 'w').close()
file_name_entities_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a1")
file_name_interactions_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a2")
file_name_entities_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a1")
file_name_interactions_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a2")
file_name_text_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".txt")
file_name_text_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".txt")
open(file_name_entities_complete, 'w').close()
open(file_name_interactions_complete, 'w').close()
# Original Daniel: open( file_name , 'w').close()
open(file_name_entities_incomplete, 'w').close()
open(file_name_interactions_incomplete, 'w').close()
# declara variables
# Original Daniel: impresion = []
impresionEntities = []
impresionInteractionsComplete = []
impresionInteractionsIncomplete = []
salida_a2 = []
salida_a2_trimmed = []
salida_a2_str = []
q2line = ()
listadeRIs = []
posiblesminimos = [[], []]
posiblesmaximos = [[], []]
listasecundaria = []
listasecundaria_trimmed = []
impresionEntities = []
impresionInteractionsComplete = []
impresionInteractionsIncomplete = []
# Effects
for i in range(len(listEffects)):
if listEffects[i] in dato:
for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at|for)\b)', dato): # "of","for" o "at" a la derecha de EFF
# Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at)\b)', dato,
# Original Daniel: overlapped=True): # "of" o "at" a la derecha de EFF
spantup = match.span(1)
# Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1))
if match.group(1).lower() in hashNormalizedEffects:
effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
else:
effect = "EFFECT." + "deverbal_effect"
# Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1))
a2line = (effect, spantup[0], spantup[1] - 1, match.group(1))
#print("Append effect a2line: {}".format(a2line))
salida_a2.append(a2line)
for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato): # "by" a la derecha de EFF
# Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato,
# Original Daniel: overlapped=True): # "by" a la derecha de EFF
spantup = match.span(1)
# Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1))
if match.group(1).lower() in hashNormalizedEffects:
effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
else:
effect = "EFFECT." + "deverbal_effect"
# Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1))
a2line = (effect, spantup[0], spantup[1] - 1, match.group(1))
salida_a2.append(a2line)
#print("Append effect a2line: {}".format(a2line))
for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato): # "is the" 0-1 palabras a la izquierda de EFF
# Original Daniel: for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato,
# Original Daniel: overlapped=True): # "is the" 0-1 palabras a la izquierda de EFF
spantup = match.span(3)
# Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(3))
if match.group(1).lower() in hashNormalizedEffects:
effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
else:
effect = "EFFECT." + "deverbal_effect"
# Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(3))
a2line = (effect, spantup[0], spantup[1] - 1, match.group(3))
salida_a2.append(a2line)
#print("Append effect a2line: {}".format(a2line))
#print("Efectos salida_a2: {}".format(salida_a2))
# PTEREG1GENE regulados pacientes GENE
for i in range(len(PTEREG1GENE)):
if PTEREG1GENE[i] in dato:
# print(PTEREG1GENE[i])
for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato): # "of", "for" o "at" 0-1 palabras a la izq de regulado
# Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato,
# Original Daniel: overlapped=True): # "of" o "at" 0-1 palabras a la izq de regulado
spantup = match.span(3)
# print("match {} spantup {}".format(match.group(3), match.span(3)))
# Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3))
a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(3))
salida_a2.append(a2line)
# print("Append regulados a2line: {}".format(a2line))
for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato): # regulados sin patron
# Original Daniel: for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato, overlapped=True): # regulados sin patron
spantup = match.span(1)
# print("match {} spantup {}".format(match.group(1), match.span(1)))
# Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1))
a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(1))
listasecundaria.append(a2line)
#print("Efectos regulados gene listasecundaria: {}".format(listasecundaria))
# CMC: ADDED TO SEPARTE REGULATED GENE AND TU
# PTEREG1TU regulados pacientes TU
for i in range(len(PTEREG1TU)):
if PTEREG1TU[i] in dato:
# print(PTEREG1TU[i])
for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato): # "of","for" o "at" 0-1 palabras a la izq de regulado
# Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato,
# Original Daniel: overlapped=True): # "of" o "at" 0-1 palabras a la izq de regulado
spantup = match.span(3)
# print("match: " + match.group(3))
# Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3))
a2line = ('TU', spantup[0], spantup[1] - 1, match.group(3))
salida_a2.append(a2line)
# print("Append regulados a2line: {}".format(a2line))
for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato): # regulados sin patron
# for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato, overlapped=True): # regulados sin patron
spantup = match.span(1)
# Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1))
a2line = ('TU', spantup[0], spantup[1] - 1, match.group(1))
listasecundaria.append(a2line)
#print("Efectos regulados tu listasecundaria: {}".format(listasecundaria))
# ATEREG1 reguladores agentes
for i in range(len(ATEREG1)):
if ATEREG1[i] in dato:
# print(ATEREG1[i])
for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato): # "by" 0-1 palabras a la izq de regulado
# Original Daniel: for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato,
# Original Daniel: overlapped=True): # "by" 0-1 palabras a la izq de regulado
spantup = match.span(2)
# print("match: " + match.group(2))
# print("match {} spantup {}".format(match.group(2), match.span(2)))
# Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(2))
a2line = ('TF', spantup[0], spantup[1] - 1, match.group(2))
salida_a2.append(a2line)
#print("Append regulator a2line: {}".format(a2line))
for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato): # reguladores sin patron
# for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato, overlapped=True): # reguladores sin patron
spantup = match.span(1)
# print("match {} spantup {}".format(match.group(1), match.span(1)))
# Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(1))
a2line = ('TF', spantup[0], spantup[1] - 1, match.group(1))
listasecundaria.append(a2line)
#print("Append regulator a2line: {}".format(a2line))
#print("Reguladores agentes salida_a2: {}".format(salida_a2))
#print("Reguladores agentes listasecundaria: {}".format(listasecundaria))
# Elimina etiquetados repetidos o que estan incluidos en otros
if salida_a2:
salida_a2.sort(key=lambda tup: tup[1])
salida_a2_trimmed.append(salida_a2[0])
for i in range(len(salida_a2)):
copiar = True
for j in range(len(salida_a2_trimmed)):
if ((salida_a2[i][1] >= salida_a2_trimmed[j][1]) and (salida_a2[i][2] <= salida_a2_trimmed[j][2])):
copiar = False
if copiar:
salida_a2_trimmed.append(salida_a2[i])
if listasecundaria:
listasecundaria.sort(key=lambda tup: tup[1])
listasecundaria_trimmed.append(listasecundaria[0])
for i in range(len(listasecundaria)):
copiar = True
for j in range(len(listasecundaria_trimmed)):
if ((listasecundaria[i][1] >= listasecundaria_trimmed[j][1]) and (
listasecundaria[i][2] <= listasecundaria_trimmed[j][2])):
copiar = False
if copiar:
listasecundaria_trimmed.append(listasecundaria[i])
# print("Sin repeticiones salida_a2_trimmed: {}".format(salida_a2_trimmed))
#print("Sin repeticiones listasecundaria_trimmed: {}".format(listasecundaria_trimmed))
# Asigna identificadores (TX) a entidades (eff, regulador, regulado)
lastID = 0
for i in range(len(salida_a2_trimmed)):
# if sys.argv[2].find('355') > -1:
# print("i : {}".format(i))
salida_a2_trimmed[i] = list(salida_a2_trimmed[i])
ID = "T" + str(i + 1)
salida_a2_trimmed[i].insert(0, ID)
lastID = i + 1
# if sys.argv[2].find('355') > -1:
# print("lastID : {}".format(lastID))
for i in range(len(listasecundaria_trimmed)):
# if sys.argv[2].find('355') > -1:
# print("i : {}".format(i))
# print("lastID : {}".format(lastID))
listasecundaria_trimmed[i] = list(listasecundaria_trimmed[i])
ID = "T" + str(i + 1 + lastID)
listasecundaria_trimmed[i].insert(0, ID)
# print("Con identificadores salida_a2_trimmed: {}".format(salida_a2_trimmed))
#print("Con identificadores listasecundaria_trimmed: {}".format(listasecundaria_trimmed))
#print("salida_a2_trimmed") #########################
#print(salida_a2_trimmed) #########################
#print("listasecundaria_trimmed")
#print(listasecundaria_trimmed)
# Arma Interacciones Regulatorias
i = 0
while i < int(len(salida_a2_trimmed)):
if "EFFECT" in salida_a2_trimmed[i][1]:
# BUSCA REGULADO A LA DERECHA
nuevaRI = [salida_a2_trimmed[i][0], "", ""] # efecto, tema, causa
ref = ""
posiblesminimos = [[], []]
j = 0
while j < int(len(salida_a2_trimmed)):
# Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
posiblesminimos[0].append(salida_a2_trimmed[j][2])
posiblesminimos[1].append(salida_a2_trimmed[j][0])
j = j + 1
if posiblesminimos[0]:
refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
ref = posiblesminimos[1][refpointer]
# si no encuentra, BUSCA REGULADO A LA IZQUIERDA
if not ref:
posiblesmaximos = [[], []]
j = 0
while j < int(len(salida_a2_trimmed)):
# Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
posiblesmaximos[0].append(salida_a2_trimmed[j][3])
posiblesmaximos[1].append(salida_a2_trimmed[j][0])
j = j + 1
if posiblesmaximos[0]:
refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0]))
ref = posiblesmaximos[1][refpointer]
nuevaRI[1] = ref
# BUSCA REGULADOR A LA DERECHA
ref = ""
posiblesminimos = [[], []]
j = 0
while j < int(len(salida_a2_trimmed)):
# Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
posiblesminimos[0].append(salida_a2_trimmed[j][2])
posiblesminimos[1].append(salida_a2_trimmed[j][0])
j = j + 1
if posiblesminimos[0]:
refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
ref = posiblesminimos[1][refpointer]
# si no encuentra, BUSCA REGULADOR A LA IZQUIERDA
if not ref:
posiblesmaximos = [[], []]
j = 0
while j < int(len(salida_a2_trimmed)):
# Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
posiblesmaximos[0].append(salida_a2_trimmed[j][3])
posiblesmaximos[1].append(salida_a2_trimmed[j][0])
j = j + 1
if posiblesmaximos[0]:
refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0]))
ref = posiblesmaximos[1][refpointer]
nuevaRI[2] = ref
listadeRIs.append(nuevaRI)
i = i + 1
# SEGUNDA FASE DE BUSQUEDA DE REGULADORES Y REGULADOS
i = 0
while i < int(len(listadeRIs)):
if not listadeRIs[i][1]: # no regulado
ref = ""
posiblesminimos = [[], []]
# BUSCA REGULADO A LA DERECHA
j = 0
while j < int(len(listasecundaria_trimmed)):
for k in range(len(salida_a2_trimmed)):
if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
ind = k
# Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3]))
posiblesminimos[1].append(listasecundaria_trimmed[j][0])
j = j + 1
# BUSCA REGULADO A LA IZQUIERDA
j = 0
while j < int(len(listasecundaria_trimmed)):
for k in range(len(salida_a2_trimmed)):
if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
ind = k
# Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3]))
posiblesminimos[1].append(listasecundaria_trimmed[j][0])
j = j + 1
# ELIGE EL REGULADO MAS CERCANO
if posiblesminimos[0]:
refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
ref = posiblesminimos[1][refpointer]
# print(ref)
listadeRIs[i][1] = ref
if not listadeRIs[i][2]: # no regulador
ref = ""
posiblesminimos = [[], []]
# BUSCA REGULADO A LA DERECHA
j = 0
while j < int(len(listasecundaria_trimmed)):
for k in range(len(salida_a2_trimmed)):
if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
ind = k
# Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3]))
posiblesminimos[1].append(listasecundaria_trimmed[j][0])
j = j + 1
# BUSCA REGULADO A LA IZQUIERDA
j = 0
while j < int(len(listasecundaria_trimmed)):
for k in range(len(salida_a2_trimmed)):
if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
ind = k
# Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3]))
posiblesminimos[1].append(listasecundaria_trimmed[j][0])
j = j + 1
# ELIGE EL REGULADO MAS CERCANO
if posiblesminimos[0]:
refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
ref = posiblesminimos[1][refpointer]
# print(ref)
listadeRIs[i][2] = ref
i = i + 1
#print("ListadeRIs: {}".format(listadeRIs))
# Elige reguladores y regulados de la listasecundaria para ser impresos
setmem = []
k = 0
while k < int(len(listadeRIs)):
j = 0
copysec = False
#while j < int(len(listasecundaria_trimmed)):
while j < len(listasecundaria_trimmed):
# print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs))
# Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
if listasecundaria_trimmed[j][0] == listadeRIs[k][2]:
# print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][2] {}".format(listasecundaria_trimmed[j][0],
# listadeRIs[k][2]))
copysec = True
# print("j: {}".format(j))
indj = j
j = j + 1
if copysec:
setmem.append(listasecundaria_trimmed[indj])
# print("setmen: {}".format(setmem))
#### CMC: AGREGO ESTE CODIGO PARA BUSCAR REGULADOS YA QUE EL CODIGO ANTERIOR BUSCA REGULADORES
j = 0
copysec = False
#while j < int(len(listasecundaria_trimmed)):
while j < len(listasecundaria_trimmed):
# print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs))
# Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
# print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][1] {}".format(listasecundaria_trimmed[j][0],
# listadeRIs[k][1]))
copysec = True
# print("j: {}".format(j))
indj = j
j = j + 1
if copysec:
setmem.append(listasecundaria_trimmed[indj])
# print("setmen: {}".format(setmem))
k = k + 1
setmem = sorted(setmem)
# print("setmen: {}".format(setmem))
dedup = [setmem[i] for i in range(len(setmem)) if i == 0 or setmem[i] != setmem[i - 1]]
# print("dedup: {}".format(dedup))
salida_a2_trimmed.extend(dedup)
#print("salida_a2_trimmed after listasecundaria_trimmed: {}".format(salida_a2_trimmed))
# Asigna identificadores (EX) a eventos (RIs)
for i in range(len(listadeRIs)):
# Original Daniel: ID = "E" + str(i+1)
ID = "R" + str(i + 1)
listadeRIs[i].insert(0, ID)
#print("Con identificadores ListadeRIs: {}".format(listadeRIs))
# CREA LISTADO DE EVENTOS (EX) Y ENTIDADES (TX) EN FORMATO DE IMPESIÓN
for i in range(len(salida_a2_trimmed)):
linea = str(salida_a2_trimmed[i][0]) + ' ' + str(salida_a2_trimmed[i][1]) + ' ' + str(
salida_a2_trimmed[i][2]) + ' ' + str(salida_a2_trimmed[i][3]) + ' ' + str(salida_a2_trimmed[i][4])
# Original Daniel: impresion.append(linea)
impresionEntities.append(linea)
for i in range(len(listadeRIs)):
if listadeRIs[i][2] and listadeRIs[i][3]:
# Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2]) + ' ' + 'Cause:' + str(listadeRIs[i][3])
linea = str(listadeRIs[i][0]) + ' ' + "Interaction." + str(listadeRIs[i][1]) + ' ' + 'Target:' + str(
listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3])
# Original Daniel: elif listadeRIs[i][2]:
# Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2])
# Original Daniel: elif listadeRIs[i][3]:
# Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Cause:' + str(listadeRIs[i][3])
# Original Daniel: else:
# Original Daniel: linea = str(listadeRIs[i][0]) + ' ' + "deverbal_effect:" + str(listadeRIs[i][1])
# Original Daniel: impresion.append(linea)
impresionInteractionsComplete.append(linea)
#print("Interaction complete: {}".format(linea))
linea = str(listadeRIs[i][0]) + ' ' + "Interaction.regulator" + ' ' + 'Target:' + str(
listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3])
impresionInteractionsIncomplete.append(linea)
#print("Entities: {}".format(impresionEntities))
# Escribir entidades interacciones completas en a1
for line in impresionEntities:
# Original Daniel: save_file = open( file_name, "a" )
save_file = open(file_name_entities_complete, "a")
save_file.write(line)
save_file.write("\n")
save_file.close()
# Escribir entidades interacciones incompletas en a1
for line in impresionEntities:
# Original Daniel: save_file = open( file_name, "a" )
save_file = open(file_name_entities_incomplete, "a")
save_file.write(line)
save_file.write("\n")
save_file.close()
# Escribir interacciones completas (regulator, effect, regulated)
# print("InteractionsComplete: {}".format(impresionInteractionsComplete))
for line in impresionInteractionsComplete:
# Original Daniel: save_file = open( file_name, "a" )
save_file = open(file_name_interactions_complete, "a")
save_file.write(line)
save_file.write("\n")
save_file.close()
# Escribir interacciones incompletas (regulator, "regulator", regulated)
# print("InteractionsIncomplete: {}".format(impresionInteractionsIncomplete))
for line in impresionInteractionsIncomplete:
# Original Daniel: save_file = open( file_name, "a" )
save_file = open(file_name_interactions_incomplete, "a")
save_file.write(line)
save_file.write("\n")
save_file.close()
with open(file_name_text_complete, mode="w") as txtFile:
txtFile.write(dato)
with open(file_name_text_incomplete, mode="w") as txtFile:
txtFile.write(dato)
import fileinput
import re
import sys
if ( len( sys.argv ) < 3 ):
sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <output_file> \n" )
sys.stderr.flush();
exit( 2 );
else:
print("Ok.")
#LEER ARCHIVO INPUT
text_file = open( sys.argv[1], "r" )
dato = text_file.read().splitlines()
text_file.close()
#QUITA EXTENSION DE NOMBRE DE ARCHIVO
split_line = sys.argv[2]
split_line = split_line[:-4]
file_name=""
file_name = split_line + ".san"
open( file_name , 'w').close()
#ESCRIBIR REGEX EN ARGV 2
for line in dato:
line = re.sub('[\(][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NNNNa_)
line = re.sub('[\[][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NNNNa_]
line = re.sub('[\(][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NN,NN,NN_)
line = re.sub('[\[][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NN,NN,NN_]
line = re.sub('[\(][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num_)
line = re.sub('[\(][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num.num_)
line = re.sub('[\(][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num-num_)
line = re.sub('[\[][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num_]
line = re.sub('[\[][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num.num_]
line = re.sub('[\[][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num-num_]
line = re.sub('[\(]\s[a-zA-Z]{1}\s[\)]', '', line.rstrip()) #elimina (_alpha_)
line = re.sub('[\[]\s[a-zA-Z]{1}\s[\]]', '', line.rstrip()) #elimina [_alpha_]
line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman_)
line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman-Roman_)
line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman_)
line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman-roman_)
line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman_]
line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman-Roman_]
line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman_]
line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman-roman_]
line = re.sub('[\(][^\(|^\)]*\s(fig\s\.|figure|see|i\s\.\se\s\.|e\s\.\sg\s\.|tab\s\.table)\s[^\(|^\)]*[\)]', '', line.rstrip(), flags=re.I) #
line = re.sub(' ', ' ', line.rstrip()) #elimina (_NNNNa_)
#print(line)
save_file = open( file_name, "a" )
save_file.write(line)
save_file.write("\n")
save_file.close()
# -*- coding: UTF-8 -*-
import operator
from optparse import OptionParser
import os
import sys
import json
import re
import pandas as pd
__author__ = 'CMendezC'
# Objective: add organism annotation (http://pakal.ccg.unam.mx/cmendezc/bacteria-annotation) to TRN tabla
# Parameters:
# 1) --trnPath Path to TRN detail table
# 2) --trnFile File of TRN detail table
# 3) --outputPath Output path
# 4) --organismPath Path to Organism annotation table
# 5) --organismFile File of Organism annotation table
# Ouput:
# 1) Tsv file detail with:
# TF TypeRegulated Regulated Effect PMID IdSentence TypeSentence Sentence
# Original_idsentence Original_sentence SectionNum SectionName OrganismMentions OrganismScore ConfirmationLevel
# OrganismScore = {
# If only salmonella or only non identified organism = 1,
# If (startswith salmonella or non identified organism) and other organisms = 0.5
# If only other organisms = 0
# }
# Execution:
# python3.4 get-TRN-Organism-v1.py
# Local
# python get-TRN-Organism-v1.py
# --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
# --trnFile STMTRN_all.detail.tsv
# --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
# --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results
# --organismFile annotations_STMTRN_all.sentences.csv
# python3 get-TRN-Organism-v1.py --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --trnFile STMTRN_all.detail.tsv --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results --organismFile annotations_STMTRN_all.sentences.csv
###########################################################
# MAIN PROGRAM #
###########################################################
def only_salmonella_or_non_identified_organism(list_temp):
non_identified_organisms = [
'unidentified plasmid',
'unidentified',
'bacterium',
'bacterium IFAM-3211',
'bacterium IFAM-2074',
'bacterium IFAM-1493',
'bacterium IFAM-3215',
'bacterium IFAM-3359',
'hybrid',
'Vector pMC1403',
'Transposon Tn10',
'unidentified cloning vector',
'Plasmid F',
'Cloning vector pUC19'
]
matches = 0
for o in list_temp:
if o.lower().startswith("salmonella") or o in non_identified_organisms:
matches += 1
if matches == len(list_temp):
return True
else:
return False
def salmonella_or_non_identified_and_other_organisms(list_temp):
non_identified_organisms = [
'unidentified plasmid',
'unidentified',
'bacterium',
'bacterium IFAM-3211',
'bacterium IFAM-2074',
'bacterium IFAM-1493',
'bacterium IFAM-3215',
'bacterium IFAM-3359',
'hybrid',
'Vector pMC1403',
'Transposon Tn10',
'unidentified cloning vector',
'Plasmid F',
'Cloning vector pUC19'
]
matches = 0
for o in list_temp:
if o.lower().startswith("salmonella") or o in non_identified_organisms:
matches += 1
if matches < len(list_temp) and matches > 0:
return True
else:
return False
def only_other_organims(list_temp):
non_identified_organisms = [
'unidentified plasmid',
'unidentified',
'bacterium',
'bacterium IFAM-3211',
'bacterium IFAM-2074',
'bacterium IFAM-1493',
'bacterium IFAM-3215',
'bacterium IFAM-3359',
'hybrid',
'Vector pMC1403',
'Transposon Tn10',
'unidentified cloning vector',
'Plasmid F',
'Cloning vector pUC19'
]
matches = 0
for o in list_temp:
if o.lower().startswith("salmonella") or o in non_identified_organisms:
matches += 1
if matches == 0:
return True
else:
return False
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--trnPath", dest="trnPath",
help="Path to TRN detail table", metavar="PATH")
parser.add_option("--trnFile", dest="trnFile",
help="File of TRN detail table", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--organismPath", dest="organismPath",
help="Path to organism annotation table", metavar="PATH")
parser.add_option("--organismFile", dest="organismFile",
help="File of organism annotation table", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to TRN detail table: " + str(options.trnPath))
print("File of TRN detail table: " + str(options.trnFile))
print("Output path: " + str(options.outputPath))
print("Path to organism annotation table: " + str(options.organismPath))
print("File of organism annotation table: " + str(options.organismFile))
# Load organism annotation table
print("Loading organism annotation table")
df_organisms = pd.read_csv(os.path.join(options.organismPath, options.organismFile), sep=',')
print("Total de frases anotadas con organism: {}".format(df_organisms.shape[0]))
# Load TRN detail table
print("Loading TRN detail table")
df_detail = pd.read_csv(os.path.join(options.trnPath, options.trnFile), sep='\t')
print("Total de frases en TRN: {}".format(df_detail.shape[0]))
# Fix column for organism. We changed this issue in get-TRN-v2.py
df_detail = df_detail.rename(columns={"Organism": "Organisms"})
df_detail['OrganismScore'] = 1.00
print(df_detail.columns)
#print(df_detail['Sentence'].head(15))
for idx in df_organisms.index:
organisms = df_organisms['Organisms'][idx]
SentenceNumberInFile = df_organisms['SentenceNumberInFile'][idx]
SentenceNumberInFile = SentenceNumberInFile - 2
# print("Organisms before: {}".format(df_detail.Organisms[SentenceNumberInFile]))
df_detail.Organisms[SentenceNumberInFile] = organisms
# print("Organisms assigned: {}".format(df_detail.Organisms[SentenceNumberInFile]))
# OrganismScore = {
# If only salmonella or only non identified organism = 1,
# If (startswith salmonella or non identified organism) and other organisms = 0.5
# If only other organisms = 0
# }
list_organisms = organisms.split(';')
# print(" OrganismScore before: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
if only_salmonella_or_non_identified_organism(list_organisms):
df_detail.OrganismScore[SentenceNumberInFile] = 1.00
elif salmonella_or_non_identified_and_other_organisms(list_organisms):
df_detail.OrganismScore[SentenceNumberInFile] = 0.50
elif only_other_organims(list_organisms):
df_detail.OrganismScore[SentenceNumberInFile] = 0.00
# print(" OrganismScore assigned: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
hashPredictedRIs = {}
hashPredictedRIsCount = {}
hashPredictedRIsCountVer = {}
hashPredictedRIsCountDev = {}
hashPredictedRIsCountAtt = {}
hashPredictedRIsCountAuto = {}
hashPredictedRIsScore = {}
hashPredictedRIsRI = {}
for idx in df_detail.index:
tf = df_detail['TF'][idx]
TypeRegulated = df_detail['TypeRegulated'][idx]
Regulated = df_detail['Regulated'][idx]
Effect = df_detail['Effect'][idx]
pmid = df_detail['PMID'][idx]
numsent = df_detail['NumSentence'][idx]
type_sent = df_detail['TypeSentence'][idx]
sentence = df_detail['Sentence'][idx]
original_idsentence = df_detail['OriginalIdSentence'][idx]
original_sentence = df_detail['OriginalSentence'][idx]
section_num = df_detail['SectionNum'][idx]
section_name = df_detail['SectionName'][idx]
organisms = df_detail['Organisms'][idx]
organism_score = df_detail['OrganismScore'][idx]
llave = "{}\t{}\t{}\t{}".format(tf, TypeRegulated, Regulated, Effect)
if organism_score == 0:
continue
if llave in hashPredictedRIs:
hashPredictedRIs[llave].append(
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
original_sentence, section_num, section_name, organisms,
organism_score, "", "", "", "", "", ""))
hashPredictedRIsCount[llave] += 1
if type_sent == "ver/dev":
hashPredictedRIsCountVer[llave] += 1
elif type_sent == "dev":
hashPredictedRIsCountDev[llave] += 1
elif type_sent == "att":
hashPredictedRIsCountAtt[llave] += 1
elif type_sent == "auto":
hashPredictedRIsCountAuto[llave] += 1
# if organism_score == 0.5:
# We penalize RI
# hashPredictedRIsScore[llave] -= 0.05
else:
hashPredictedRIs[llave] = [
"{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
original_sentence, section_num, section_name, organisms,
organism_score, "", "", "", "", "", "")]
hashPredictedRIsCount[llave] = 1
hashPredictedRIsCountVer[llave] = 0
hashPredictedRIsCountDev[llave] = 0
hashPredictedRIsCountAtt[llave] = 0
hashPredictedRIsCountAuto[llave] = 0
hashPredictedRIsScore[llave] = 1
if type_sent == "ver/dev":
hashPredictedRIsCountVer[llave] = 1
elif type_sent == "dev":
hashPredictedRIsCountDev[llave] = 1
elif type_sent == "att":
hashPredictedRIsCountAtt[llave] = 1
elif type_sent == "auto":
hashPredictedRIsCountAuto[llave] = 1
# if organism_score == 0.5:
# We penalize RI
# hashPredictedRIsScore[llave] -= 0.05
print("Total RIs en TRN con organismo: {}".format(len(hashPredictedRIs)))
with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "summary_org")), mode="w") as oFile:
# oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
for k,v in hashPredictedRIs.items():
RI_value = "True"
# if hashPredictedRIsScore[k] < 1:
# RI_value = "Possible"
oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k],
hashPredictedRIsScore[k], RI_value))
with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "detail_org")), mode="w") as oFile:
# oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tKT\tCL\tSource\tSpeculation\tNegation\tConfirmationLevel\n")
i = 0
for k,v in hashPredictedRIs.items():
for s in v:
oFile.write("{}\t{}\n".format(k, s))
i += 1
print("Total de frases en TRN organismo: {}".format(i))
# -*- coding: UTF-8 -*-
import operator
from optparse import OptionParser
import os
import sys
import json
import re
import pandas as pd
__author__ = 'CMendezC'
# Objective: generate TRN
# CFMC 2022-03-11: Agregamos:
# 1) Sección de oraciones de salida
# 2)
# Parameters:
# 1) --predictedPath Path for predicted interactions
# 2) --outputPath Output path
# 3) --outputFile Preffix file for saving TRN
# 4) --diccPath Dictionary path
# 5) --diccSynon File with synonyms of TFs
# 6) --tsvPath Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf
# 7) --jsonpdfPath Path to read jsonpdf file to extract section name
# Ouput:
# 1) Tsv file detail with:
# TF TypeRegulated Regulated Effect PMID IdSentence TypeSentence Sentence
# Original_idsentence Original_sentence SectionNum SectionName OrganismMentions OrganismScore ConfirmationLevel
# 1) Tsv file summary with:
# TF TypeRegulated Regulated Effect SentCount Ver/Dev Att Auto Score RI (True/False)
# Execution:
# Version 2 TRN Salmonella
# python3.4 get-TRN-v2.py
# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn
# --outputFile STMTRN_v2
# --diccPath /home/cmendezc/terminologicalResources
# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv
# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf
# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_v2 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf
# articulos_sal_4
# python3.4 get-TRN-v2.py
# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn
# --outputFile STMTRN_articulos_sal_4
# --diccPath /home/cmendezc/terminologicalResources
# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv
# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf
# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_4 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf
# articulos_sal_1
# python3.4 get-TRN-v2.py
# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn
# --outputFile STMTRN_articulos_sal_1
# --diccPath /home/cmendezc/terminologicalResources
# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv
# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf
# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_1 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf
# all = articulos_sal_1 + articulos_sal_2 + articulos_sal_3 + articulos_sal_4
# python3.4 get-TRN-v2.py
# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn
# --outputFile STMTRN_all
# --diccPath /home/cmendezc/terminologicalResources
# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv
# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf
# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_all --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf
####
# python3.4 get-TRN-v1.py
# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn
# --outputFile STMTRN
# --diccPath /home/cmendezc/terminologicalResources
# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
# python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
# Con dataset automatic-extraction-STM-RIs-dataset
# python3.4 get-TRN-v1.py
# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn
# --outputFile STM-RIs-dataset
# --diccPath /home/cmendezc/terminologicalResources
# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
# python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STM-RIs-dataset --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
###########################################################
# MAIN PROGRAM #
###########################################################
def updateHashPredicted(pr, hashP, pm, sF, ef):
# updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect])
if pr not in hashP:
hashTemp = {"pmids": {pm: [sF]}, "orieff": ef}
hashP[pr] = hashTemp
else:
hashTemp = hashP[pr]
if pm in hashTemp["pmids"]:
hashP[pr]["pmids"][pm].append(sF)
else:
hashP[pr]["pmids"][pm] = [sF]
def get_standard_name(regSynon):
reg = regSynon
if regSynon in hashSynon:
reg = hashSynon[regSynon]
else:
for syn, std in hashSynon.items():
if regSynon.startswith(syn):
reg = regSynon.replace(syn, std, 1)
break
return reg
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--predictedPath", dest="predictedPath",
help="Path predicted ris gcs", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--outputFile", dest="outputFile",
help="Preffix file for saving results", metavar="FILE")
parser.add_option("--diccPath", dest="diccPath",
help="Path to dictionary", metavar="PATH")
parser.add_option("--diccSynon", dest="diccSynon",
help="File with synonyms", metavar="FILE")
parser.add_option("--tsvPath", dest="tsvPath",
help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")
parser.add_option("--jsonpdfPath", dest="jsonpdfPath",
help="Path to read jsonpdf file to extract section name", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path predicted ris gcs: " + str(options.predictedPath))
print("Output path: " + str(options.outputPath))
print("Preffix file for saving results: " + str(options.outputFile))
print("Path to dictionary: " + str(options.diccPath))
print("File with synonyms: " + str(options.diccSynon))
print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))
print("Path to read jsonpdf file to extract section name: " + str(options.jsonpdfPath))
use_synonyms = False
hashSynon = {}
if options.diccPath != None and options.diccSynon != "no-synonyms":
print("***** Using synonyms *****")
use_synonyms = True
print('Loading dictionary of synonyms...')
with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon:
hashSynon = json.load(diccSynon)
print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon)))
hashPredictedRIs = {}
hashPredictedRIsCount = {}
hashPredictedRIsCountVer = {}
hashPredictedRIsCountDev = {}
hashPredictedRIsCountAtt = {}
hashPredictedRIsCountAuto = {}
hashFiles = {}
for path, dirs, files in os.walk(options.predictedPath):
for file in files:
if file.endswith(".a1"):
filename = file[:-3]
if filename not in hashFiles:
hashFiles[filename] = 1
else:
hashFiles[filename] += 1
print("Files: {}".format(len(hashFiles)))
processedFiles = 0
id_ri = 1
regex_att_auto = re.compile(r"(\.att\.|\.auto\.)[0-9]*$")
for file in sorted(hashFiles.keys()):
print("File: {}".format(file))
type_sent = "ver/dev"
if file.find("dataSet_OnlyRI_sentences") > -1:
pmid = "000000"
if file.find("dataSet_OnlyRI_sentences.") > -1:
if file.find(".att.") > -1:
numsent = file[file.find("att.") + 4:]
type_sent = "att"
if pmid.find(".auto.") > -1:
numsent = file[file.find("auto.") + 5:]
type_sent = "auto"
else:
numsent = file[file.find("_", file.find("_", file.find("_") + 1) + 1) + 1:file.find("-")]
numsent = numsent.replace(".al", "")
print("dataSet_OnlyRI_sentences numsent: {}".format(numsent))
print("dataSet_OnlyRI_sentences pmid: {}".format(pmid))
else:
pmid = file[:file.find("_")]
# print("pmid: {}".format(pmid))
numsent = file[file.find("_")+1:file.find("-")]
numsent = numsent.replace(".al", "")
if pmid.find(".att.") > -1:
# CFMC 2022-03-11: Fix errro in pmid
# CFMC 2022-03-11 Original: pmid = pmid.replace(".att.", "")
pmid = regex_att_auto.sub("", pmid)
numsent = file[file.find("att.")+4:]
type_sent = "att"
if pmid.find(".auto.") > -1:
# CFMC 2022-03-11: Fix errro in pmid
# CFMC 2022-03-11 Original: pmid = pmid.replace(".auto.", "")
pmid = regex_att_auto.sub("", pmid)
numsent = file[file.find("auto.") + 5:]
type_sent = "auto"
# numsent = file[file.find("_"):file.find("-")]
# print("pmid {}".format(pmid))
# print("numsent: {}".format(numsent))
sentenceFile = file[:file.find("-", file.find("_"))] + ".txt"
hashEntitiesGenes = {}
hashEntitiesTUs = {}
hashEntitiesTFs = {}
hashEntitiesEffects = {}
hashOriginalEffect = {}
regex_fix_regulator = re.compile(r'(Regulated|Binds|Bind|deverbal_effect|Regulate)')
regex_fix_repressor = re.compile(r'(Repressing|Represses)')
with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File:
for line in a1File:
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
entity = listLine2[0]
entity_type = listLine2[0]
idEntity = listLine1[0]
originalEffect = listLine1[2]
if entity.startswith("EFFECT"):
entity = entity[entity.find(".") + 1:]
# print("Entity: {}".format(entity))
if pmid.find("_dev") > -1:
type_sent = "dev"
entity = entity.replace("_dev", "")
# print("Entity without _dev: {}".format(entity))
if idEntity not in hashOriginalEffect:
hashOriginalEffect[idEntity] = originalEffect
if idEntity not in hashEntitiesEffects:
# We fixed some wrong effects in TRN, but we must fix this also in another script where error is produced
if regex_fix_regulator.match(entity):
print("WARNING EFFECT: {}".format(entity))
entity = regex_fix_regulator.sub("regulator", entity)
print("WARNING EFFECT after: {}".format(entity))
if regex_fix_repressor.match(entity):
print("WARNING EFFECT: {}".format(entity))
entity = regex_fix_repressor.sub("repressor", entity)
print("WARNING EFFECT after: {}".format(entity))
hashEntitiesEffects[idEntity] = entity
else:
entity = listLine1[2]
if entity_type == "GENE":
if idEntity not in hashEntitiesGenes:
hashEntitiesGenes[idEntity] = entity
elif entity_type == "TU":
if idEntity not in hashEntitiesTUs:
hashEntitiesTUs[idEntity] = entity
elif entity_type == "TF":
if idEntity not in hashEntitiesTFs:
hashEntitiesTFs[idEntity] = entity
# print("hashEntities: {}".format(hashEntitiesGenes))
# print("hashEntities: {}".format(hashEntitiesTUs))
# print("hashEntities: {}".format(hashEntitiesTFs))
with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File:
sentence = ''
with open(os.path.join(options.predictedPath, file + ".txt"), mode="r") as txtFile:
sentence = txtFile.read()
listTokens = [token.split('|')[0] for token in sentence.split()]
sentence = ' '.join(listTokens)
# CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
# Open jsonpdf file
hash_sections = {}
sentences = {}
print('Loading jsonpdf file...')
with open(os.path.join(options.jsonpdfPath, pmid + ".jsonpdf"), "r", encoding="utf-8", errors="replace") as jsonpdfFile:
text_file = jsonpdfFile.read()
if file.startswith("26781240"):
text_file = text_file.replace(" \\ ", " \\\\ ")
elif file.startswith("26249345"):
text_file = text_file.replace('}], ', '}],"sections": {}')
try:
hash_jsonpdf = json.loads(text_file)
print(' Loading jsponpdf file... done!')
except Exception as e:
print(e)
print(" Loading jsonpdf file failed: {}".format(file))
hash_sections = hash_jsonpdf["sections"]
# print("Sections: {}".format(hash_sections))
sentences = hash_jsonpdf["sentences"]
# Open tsv file
print('Loading tsv file...')
file_tsv = pmid + ".pre.fil.tsv"
tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
# print("tsv_file.shape: {}".format(tsv_file.shape))
tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
# print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
tsv_file_new = tsv_file_filtered.reset_index(drop=True)
# print(tsv_file_new.head(10))
print(' Loading tsv file... done!')
numsent_int = int(numsent)
original_sentence = tsv_file_new.at[numsent_int, 'sentence']
section_num = tsv_file_new.at[numsent_int, 'section']
# print("type(section_num): {}".format(type(section_num)))
original_idsentence = tsv_file_new.at[numsent_int, 'idsentence']
section_num_str = str(section_num)
if section_num_str in hash_sections:
section_name = hash_sections[section_num_str]
else:
section_name = "Unknown"
for line in a2File:
# print("Line a2: {}".format(line))
# R1 Interaction.T3 Target:T2 Agent:T1 Condition: T4
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
regulator = listLine2[2]
regulator = regulator[regulator.find(":") + 1:]
regulated = listLine2[1]
regulated = regulated[regulated.find(":") + 1:]
effect = listLine2[0]
effect = effect[effect.find(".") + 1:]
tf = hashEntitiesTFs[regulator]
if tf.endswith("ed"):
tf = tf[:tf.find("-")]
#else:
# Clean TF names by expressions seen in TRN outpur file
tf = re.sub(r"(/absence|controlle|activation|‐regulate|‐mediate|mediate|-regulate|regulate|ˉ|-like|-mutant|-type|-independent|-dependent|dependent|-dependant|-binding|-and|-family|-bound|-deficient|-indepen-dent|-inducing|-green|-overproducing|-or|-depletion|-repressible|-dual|-box)", "", tf)
# Clean false TF names - 2329
result = re.match(r"(cyclic|RHONDA|Crawford|Hulett|Rhodobacter|Danino|Huang|Neisseria|Huang|HUGHES1|Robbe-Saule|Danchin|Roberts|Furer|Hunter|Furue|Humphreys|Nacional)", tf)
if result:
break
# H
tf = get_standard_name(tf)
# print("numsent: {}".format(numsent))
# For L&C do not increment 1
# CFMC 2022-03-11 Original: numsent_int = int(numsent)
if regulated in hashEntitiesGenes:
type_regulated = "Gene"
llave = "{}\t{}\t{}\t{}".format(tf, "gene", hashEntitiesGenes[regulated],
hashEntitiesEffects[effect])
elif regulated in hashEntitiesTUs:
type_regulated ="TU"
llave = "{}\t{}\t{}\t{}".format(tf, "TU", hashEntitiesTUs[regulated],
hashEntitiesEffects[effect])
else:
print("ERROR: Regulated did not found!")
# Clean false cases
if llave.startswith("Hu"):
break
if llave in hashPredictedRIs:
# CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
hashPredictedRIs[llave].append("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, ""))
hashPredictedRIsCount[llave] += 1
if type_sent == "ver/dev":
# if llave in hashPredictedRIsCountVer:
hashPredictedRIsCountVer[llave] += 1
# else:
# hashPredictedRIsCountVer[llave] = 1
elif type_sent == "dev":
# if llave in hashPredictedRIsCountVer:
hashPredictedRIsCountDev[llave] += 1
# else:
# hashPredictedRIsCountDev[llave] = 1
elif type_sent == "att":
# if llave in hashPredictedRIsCountVer:
hashPredictedRIsCountAtt[llave] += 1
# else:
# hashPredictedRIsCountAtt[llave] = 1
elif type_sent == "auto":
# if llave in hashPredictedRIsCountVer:
hashPredictedRIsCountAuto[llave] += 1
# else:
# hashPredictedRIsCountAuto[llave] = 1
else:
# CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
hashPredictedRIs[llave] = ["{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, "")]
hashPredictedRIsCount[llave] = 1
hashPredictedRIsCountVer[llave] = 0
hashPredictedRIsCountDev[llave] = 0
hashPredictedRIsCountAtt[llave] = 0
hashPredictedRIsCountAuto[llave] = 0
if type_sent == "ver/dev":
hashPredictedRIsCountVer[llave] = 1
elif type_sent == "dev":
hashPredictedRIsCountDev[llave] = 1
elif type_sent == "att":
hashPredictedRIsCountAtt[llave] = 1
elif type_sent == "auto":
hashPredictedRIsCountAuto[llave] = 1
id_ri += 1
processedFiles += 1
print("Processed files: {}".format(processedFiles))
with open(os.path.join(options.outputPath, options.outputFile + ".summary.tsv"), mode="w") as oFile:
# oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
for k,v in hashPredictedRIs.items():
oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], "1", "True"))
#oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k], hashPredictedRIsCountDev[k], hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], v))
with open(os.path.join(options.outputPath, options.outputFile + ".detail.tsv"), mode="w") as oFile:
# oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tConfirmationLevel\n")
for k,v in hashPredictedRIs.items():
for s in v:
oFile.write("{}\t{}\n".format(k, s))
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import sys
import os
import json
import operator
import re
from nltk.corpus import words
__author__ = 'CMendezC'
# Objective: obtain predicted ris from attributive sentences, such as ArgP-regulated gene argP
# Input format: transformed format.
# WARNING: Only one sentence per line
# Parameters:
# 1) --inputPath Input path
# 2) --inputFile Inpupt file
# 3) --outputPath Output path
# 5) --diccPath Dictionary path
# 7) --diccEffect File with normalized effects
# 6) --diccFile JSON file with entity dictionaries
# 9) --diccEPAth Dictionary path diccEffect
# 8) --format Output format: standoff, tabs
# Ouput:
# 1) File with predicted ris combined with existing files.
# Format standoff:
# T1 TF 0 0 ArgP-regulated
# T2 GENE 0 0 argP
# T1 Growth_condition 88 137 mitochondrial electron transport chain inhibitors
# R1 Interaction.activator Target:T3 Agent:T1
# Execution
# C:\anaconda3\python ri-attributive-extraction.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences
# --inputFile ris-sentences-analysis.att.017.txt
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs
# --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources
# --diccEffect normalized_Effects.json
# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json
# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.286.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json
# python3 ri-attributive-extraction.py
# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences
# --inputFile ris-sentences-analysis.att.017.txt
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs
# --diccPath /home/cmendezc/terminologicalResources
# --diccEffect normalized_Effects.json
# python3 ri-attributive-extraction.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json
###########################################################
# MAIN PROGRAM #
###########################################################
def getPosWord(wordPos, endPos, text, termList):
offsetStart = 0
wordNum = 0
listText = text.split()
for w in listText:
# if filenameBefore.find('000-2') > -1:
# print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
if wordNum >= int(wordPos):
# for tok in word.split():
for t in termList:
# For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
if w == t:
return [w, offsetStart, offsetStart + len(w) - 1]
#else:
wordNum += 1
offsetStart += len(w) + 1
if wordNum > int(endPos):
return None
return None
def getIdEntity(aList, etype, idE):
entity = aList[0]
if etype == "EFFECT":
normalizedEffect = entity
#print("EFFECT: {}".format(entity))
if entity in hashNormalizedEffects:
normalizedEffect = hashNormalizedEffects[entity]
etype += "." + normalizedEffect
#print("etype: {}".format(etype))
entityPosStart = aList[1]
entityPosEnd = aList[2]
keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
#print("keyEntity: {}".format(keyEntity))
if keyEntity not in hashEntities:
idE += 1
idEntity = "T{}".format(idE)
hashEntities[keyEntity] = idEntity
#print("New entity {}: {}".format(idEntity, keyEntity))
return idEntity, idE
else:
idEntity = hashEntities[keyEntity]
return idEntity, idE
def getIdInteraction(regulator, regulated, effect, idI, hashInt):
#print("hashInt: {}".format(hashInt))
keyInteraction = "{} {} {}".format(regulator, regulated, effect)
if keyInteraction not in hashInt:
idI += 1
idInteraction = "R{}".format(idI)
hashInt[keyInteraction] = idInteraction
#print("New interaction {}: {}".format(idInteraction, keyInteraction))
#return idInteraction, idI
else:
idInteraction = hashInt[keyInteraction]
return idInteraction, idI
def saveFiles(filename, hashE, hashI, s, effect):
if effect:
outputPath = os.path.join(options.outputPath, "complete-ris")
else:
outputPath = os.path.join(options.outputPath, "incomplete-ris")
with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
#with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
aList = k.split()
a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
#with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
aList = k.split()
a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
txtFile.write(s)
def loadFileEntities(filename, outputPath, hashTemp):
#print("Start loadFileEntities")
idE = 1
try:
f = filename[:filename.rfind(".")] + ".a1"
# print("file entities: {}".format(f))
with open(os.path.join(outputPath, f), mode="r") as a1File:
for line in a1File:
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
etype = listLine2[0]
entityPosStart = listLine2[1]
entityPosEnd = listLine2[2]
entity = listLine1[2]
keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
idEntity = listLine1[0]
if keyEntity not in hashTemp:
hashTemp[keyEntity] = idEntity
if int(idEntity[1:]) > idE:
idE = int(idEntity[1:])
except IOError:
print("IOError file: {}".format(os.path.join(outputPath, f)))
# idE = 1
return idE
def loadFileInteractions(filename, outputPath, hashTemp):
#print("Start loadFileInteractions")
idI = 1
try:
with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
for line in a2File:
#print("Line a2: {}".format(line))
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
regulator = listLine2[2]
regulator = regulator[regulator.find(":") + 1:]
regulated = listLine2[1]
regulated = regulated[regulated.find(":") + 1:]
effect = listLine2[0]
effect = effect[effect.find(".") + 1:]
idInteraction = listLine1[0]
keyInteraction = "{} {} {}".format(regulator, regulated, effect)
if keyInteraction not in hashTemp:
hashTemp[keyInteraction] = idInteraction
if int(idInteraction[1:]) > idI:
idI = int(idInteraction[1:])
except IOError:
print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
# idI = 1
return idI
def getRealPos(posStart, posEnd, lin):
return (posStart, posEnd)
def getRI(r, l):
regulator = r.group('regulator')
regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
# regulatorStart = getRealPos(r.start('regulator'), l)
# regulatorEnd = getRealPos(r.end('regulator'), l)
regulated = r.group('regulated')
regulatedPos = getRealPos(r.start('regulated'), r.end('regulated'), l)
# regulatedStart = getRealPos(r.start('regulated'), l)
# regulatedEnd = getRealPos(r.end('regulated'), l)
effect = r.group('effect')
effectPos = getRealPos(r.start('effect'), r.end('effect'), l)
# effectStart = getRealPos(r.start('effect'), l)
# effectEnd = getRealPos(r.end('effect'), l)
#print("Regulator {}, start {}, end {}".format(regulator, regulatorPos[0], regulatorPos[1]))
#print("Regulated {}, start {}, end {}".format(regulated, regulatedPos[0], regulatedPos[1]))
#print("Effect {}, start {}, end {}".format(effect, effectPos[0], effectPos[1]))
return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
if __name__ == "__main__":
# Parameter definition
# python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py
# --inputPath $(dirname ${file})
# --inputFile $(basename ${file})
# --outputPath $OUTPUT_PATH
# --diccPath $DICC_PATH
# --diccEffect normalized_Effects.json
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Input path", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="Input file", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--diccPath", dest="diccPath",
help="Path to read dictionaries", metavar="PATH")
# parser.add_option("--diccFile", dest="diccFile",
# help="JSON file with entity dictionaries", metavar="FILE")
parser.add_option("--diccEffect", dest="diccEffect",
help="File with normalized effects", metavar="FILE")
# parser.add_option("--format", dest="format",
# help="Output format: standoff", metavar="TEXT")
# parser.add_option("--diccEPAth", dest="diccEPAth",
# help="File with normalized effects", metavar="FILE")
(options, args) = parser.parse_args()
#if len(args) > 0:
# parser.error("None parameter entered.")
# sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Input path: " + str(options.inputPath))
print("Input file: " + str(options.inputFile))
print("Output path: " + str(options.outputPath))
print("Path to read dictionaries: " + str(options.diccPath))
# print("JSON file with entity dictionaries: " + str(options.diccFile))
print("File with normalized effects: " + str(options.diccEffect))
# print("Output format: " + str(options.format))
# print("Path to read normalized effects: " + str(options.diccEPAth))
# regularWords = words.words('en')
# print('Loading dictionaries...')
# with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
# hashDicc = json.load(diccFile)
# hashTermFiles = hashDicc["hashTermFiles"]
# hashTerms = hashDicc["hashTerms"]
# for key in hashTermFiles.keys():
# for f in hashTermFiles[key]:
# # print('File: ' + f)
# with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
# for line in iFile:
# line = line.strip('\n')
# line = line.replace(' ', '-')
# if line not in hashTerms[key]:
# hashTerms[key].append(line)
# # if options.termLower:
# # hashTerms[key].append(line.lower())
# # if options.termCapitalize:
# # hashTerms[key].append(line.capitalize())
# print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
# Loading normalized effects
print('Loading normalized effects ending with -d...')
hashNormalizedEffects = {}
with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
hashNormalizedEffects = json.load(diccFile)
listEffects = []
for eff in hashNormalizedEffects.keys():
if eff.endswith('d'):
listEffects.append(eff)
listEffects.append("dependent")
effects = "|".join(listEffects)
#print("Effects: {}".format(effects))
files = {}
hashEntities = {}
hashInteractions = {}
hashInteractionsEffect = {}
idEntities = 1
idInteractions = 1
idInteractionsEffect = 1
# regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)\s([^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
# regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+( [^ ]+)')
# regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))\s([^|]+\|[^|]+\|(CC|,))?)+ ([^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
# regexAttRILeft = re.compile(r'(?:([^|\s]+\|[^|]+\|(?:GENE|TU))\s(?:[^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
# regexAttRILeft = re.compile(r'(?=([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
# regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
# regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+) ([^ ]+ )+(?P<regulator>[^|]+\|[^|]+\|TF)')
# regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>' + r'(' + effects + ')\|[^|]+\|TF) [^|]+\|gene')
# reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
# regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(regulated|repressed)\|[^|]+\|TF) [^|]+\|gene')
# regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ ){,5}(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
# CMC 2018-11-07: regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
regexAttRILeft = re.compile(
r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF)')
# regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ ){,5}(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
# CMC 2018-11-07: regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ )+(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
regexAttRIRight = re.compile(
r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) ([^ ]+ )*(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
filename = options.inputFile
hashEntities = {}
hashInteractions = {}
hashInteractionsEffect = {}
idEntities = 1
idInteractions = 1
idInteractionsEffect = 1
outputPath = os.path.join(options.outputPath, "complete-ris")
idEntities = loadFileEntities(filename, outputPath, hashEntities)
idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
outputPath = os.path.join(options.outputPath, "incomplete-ris")
idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
listRIs = []
with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
for line in iFile:
line = line.rstrip('\n')
# Buscar hacia la izquierda
#print("Buscando hacia <<")
result = regexAttRILeft.search(line)
#print("result: {}".format(result))
lineTemp = line
# print("lineTemp: {}".format(lineTemp))
while result:
#print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
listRIs.append(getRI(result, line))
#print("listRIs: {}".format(listRIs))
lineTemp = lineTemp.replace(result.group('regulated'), '')
#print("lineTemp for: {}".format(lineTemp))
result = regexAttRILeft.search(lineTemp)
#print("result: {}".format(result))
# Buscar hacia la derecha
#print("Buscando hacia >>")
result = regexAttRIRight.search(line)
#print("result: {}".format(result))
lineTemp = line
# print("lineTemp: {}".format(lineTemp))
while result:
#print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
listRIs.append(getRI(result, line))
#print("listRIs: {}".format(listRIs))
lineTemp = lineTemp.replace(result.group('regulated'), '')
#print("lineTemp for: {}".format(lineTemp))
result = regexAttRIRight.search(lineTemp)
#print("result: {}".format(result))
# result = regexAttRIRight.finditer(line)
# lineTemp = line
# while result:
# listRIs.append(getRI(result, line))
# lineTemp = lineTemp.replace(result.group('regulated'), '')
# result = regexAttRIRight.finditer(lineTemp)
# return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
# regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
# effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
for ri in listRIs:
#print("ri: {}".format(ri))
if len(ri) != 4:
print("WARNING! corrupted list")
exit()
regulator = ri[0]
regulated = ri[1]
effect = ri[2]
line = ri[3]
listElem = regulator.split('|')
regulatorWord = listElem[0]
regulatorType = listElem[2]
regulatorStart = listElem[3]
regulatorEnd = listElem[4]
listElem = regulated.split('|')
regulatedWord = listElem[0]
regulatedType = listElem[2]
regulatedStart = listElem[3]
regulatedEnd = listElem[4]
listElem = effect.split('|')
effectWord = listElem[0]
effectType = "EFFECT"
effectStart = listElem[1]
effectEnd = listElem[2]
idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
if regulatedType == "GENE":
idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
elif regulatedType == "TU":
idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "TU", idEntities)
else:
print("WARNING! Unknown entity type")
idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
idInteractions, hashInteractions)
idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
idInteractionsEffect,
hashInteractionsEffect)
saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import sys
import os
import json
import operator
import re
from general_functions import getTypeRegulation
from nltk.corpus import words
__author__ = 'CMendezC'
# Objective: obtain predicted ris from autoregulation sentences,
# such as ArgP protein represses its own synthesis
# Input format: transformed format.
# WARNING: Only one sentence per line
# Parameters:
# 1) --inputPath Input path
# 2) --inputFile Inpupt file
# 3) --outputPath Output path
# 5) --diccPath Dictionary path
# 7) --diccEffect File with normalized effects
# 6) --diccFile JSON file with entity dictionaries
# 9) --diccEPAth Dictionary path diccEffect
# 8) --format Output format: standoff, tabs
# Ouput:
# 1) File with predicted ris combined with existing files.
# Format standoff:
# T1 TF 0 0 ArgP
# T2 GENE 0 0 Argp -- > argP
# R1 Interaction.activator Target:T3 Agent:T1
# Sentence ArgP protein represses its own synthesis
# The FimZ transcription factor activates this promoter directly ,
# and it also positively regulates the transcription of its own gene
# FimZ is known to regulate the expression of its own gene positively
# FimZ also positively regulates its own transcription
# ArgP protein represses its own synthesis
# ArgP both represses its own transcription
# ArgP protein represses its own synthesis
# OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
# of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
# Execution
# python3 ri-autoregulation-extraction-v01.py
# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
# --inputFile dataSet_OnlyRI_sentences.auto.1017.txt
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs
# --diccPath /home/cmendezc/terminologicalResources
# --diccEffect normalized_Effects.json
# python3 ri-autoregulation-extraction-v01.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences --inputFile dataSet_OnlyRI_sentences.auto.1017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json
###########################################################
# MAIN PROGRAM #
###########################################################
def getPosWord(wordPos, endPos, text, termList):
offsetStart = 0
wordNum = 0
listText = text.split()
for w in listText:
# if filenameBefore.find('000-2') > -1:
# print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
if wordNum >= int(wordPos):
# for tok in word.split():
for t in termList:
# For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
if w == t:
return [w, offsetStart, offsetStart + len(w) - 1]
#else:
wordNum += 1
offsetStart += len(w) + 1
if wordNum > int(endPos):
return None
return None
def getIdEntity(aList, etype, idE):
entity = aList[0]
if etype == "EFFECT":
normalizedEffect = entity
#print("EFFECT: {}".format(entity))
if entity in hashNormalizedEffects:
normalizedEffect = hashNormalizedEffects[entity]
etype += "." + normalizedEffect
#print("etype: {}".format(etype))
entityPosStart = aList[1]
entityPosEnd = aList[2]
keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
#print("keyEntity: {}".format(keyEntity))
if keyEntity not in hashEntities:
idE += 1
idEntity = "T{}".format(idE)
hashEntities[keyEntity] = idEntity
#print("New entity {}: {}".format(idEntity, keyEntity))
return idEntity, idE
else:
idEntity = hashEntities[keyEntity]
return idEntity, idE
def getIdInteraction(regulator, regulated, effect, idI, hashInt):
#print("hashInt: {}".format(hashInt))
keyInteraction = "{} {} {}".format(regulator, regulated, effect)
if keyInteraction not in hashInt:
idI += 1
idInteraction = "R{}".format(idI)
hashInt[keyInteraction] = idInteraction
#print("New interaction {}: {}".format(idInteraction, keyInteraction))
#return idInteraction, idI
else:
idInteraction = hashInt[keyInteraction]
return idInteraction, idI
def saveFiles(filename, hashE, hashI, s, effect):
if effect:
outputPath = os.path.join(options.outputPath, "complete-ris")
else:
outputPath = os.path.join(options.outputPath, "incomplete-ris")
with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
#with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
aList = k.split()
a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
#with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
aList = k.split()
a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
txtFile.write(s)
def loadFileEntities(filename, outputPath, hashTemp):
#print("Start loadFileEntities")
idE = 1
try:
f = filename[:filename.rfind(".")] + ".a1"
# print("file entities: {}".format(f))
with open(os.path.join(outputPath, f), mode="r") as a1File:
for line in a1File:
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
etype = listLine2[0]
entityPosStart = listLine2[1]
entityPosEnd = listLine2[2]
entity = listLine1[2]
keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
idEntity = listLine1[0]
if keyEntity not in hashTemp:
hashTemp[keyEntity] = idEntity
if int(idEntity[1:]) > idE:
idE = int(idEntity[1:])
except IOError:
print("IOError file: {}".format(os.path.join(outputPath, f)))
# idE = 1
return idE
def loadFileInteractions(filename, outputPath, hashTemp):
#print("Start loadFileInteractions")
idI = 1
try:
with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
for line in a2File:
#print("Line a2: {}".format(line))
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
regulator = listLine2[2]
regulator = regulator[regulator.find(":") + 1:]
regulated = listLine2[1]
regulated = regulated[regulated.find(":") + 1:]
effect = listLine2[0]
effect = effect[effect.find(".") + 1:]
idInteraction = listLine1[0]
keyInteraction = "{} {} {}".format(regulator, regulated, effect)
if keyInteraction not in hashTemp:
hashTemp[keyInteraction] = idInteraction
if int(idInteraction[1:]) > idI:
idI = int(idInteraction[1:])
except IOError:
print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
# idI = 1
return idI
'''
def getTypeRegulation(effect_group, posini, sent, type_sent):
# To change regulation effect in such as:
# negative regulator --> repressor
# positively regulates --> activator
effect_ret = effect_group
#listEff = effect_ret.split('|')
if type_sent == "tra":
regexTypeEffectPosi = re.compile(r'(?<=positive\|(RB|JJ) )' + effect_ret)
regexTypeEffectNega = re.compile(r'(?<=negative\|(RB|JJ) )' + effect_ret)
if regexTypeEffectPosi.search(sent, posini - 12):
# Creo que no es necesario: effect_ret = "activator|{}|{}".format(listEff[1], listEff[2])
effect_ret = "activator"
print("Change regulation effect: {}".format(sent))
elif regexTypeEffectNega.search(sent, posini - 12):
# Creo que no es necesario: effect_ret = "repressor|{}|{}".format(listEff[1], listEff[2])
effect_ret = "repressor"
print("Change regulation effect: {}".format(sent))
return effect_ret
'''
def getRealPos(posStart, posEnd, lin):
return (posStart, posEnd)
def getRI(r, l):
regulator = r.group('regulator')
regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
# We change TF name to GENE name
listRegulator = regulator.split('|')
regulatorWord = listRegulator[0]
regulated = regulatorWord[0].lower()+regulatorWord[1:]
regulated += "|{}|GENE".format(regulated)
regulatedPos = getRealPos(0, 0, l)
effect = r.group('effect')
# print("effect from group: {}".format(effect))
effectPos = getRealPos(r.start('effect'), r.end('effect'), l)
# To change regulation effect in:
# negative regulator --> repressor
# positively regulates --> activator
effect = getTypeRegulation(effect, r.start('effect'), l, "tra")
return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Input path", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="Input file", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--diccPath", dest="diccPath",
help="Path to read dictionaries", metavar="PATH")
parser.add_option("--diccEffect", dest="diccEffect",
help="File with normalized effects", metavar="FILE")
(options, args) = parser.parse_args()
#if len(args) > 0:
# parser.error("None parameter entered.")
# sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Input path: " + str(options.inputPath))
print("Input file: " + str(options.inputFile))
print("Output path: " + str(options.outputPath))
print("Path to read dictionaries: " + str(options.diccPath))
print("File with normalized effects: " + str(options.diccEffect))
# Loading normalized effects
print('Loading normalized effects (all)...')
hashNormalizedEffects = {}
with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
hashNormalizedEffects = json.load(diccFile)
listEffects = []
for eff in hashNormalizedEffects.keys():
listEffects.append(eff)
effects = "|".join(listEffects)
#print("Effects: {}".format(effects))
files = {}
hashEntities = {}
hashInteractions = {}
hashInteractionsEffect = {}
idEntities = 1
idInteractions = 1
idInteractionsEffect = 1
# The FimZ transcription factor activates this promoter directly ,
# and it also positively regulates the transcription of its own gene
# FimZ is known to regulate the expression of its own gene positively
# FimZ also positively regulates its own transcription
# ArgP protein represses its own synthesis
# ArgP both represses its own transcription
# ArgP protein represses its own synthesis
# OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
# of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
regexAutoRI = re.compile(
# r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]\s){,4}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+\s(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
#r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^(TF)\s]+\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
#r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^T][^F]\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT')
filename = options.inputFile
hashEntities = {}
hashInteractions = {}
hashInteractionsEffect = {}
idEntities = 1
idInteractions = 1
idInteractionsEffect = 1
outputPath = os.path.join(options.outputPath, "complete-ris")
idEntities = loadFileEntities(filename, outputPath, hashEntities)
idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
outputPath = os.path.join(options.outputPath, "incomplete-ris")
idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
listRIs = []
# print("Read autoregulation file")
with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
for line in iFile:
line = line.rstrip('\n')
print("Buscando autoregulation")
result = regexAutoRI.search(line)
#print("result: {}".format(result))
if result:
lineTemp = result.string[result.end('regulator'):result.end(0)]
# print("lineTemp: {}".format(lineTemp))
result2 = regexAutoRI.search(lineTemp)
if result2:
print("Regulator {} regulated {} effect {}".format(result2.group('regulator'), result2.group('regulator'), result2.group('effect')))
listRIs.append(getRI(result2, line))
print("listRIs: {}".format(listRIs))
elif result:
print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulator'), result.group('effect')))
listRIs.append(getRI(result, line))
print("listRIs: {}".format(listRIs))
for ri in listRIs:
#print("ri: {}".format(ri))
if len(ri) != 4:
print("WARNING! corrupted list")
exit()
regulator = ri[0]
regulated = ri[1]
effect = ri[2]
line = ri[3]
listElem = regulator.split('|')
regulatorWord = listElem[0]
regulatorType = listElem[2]
regulatorStart = listElem[3]
regulatorEnd = listElem[4]
listElem = regulated.split('|')
regulatedWord = listElem[0]
regulatedType = listElem[2]
regulatedStart = listElem[3]
regulatedEnd = listElem[4]
listElem = effect.split('|')
effectWord = listElem[0]
effectType = "EFFECT"
effectStart = listElem[1]
effectEnd = listElem[2]
idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
idInteractions, hashInteractions)
idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
idInteractionsEffect,
hashInteractionsEffect)
saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import sys
import os
import json
import operator
from general_functions import getTypeRegulation
import re
from nltk.corpus import words
__author__ = 'CMendezC'
# Objective: obtain predicted ris from triplets extracted by OpenIE Stanford CoreNLP
# Input format:
# WARNING: Only one sentence per line
# Parameters:
# 1) --inputPath Input path
# 2) --inputFile Inpupt file
# 3) --outputPath Output path
# 5) --diccPath Dictionary path
# 6) --diccFile JSON file with entity dictionaries
# 7) --diccEffect File with normalized effects
# 8) --format Output format: standoff, tabs
# 9) --diccEPAth Dictionary path diccEffect
# Ouput:
# 1) File with predicted ris.
# Format standoff:
# T1 TF 0 0 MetR
# T2 TU 0 0 metH
# T3 GENE 0 0 metH
# T1 Growth_condition 88 137 mitochondrial electron transport chain inhibitors
# T2 Growth_condition 150 179 switch rich to minimal medium
# R1 Interaction.activator Target:T3 Agent:T1
# R2 Interaction.activator Target:T2 Agent:T1
# Execution
# python3.4 ri-openie-extraction.py
# --inputFile /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris/predicted-ris.reverb
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/processing-ris
# --diccPath /home/cmendezc/terminologicalResources
# --diccFile normalized_Effects_Type.json
# --diccEffect termFilesTag_RIE_GCE_SYSTEM_ECCO.jsong
# --format standoff
###########################################################
# MAIN PROGRAM #
###########################################################
def getPosWord(wordPos, endPos, text, termList, type_entity=""):
#print("GETPOSWORD wordPOs {}".format(wordPos))
offsetStart = 0
wordNum = 0
listText = text.split()
for w in listText:
# if filenameBefore.find('000-2') > -1:
# print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
if wordNum >= int(wordPos):
# for tok in word.split():
for t in termList:
# For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
if w == t:
if type_entity == "EFFECT":
# To change regulation effect in:
# negative regulator --> repressor
# positively regulates --> activator
print("text: {}".format(text))
new_w = getTypeRegulation(w, int(wordPos), text, "word")
return [new_w, offsetStart, offsetStart + len(w) - 1]
else:
return [w, offsetStart, offsetStart + len(w) - 1]
#else:
wordNum += 1
offsetStart += len(w) + 1
if wordNum > int(endPos):
return None
return None
def getIdEntity(aList, etype, idE):
entity = aList[0]
if etype == "EFFECT":
normalizedEffect = entity
# print("EFFECT: {}".format(entity))
if entity in hashEffects:
normalizedEffect = hashEffects[entity]
etype += "." + normalizedEffect
# print("EFFECT: {}".format(entity))
entityPosStart = aList[1]
entityPosEnd = aList[2]
keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
#if filenameBefore.find('061-02') > -1:
# print("keyEntity: {}".format(keyEntity))
# print("idE: {}".format(idE))
# print("hashEntities: {}".format(hashEntities))
if keyEntity not in hashEntities:
idE += 1
idEntity = "T{}".format(idE)
#if filenameBefore.find('061-02') > -1:
# print("idEntity not in hashEntities: {}".format(keyEntity))
# print("idE not in hashEntities: {}".format(idE))
hashEntities[keyEntity] = idEntity
#print("New entity {}: {}".format(idEntity, keyEntity))
return idEntity, idE
else:
idEntity = hashEntities[keyEntity]
return idEntity, idE
def getIdInteraction(regulator, regulated, effect, idI, hashInt):
#print("hashInt: {}".format(hashInt))
keyInteraction = "{} {} {}".format(regulator, regulated, effect)
if keyInteraction not in hashInt:
idI += 1
idInteraction = "R{}".format(idI)
hashInt[keyInteraction] = idInteraction
#print("New interaction {}: {}".format(idInteraction, keyInteraction))
#return idInteraction, idI
else:
idInteraction = hashInt[keyInteraction]
return idInteraction, idI
def saveFiles(filename, hashE, hashI, s, effect):
if effect:
outputPath = os.path.join(options.outputPath, "complete-ris")
else:
outputPath = os.path.join(options.outputPath, "incomplete-ris")
with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="w") as a1File:
#with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
aList = k.split()
a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="w") as a2File:
#with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
aList = k.split()
a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
with open(os.path.join(outputPath, filename[:file.find(".")] + ".txt"), mode="w") as txtFile:
txtFile.write(s)
def loadFileEntities(filename, outputPath, hashTemp):
idE = 1
try:
with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="r") as a1File:
for line in a1File:
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
etype = listLine2[0]
entityPosStart = listLine2[1]
entityPosEnd = listLine2[2]
entity = listLine1[2]
keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
idEntity = listLine1[0]
if keyEntity not in hashTemp:
hashTemp[keyEntity] = idEntity
if int(idEntity[1:]) > idE:
idE = int(idEntity[1:])
except IOError:
print("IOError file, idEntity starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a1")))
# idE = 1
return idE
def loadFileInteractions(filename, outputPath, hashTemp):
idI = 1
try:
with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="r") as a2File:
for line in a2File:
#print("Line a2: {}".format(line))
line = line.strip('\n')
listLine1 = line.split('\t')
listLine2 = listLine1[1].split(' ')
regulator = listLine2[2]
regulator = regulator[regulator.find(":") + 1:]
regulated = listLine2[1]
regulated = regulated[regulated.find(":") + 1:]
effect = listLine2[0]
effect = effect[effect.find(".") + 1:]
idInteraction = listLine1[0]
keyInteraction = "{} {} {}".format(regulator, regulated, effect)
if keyInteraction not in hashTemp:
hashTemp[keyInteraction] = idInteraction
if int(idInteraction[1:]) > idI:
idI = int(idInteraction[1:])
except IOError:
print("IOError file, idInteraction starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a2")))
# idI = 1
return idI
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Input path", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="Input file", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
#parser.add_option("--outputFile", dest="outputFile",
#help="Output file", metavar="FILE")
parser.add_option("--diccPath", dest="diccPath",
help="Path to read dictionaries", metavar="PATH")
parser.add_option("--diccFile", dest="diccFile",
help="JSON file with entity dictionaries", metavar="FILE")
parser.add_option("--diccEffect", dest="diccEffect",
help="File with normalized effects", metavar="FILE")
parser.add_option("--format", dest="format",
help="Output format: standoff", metavar="TEXT")
parser.add_option("--diccEPAth", dest="diccEPAth",
help="File with normalized effects", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Input path: " + str(options.inputPath))
print("Input file: " + str(options.inputFile))
print("Output path: " + str(options.outputPath))
#print("Output file: " + str(options.outputFile))
print("Path to read dictionaries: " + str(options.diccPath))
print("JSON file with entity dictionaries: " + str(options.diccFile))
print("Path to read normalized effects: " + str(options.diccEPAth))
print("File with normalized effects: " + str(options.diccEffect))
print("Output format: " + str(options.format))
regularWords = words.words('en')
print('Loading dictionaries...')
with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
hashDicc = json.load(diccFile)
# hashTermFiles = hashDicc["hashTermFiles"]
# hashTerms = hashDicc["hashTerms"]
# for key in hashTermFiles.keys():
# for f in hashTermFiles[key]:
# # print('File: ' + f)
# with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
# for line in iFile:
# line = line.strip('\n')
# line = line.replace(' ', '-')
# if line not in hashTerms[key]:
# hashTerms[key].append(line)
# # if options.termLower:
# # hashTerms[key].append(line.lower())
# # if options.termCapitalize:
# # hashTerms[key].append(line.capitalize())
# print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
# Loading normalized effects
print('Loading normalized effects...')
with open(os.path.join(options.diccEPAth, options.diccEffect)) as diccFile:
hashEffects = json.load(diccFile)
files = {}
hashEntities = {}
hashInteractions = {}
hashInteractionsEffect = {}
idEntities = 1
idInteractions = 1
idInteractionsEffect = 1
filenameBefore = ''
regexNumFile = re.compile(r'_([0-9]+)[.-]')
numFile = ""
inumFile = 0
hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}
with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
for line in iFile:
line = line.rstrip('\n')
listLine = line.split('\t')
file = listLine[0]
filename = file.split("/")[-1]
filename = filename[:-4]
if filename not in files:
# New file, that is, new sentence
files[filename] = 1
if len(files) > 1:
if len(hashEntities) > 0:
#if filenameBefore.find('061-02') > -1:
# print("filenameBefore: {}".format(filenameBefore))
# print("Save hashEntities: {}".format(hashEntities))
# print("Save hashInteractions: {}".format(hashInteractions))
# print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)
filenameBefore = filename
hashEntities = {}
hashInteractions = {}
hashInteractionsEffect = {}
idEntities = 1
idInteractions = 1
idInteractionsEffect = 1
outputPath = os.path.join(options.outputPath, "complete-ris")
idEntities = loadFileEntities(filename, outputPath, hashEntities)
idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
outputPath = os.path.join(options.outputPath, "incomplete-ris")
idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
result = regexNumFile.search(filenameBefore)
if result:
inumFile = int(result.group(1))
numFile = str(inumFile)
print("Numfile: {}".format(numFile))
else:
print("WARNING: numfile not found in filename")
hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}
if numFile in hashDicc:
hashTemp = hashDicc[numFile]
#print("hashDicc[numFile]: {}".format(hashTemp))
for k, v in hashTemp.items():
if v == "TF":
# print("Verifiying TF")
if "TF" in hashTerms:
# print(" TF {}".format(k))
hashTerms["TF"].append(k)
else:
hashTerms["TF"] = [k]
elif v == "GENE":
if "GENE" in hashTerms:
hashTerms["GENE"].append(k)
else:
hashTerms["GENE"] = [k]
elif v == "TU":
if "TU" in hashTerms:
hashTerms["TU"].append(k)
else:
hashTerms["TU"] = [k]
elif v == "EFFECT":
if "EFFECT" in hashTerms:
hashTerms["EFFECT"].append(k)
else:
hashTerms["EFFECT"] = [k]
else:
print("WARNING: entity not found in dictionaries")
else:
print("WARNING: numfile not found in dictionaries")
#if filename.find('061-02') > -1:
# print("filename: {}".format(filename))
# print("Load hashEntities: {}".format(hashEntities))
# print("Load hashInteractions: {}".format(hashInteractions))
# print("Load hashInteractionsEffect: {}".format(hashInteractionsEffect))
wordA = listLine[2]
wordB = listLine[3]
wordC = listLine[4]
startA = listLine[5]
endA = listLine[6]
startB = listLine[7]
endB = listLine[8]
startC = listLine[9]
endC = listLine[10]
sent = listLine[12]
lemmaA = listLine[2]
lemmaB = listLine[3]
lemmaC = listLine[4]
# Return [tok, offsetStart, offsetEnd ]
# print("hashTerms[TF]: {}".format(hashTerms["TF"]))
listRegulator = getPosWord(startA, endA, sent, hashTerms["TF"])
if listRegulator is not None:
#if filenameBefore.find('061-02') > -1:
# print(">> Regulator found: {}".format(listRegulator[0]))
listRegulated = getPosWord(startC, endC, sent, hashTerms["GENE"])
if listRegulated is not None:
#if filenameBefore.find('061-02') > -1:
# print(">> Regulated GENE found: {}".format(listRegulated[0]))
idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
#print("Review EFFECT")
listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
if listEffect is not None:
idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
else:
listRegulated = getPosWord(startC, endC, sent, hashTerms["TU"])
if listRegulated is not None:
#if filenameBefore.find('061-02') > -1:
# print(">> Regulated TU found: {}".format(listRegulated[0]))
idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
#print("Review EFFECT")
listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
if listEffect is not None:
idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
else:
listRegulator = getPosWord(startC, endC, sent, hashTerms["TF"])
if listRegulator is not None:
#if filenameBefore.find('061-02') > -1:
# print(">> Regulator found: {}".format(listRegulator[0]))
listRegulated = getPosWord(startA, endA, sent, hashTerms["GENE"])
if listRegulated is not None:
#if filenameBefore.find('061-02') > -1:
# print(">> Regulated GENE found: {}".format(listRegulated[0]))
idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
#print("Review EFFECT")
listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
if listEffect is not None:
idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
else:
listRegulated = getPosWord(startA, endA, sent, hashTerms["TU"])
if listRegulated is not None:
#if filenameBefore.find('061-02') > -1:
# print(">> Regulated TU found: {}".format(listRegulated[0]))
idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
#print("Review EFFECT")
listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
if listEffect is not None:
idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
if len(files) > 1:
if len(hashEntities) > 0:
#print("filenameBefore: {}".format(filenameBefore))
#print("Save hashEntities: {}".format(hashEntities))
#print("Save hashInteractions: {}".format(hashInteractions))
#print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)
#!/bin/bash
###### Automatic extraction of TRN from several files ######
BRIES_HOME=/myhome/bries
PMIDS_HOME=/myhome/preprocessed-files
# We don't use REFERENCE_HOME because we don't evaluate. Path /reference-data-set doesn't exist. File no-reference.txt doesn't exist.
REFERENCE_HOME=/myhome/reference-data-set
for f in $PMIDS_HOME/original/text/*.*
do
FILE_NAME=$(basename "$f")
FILE_NAME="${FILE_NAME%.*}"
echo "File: $FILE_NAME"
./automatic-extraction-ris-gcs.sh $PMIDS_HOME/features/$FILE_NAME.tra.word.txt $PMIDS_HOME/transformed/$FILE_NAME.tra.txt $BRIES_HOME/ri-openie-extraction/$FILE_NAME.txt $BRIES_HOME/predicted-ris-gcs Y Y FILT1 $REFERENCE_HOME no-reference.txt $BRIES_HOME/evaluation-reports no-evaluation.txt diccionario-SYNONYMS.json $PMIDS_HOME/original/tsv 1>uno-$FILE_NAME.txt 2>dos-$FILE_NAME.txt
done
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
import json
import re
import pandas as pd
__author__ = 'CMendezC'
# Objective: Filter sentences with specific entities.
# Also extract attributive sentences: effect-TF
# And autoregulation: regulates its own gene
# CFMC 2022-03-08: We added updating tsv file with idsentence, sentence and section (.pre.tsv)
# to indicate filtered sentences.
# Parameters:
# 1) --inputFileWord Path and filename to read feature word file.
# 2) --inputFileTrans Path and filename to read transformed file.
# 3) --outputPath Path to place output file.
# 4) --outputFile Output file.
# 5) --filter FILT1: (GENE OR TU) AND TF
# FILT2: (GENE OR TU) AND EFFECT AND TF
# 6) --attrPath Path for attributive cases: ArgP-regulated genes
# 8) --dicPath Path for dictionary
# 9) --dicFile Path for dictionary file normalized_Effects.json
# 10) --autoPath Path for autoregulation cases: regulates its own gene
# /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
# Output:
# 1) Filtered sentences.
# 2) Attributive sentences
# 3) Autoregulation sentences
###########################################################
# MAIN PROGRAM #
###########################################################
def getEntities(tline, filt):
# FILT1: (GENE OR TU) AND TF
# FILT2: (GENE OR TU) AND EFFECT AND TF
entities = {}
tline = tline.rstrip('\n\r ')
for token in tline.split(" "):
# print("Token: {}".format(token))
listElem = token.split("|")
w = listElem[0]
l = listElem[1]
t = listElem[2]
if filt == "FILT1" or filt == "FILT2":
if t in ["GENE", "TU", "TF", "EFFECT"]:
if w not in entities:
entities[w] = t
# if filt == "FILT2":
# if t in ["GENE", "TU", "TF", "EFFECT"]:
# if w not in entities:
# entities[w] = t
return entities
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputFileWord", dest="inputFileWord",
help="Path and filename to read feature word file", metavar="PATH")
parser.add_option("--inputFileTrans", dest="inputFileTrans",
help="Path and filename to read transformed file", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--outputFile", dest="outputFile",
help="Output file", metavar="FILE")
parser.add_option("--filter", dest="filter", choices=('FILT1', 'FILT2'), default=None,
help="FILT1: (GENE OR TU) AND TF; FILT2: (GENE OR TU) AND EFFECT AND TF", metavar="TEXT")
parser.add_option("--attrPath", dest="attrPath",
help="Output path attributive sentences", metavar="PATH")
parser.add_option("--dicPath", dest="dicPath",
help="Output path dictionary", metavar="PATH")
parser.add_option("--dicFile", dest="dicFile",
help="Output file dictionary normalized_Effects.json", metavar="FILE")
parser.add_option("--autoPath", dest="autoPath",
help="Output path autoregulation sentences", metavar="PATH")
parser.add_option("--tsvPath", dest="tsvPath",
help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path and filename to read feature word file: " + str(options.inputFileWord))
print("Path and filename to read transformed file: " + str(options.inputFileTrans))
print("Output path: " + str(options.outputPath))
print("Output file: " + str(options.outputFile))
print("Filter: " + str(options.filter))
print("Output path attributive sentences: " + str(options.attrPath))
print("Output path autoregulation sentences: " + str(options.autoPath))
print("Output path dictionary: " + str(options.dicPath))
print("Output file dictionary normalized_Effects.json: " + str(options.dicFile))
print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))
# Loading normalized effects
# print('Loading normalized effects...')
hashNormalizedEffects = {}
with open(os.path.join(options.dicPath, options.dicFile)) as diccFile:
hashNormalizedEffects = json.load(diccFile)
listEffects = []
for eff in hashNormalizedEffects.keys():
if eff.endswith('d'):
listEffects.append(eff)
listEffects.append("dependent")
effects = "|".join(listEffects)
print("Effects: {}".format(effects))
t0 = time()
count = 0
hashEntities = {}
hashAttrSent = {}
hashAutoSent = {}
# Original CMC 2018-11-07: reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
# We decided to extract all sentences containing effect-TF because we observed some patterns where
# "gene" does not appear, then, to recover these examples we employ a more general rule to separate
# attributive sentences.
reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF')
# We decided to extract all sentences containing autoregulation
# The FimZ transcription factor activates this promoter directly ,
# and it also positively regulates the transcription of its own gene
# FimZ is known to regulate the expression of its own gene positively
# FimZ also positively regulates its own transcription
# ArgP protein represses its own synthesis
# ArgP both represses its own transcription
# ArgP protein represses its own synthesis
# OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
# of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
reAutoSent = re.compile(r'(?<=\|TF).+\|EFFECT.+its\|its\|PRP\$ own\|own\|JJ')
aFilter = options.filter
print(" Processing file...{}".format(options.inputFileTrans))
with open(os.path.join(options.outputPath, options.outputFile), "w", encoding="utf-8", errors="replace") as oFile:
with open(os.path.join(options.inputFileTrans), mode="r", encoding="utf-8", errors="replace") as tFile, open(os.path.join(options.inputFileWord), mode="r", encoding="utf-8", errors="replace") as wFile:
# CFMC 2022-03-09: Load tsv file with section, id sentence, sentence (Extracted from jsonpdf)
file = options.inputFileTrans[options.inputFileTrans.rfind("/")+1:]
file_tsv = file.replace(".tra.txt", ".pre.tsv")
tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
print("tsv_file.shape: {}".format(tsv_file.shape))
tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
# print(tsv_file_filtered.head(10))
tsv_file_new = tsv_file_filtered.reset_index(drop=True)
# print(tsv_file_new.shape)
# print(tsv_file_new.head(10))
i = 0
for tLine, wLine in zip(tFile, wFile):
# FILT1: (GENE OR TU) AND TF
# FILT2: (GENE OR TU) AND EFFECT AND TF
if aFilter is not None:
reGENETU = re.compile(r'(\|GENE|\|TU)')
reEFFECT = re.compile(r'\|EFFECT')
reTF = re.compile(r'\|TF')
tCount = str(count)
if aFilter == "FILT1":
if not (reGENETU.search(tLine) and reTF.search(tLine)):
#print("NOT FOUND")
# CFMC 2022-03-08
tsv_file_new.at[i, 'status'] = 0
i += 1
continue
else:
#print("FOUND")
oFile.write(wLine)
if tCount not in hashEntities:
hashEntities[tCount] = getEntities(tLine, aFilter)
if reAttrSent.search(tLine):
#print("ATTRIBUTIVE SENTENCE: {}".format(tLine))
if tCount not in hashAttrSent:
hashAttrSent[tCount] = tLine
# Autoregulation sentences
if reAutoSent.search(tLine):
# print("AUOREGULATION SENTENCE: {}".format(tLine))
if tCount not in hashAutoSent:
hashAutoSent[tCount] = tLine
#print(tLine)
elif aFilter == "FILT2":
if not (reGENETU.search(tLine) and reEFFECT.search(tLine) and reTF.search(tLine)):
continue
# CFMC 2022-03-08
tsv_file_new.at[i, 'status'] = 0
i += 1
else:
oFile.write(wLine)
if tCount not in hashEntities:
hashEntities[tCount] = getEntities(tLine, aFilter)
if reAttrSent.search(tLine):
if tCount not in hashAttrSent:
hashAttrSent[tCount] = tLine
if reAutoSent.search(tLine):
if tCount not in hashAutoSent:
hashAutoSent[tCount] = tLine
count += 1
i += 1
merged = tsv_file.merge(tsv_file_new, on=['idsentence'], how='left')
# print(merged.shape)
# print(merged.head(10))
tsv_file.status = merged.status_y.where(~merged.status_y.isnull(), tsv_file.status).astype(int)
tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
print("Last tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
# print(tsv_file_filtered.head(10))
tsv_file.to_csv(os.path.join(options.tsvPath, file_tsv.replace('.tsv', '.fil.tsv')), sep='\t')
with open(os.path.join(options.outputPath, options.outputFile.replace(".txt", ".ents.json")), "w", encoding="utf-8",
errors="replace") as eFile:
json.dump(hashEntities, eFile)
for f, sent in hashAttrSent.items():
listPath = options.inputFileTrans.split('/')
fileName = listPath[-1]
fileName = fileName.replace('.tra.', '.att.' + f + '.')
print("Save file {}".format(fileName))
with open(os.path.join(options.attrPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
aFile.write(sent)
for f, sent in hashAutoSent.items():
listPath = options.inputFileTrans.split('/')
fileName = listPath[-1]
fileName = fileName.replace('.tra.', '.auto.' + f + '.')
print("Save file {}".format(fileName))
with open(os.path.join(options.autoPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
aFile.write(sent)
print("Files split in: %fs" % (time() - t0))
#!/bin/bash
#Validate arguments
if [[ ! ("$#" == 3 ) ]]; then
echo 'Usage: ./sentence-simplification-main.sh <input_path> <output_file_path> <isimp_path>'
exit 1
fi
SCRIPT_PATH=$(cd `dirname $0` && pwd)
#Define aquí la palabra clave del grupo de oraciones a simplificar.
INPUT_PATH=$1
OUTPUT_INDEX_FILE_PATH=$2
ISIMP_PATH=$3
cd $SCRIPT_PATH
#ANALIZAR EN ISIMP
echo "Analysing in iSimp..."
if [ -z "$(ls -A ./iSimp_sentences/)" ]; then :
else
#echo "Not Empty"
rm ./iSimp_sentences/*
fi
#cd $INPUT_PATH
for j in $INPUT_PATH/*
do
echo $j
#echo "++++entrada_simp: $j salida_simp: $SCRIPT_PATH/iSimp_sentences/$(basename $j)"
$ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
done
cd $SCRIPT_PATH
#CREA INDICE DE ARCHIVOS SIMPLIFICADOS
#touch $SCRIPT_PATH/index.txt
>| $OUTPUT_INDEX_FILE_PATH
#ALIMENTAR A ALGORITMO
echo "Analysing in Algorithm..."
if [ -z "$(ls -A ./algorithm_sentences/)" ]; then :
else
#echo "Not Empty"
rm ./algorithm_sentences/*
fi
#cd ./iSimp_sentences
for k in $SCRIPT_PATH/iSimp_sentences/*
do
echo $k
#echo "entrada: $k salida: $SCRIPT_PATH/algorithm_sentences/$(basename $k) index: $OUTPUT_INDEX_FILE_PATH"
python2 $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
done
cd $SCRIPT_PATH
import copy
import sys
import requests
class Simp(object):
def __init__(self):
self.TYPE=""
self.TYPEx=0
self.TYPEy=0
self.TEXT=""
self.COMP=[]
def agregarTYPE(self,Type):
self.TYPE=Type
def agregarTEXT(self,text):
self.TEXT=text
def agregarCOMP(self,comp):
self.COMP.append(comp)
class Frase(object):
def __init__(self):
self.TYPE=""
self.TEXT=""
self.POS=""
self.TREE=""
self.SIMP=[]
def agregarTYPE(self,Type):
self.TYPE=Type
def agregarTEXT(self,text):
self.TEXT=text
def agregarPOS(self,Pos):
self.POS=Pos
def agregarTREE(self,Tree):
self.TREE=Tree
def agregarSIMP(self):
self.SIMP.append(Simp())
class Sentence(object):
def __init__(self):
self.FLAG=True
self.TEXT=""
self.TREE=""
self.SIMP=[]
def agregarTEXT(self,text):
self.TEXT=text
def agregarTREE(self,Tree):
self.TREE=Tree
def agregarSIMP(self):
self.SIMP.append(Simp())
MEMORIAB=[]
MEMORIAA=[]
#----lectura de datos desde archivo
arch=(sys.argv[1])
f = open(arch)
dato = f.read().splitlines()
f.close
frase=Frase()
for i in range(len(dato)):
if 'TYPE: ' in dato[i][0:6]:
frase.agregarTYPE(dato[i][6:])
elif 'TEXT: ' in dato[i][0:6]:
frase.agregarTEXT(dato[i][6:])
elif 'POS : ' in dato[i][0:6]:
frase.agregarPOS(dato[i][6:])
elif 'TREE: ' in dato[i][0:6]:
frase.agregarTREE(dato[i][6:])
elif 'SIMP:' in dato[i]:
frase.agregarSIMP()
elif ' TYPE: ' in dato[i][0:8]:
frase.SIMP[-1].agregarTYPE(dato[i][8:])
elif ' TEXT: ' in dato[i][0:8]:
frase.SIMP[-1].agregarTEXT(dato[i][8:])
elif ' COMP: ' in dato[i]:
frase.SIMP[-1].agregarCOMP(dato[i][8:])
#------------
#-------Programa principal
#Algoritmo v4
if ((frase.TYPE.find('sentence')) !=- 1) and (frase.SIMP!=[]) and (frase.SIMP[0].TYPE != ''):
y=1
w=1
SIMPworkspace=[]
# copia TREE y cada SIMP a SENTENCE.1
Sentence1=Sentence()
Sentence1.TREE=copy.deepcopy(frase.TREE)
Sentence1.TEXT=copy.deepcopy(frase.TEXT)
for i in range(len(frase.SIMP)):
#Sentence1.SIMP.append(Simp())
#Sentence1.SIMP[i]=copy.deepcopy(frase.SIMP[i])
SIMPworkspace.append(Simp())
SIMPworkspace[i]=copy.deepcopy(frase.SIMP[i])
## ORDENAMIENTO DE SIMPs
for i in range(len(SIMPworkspace)):
#print SIMPworkspace[i].TEXT
#print SIMPworkspace[i].TYPE
SIMPworkspace[i].TYPEx = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('[')+1:SIMPworkspace[i].TYPE.find('..')])
SIMPworkspace[i].TYPEy = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('..')+2:SIMPworkspace[i].TYPE.find(']')])
if 'parenthesis' in SIMPworkspace[i].TYPE:
SIMPworkspace[i].TYPEy = SIMPworkspace[i].TYPEy + 2
#print SIMPworkspace[i].TYPEx
#print SIMPworkspace[i].TYPEy
SIMPworkspace.sort(key=lambda x: x.TYPEy, reverse=True)
SIMPworkspace.sort(key=lambda x: x.TYPEx)
# for i in range(len(SIMPworkspace)):
# print "\nSIMP " + str(i) + " :"
# print SIMPworkspace[i].TYPE
# print SIMPworkspace[i].TYPEx
# print SIMPworkspace[i].TYPEy
# print "\n"
for i in range(len(SIMPworkspace)):
Sentence1.SIMP.append(Simp())
Sentence1.SIMP[i]=copy.deepcopy(SIMPworkspace[i])
# Agrega la oracion original Sentence1 a la memoria como primer objeto en ser analizado
MEMORIAB.append(Sentence())
MEMORIAB[0]=copy.deepcopy(Sentence1)
# 1 entrada al bucle A por cada SIMP diferente en Sentence1
numSimp=len(Sentence1.SIMP)
s = 0
#bucle A
while s < numSimp :
#print "\nEntro por vez " + str(s) + " al bucle A"
#print "Analizando todos los SIMP de tipo: " + MEMORIAB[0].SIMP[s].TYPE
#Entra al bucle B el numero de veces igual al numerode elementos en MEMORIAB
numMEM = len(MEMORIAB)
t = 0
#bucle B
while t < numMEM :
#print "Entro por vez " + str(t) + " al bucle B"
#Entra si la oracion no ha sido analizada antes (FLAG==True) y si el texto del simp esta presente en la oracion.
#print "CONDICIONES:"
#print "SIMP " + MEMORIAB[0].SIMP[s].TEXT
#print "SIMP " + MEMORIAB[0].SIMP[s].TYPE
#print "MEMB " + str(MEMORIAB[t].FLAG)
#print "MEMB " + MEMORIAB[t].TEXT
if ( MEMORIAB[0].SIMP[s].TEXT in MEMORIAB[t].TEXT ) and ( MEMORIAB[t].FLAG == True ):
MEMORIAB[t].FLAG = False
#print "False to: " + MEMORIAB[t].TEXT
#print "Entro a condicional"
#Reglas de simplificacion
if ( 'coordination' in MEMORIAB[t].SIMP[s].TYPE ) and ( not ('sentence coordination' in MEMORIAB[t].SIMP[s].TYPE ) ) :
#print "Aplico regla coord"
TEMPORALES = []
c = len(MEMORIAB[t].SIMP[s].COMP)
#print "Hay " + str(c) + " COMP en este SIMP"
tt = 0
while c > 0 :
c = c - 1
if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) :
TEMPORALES.append(Sentence())
TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
replaced = MEMORIAB[0].SIMP[s].TEXT
indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
replacer = MEMORIAB[0].TEXT[indice1:indice2]
TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
tt = tt + 1
#copiar simplificaciones de memoria temporal a MEMORIAB
indtempamem = 0
while indtempamem < len(TEMPORALES) :
MEMORIAB.append(Sentence())
MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
MEMORIAB[-1].FLAG = True
#print MEMORIAB[-1].TEXT
indtempamem = indtempamem + 1
elif 'parenthesis' in MEMORIAB[t].SIMP[s].TYPE:
#print "Aplico regla par"
TEMPORALES = []
c = len(MEMORIAB[t].SIMP[s].COMP)
#print "Hay " + str(c) + " COMP en este SIMP"
tt = 0
while c > 0 :
#print "entro al while de par"
c = c - 1
TEMPORALES.append(Sentence())
TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
replaced = MEMORIAB[0].SIMP[s].TEXT + ' )'
indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
replacer = MEMORIAB[0].TEXT[indice1:indice2]
#print "replaced: " + replaced
#print "replacer: " + replacer
TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
tt = tt + 1
#copiar simplificaciones de memoria temporal a MEMORIAB
indtempamem = 0
while indtempamem < len(TEMPORALES) :
MEMORIAB.append(Sentence())
MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
MEMORIAB[-1].FLAG = True
#print MEMORIAB[-1].TEXT
indtempamem = indtempamem + 1
elif 'apposition' in MEMORIAB[t].SIMP[s].TYPE:
#print "Aplico regla Apposition"
TEMPORALES = []
c = len(MEMORIAB[t].SIMP[s].COMP)
#print "Hay " + str(c) + " COMP en este SIMP"
tt = 0
while c > 0 :
#print "entro al while de par"
c = c - 1
TEMPORALES.append(Sentence())
TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
replaced = MEMORIAB[0].SIMP[s].TEXT
indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
replacer = MEMORIAB[0].TEXT[indice1:indice2]
#print "replaced: " + replaced
#print "replacer: " + replacer
TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
tt = tt + 1
#copiar simplificaciones de memoria temporal a MEMORIAB
indtempamem = 0
while indtempamem < len(TEMPORALES) :
MEMORIAB.append(Sentence())
MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
MEMORIAB[-1].FLAG = True
#print "Copio a memoria: " + MEMORIAB[-1].TEXT
indtempamem = indtempamem + 1
elif 'member-collection' in MEMORIAB[t].SIMP[s].TYPE:
#print "Aplico regla member-collection"
TEMPORALES = []
c = len(MEMORIAB[t].SIMP[s].COMP)
#print "Hay " + str(c) + " COMP en este SIMP"
tt = 0
while c > 0 :
#print "entro al while de mem"
c = c - 1
TEMPORALES.append(Sentence())
TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
replaced = MEMORIAB[0].SIMP[s].TEXT
indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
replacer = MEMORIAB[0].TEXT[indice1:indice2]
#print "replaced: " + replaced
#print "replacer: " + replacer
TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
tt = tt + 1
#copiar simplificaciones de memoria temporal a MEMORIAB
indtempamem = 0
while indtempamem < len(TEMPORALES) :
MEMORIAB.append(Sentence())
MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
MEMORIAB[-1].FLAG = True
#print "Copio a memoria: " + MEMORIAB[-1].TEXT
indtempamem = indtempamem + 1
elif 'sentence coordination' in MEMORIAB[t].SIMP[s].TYPE:
#print "Aplico regla Verb"
TEMPORALES = []
c = len(MEMORIAB[t].SIMP[s].COMP)
#print "Hay " + str(c) + " COMP en este SIMP"
tt = 0
while c > 0 :
c = c - 1
if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) :
TEMPORALES.append(Sentence())
TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
#sustituye todo el contenido de TEMPORAL.r/TREE, por el contenido la oracion coordinada
#replaced = MEMORIAB[0].SIMP[s].TEXT
indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
replacer = MEMORIAB[0].TEXT[indice1:indice2]
#print replacer
TEMPORALES[tt].TEXT = replacer
## si la oracion no termina en punto o !
tt = tt + 1
#copiar simplificaciones de memoria temporal a MEMORIAB
indtempamem = 0
while indtempamem < len(TEMPORALES) :
MEMORIAB.append(Sentence())
MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
MEMORIAB[-1].FLAG = True
#print MEMORIAB[-1].TEXT
indtempamem = indtempamem + 1
elif 'full relative clause' in MEMORIAB[t].SIMP[s].TYPE:
#print "Aplico regla RelCl"
TEMPORALES = []
c = 0
tt = 0
while c < 2 :
if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] :
TEMPORALES.append(Sentence())
TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
if MEMORIAB[0].TEXT[MEMORIAB[0].TEXT.index(TEMPORALES[tt].SIMP[s].TEXT)+len(TEMPORALES[tt].SIMP[s].TEXT)-1] == ',':
replaced = MEMORIAB[0].SIMP[s].TEXT + ',' #posible error, si es asi probar con ' ,'
else:
replaced = MEMORIAB[0].SIMP[s].TEXT
indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
replacer = MEMORIAB[0].TEXT[indice1:indice2]
TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
indice3 = indice1
indice4 = indice2
if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] :
TEMPORALES.append(Sentence())
TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
TEMPORALES[tt].TEXT = copy.deepcopy(MEMORIAB[0].TEXT[indice3:indice4]+' '+MEMORIAB[0].TEXT[indice1:indice2] ) ##
cad3 = MEMORIAB[0].TEXT[indice1:indice2]
cad4 = cad3.split()
if (cad4[0]+'_WDT') in frase.POS:
TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(' '+cad4[0],'')
tt = tt + 1
c = c + 1
#copiar simplificaciones de memoria temporal a MEMORIAB
indtempamem = 0
while indtempamem < len(TEMPORALES) :
MEMORIAB.append(Sentence())
MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
MEMORIAB[-1].FLAG = True
#print MEMORIAB[-1].TEXT
indtempamem = indtempamem + 1
elif 'reduced relative clause' in MEMORIAB[t].SIMP[s].TYPE:
#print "Aplico regla RelCl"
TEMPORALES = []
c = 0
tt = 0
while c < 2 :
if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] :
TEMPORALES.append(Sentence())
TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
replaced = MEMORIAB[0].SIMP[s].TEXT
indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
replacer = MEMORIAB[0].TEXT[indice1:indice2]
#subj = MEMORIAB[0].TEXT[indice1:(indice2+1)]
subj = MEMORIAB[0].TEXT[indice1:(indice2)]
TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] :
TEMPORALES.append(Sentence())
TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #el referente debera estar antes que la clausula para tener orden correcto
indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
replacer = MEMORIAB[0].TEXT[indice1:indice2]
TEMPORALES[tt].TEXT = subj + " _ " + replacer #en este punto para ingresar copula necesitas info de numero y tiempo
tt = tt + 1
c = c + 1
#copiar simplificaciones de memoria temporal a MEMORIAB
indtempamem = 0
while indtempamem < len(TEMPORALES) :
MEMORIAB.append(Sentence())
MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
MEMORIAB[-1].FLAG = True
#print MEMORIAB[-1].TEXT
indtempamem = indtempamem + 1
elif 'hypernymy' in MEMORIAB[t].SIMP[s].TYPE:
print "**hypernymy detected**"
#print "True to: " + MEMORIAB[t].TEXT
MEMORIAB[t].FLAG = True
else:
print "Error: Unknown simplification construct detected."
#print "True to: " + MEMORIAB[t].TEXT
MEMORIAB[t].FLAG = True
t = t + 1
s = s + 1
#CONDICIONES PARA IMPRESION DE SIMPLIFICACIONES EN ARCHIVO DE TEXTO
#print "Sentence simplificated. New sentences generated:"
for i in range(len(MEMORIAB)):
#se reutiliza flag para marcar las oraciones finales
MEMORIAB[i].FLAG = True
for j in range(len(MEMORIAB[0].SIMP)):
#NOTA: si se agrega un constructo simplificable, anadirlo tambien a esta lista:
if ( ('member-collection' in MEMORIAB[0].SIMP[j].TYPE) or ('apposition' in MEMORIAB[0].SIMP[j].TYPE) or ('coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('parenthesis' in MEMORIAB[0].SIMP[j].TYPE) or ('sentence coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('full relative clause' in MEMORIAB[0].SIMP[j].TYPE) or ('reduced relative clause' in MEMORIAB[0].SIMP[j].TYPE) ) and (MEMORIAB[0].SIMP[j].TEXT in MEMORIAB[i].TEXT) :
MEMORIAB[i].FLAG = False
##areglar numeracion archivos salida ej 011
arcsalnum = 0
for i in range(len(MEMORIAB)):
if MEMORIAB[i].FLAG == True:
arcsalnum = arcsalnum + 1
length = len(str(arcsalnum))
#print('{:03d}'.format(arcsalnum)) # python >= 2.7 + python3
# >>> n = '4'
#>>> print n.zfill(3)
arcsalnum = 0
for i in range(len(MEMORIAB)):
if MEMORIAB[i].FLAG == True:
arcsalnum = arcsalnum + 1
print MEMORIAB[i].TEXT#Salida
archSalNombre = sys.argv[2]
archSalNombre=archSalNombre[:-4] + "-" + (str(arcsalnum)).zfill(length) + '.alg'
archivoSalida=open(archSalNombre,"w")
archivoSalida.write(MEMORIAB[i].TEXT+"\n")##
archivoSalida.close()
#WRITE OUTPUT FILE PATH TO INDEX (Arg 3)
index_name = sys.argv[3]
index = open(index_name, "a+")
archSalNombreforIndex=archSalNombre + "\n"
index.write(archSalNombreforIndex)
index.close()
else:
print frase.TEXT #----Salida si no habia constructos simplificables
archSalNombre = sys.argv[2]
archSalNombre = archSalNombre[:-4] + ".alg"
archivoSalida = open(archSalNombre,"a+")
archivoSalida.write(frase.TEXT+"\n")##
archivoSalida.close()
#WRITE OUTPUT FILE PATH TO INDEX (Arg 3)
index_name = sys.argv[3]
index = open(index_name, "a+")
archSalNombreforIndex=archSalNombre + "\n"
index.write(archSalNombreforIndex)
index.close()
#FIN
Delete me
\ No newline at end of file