Bacterial regulatory interaction extraction system

cmendezc
Commit f3df57ab2b884d6cdeb18e80bfc986cf683d8da9 f3df57ab 0 parents
Showing 31 changed files with 4658 additions and 0 deletions
README.md
attributive-sentences/.gitignore
automatic-extraction-ris-gcs.sh
autoregulation-sentences/deleteme.txt
deverbal-separator/filter-v03.py
deverbal-separator/separated_sentences/dev/.gitignore
deverbal-separator/separated_sentences/vrb/.gitignore
deverbal-separator/separator-v02.sh
deverbal-separator/tagged-line/.gitignore
deverbal-separator/tagged/.gitignore
evaluate-ris-gcs-standoff-v04.py
extract-ris-deverbal/EFF_DVB-regex-v03.py
filtered-sentences/.gitignore
format/regex.py
format/sanitized_sentences/.gitignore
format/split_sentences/.gitignore
get-TRN-Organism-v1.py
get-TRN-v2.py
predicted-ris-gcs/complete-ris/.gitignore
predicted-ris-gcs/incomplete-ris/.gitignore
--- a/README.md 0 → 100644
View file @f3df57a
+++ b/README.md 0 → 100644
View file @f3df57a
+# Bacterial regulatory interaction extraction system
+
+## Prerequisites
+1. Input file must be tokenized and sentence split
+
+
+
+
+## Run
+### Several files
+Set filenames and paths in run-several-files.sh
+
+## Acknowledgments
+This work was supported by UNAM-PAPIIT IA203420.
\ No newline at end of file
--- a/attributive-sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/attributive-sentences/.gitignore 0 → 100644
View file @f3df57a
+
--- a/automatic-extraction-ris-gcs.sh 0 → 100755
View file @f3df57a
+++ b/automatic-extraction-ris-gcs.sh 0 → 100755
View file @f3df57a
+#!/bin/bash
+# Main script for automatic extraction of regulatory interactions
+
+#Parameters
+#1: Path y nombre de archivo con las frases preprocesadas en formato de tokens (palabras)
+#2: Path y nombre de archivo con las frases preprocesadas en formato trasformado (palabra|lemma|pos)
+#3: Path y nombre de archivo para procesamiento con OpenIE
+#4: Path de salida de archivos a1 y a2 con RIS y GCs
+#5: Simplificar Y/N?
+#6: Separar verbales y deverbales Y/N?
+#7: Filtro de frases que contengan entidades. FILT1 = (GENE OR TU) AND TF
+#8: Path con archivos a1 y a2 de referencia (RIs y GCs verdaderas)
+#9: Archivo de referencia (RIs y GCs verdaderas)
+#10: Path para guardar archivo de evaluación
+#11: Archivo para guardar resultados de la evaluación contra referencia
+#12: Archivo de sinónimos de TFs
+
+# RUN EXTRACTION FOR L&C STM
+# ./automatic-extraction-ris-gcs.sh
+# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt
+# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt
+# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt
+# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs
+# Y Y FILT1
+# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference
+# unused.txt
+# /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports
+# unused.txt
+# diccionario-STM-LT2-v7.0.SYNONYMS.json
+# 1>uno-STM-LC.txt
+# 2>dos-STM-LC.txt
+# ./automatic-extraction-ris-gcs.sh /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs Y Y FILT1 /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference unused.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports unused.txt diccionario-STM-LT2-v7.0.SYNONYMS.json 1>uno-STM-LC.txt 2>dos-STM-LC.txt
+
+# Some help
+# Filename without path: filename=$(basename "$fullfile")
+# Filename extension: extension="${filename##*.}"
+# Filename without extension: filename="${filename%.*}"
+# Por error de muchos archivos: find . -print0 | xargs -0 grep AcrR
+
+
+
+PATH_TO_CORENLP=/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09
+DICC_PATH=/home/cmendezc/terminologicalResources
+ISIMP_PATH=/home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/sentence-simplification/isimp_v2
+
+SCRIPT_PATH=$(cd `dirname $0` && pwd)
+INPUT_PATH=$1
+INPUT_PATH_TRANS=$2
+OUTPUT_FILE=$3
+OUTPUT_PATH=$4
+INPUT_NAME_EXT=$(basename "$INPUT_PATH")
+INPUT_NAME="${INPUT_NAME_EXT%.*}"
+# Simplify sentences?
+SIMPLIFY=$5
+# Separate sentences with deverbal effect?
+DEVERBAL_SEPARATOR=$6
+FILT=$7
+TRUE_PATH=$8
+TRUE_FILE=$9
+PATH_EVAL=${10}
+FILE_EVAL=${11}
+DICC_SYNON=${12}
+# CFMC 2022-03-09: tsv file with section, id sentence, sentence (Extracted from jsonpdf)
+TSV_PATH=${13}
+
+#Validate arguments
+if [[ ! ("$#" == 13 ) ]]; then
+    echo 'Usage: ./automatic-extraction-ris-gcs.sh <inputPath_wordFile>
+    <inputPath_taggedFile> <outputPath_file> <simplify?> <deverbal_detector?>
+    <filter> <true_path> <true_file> <path_evaluation_report> <file_evaluation_report>
+    <dictionary_TFs_synonyms> <path_tsv_file>'
+    exit 1
+fi
+
+echo "********** SELECTED PARAMETERS **********"
+echo "INPUT PATH: $INPUT_PATH"
+echo "INPUT PATH TRANSFORMED FILE $INPUT_PATH_TRANS"
+echo "OUTPUT FILE: $OUTPUT_FILE"
+echo "OUTPUT PATH: $OUTPUT_PATH"
+echo "SIMPLIFY SENTENCES? $SIMPLIFY"
+echo "SEPARATE DEVERBAL SENTENCES? $DEVERBAL_SEPARATOR"
+echo "FILTER SENTENCES WITH ENTITIES? $FILT"
+echo "REFERENCE (TRUE) PATH: $TRUE_PATH"
+echo "REFERENCE (TRUE) FILE: $TRUE_FILE"
+echo "PATH EVALUATION REPORT: $PATH_EVAL"
+echo "FILE EVALUATION REPORT: $FILE_EVAL"
+echo "DICTIONARY OF SYNONYMS OF TFS: $DICC_SYNON"
+
+echo "********** SELECTED PROCESSES **********"
+CLEAN_OUTPUT=FALSE
+echo "   Clean output paths: $CLEAN_OUTPUT"
+
+FILTER=TRUE
+echo "   Filter sentences: $FILTER"
+
+CLEAN=TRUE
+echo "   Clean sentences for iSimp: $CLEAN"
+
+SEPARATE=TRUE
+echo "   Separate sentences to iSimp: $SEPARATE"
+
+SIMPLI=TRUE
+echo "   Simplify sentences: $SIMPLI"
+
+DEVERBAL=TRUE
+echo "   Separate deverbal and verbal sentences: $DEVERBAL"
+
+DEVTAG=TRUE # Needs DEVERBAL=TRUE
+echo "   Tag sentences to separate deverbal and verbal sentences: $DEVTAG"
+
+DEVSEPAR=TRUE # Needs DEVERBAL=TRUE
+echo "   Do separate deverbal and verbal sentences: $DEVSEPAR"
+
+EXTDEVERBAL=TRUE
+echo "   Extract RI deverbal: $EXTDEVERBAL"
+
+OPENIE=TRUE
+echo "   OpenIE triplet extraction: $OPENIE"
+
+EXTOPENIE=TRUE
+echo "   Extract RI verbal: $EXTOPENIE"
+
+EXTATTRIB=TRUE
+echo "   Extract RI attributive: $EXTATTRIB"
+
+EXTAUTOREG=TRUE
+echo "   Extract RI autoregulation: $EXTAUTOREG"
+
+EXTGC=FALSE
+echo "   Extract growth conditions: $EXTGC"
+
+EVAL=FALSE
+echo "   Evaluate extraction: $EVAL"
+
+EVALGC=FALSE
+echo "   Evaluate growth condition extraction: $EVALGC"
+
+#########################
+# Cleaning output paths #
+#########################
+if [ "$CLEAN_OUTPUT" = "TRUE" ]; then
+    if [ -z "$(ls -A $OUTPUT_PATH/complete-ris/)" ]; then :
+    else
+       #echo "Not Empty"
+       # Original: rm $OUTPUT_PATH/complete-ris/*
+       find $OUTPUT_PATH/complete-ris -maxdepth 1 -name '*.*' -delete
+    fi
+    if [ -z "$(ls -A $OUTPUT_PATH/incomplete-ris/)" ]; then :
+    else
+       #echo "Not Empty"
+       # Original: rm $OUTPUT_PATH/incomplete-ris/*
+       find $OUTPUT_PATH/incomplete-ris -maxdepth 1 -name '*.*' -delete
+    fi
+fi # if [ "$CLEAN_OUTPUT" = "TRUE" ]; then
+################
+# preliminares #
+################
+#Clone and update simplification pipeline
+#if [ ! -d "./sentence-simplification" ]
+#	then
+#		echo Downloading sentence simplificator...
+#		git clone https://github.com/ezojg/sentence-simplification
+#	else 
+#		cd ./sentence-simplification
+#		git pull origin master
+#		cd ..
+#fi
+#Check for iSimp 
+#if [ ! -d "./sentence-simplification/isimp_v2" ]
+#	then
+#		echo ERROR: ./sentence-simplification/isimp_v2 not found. Please manually copy iSimp to said path.
+#		exit 1
+#fi
+
+if [ "$FILTER" = "TRUE" ]; then
+echo "********** FILTER SENTENCES **********"
+###################################################
+# filter sentences with entities of interest      #
+# and collect attributive examples ArgP-regulated #
+###################################################
+# INPUT:
+# 1) --inputFileWord $INPUT_PATH input file transformed
+# 2) --inputFileTrans $INPUT_PATH_TRANS input file of feature 'word'
+# 3) --outputPath $SCRIPT_PATH/filtered-sentences
+# 4) --outputFile filtered-sentences.txt output File
+# 5) --filter filter $FILT
+#  FILT1: (GENE OR TU) AND TF
+#  FILT2: (GENE OR TU) AND EFFECT AND TF
+# 6) --attrPath $SCRIPT_PATH/attributive-sentences Path for attributive cases: ArgP-regulated genes
+# 7) --attrFile attributive-sentences.txt File for attributive cases: ArgP-regulated genes
+# $DICC_PATH/normalized_Effects.json
+
+cd $SCRIPT_PATH
+if [ -z "$(ls -A ./filtered-sentences/)" ]; then :
+else
+   #echo "Not Empty"
+   rm ./filtered-sentences/*
+fi
+if [ -z "$(ls -A ./attributive-sentences/)" ]; then :
+else
+   #echo "Not Empty"
+   rm ./attributive-sentences/*
+fi
+if [ -z "$(ls -A ./autoregulation-sentences/)" ]; then :
+else
+   #echo "Not Empty"
+   rm ./autoregulation-sentences/*
+fi
+# CFMC 2022-03-09: To update tsv file with filtered sentences
+# python3.4 $SCRIPT_PATH/sentence-filter.py --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json
+python3.4 $SCRIPT_PATH/sentence-filter_v02.py --tsvPath $TSV_PATH --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json
+fi # if [ "$PRE" = "TRUE" ]; then
+
+if [ "$CLEAN" = "TRUE" ]; then
+echo "********** CLEAN SENTENCES **********"
+#################################
+# Clean sentences for iSimpm #
+#################################
+# INPUT - PREVIOUS OUTPUT: filtered sentences $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt
+# output path and file $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
+if [ -z "$(ls -A ./format/sanitized_sentences/)" ]; then :
+else
+   #echo "Not Empty"
+   rm ./format/sanitized_sentences/*
+fi
+#Original Daniel: python2 $SCRIPT_PATH/format/regex-before.py $INPUT_PATH $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
+python2 $SCRIPT_PATH/format/regex.py $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
+fi # if [ "$CLEAN" = "TRUE" ]; then
+
+if [ "$SEPARATE" = "TRUE" ]; then
+echo "********** SEPARATE SENTENCES **********"
+################################
+# Separate sentences for iSimp #
+################################
+# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/sanitized_sentences/$l
+# output path and file $SCRIPT_PATH/format/split_sentences/$BARE_NAME
+cd $SCRIPT_PATH
+if [ -z "$(ls -A ./format/split_sentences/)" ]; then :
+	else
+		rm ./format/split_sentences/*
+fi
+cd ./format/sanitized_sentences
+for l in $(\ls $INPUT_NAME*)
+do
+	# echo $l
+	BARE_NAME=$(echo $l | cut -f 1 -d '.') 
+	BARE_NAME+="_"
+	LENGTH="$(wc -l < $l)"
+	LENGTH="$(echo "${#LENGTH}")"
+	split -a $LENGTH -d -l 1 --additional-suffix=.spt $SCRIPT_PATH/format/sanitized_sentences/$l $SCRIPT_PATH/format/split_sentences/$BARE_NAME
+done
+fi # if [ "$SEPARATE" = "TRUE" ]; then
+
+if [ "$SIMPLI" = "TRUE" ]; then
+echo "********** SIMPLIFY SENTENCES **********"
+######################
+# Simplify sentences #
+######################
+# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/split_sentences
+# output file $OUTPUT_FILE
+# path to iSimp $ISIMP_PATH
+# CALL: ./sentence-simplification/sentence-simplification-main.sh
+#  CALL: $ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
+#  CALL: $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
+#     $OUTPUT_INDEX_FILE_PATH = $OUTPUT_FILE
+# OUTPUT: simplified sentences in path ./algorithm_sentences
+
+# while true; do
+    # read -p "Do you wish to simplificate sentences? [Y/N]: " yn
+    # case $yn in
+        # [Yy]* ) SIMP=1; break;;
+        # [Nn]* ) SIMP=0; break;;
+        # * ) echo "Please answer yes [Y] or no [N].";;
+    # esac
+# done
+case $SIMPLIFY in
+	[Yy]* )
+		SIMP=1
+		;;
+	[Nn]* )
+		SIMP=0
+		;;
+	* )
+		SIMP=1
+		;;
+esac
+cd $SCRIPT_PATH
+if [ $SIMP == 1 ]
+	then    #USING SIMPLIFICATION
+        echo "********** YES SIMPLIFY SENTENCES **********"
+		#Copy file to sentence-simplification
+		#FILE_NAME=$(basename "$INPUT_PATH")
+		#Call simplification pipeline AND create a file with the paths for the simplificated sentences
+		./sentence-simplification/sentence-simplification-main.sh $SCRIPT_PATH/format/split_sentences $OUTPUT_FILE $ISIMP_PATH
+		#echo "entrada: $SCRIPT_PATH/format/split_sentences --salida: $OUTPUT_FILE"
+		#echo "Sentences simplificated. Paths to simplificated sentences saved in $OUTPUT_FILE"
+	else    #WITHOUT SIMPLIFICACION
+        echo "********** NO SIMPLIFY SENTENCES **********"
+        if [ -z "$(ls -A ./sentence-simplification/algorithm_sentences/)" ]; then :
+        else
+           #echo "Not Empty"
+           rm ./sentence-simplification/algorithm_sentences/*
+        fi
+		ls $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE
+		cp $SCRIPT_PATH/format/split_sentences/* $SCRIPT_PATH/sentence-simplification/algorithm_sentences
+		#echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE"
+fi
+fi # if [ "$SIMPLI" = "TRUE" ]; then
+
+if [ "$DEVERBAL" = "TRUE" ]; then
+echo "********** SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
+######################
+# Deverbal separator #
+######################
+# $PATH_TO_CORENLP
+# INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/sentence-simplification/algorithm_sentences
+# output path $SCRIPT_PATH/deverbal-separator/separated_sentences
+# $DICC_PATH
+# $DEVTAG POS taggging sentences
+# $DEVSEPAR Do separate sentences
+# CALL: java -cp "$PATH_TO_CORENLP/*"
+#       $SCRIPT_PATH/filter.py
+# OUTPUT: sentences separated in two paths according to verbal/deverbal effect
+
+case $DEVERBAL_SEPARATOR in
+	[Yy]* ) 
+		DEVSEP=1
+		;;
+	[Nn]* ) 
+		DEVSEP=0
+		;;
+	* ) 
+		DEVSEP=1
+		;;
+esac
+if [ $DEVSEP == 1 ]
+	then    #USING DEVERBAL SEPARATOR 
+
+        #if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/)" ]; then :
+        #else
+           #echo "Not Empty"
+           # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/*
+        #   find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -maxdepth 1 -name '*.vrb' -delete
+        #fi
+        #if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/)" ]; then :
+        #else
+           #echo "Not Empty"
+           # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/*
+        #   find $SCRIPT_PATH/deverbal-separator/separated_sentences/dev -maxdepth 1 -name '*.dev' -delete
+        #fi
+
+		echo "********** YES SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
+		# Original Daniel 2018-12-06: ./deverbal-separator/separator.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR
+		./deverbal-separator/separator-v02.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR
+	else    #WITHOUT DEVERBAL SEPARATOR
+	    echo "********** NO SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
+		ls $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE
+		#echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE"
+fi # [ $DEVSEP == 1 ]
+fi # if [ "$DEVERBAL" = "TRUE" ]; then
+
+if [ "$EXTDEVERBAL" = "TRUE" ]; then
+echo "********** EXTRACT RI DEVERBAL **********"
+#######################
+# Extract RI deverbal #
+#######################
+# INPUT: deverbal files $(dirname ${file}) $(basename ${file})
+# output path $OUTPUT_PATH $(basename ${file%.*})
+# $DICC_PATH/names_EFFECT_ONTOGENE.txt $
+# DICC_PATH/names_GENE.txt
+# $DICC_PATH/names_GENE_ONTOGENE.txt
+# $DICC_PATH/names_GENE_SYN.txt
+# $DICC_PATH/names_TU.txt
+# $DICC_PATH/names_TU_ONTOGENE.txt
+# $DICC_PATH/names_TF_1grams.txt
+# $DICC_PATH/names_TF_2grams.txt
+# $DICC_PATH/names_TF_3grams.txt
+# $DICC_PATH/names_TF_4grams.txt
+# $DICC_PATH/names_TF_5Moregrams.txt
+# $DICC_PATH/names_TF_ONTOGENE.txt
+# $DICC_PATH/normalized_Effects.json
+# OUTPUT: standoff files with RIs
+# PATH ALREADY TAGGED ENTITIES: $SCRIPT_PATH/filtered-sentences
+# FILE ALREADY TAGGED ENTITIES: filtered-sentences.ents.json
+    for file in $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/*.*
+    do
+        #python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-OriginalDaniel.py $file $OUTPUT_PATH/$(basename ${file%.*}) $DICC_PATH/names_EFFECT_ONTOGENE.txt $DICC_PATH/names_GENE.txt $DICC_PATH/names_GENE_ONTOGENE.txt $DICC_PATH/names_GENE_SYN.txt $DICC_PATH/names_TU.txt $DICC_PATH/names_TU_ONTOGENE.txt $DICC_PATH/names_TF_1grams.txt $DICC_PATH/names_TF_2grams.txt $DICC_PATH/names_TF_3grams.txt $DICC_PATH/names_TF_4grams.txt $DICC_PATH/names_TF_5Moregrams.txt $DICC_PATH/names_TF_ONTOGENE.txt
+        #echo "Dir file: $(dirname ${file})"
+        #echo "File $(basename ${file})"
+        #echo "OUTOUT_PATH $OUTPUT_PATH"
+        #echo "File $(basename ${file%.*})"
+        echo "Dir and files: $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
+        #python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v02.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json
+        python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v03.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json
+    done
+fi # if [ "$EXTDEVERBAL" = "TRUE" ]; then
+
+if [ "$OPENIE" = "TRUE" ]; then
+echo "********** OPENIE TRIPLET EXTRACTION **********"
+    #########################
+    # OpenIE RI extraction #
+    #########################
+    # Juntamos frases verbales en archivo para OpenIE extraction
+    # Error: /bin/ls: Argument list too long: ls $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE
+    echo "   Join verbal sentences into file for OpenIE extraction"
+    find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -type f -name '*' > $OUTPUT_FILE
+    #echo "Deberval sentences separated. Paths to verbal sentences saved in $OUTPUT_FILE"
+
+    echo "   CoreNLP OpenIE..."
+    java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.naturalli.OpenIE -filelist $OUTPUT_FILE -triple.strict false -triple.all_nominals true -format reverb > $OUTPUT_FILE.reverb
+fi # if [ "$OPENIE" = "TRUE" ]; then
+
+if [ "$EXTOPENIE" = "TRUE" ]; then
+    echo "********** OPENIE RI EXTRACTION **********"
+    #########################
+    # OpenIE RI extraction #
+    #########################
+    # Sustituyo oie_compress de Nacho por un programa hecho por CMC para analizar las tripletas
+    # y obtener aquellas que sugieran a los participantes y el efecto
+    #Paste input and output for fancy printing
+    # Original Nacho: echo "   Fancy printing..."
+    # Original Nacho: > $OUTPUT_FILE.fuzzy
+    # Original Nacho: python3 oie_compress.py --oies $OUTPUT_FILE.reverb --op fuzzy --ris $DICC_PATH/normalized_Effects.json --out $OUTPUT_FILE.fuzzy
+    #
+    # --inputFile $OUTPUT_FILE.reverb file obtained with CoreNLPL
+    # --outputPath $OUTPUT_PATH
+    # --diccPath $SCRIPT_PATH/filtered-sentences  Before: $DICC_PATH
+    # --diccFile Before: termFilesTag_RIE_GCE_SYSTEM_ECCO.json
+    # --diccEffect normalized_Effects.json
+    # --format standoff
+    # --diccEPAth $DICC_PATH
+    # OUTPUT: standoff files with RIs
+
+    # python3.4 $SCRIPT_PATH/ri-openie-extraction.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccFile termFilesTag_RIE_GCE_SYSTEM_ECCO.json --diccEffect normalized_Effects.json --format standoff
+    python3.4 $SCRIPT_PATH/ri-openie-extraction-v02.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $SCRIPT_PATH/filtered-sentences --diccFile filtered-sentences.ents.json --diccEffect normalized_Effects.json --diccEPAth $DICC_PATH --format standoff
+
+    #Join into single file
+    #Sort fuzzy
+    # Original Nacho: echo "   Sort fuzzy..."
+    # Obtiene tipo de efecto
+    # Original Nacho: sort $OUTPUT_FILE.fuzzy -o $OUTPUT_FILE.fuzzy
+    #Concatenate
+    # CMC eliminated following lines because simplification was
+    #discriminated before
+    #if [ $SIMP == 1 ]
+        #then    #USING SIMPLIFICATION
+            #ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+            #awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+            #cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
+            #paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
+        #else    #WITHOUT SIMPLIFICACION
+            #ls -l $SCRIPT_PATH/format/split_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+            #awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+            #cat $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE.als
+            #paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
+    #fi
+    # Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+    # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+    # Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
+    # Original Nacho: echo "   Creating ils, fls and als files..."
+    # Original Nacho: if [ $DEVSEP == 1 ]
+        # Original Nacho: then    #USING DEVERBAL SEPARATOR
+            # Original Nacho: ls -l $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+            # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+            # Original Nacho: cat $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE.als
+        # Original Nacho: else    #WITHOUT DEVERBAL SEPARATOR
+            # Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+            # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+            # Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
+    # Original Nacho: fi
+    # Original Nacho: echo "   Paste merger..."
+    # Original Nacho: paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
+    # Original Nacho: echo "   Create dsp file..."
+    #  Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp
+    # Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp
+    # rm $(dirname "$OUTPUT_FILE")/*.fls
+    # rm $(dirname "$OUTPUT_FILE")/*.ils
+    # rm $(dirname "$OUTPUT_FILE")/*.als
+    #rm $SCRIPT_PATH/*.merger
+    #rm $SCRIPT_PATH/*.reverb
+    #rm $SCRIPT_PATH/*.fuzzy
+fi # if [ "$EXTOPENIE" = "TRUE" ]; then
+
+if [ "$EXTATTRIB" = "TRUE" ]; then
+    echo "********** ATTRIBUTIVE RI EXTRACTION **********"
+    #########################
+    # Attributive RI extraction #
+    #########################
+    # Attributive RI extraction, such as AraP-regulated genes aragP, aragT
+    #
+    # --inputPath $SCRIPT_PATH/attributive-sentences
+    # --outputPath $OUTPUT_PATH
+    # --diccPath $SCRIPT_PATH/filtered-sentences  Before: $DICC_PATH
+    # --diccEffect normalized_Effects.json
+    # OUTPUT: standoff files with RIs
+
+    for file in $SCRIPT_PATH/attributive-sentences/*.*
+    do
+        echo "Dir file: $(dirname ${file})"
+        echo "File: $(basename ${file})"
+        # echo "OUTOUT_PATH $OUTPUT_PATH"
+        # echo "File $(basename ${file%.*})"
+        # echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
+        if [ "$(basename ${file})" = "*.*" ]; then
+          echo "None attributive sentence found"
+        else
+          python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json
+        fi
+    done
+
+fi # if [ "$EXTATTRIB" = "TRUE" ]; then
+
+if [ "$EXTAUTOREG" = "TRUE" ]; then
+    echo "********** AUTOREGULATION RI EXTRACTION **********"
+    #########################
+    # Autoregulation RI extraction #
+    #########################
+    # Autoregulation RI extraction, such as ArgP protein represses its own synthesis
+    #
+    # --inputPath $SCRIPT_PATH/autoregulation-sentences
+    # --outputPath $OUTPUT_PATH
+    # --diccPath $DICC_PATH
+    # --diccEffect normalized_Effects.json
+    # OUTPUT: standoff files with RIs
+
+    for file in $SCRIPT_PATH/autoregulation-sentences/*.*
+    do
+        echo "Dir file: $(dirname ${file})"
+        echo "File: $(basename ${file})"
+        # echo "OUTOUT_PATH $OUTPUT_PATH"
+        # echo "File $(basename ${file%.*})"
+        # echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
+        if [ "$(basename ${file})" = "*.*" ]; then
+          echo "None autoregulation sentence found"
+        else
+          python3 $SCRIPT_PATH/ri-autoregulation-extraction-v01.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json
+        fi
+    done
+
+fi # if [ "$EXTAUTOREG" = "TRUE" ]; then
+
+if [ "$EXTGC" = "TRUE" ]; then
+    echo "********** EXTRACT GROWTH CONDITIONS **********"
+    #############################
+    # Extract growth conditions #
+    #############################
+    python3.4 $SCRIPT_PATH/extract-gcs/extract-gcs-regex.py --inputPath $OUTPUT_PATH/complete-ris --outputPath $OUTPUT_PATH/complete-ris --termPath /home/cmendezc/terminologicalResources
+    #python3 ./GCs-regex-before.py ./ejemplo_11.spt
+    #/home/elwe/Documents/prueba3/RIE_reordenado/RI-searcher/GC/ejemplo_11.spt ./ejemplo_11.a2
+    #./names_GC_ECCO_1grams.txt ./names_GC_ECCO_2grams.txt ./names_GC_ECCO_3grams.txt
+    #./names_GC_ECCO_4grams.txt ./names_GC_ECCO_5Moregrams.txt
+fi # if [ "$EXTGC" = "TRUE" ]; then
+
+if [ "$EVAL" = "TRUE" ]; then
+    echo "********** EVALUATE EXTRACTION **********"
+    if [ "$EVALGC" = "TRUE" ]; then
+        echo "********** EVALUATE GROWTH CONDITION EXTRACTION **********"
+        python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON --evaluateGCs
+    else
+        echo "********** EVALUATE WITHOUT GROWTH CONDITION EXTRACTION **********"
+        python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON
+
+    fi  # if [ "$EVALGC" = "TRUE" ]; then
+fi # if [ "$EVAL" = "TRUE" ]; then
+
--- a/autoregulation-sentences/deleteme.txt 0 → 100644
View file @f3df57a
+++ b/autoregulation-sentences/deleteme.txt 0 → 100644
View file @f3df57a
--- a/deverbal-separator/filter-v03.py 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/filter-v03.py 0 → 100644
View file @f3df57a
+# import fileinput
+# import regex as re
+# from regex import finditer
+import sys
+import json
+
+if ( len( sys.argv ) != 3 ):
+    # Original Daniel: sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <EFFs_dictionary> \n" )
+    sys.stderr.write("E: usage: " + sys.argv[0] + " <input_file> <normalized_Effects> \n")
+    sys.stderr.flush();
+
+#    exit( 2 );
+
+#LEER ARCHIVO INPUT
+text_file = open( sys.argv[1], "r" )
+dato = text_file.read()
+text_file.close()
+
+#LEE DICCIONARIO
+
+# Loading normalized effects
+# print('Loading normalized effects...')
+with open(sys.argv[2]) as diccFile:
+    hashNormalizedEffects = json.load(diccFile)
+DICC = list(hashNormalizedEffects.keys())
+
+# Original Daniel: text_file = open( sys.argv[2], "r" )
+# Original Daniel: DICC = text_file.read().splitlines()
+# Original Daniel: text_file.close()
+
+
+#declara variables
+is_dev = False
+is_vrb = False
+
+
+# DICC
+# 2018-11-30 CMC: We separated noun and only past participle for deverbal processing
+# and all verb forms as verbal
+# VRB: VB 	verb, base form 	think
+# VRB: VBZ 	verb, 3rd person singular present 	she thinks
+# VRB: VBP 	verb, non-3rd person singular present 	I think
+# VRB: VBD 	verb, past tense 	they thought
+# DEV: VBN 	verb, past participle 	a sunken ship
+# VRB: VBG 	verb, gerund or present participle 	thinking is fun
+# extend/VBP
+for i in range(len(DICC)):
+    # print(DICC[i])
+    for token in dato.split():
+        word = token[:token.find("/")]
+        tag = token[token.find("/")+1:]
+        # print("word: {}".format(word))
+        # print("tag: {}".format(tag))
+        if (DICC[i] in word) and (("NN" in tag)
+                                  or ("VBN" == tag)
+                                  ):
+            is_dev = True
+            # print("deverbal: " + word)
+        if (DICC[i] in word) and ("VB" in tag):
+            is_vrb = True
+            # print("verbal: " + word)
+
+if is_dev and is_vrb:
+    sys.exit(11)
+elif is_dev:
+    sys.exit(12)
+elif is_vrb:
+    sys.exit(13)
+else:
+    sys.exit(10)
+
--- a/deverbal-separator/separated_sentences/dev/.gitignore 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/separated_sentences/dev/.gitignore 0 → 100644
View file @f3df57a
+
--- a/deverbal-separator/separated_sentences/vrb/.gitignore 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/separated_sentences/vrb/.gitignore 0 → 100644
View file @f3df57a
+
--- a/deverbal-separator/separator-v02.sh 0 → 100755
View file @f3df57a
+++ b/deverbal-separator/separator-v02.sh 0 → 100755
View file @f3df57a
+#!/bin/bash
+# Separates sentences by deverbal (.dev) and verbal (.vrb)
+
+# Original Daniel: PATH_TO_CORENLP=/home/elwe/Documents/temporal/CoreNLP
+
+#Validate arguments
+if [[ ! ("$#" == 6 ) ]]; then
+    echo 'Usage: ./separator.sh <path_to_corenlp> <input_path> <output_path> <dicc_path> <if_tag> <if_separate>'
+    exit 1
+fi
+
+SCRIPT_PATH=$(cd `dirname $0` && pwd)
+# Original Daniel: INPUT_PATH=$1 #carpeta que contiene archivos a separar
+# Original Daniel: OUTPUT_PATH=$2
+PATH_TO_CORENLP=$1
+INPUT_PATH=$2 #carpeta que contiene archivos a separar
+OUTPUT_PATH=$3
+DICC_PATH=$4
+# Tag sentences to separate deverbal and verbal sentences: $DEVTAG
+TAG=$5
+# Do separate deverbal and verbal sentences: $DEVSEPAR
+SEP=$6
+
+if [ $TAG == "TRUE" ]
+	then    #ANALIZAR EN STANFORD PARSER
+
+    if [ -z "$(ls -A $SCRIPT_PATH/tagged/)" ]; then :
+    else
+       #echo "Not Empty"
+       # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged/*
+       find $SCRIPT_PATH/tagged -maxdepth 1 -name '*.conll' -delete
+    fi
+
+    # Added by CMC
+    if [ -z "$(ls -A $SCRIPT_PATH/tagged-line/)" ]; then :
+    else
+       #echo "Not Empty"
+       # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged-line/*
+       find $SCRIPT_PATH/tagged-line -maxdepth 1 -name '*.spt' -delete
+    fi
+
+    for j in $INPUT_PATH/*
+    do
+        #echo $j
+        #Original Daniel: java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.parser.lexparser.LexicalizedParser -writeOutputFiles -retainTMPSubcategories -outputFormat "wordsAndTags" $SCRIPT_PATH/englishPCFG.ser.gz $j
+        # Command line: java -cp "/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file datos_0.spt -outputDirectory tagged
+        # java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
+        # With parse: java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
+        java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
+    done
+
+    # Original Daniel: mv $INPUT_PATH/*.stp $SCRIPT_PATH/tagged/
+    for j in $SCRIPT_PATH/tagged/*
+    do
+        # Original Daniel: awk 'NF {print $2 "/" $4}' tagged/$j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${j%.spt}"
+        filename=$(basename "$j")
+        #filename="${filename%.*}"
+         awk 'NF {print $2 "/" $4}' $j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${filename%.*}.spt"
+        # Original Daniel: mv "$j" "${j%.stp}"
+    done
+fi # if [ $TAG == "TRUE" ]
+
+if [ $SEP == "TRUE" ]
+	then    #SEPARAR ARCHIVOS
+
+    # Original Daniel: if [ -z "$(ls -A $OUTPUT_PATH)" ]; then :
+    # Modified by Carlos Méndez
+    if [ -z "$(ls -A $OUTPUT_PATH/dev)" ]; then :
+    else
+       #echo "Not Empty"
+       # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/dev/*
+       find $OUTPUT_PATH/dev -maxdepth 1 -name '*.dev' -delete
+    fi
+
+    if [ -z "$(ls -A $OUTPUT_PATH/vrb)" ]; then :
+    else
+       #echo "Not Empty"
+       # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/vrb/*
+       find $OUTPUT_PATH/vrb -maxdepth 1 -name '*.vrb' -delete
+    fi
+
+    for j in $SCRIPT_PATH/tagged-line/*
+    do
+        # Original Daniel: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/names_EFFECT_ONTOGENE.txt
+        # CMC 2018-12-04: Without separating verbal forms: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/normalized_Effects.json
+        # CMC 2018-12-11: With separating verbal forms: python3 $SCRIPT_PATH/filter-v02.py $j $DICC_PATH/normalized_Effects.json
+        # CMC 2018-12-11: Considering only passive verbal form as deverbal: VBN 	verb, past participle
+        python3 $SCRIPT_PATH/filter-v03.py $j $DICC_PATH/normalized_Effects.json
+        VAR=$?
+        # filename=${j##*/}
+        # inputfile=${filename%.spt}
+        # exit
+
+        if [ $VAR == 11 ]; then :
+            #contiene dev y vrb $SCRIPT_PATH/tagged-line/
+            # o
+            #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
+            #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+            #echo "Deverbal and verbal"
+            cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
+            cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+        elif [ $VAR == 12 ]; then :
+            #contiene dev
+            #echo "Deverbal"
+            cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
+            # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
+        elif [ $VAR == 13 ]; then :
+            #contiene vrb
+            #echo "Verbal"
+            cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+            # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+        elif [ $VAR == 10 ]; then :
+            #parece no contener dev ni vrb
+            echo "Non deverbal and verbal"
+            cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+            # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+        fi
+    done
+fi # if [ $SEP == "TRUE" ]
--- a/deverbal-separator/tagged-line/.gitignore 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/tagged-line/.gitignore 0 → 100644
View file @f3df57a
+
--- a/deverbal-separator/tagged/.gitignore 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/tagged/.gitignore 0 → 100644
View file @f3df57a
+
--- a/evaluate-ris-gcs-standoff-v04.py 0 → 100644
View file @f3df57a
+++ b/evaluate-ris-gcs-standoff-v04.py 0 → 100644
View file @f3df57a
+# -*- coding: UTF-8 -*-
+import operator
+from optparse import OptionParser
+import os
+import sys
+import json
+import re
+
+__author__ = 'CMendezC'
+
+
+# Objective: evaluate predicted interactions in standoff format
+# versus true interactions in tab format
+# v04: add synonyms of TFs
+
+# Parameters:
+#   1) --truePath Path for true interactions
+#   2) --trueFile File for true interactions
+#   3) --predictedPath Path for predicted interactions
+#   4) --outputPath Output path
+#   5) --outputFile File for saving results
+#   6) --evaluateGCs Evaluate with GCs
+#   7) --diccPath Dictionary path
+#   8) --diccSynon File with synonyms of TFs
+
+# Ouput:
+#   1) File with TP, FP, FN and scores Precision, Recall , F1
+
+# Execution:
+# python3.4 evaluate-ris-gcs-standoff.py
+# --truePath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/analysis-validation-data-sets
+# --trueFile ris-analysis-reference.txt
+# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris-gcs
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/evaluation-reports
+# --outputFile evaluation-riegce-system-ris-analysis.txt
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+# --evaluateGCs
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+def updateHashPredicted(pr, hashP, pm, sF, ef):
+    if pr not in hashP:
+        hashTemp = {"pmids": {pm: [sF]}, "orieff": ef}
+        hashP[pr] = hashTemp
+    else:
+        hashTemp = hashP[pr]
+        if pm in hashTemp["pmids"]:
+            hashP[pr]["pmids"][pm].append(sF)
+        else:
+            hashP[pr]["pmids"][pm] = [sF]
+
+
+def getSummary(r, hashTemp):
+    pmids = 0
+    sentences = 0
+    orieff = ""
+    if r in hashTemp:
+        # print("r: {}".format(r))
+        orieff = hashTemp[r]["orieff"]
+        for pmid in hashTemp[r]["pmids"]:
+            pmids += 1
+            # print("PMID with sentences: {}".format(pmid))
+            for sent in hashTemp[r]["pmids"][pmid]:
+                sentences += 1
+    else:
+        return "WARNING: no data available!"
+    return "Artículos: {}\tFrases: {}\tOriginal effect: {}".format(pmids, sentences, orieff)
+
+
+def getDetail(r, hashTemp):
+    return_text = ""
+    sentences = 0
+    aHash = {}
+    if r in hashTemp:
+        for pmid in hashTemp[r]["pmids"]:
+            for sent in hashTemp[r]["pmids"][pmid]:
+                sentences += 1
+            if pmid not in aHash:
+                aHash[pmid] = sentences
+            else:
+                return "WARNING: PMID duplicated!"
+    else:
+        return "WARNING: no data available!"
+    for p, s in sorted(aHash.items(), key=operator.itemgetter(1), reverse=True):
+        return_text += "\tPMID {}: {} frases\n".format(p, s)
+
+    return return_text
+
+
+def get_standard_name(regSynon):
+    reg = ""
+    if regSynon in hashSynon:
+        reg = hashSynon[regSynon]
+    else:
+        for syn, std in hashSynon.items():
+            if regSynon.startswith(syn):
+                reg = regSynon.replace(syn, std, 1)
+                break
+    return reg
+
+
+def isCorrect(ripr, listT, rtype):
+    # The predicted regulator starts with entity
+    # Effect and regulated coincide
+    # Regulator coincides with activator or repressor
+    # We return a flag to indicate type of matching: full
+    list_ripr = ripr.split('\t')
+    regulator = list_ripr[0]
+    regulatorStdName = ""
+    if use_synonyms:
+        regulatorStdName = get_standard_name(regulator)
+    for rit in listT:
+        # print("RI TRUE: {}".format(rit))
+        listRT = rit.split('\t')
+        regulatorT = listRT[0]
+        regexRegulatorStarts = re.compile(r'(' + regulatorT + r').+')
+        if rtype == "ri":
+            regulated = list_ripr[1]
+            regulatedT = listRT[1]
+            if (regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT:
+                return (rit, 'Full')
+            # For cases where regulator is part of the word, such as ArgP-regulated
+            result = regexRegulatorStarts.match(regulator)
+            if result:
+                # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
+                regulator = result.group(1)
+                if regulator == regulatorT and regulated == regulatedT:
+                    return (rit, 'Start')
+            else:
+                if use_synonyms:
+                    result = regexRegulatorStarts.match(regulatorStdName)
+                    if result:
+                        # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
+                        regulator = result.group(1)
+                        if regulator == regulatorT and regulated == regulatedT:
+                            return (rit, 'Start')
+        elif rtype == "rief":
+            effect = list_ripr[2]
+            regulated = list_ripr[1]
+            effectT = listRT[2]
+            regulatedT = listRT[1]
+            # if ripr == "ArgP\ttargets\tregulator":
+            # print("RI-PREDICT: ArgP\ttargets\tregulator")
+            # print("  PREDICT: regulator {} effect {} regulated {}".format(regulator, effect, regulated))
+            # print("  TRUE: regulator {} effect {} regulated {}".format(regulatorT, effectT, regulatedT))
+            if (
+                    regulator == regulatorT or regulatorStdName == regulatorT) and effect == effectT and regulated == regulatedT:
+                return (rit, 'Full')
+            elif (
+                    regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT and effect == "regulator" and (
+                    effectT == "activator" or effectT == "repressor"):
+                # if ripr == "ArgP\ttargets\tregulator":
+                #    print("   Correct RI with regulator: {}".format(ripr))
+                # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
+                return (ripr, 'Regulator')
+            else:
+                # For cases where regulator is part of the word, such as ArgP-regulated
+                result = regexRegulatorStarts.match(regulator)
+                if result:
+                    # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
+                    regulator = result.group(1)
+                    if regulator == regulatorT and effect == effectT and regulated == regulatedT:
+                        return (rit, 'Start')
+                    elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and (
+                            effectT == "activator" or effectT == "repressor"):
+                        # if ripr == "ArgP\ttargets\tregulator":
+                        #    print("   Correct RI with regulator: {}".format(ripr))
+                        # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
+                        # solo que en este caso uso solo el regulador
+                        # return rit
+                        return (regulator + '\t' + regulated + '\t' + effect, 'Regulator')
+                else:
+                    if use_synonyms:
+                        result = regexRegulatorStarts.match(regulatorStdName)
+                        if result:
+                            if regulator == regulatorT and effect == effectT and regulated == regulatedT:
+                                return (rit, 'Start')
+                            elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and (
+                                            effectT == "activator" or effectT == "repressor"):
+                                # if ripr == "ArgP\ttargets\tregulator":
+                                #    print("   Correct RI with regulator: {}".format(ripr))
+                                # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
+                                # solo que en este caso uso solo el regulador
+                                # return rit
+                                return (regulator + '\t' + regulated + '\t' + effect, 'Regulator')
+
+                                # CMC 2018-10-14: Revisar riefgc porque no se ha actualizado
+                                # elif rtype == "riefgc":
+                                #     effect = list_ripr[2]
+                                #     regulated = list_ripr[1]
+                                #     gc = list_ripr[3]
+                                #     effectT = listRT[2]
+                                #     regulatedT = listRT[1]
+                                #     gcT = listRT[3]
+                                #     if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT:
+                                #         return rit
+                                #     elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT:
+                                #         return rit
+                                #     else:
+                                #         # For cases where regulator is part of the word, such as ArgP-regulated
+                                #         result = regexRegulatorStarts.match(regulator)
+                                #         if result:
+                                #             #print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
+                                #             regulator = result.group(1)
+                                #         if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT:
+                                #             return rit
+                                #         elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT:
+                                #             return rit
+    return ('', '')
+
+
+def get_scores_rules(listTrue, listPredicted, hashTemp, title, ri_type):
+    print("Evaluation")
+    # print(listPredicted)
+    # Precision = Extraídos correctos / Predichos
+    # Recall = Extraídos correctos / Referencia
+    # F - 1 = 2 * ((Precision * Recall) / (Precision + Recall))
+    correct = 0
+    incorrect = 0
+    # For registering correct and incorrect RIs
+    hashPredicted = {}
+    # To print output RIs
+    hashOutputRIs = {}
+    # For registering unrecovered RIs
+    hashUnrecovered = {}
+
+    predicted = len(listPredicted)
+    print("len(listPredicted): {}".format(predicted))
+    reference = len(listTrue)
+    # print("Reference: {}".format(reference))
+
+    listRecovered = []
+    for ri_pred in listPredicted:
+        print("ri_pred: {}".format(ri_pred))
+        # if ri_pred in hashPredicted:
+        #    print("WARNING: RI predicted {} duplicated {}".format(ri_pred, hashPredicted[ri_pred]))
+        # else:
+        # First all predicted RIs are incorrect
+        #    hashPredicted[ri_pred] = "incorrect"
+        # if ri_pred in listTrue:
+        #    hashPredicted[ri_pred] = "correct"
+        #    listRecovered.append(ri_pred)
+        #    correct += 1
+        #    continue
+        riTrue = ''
+        result = isCorrect(ri_pred, listTrue, ri_type)
+        riResult = result[0]
+        matchType = result[1]
+        if riResult != '':
+            if riResult not in hashOutputRIs:
+                hashOutputRIs[riResult] = "Correct"
+            if ri_pred not in hashPredicted:
+                hashPredicted[ri_pred] = "correct"
+            print("ri_pred {} correct".format(ri_pred))
+            correct += 1
+            # Complete matching or the predicted regulator starts with entity
+            if matchType == 'Full' or matchType == 'Start':
+                # ri_pred matches with ri_true
+                if riResult in listRecovered:
+                    print("WARNING: riResult {} already in listRecovered".format(riResult))
+                else:
+                    listRecovered.append(riResult)
+        else:
+            incorrect += 1
+            if riResult not in hashOutputRIs:
+                hashOutputRIs[riResult] = "Incorrect"
+            if ri_pred not in hashPredicted:
+                hashPredicted[ri_pred] = "incorrect"
+            print("ri_pred {} incorrect".format(ri_pred))
+
+    if len(hashPredicted) != predicted:
+        print("ERROR: number of predicted RIs mismatch")
+        # return
+    print("Predicted: {}".format(predicted))
+    print("len(hashPredicted): {}".format(len(hashPredicted)))
+
+    cor = 0
+    inc = 0
+    for r, v in hashPredicted.items():
+        if v == "correct":
+            cor += 1
+        elif v == "incorrect":
+            inc += 1
+    if cor != correct:
+        print("ERROR: number of correct RIs mismatch")
+        # return
+    if inc != incorrect:
+        print("ERROR: number of incorrect RIs mismatch")
+        # return
+    print("Correct: {}".format(correct))
+    print("Incorrect: {}".format(incorrect))
+
+    unrecovered = 0
+    recovered = 0  # Only when coincide with reference
+    # without considering Regulator correct when Activator or Repressor appears in reference
+    listRecovered2 = []
+    listUnrecovered = []
+    for ri in listTrue:
+        if ri not in listRecovered:
+            if ri in listUnrecovered:
+                print("WARNING: ri {} already in listUnrecovered".format(ri))
+            else:
+                listUnrecovered.append(ri)
+                unrecovered += 1
+        else:
+            if ri in listRecovered2:
+                print("WARNING: ri {} already in listRecovered2".format(ri))
+            else:
+                listRecovered2.append(ri)
+                recovered += 1
+
+    print("Len listRecovered: {}".format(len(listRecovered)))
+    print("Len listRecovered2: {}".format(len(listRecovered2)))
+    print("Len listUnrecovered: {}".format(len(listUnrecovered)))
+    # if (unrecovered + correct) != reference:
+    #    print("ERROR: number of unrecovered {} + correct {} and reference {} RIs mismatch".format(unrecovered, correct, reference))
+    #    return
+
+    print("{}".format(title))
+    print("Predicted: {}".format(predicted))
+    print("Reference: {}".format(reference))
+    print("Unrecovered: {}".format(unrecovered))
+    print("Recovered: {}".format(recovered))
+
+    precision = correct / predicted
+    print("Precision = correct / predicted: {}".format(precision))
+    # recall = correct / reference
+    # We calculate recall as recovery rate, because correct instances are calculates
+    # considering Regulator correct when Activator and Repressor appears in reference
+    recall = recovered / reference
+    print("Recall = recovered / reference: {}".format(recall))
+    f1 = 2 * ((precision * recall) / (precision + recall))
+    print("F1: {}".format(f1))
+
+    with open(os.path.join(options.outputPath, options.outputFile), mode="a", errors="replace") as oFile:
+        oFile.write("{}\n".format(title))
+        oFile.write("Predicted: {}\n".format(predicted))
+        oFile.write("Reference: {}\n".format(reference))
+        oFile.write("Correct: {}\n".format(correct))
+        oFile.write("Incorrect: {}\n".format(incorrect))
+        oFile.write("Unrecovered: {}\n".format(unrecovered))
+        oFile.write("Recovered: {}\n".format(recovered))
+        oFile.write("Precision = correct / predicted: {}\n".format(precision))
+        oFile.write("Recall = recovered / reference: {}\n".format(recall))
+        oFile.write("F1: {}\n".format(f1))
+        oFile.write("Unrecovered instances:\n")
+        for r in sorted(listUnrecovered):
+            oFile.write("\tUnrecovered: {}\n".format(r))
+        oFile.write("Recovered instances:\n")
+        for r in sorted(listRecovered):
+            oFile.write("\tRecovered: {}\n".format(r))
+        oFile.write("Incorrect instances:\n")
+        for r, v in sorted(hashPredicted.items()):
+            if v == "incorrect":
+                oFile.write("\tIncorrect: {}\n".format(r))
+        oFile.write("Correct instances:\n")
+        for r, v in sorted(hashPredicted.items()):
+            if v == "correct":
+                oFile.write("\tCorrect: {}\n".format(r))
+                # oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp)))
+                # oFile.write("\t{}\n".format(getDetail(r, hashTemp)))
+
+
+def get_scores(listTrue, listPredicted, hashTemp, title):
+    # Precision = Extraídos correctos / Extraídos
+    # Recall = Extraídos correctos / Referencia
+    # F - 1 = 2 * ((Precision * Recall) / (Precision + Recall))
+    print("{}".format(title))
+    # print("listTrue: {}".format(listTrue))
+    # print("listPredicted: {}".format(listPredicted))
+    print("Predicted: {}".format(len(listPredicted)))
+    print("Reference: {}".format(len(listTrue)))
+    correct = set(listTrue) & set(listPredicted)
+    print("Correct: {} ({})".format(len(correct), len(correct) / len(listPredicted)))
+    incorrect = set(listPredicted) - set(listTrue)
+    print("Incorrect: {} ({})".format(len(incorrect), len(incorrect) / len(listPredicted)))
+    unrecovered = set(listTrue) - set(listPredicted)
+    print("Unrecovered: {} ()".format(len(unrecovered), len(unrecovered) / len(listTrue)))
+    precision = len(correct) / len(listPredicted)
+    print("Precision: {}".format(precision))
+    recall = len(correct) / len(listTrue)
+    print("Recall: {}".format(recall))
+    f1 = 2 * ((precision * recall) / (precision + recall))
+    print("F1: {}".format(f1))
+
+    with open(os.path.join(options.outputPath, options.outputFile), mode="a") as oFile:
+        oFile.write("{}\n".format(title))
+        oFile.write("Predicted: {}\n".format(len(listPredicted)))
+        oFile.write("Reference: {}\n".format(len(listTrue)))
+        oFile.write("Correct: {}\n".format(len(correct)))
+        oFile.write("Incorrect: {}\n".format(len(incorrect)))
+        oFile.write("Unrecovered: {}\n".format(len(unrecovered)))
+        oFile.write("Precision: {}\n".format(precision))
+        oFile.write("Recall: {}\n".format(recall))
+        oFile.write("F1: {}\n".format(f1))
+        oFile.write("Correct instances:\n")
+        for r in sorted(correct):
+            oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp)))
+            oFile.write("\t{}\n".format(getDetail(r, hashTemp)))
+        oFile.write("Incorrect instances:\n")
+        for r in sorted(incorrect):
+            oFile.write("\t{}\n".format(r))
+        oFile.write("Unrecovered instances:\n")
+        for r in sorted(unrecovered):
+            oFile.write("\t{}\n".format(r))
+
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--truePath", dest="truePath",
+                      help="Path true ris gcs", metavar="PATH")
+    parser.add_option("--trueFile", dest="trueFile",
+                      help="File true ris gcs", metavar="FILE")
+    parser.add_option("--predictedPath", dest="predictedPath",
+                      help="Path predicted ris gcs", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path", metavar="PATH")
+    parser.add_option("--outputFile", dest="outputFile",
+                      help="File for saving results", metavar="FILE")
+    parser.add_option("--evaluateGCs", default=False,
+                      action="store_true", dest="evaluateGCs",
+                      help="Evaluate GCs?")
+    parser.add_option("--diccPath", dest="diccPath",
+                      help="Path to dictionary", metavar="PATH")
+    parser.add_option("--diccSynon", dest="diccSynon",
+                      help="File with synonyms", metavar="FILE")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameter entered.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path true ris gcs: " + str(options.truePath))
+    print("File true ris gcs: " + str(options.trueFile))
+    print("Path predicted ris gcs: " + str(options.predictedPath))
+    print("Output path: " + str(options.outputPath))
+    print("File for saving results: " + str(options.outputFile))
+    print("Evaluate GCs: " + str(options.evaluateGCs))
+    print("Path to dictionary: " + str(options.diccPath))
+    print("File with synonyms: " + str(options.diccSynon))
+
+    use_synonyms = False
+    hashSynon = {}
+    if options.diccPath != None and options.diccSynon != "no-synonyms":
+        print("***** Using synonyms *****")
+        use_synonyms = True
+        print('Loading dictionary of synonyms...')
+        with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon:
+            hashSynon = json.load(diccSynon)
+        print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon)))
+
+    listTrueRI = []  # Without effect nor gc
+    listTrueRIEF = []  # With effect nor gc
+    if options.evaluateGCs:
+        listTrueRIEFGC = []  # With effect and gc
+    # Read and process Reference
+    with open(os.path.join(options.truePath, options.trueFile), mode="r", encoding="utf-8") as iFile:
+        for line in iFile:
+            line = line.strip('\n')
+            if line.startswith("#"):
+                continue
+            listElem = line.split('\t')
+            if len(listElem) > 4:
+                regulator = listElem[2]
+                regulated = listElem[3]
+                effect = listElem[4]
+                if options.evaluateGCs:
+                    gc = listElem[5]
+            else:
+                regulator = listElem[0]
+                regulated = listElem[1]
+                effect = listElem[2]
+                if options.evaluateGCs:
+                    gc = listElem[3]
+            if effect == "binding":
+                effect = "regulator"
+            ri = "{}\t{}".format(regulator, regulated)
+            if ri not in listTrueRI:
+                listTrueRI.append(ri)
+            rief = "{}\t{}\t{}".format(regulator, regulated, effect)
+            if rief not in listTrueRIEF:
+                listTrueRIEF.append(rief)
+            if options.evaluateGCs:
+                riefgc = "{}\t{}\t{}\t{}".format(regulator, regulated, effect, gc)
+                if riefgc not in listTrueRIEFGC:
+                    listTrueRIEFGC.append(riefgc)
+    print("   RIs en referencia antes regulators: {}".format(len(listTrueRI)))
+    print("   RIEFs en referencia antes regulators: {}".format(len(listTrueRIEF)))
+    if options.evaluateGCs:
+        print("   RIEFGCs en referencia antes regulators: {}".format(len(listTrueRIEFGC)))
+
+    # Eliminate those RIs with regulator which also have RIs with activator or repressor
+    listRITemp = []
+    for ri in listTrueRIEF:
+        listRI = ri.split('\t')
+        regulator = listRI[0]
+        regulated = listRI[1]
+        effect = listRI[2]
+        if effect == "regulator":
+            tempRIA = "{}\t{}\t{}".format(regulator, regulated, "activator")
+            tempRIR = "{}\t{}\t{}".format(regulator, regulated, "repressor")
+            if tempRIA in listTrueRIEF or tempRIR in listTrueRIEF:
+                pass
+                # print("RI regulator matchs RI activator/repressor: {}".format(ri))
+                # listTrueRIEF.remove(ri)
+            else:
+                # print("Len before: {}".format(len(listRITemp)))
+                listRITemp.append(ri)
+                # print("Len after: {}".format(len(listRITemp)))
+        else:
+            listRITemp.append(ri)
+    listTrueRIEF = listRITemp
+
+    print("   RIEFs en referencia después regulators: {}".format(len(listTrueRIEF)))
+    if options.evaluateGCs:
+        for ri in listTrueRIEFGC:
+            listRI = ri.split('\t')
+            regulator = listRI[0]
+            regulated = listRI[1]
+            effect = listRI[2]
+            gc = listRI[3]
+            if effect == "regulator":
+                tempRIGCA = "{}\t{}\t{}\t{}".format(regulator, regulated, "activator", gc)
+                tempRIGCR = "{}\t{}\t{}\t{}".format(regulator, regulated, "repressor", gc)
+                if tempRIGCA in listTrueRIEFGC or tempRIGCR in listTrueRIEFGC:
+                    listTrueRIEFGC.remove(ri)
+        print("   RIEFGCs en referencia después regulators: {}".format(len(listTrueRIEFGC)))
+
+    listPredictedRI = []
+    hashPredictedRI = {}
+    listPredictedRIEF = []
+    hashPredictedRIEF = {}
+    if options.evaluateGCs:
+        listPredictedRIEFGC = []
+        hashPredictedRIEFGC = {}
+    hashFiles = {}
+    for path, dirs, files in os.walk(options.predictedPath):
+        for file in files:
+            if file.endswith(".a1"):
+                filename = file[:-3]
+                if filename not in hashFiles:
+                    hashFiles[filename] = 1
+                else:
+                    hashFiles[filename] += 1
+    print("Files: {}".format(len(hashFiles)))
+
+    hashEntities = {}
+    processedFiles = 0
+    for file in sorted(hashFiles.keys()):
+        print("File: {}".format(file))
+        pmid = file[:file.find("_")]
+        # print("pmid {}".format(pmid))
+        sentenceFile = file[:file.find("-", file.find("_"))] + ".txt"
+        hashEntities = {}
+        hashOriginalEffect = {}
+        with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File:
+            for line in a1File:
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                entity = listLine2[0]
+                idEntity = listLine1[0]
+                originalEffect = listLine1[2]
+                if entity.startswith("EFFECT"):
+                    entity = entity[entity.find(".") + 1:]
+                    print("Entity: {}".format(entity))
+                    entity = entity.replace("_dev", "")
+                    print("Entity without _dev: {}".format(entity))
+                    if idEntity not in hashOriginalEffect:
+                        hashOriginalEffect[idEntity] = originalEffect
+                else:
+                    entity = listLine1[2]
+                if idEntity not in hashEntities:
+                    hashEntities[idEntity] = entity
+        print("hashEntities: {}".format(hashEntities))
+
+        with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File:
+            for line in a2File:
+                # print("Line a2: {}".format(line))
+                # R1	Interaction.T3 Target:T2 Agent:T1 Condition: T4
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                regulator = listLine2[2]
+                regulator = regulator[regulator.find(":") + 1:]
+                regulated = listLine2[1]
+                regulated = regulated[regulated.find(":") + 1:]
+                effect = listLine2[0]
+                effect = effect[effect.find(".") + 1:]
+                # print("effect: {}".format(hashEntities[effect]))
+                # if hashEntities[effect] == "binding":
+                #    continue
+                if options.evaluateGCs:
+                    gc = listLine2[3]
+                    gc = gc[gc.find(":") + 1:]
+
+                pri = "{}\t{}".format(hashEntities[regulator], hashEntities[regulated])
+                if pri not in listPredictedRI:
+                    listPredictedRI.append(pri)
+                updateHashPredicted(pri, hashPredictedRI, pmid, sentenceFile, None)
+
+                prief = "{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated], hashEntities[effect])
+                print("prief: {}".format(prief))
+                if prief not in listPredictedRIEF:
+                    listPredictedRIEF.append(prief)
+                updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect])
+
+                if options.evaluateGCs:
+                    priefgc = "{}\t{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated],
+                                                      hashEntities[effect], hashEntities[gc])
+                    if priefgc not in listPredictedRIEFGC:
+                        listPredictedRIEFGC.append(priefgc)
+                    updateHashPredicted(priefgc, hashPredictedRIEFGC, pmid, sentenceFile, hashOriginalEffect[effect])
+        processedFiles += 1
+
+    print("Processed files: {}".format(processedFiles))
+    with open(os.path.join(options.outputPath, options.outputFile), mode="w") as oFile:
+        pass
+    get_scores_rules(listTrueRIEF, listPredictedRIEF, hashPredictedRIEF,
+                     "Scores regulator-regulated-effect (without gc)", "rief")
+    get_scores_rules(listTrueRI, listPredictedRI, hashPredictedRI, "Scores regulator-regulated (without effect nor gc)",
+                     "ri")
+    if options.evaluateGCs:
+        get_scores_rules(listTrueRIEFGC, listPredictedRIEFGC, hashPredictedRIEFGC,
+                         "Scores regulator-regulated-effect-gc", "riefgc")
--- a/extract-ris-deverbal/EFF_DVB-regex-v03.py 0 → 100644
View file @f3df57a
+++ b/extract-ris-deverbal/EFF_DVB-regex-v03.py 0 → 100644
View file @f3df57a
+import fileinput
+#import regex as re
+#from regex import finditer
+# We use Python3 and we had to eliminate option overlapped from finditer method of re
+# As Daniel created this Python script in Python 2.7, he used overlapped, but in
+# Python 3 this option was eliminated.
+import re
+from re import finditer
+import sys
+import os
+import json
+
+if (len(sys.argv) != 8):
+    sys.stderr.write("E: usage: " + sys.argv[
+        0] + " <input_path> <input_file> <output_path> <output_file> <normalized_Effects> <entity_path> <entity_file>\n")
+    sys.stderr.flush();
+    exit(2);
+
+# LEER ARCHIVO INPUT
+# Original Daniel: text_file = open( sys.argv[1], "r" )
+# Original Daniel: dato = text_file.read()
+# Original Daniel: text_file.close()
+filename = sys.argv[2]
+input_file = open(os.path.join(sys.argv[1], filename), "r")
+#print("Input file: {}".format(os.path.join(sys.argv[1], sys.argv[2])))
+dato = input_file.read()
+input_file.close()
+
+# Loading normalized effects
+# print('Loading normalized effects...')
+with open(os.path.join(sys.argv[5])) as diccFile:
+    hashNormalizedEffects = json.load(diccFile)
+
+# USING ALREADY TAGGED ENTITIES OF THE FILE (in filter sentence step)
+#<entity_path> <entity_file>
+# READ DICTIONARY WITH ALREADY TAGGED ENTITIES
+entity_path = sys.argv[6]
+entity_file = sys.argv[7]
+print('Loading dictionaries with already tagged entities...')
+with open(os.path.join(entity_path, entity_file)) as entFile:
+    hashDicc = json.load(entFile)
+print('   Loading dictionaries with already tagged entities... Done!')
+# CREATE LISTS WITH ALREADY TAGGED ENTITIES OF THE FILE
+regexNumFile = re.compile(r'_([0-9]+)[.-]')
+result = regexNumFile.search(filename)
+numFile = ""
+inumFile = 0
+if result:
+    inumFile = int(result.group(1))
+    numFile = str(inumFile)
+    print("Numfile: {}".format(numFile))
+else:
+    print("WARNING: numfile not found in filename")
+
+ATEREG1 = []
+PTEREG1GENE = []
+PTEREG1TU = []
+listEffects = []
+
+if numFile in hashDicc:
+    hashTemp = hashDicc[numFile]
+    # print("hashDicc[numFile]: {}".format(hashTemp))
+    for k, v in hashTemp.items():
+        if v == "TF":
+            # print("Verifiying TF")
+            if k not in ATEREG1:
+                # print(" TF {}".format(k))
+                ATEREG1.append(k)
+        elif v == "GENE":
+            if k not in PTEREG1GENE:
+                PTEREG1GENE.append(k)
+        elif v == "TU":
+            if k not in PTEREG1TU:
+                PTEREG1TU.append(k)
+        elif v == "EFFECT":
+            if k not in listEffects:
+                listEffects.append(k)
+        else:
+            print("WARNING: entity not found in dictionaries")
+else:
+    print("WARNING: numfile not found in dictionaries")
+
+# QUITA EXTENSION DE NOMBRE DE ARCHIVO
+# Original Daniel: split_line = sys.argv[2]
+output_path = sys.argv[3]
+# Original Daniel: split_line = split_line[:-4]
+# Original Daniel: file_name = split_line + ".a2"
+input_file_name = sys.argv[2]
+# Original Daniel: open( file_name , 'w').close()
+file_name_entities_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a1")
+file_name_interactions_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a2")
+file_name_entities_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a1")
+file_name_interactions_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a2")
+
+file_name_text_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".txt")
+file_name_text_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".txt")
+
+open(file_name_entities_complete, 'w').close()
+open(file_name_interactions_complete, 'w').close()
+# Original Daniel: open( file_name , 'w').close()
+open(file_name_entities_incomplete, 'w').close()
+open(file_name_interactions_incomplete, 'w').close()
+
+# declara variables
+# Original Daniel: impresion = []
+impresionEntities = []
+impresionInteractionsComplete = []
+impresionInteractionsIncomplete = []
+salida_a2 = []
+salida_a2_trimmed = []
+salida_a2_str = []
+q2line = ()
+listadeRIs = []
+posiblesminimos = [[], []]
+posiblesmaximos = [[], []]
+listasecundaria = []
+listasecundaria_trimmed = []
+impresionEntities = []
+impresionInteractionsComplete = []
+impresionInteractionsIncomplete = []
+
+# Effects
+for i in range(len(listEffects)):
+    if listEffects[i] in dato:
+        for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at|for)\b)', dato):  # "of","for" o "at" a la derecha de EFF
+        # Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at)\b)', dato,
+        # Original Daniel:                       overlapped=True):  # "of" o "at" a la derecha de EFF
+            spantup = match.span(1)
+            # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1))
+            if match.group(1).lower() in hashNormalizedEffects:
+                effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
+            else:
+                effect = "EFFECT." + "deverbal_effect"
+            # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1))
+            a2line = (effect, spantup[0], spantup[1] - 1, match.group(1))
+            #print("Append effect a2line: {}".format(a2line))
+            salida_a2.append(a2line)
+        for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato):  # "by" a la derecha de EFF
+            # Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato,
+            # Original Daniel:                       overlapped=True):  # "by" a la derecha de EFF
+            spantup = match.span(1)
+            # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1))
+            if match.group(1).lower() in hashNormalizedEffects:
+                effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
+            else:
+                effect = "EFFECT." + "deverbal_effect"
+            # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1))
+            a2line = (effect, spantup[0], spantup[1] - 1, match.group(1))
+            salida_a2.append(a2line)
+            #print("Append effect a2line: {}".format(a2line))
+        for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato):  # "is the" 0-1 palabras a la izquierda de EFF
+            # Original Daniel: for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato,
+            # Original Daniel:                   overlapped=True):  # "is the" 0-1 palabras a la izquierda de EFF
+            spantup = match.span(3)
+            # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(3))
+            if match.group(1).lower() in hashNormalizedEffects:
+                effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
+            else:
+                effect = "EFFECT." + "deverbal_effect"
+            # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(3))
+            a2line = (effect, spantup[0], spantup[1] - 1, match.group(3))
+            salida_a2.append(a2line)
+            #print("Append effect a2line: {}".format(a2line))
+#print("Efectos salida_a2: {}".format(salida_a2))
+
+# PTEREG1GENE regulados pacientes GENE
+for i in range(len(PTEREG1GENE)):
+    if PTEREG1GENE[i] in dato:
+        # print(PTEREG1GENE[i])
+        for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato):  # "of", "for" o "at" 0-1 palabras a la izq de regulado
+            # Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato,
+            # Original Daniel:                   overlapped=True):  # "of" o "at" 0-1 palabras a la izq de regulado
+            spantup = match.span(3)
+            # print("match {} spantup {}".format(match.group(3), match.span(3)))
+            # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3))
+            a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(3))
+            salida_a2.append(a2line)
+            # print("Append regulados a2line: {}".format(a2line))
+        for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato):  # regulados sin patron
+            # Original Daniel: for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato, overlapped=True):  # regulados sin patron
+            spantup = match.span(1)
+            # print("match {} spantup {}".format(match.group(1), match.span(1)))
+            # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1))
+            a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(1))
+            listasecundaria.append(a2line)
+#print("Efectos regulados gene listasecundaria: {}".format(listasecundaria))
+
+# CMC: ADDED TO SEPARTE REGULATED GENE AND TU
+# PTEREG1TU regulados pacientes TU
+for i in range(len(PTEREG1TU)):
+    if PTEREG1TU[i] in dato:
+        # print(PTEREG1TU[i])
+        for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato):  # "of","for" o "at" 0-1 palabras a la izq de regulado
+            # Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato,
+            # Original Daniel:                   overlapped=True):  # "of" o "at" 0-1 palabras a la izq de regulado
+            spantup = match.span(3)
+            # print("match: " + match.group(3))
+            # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3))
+            a2line = ('TU', spantup[0], spantup[1] - 1, match.group(3))
+            salida_a2.append(a2line)
+            # print("Append regulados a2line: {}".format(a2line))
+        for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato):  # regulados sin patron
+        # for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato, overlapped=True):  # regulados sin patron
+            spantup = match.span(1)
+            # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1))
+            a2line = ('TU', spantup[0], spantup[1] - 1, match.group(1))
+            listasecundaria.append(a2line)
+#print("Efectos regulados tu listasecundaria: {}".format(listasecundaria))
+
+# ATEREG1 reguladores agentes
+for i in range(len(ATEREG1)):
+    if ATEREG1[i] in dato:
+        # print(ATEREG1[i])
+        for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato):  # "by" 0-1 palabras a la izq de regulado
+            # Original Daniel: for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato,
+            # Original Daniel:                   overlapped=True):  # "by" 0-1 palabras a la izq de regulado
+            spantup = match.span(2)
+            # print("match: " + match.group(2))
+            # print("match {} spantup {}".format(match.group(2), match.span(2)))
+            # Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(2))
+            a2line = ('TF', spantup[0], spantup[1] - 1, match.group(2))
+            salida_a2.append(a2line)
+            #print("Append regulator a2line: {}".format(a2line))
+        for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato):  # reguladores sin patron
+        # for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato, overlapped=True):  # reguladores sin patron
+            spantup = match.span(1)
+            # print("match {} spantup {}".format(match.group(1), match.span(1)))
+            # Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(1))
+            a2line = ('TF', spantup[0], spantup[1] - 1, match.group(1))
+            listasecundaria.append(a2line)
+            #print("Append regulator a2line: {}".format(a2line))
+#print("Reguladores agentes salida_a2: {}".format(salida_a2))
+#print("Reguladores agentes listasecundaria: {}".format(listasecundaria))
+
+# Elimina etiquetados repetidos o que estan incluidos en otros
+if salida_a2:
+    salida_a2.sort(key=lambda tup: tup[1])
+    salida_a2_trimmed.append(salida_a2[0])
+    for i in range(len(salida_a2)):
+        copiar = True
+        for j in range(len(salida_a2_trimmed)):
+            if ((salida_a2[i][1] >= salida_a2_trimmed[j][1]) and (salida_a2[i][2] <= salida_a2_trimmed[j][2])):
+                copiar = False
+        if copiar:
+            salida_a2_trimmed.append(salida_a2[i])
+if listasecundaria:
+    listasecundaria.sort(key=lambda tup: tup[1])
+    listasecundaria_trimmed.append(listasecundaria[0])
+    for i in range(len(listasecundaria)):
+        copiar = True
+        for j in range(len(listasecundaria_trimmed)):
+            if ((listasecundaria[i][1] >= listasecundaria_trimmed[j][1]) and (
+                listasecundaria[i][2] <= listasecundaria_trimmed[j][2])):
+                copiar = False
+        if copiar:
+            listasecundaria_trimmed.append(listasecundaria[i])
+# print("Sin repeticiones salida_a2_trimmed: {}".format(salida_a2_trimmed))
+#print("Sin repeticiones listasecundaria_trimmed: {}".format(listasecundaria_trimmed))
+
+# Asigna identificadores (TX) a entidades (eff, regulador, regulado)
+lastID = 0
+for i in range(len(salida_a2_trimmed)):
+    # if sys.argv[2].find('355') > -1:
+    #    print("i : {}".format(i))
+    salida_a2_trimmed[i] = list(salida_a2_trimmed[i])
+    ID = "T" + str(i + 1)
+    salida_a2_trimmed[i].insert(0, ID)
+    lastID = i + 1
+    # if sys.argv[2].find('355') > -1:
+    #    print("lastID : {}".format(lastID))
+
+for i in range(len(listasecundaria_trimmed)):
+    # if sys.argv[2].find('355') > -1:
+    #    print("i : {}".format(i))
+    #    print("lastID : {}".format(lastID))
+    listasecundaria_trimmed[i] = list(listasecundaria_trimmed[i])
+    ID = "T" + str(i + 1 + lastID)
+    listasecundaria_trimmed[i].insert(0, ID)
+
+# print("Con identificadores salida_a2_trimmed: {}".format(salida_a2_trimmed))
+#print("Con identificadores listasecundaria_trimmed: {}".format(listasecundaria_trimmed))
+
+#print("salida_a2_trimmed")  #########################
+#print(salida_a2_trimmed)  #########################
+#print("listasecundaria_trimmed")
+#print(listasecundaria_trimmed)
+
+# Arma Interacciones Regulatorias
+i = 0
+while i < int(len(salida_a2_trimmed)):
+    if "EFFECT" in salida_a2_trimmed[i][1]:
+        # BUSCA REGULADO A LA DERECHA
+        nuevaRI = [salida_a2_trimmed[i][0], "", ""]  # efecto, tema, causa
+        ref = ""
+        posiblesminimos = [[], []]
+        j = 0
+        while j < int(len(salida_a2_trimmed)):
+            # Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
+            if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
+                posiblesminimos[0].append(salida_a2_trimmed[j][2])
+                posiblesminimos[1].append(salida_a2_trimmed[j][0])
+            j = j + 1
+        if posiblesminimos[0]:
+            refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
+            ref = posiblesminimos[1][refpointer]
+        # si no encuentra, BUSCA REGULADO A LA IZQUIERDA
+        if not ref:
+            posiblesmaximos = [[], []]
+            j = 0
+            while j < int(len(salida_a2_trimmed)):
+                # Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
+                if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
+                    posiblesmaximos[0].append(salida_a2_trimmed[j][3])
+                    posiblesmaximos[1].append(salida_a2_trimmed[j][0])
+                j = j + 1
+            if posiblesmaximos[0]:
+                refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0]))
+                ref = posiblesmaximos[1][refpointer]
+        nuevaRI[1] = ref
+        # BUSCA REGULADOR A LA DERECHA
+        ref = ""
+        posiblesminimos = [[], []]
+        j = 0
+        while j < int(len(salida_a2_trimmed)):
+            # Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
+            if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
+                posiblesminimos[0].append(salida_a2_trimmed[j][2])
+                posiblesminimos[1].append(salida_a2_trimmed[j][0])
+            j = j + 1
+        if posiblesminimos[0]:
+            refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
+            ref = posiblesminimos[1][refpointer]
+        # si no encuentra, BUSCA REGULADOR A LA IZQUIERDA
+        if not ref:
+            posiblesmaximos = [[], []]
+            j = 0
+            while j < int(len(salida_a2_trimmed)):
+                # Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
+                if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
+                    posiblesmaximos[0].append(salida_a2_trimmed[j][3])
+                    posiblesmaximos[1].append(salida_a2_trimmed[j][0])
+                j = j + 1
+            if posiblesmaximos[0]:
+                refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0]))
+                ref = posiblesmaximos[1][refpointer]
+        nuevaRI[2] = ref
+        listadeRIs.append(nuevaRI)
+    i = i + 1
+
+# SEGUNDA FASE DE BUSQUEDA DE REGULADORES Y REGULADOS
+i = 0
+while i < int(len(listadeRIs)):
+    if not listadeRIs[i][1]:  # no regulado
+        ref = ""
+        posiblesminimos = [[], []]
+        # BUSCA REGULADO A LA DERECHA
+        j = 0
+        while j < int(len(listasecundaria_trimmed)):
+            for k in range(len(salida_a2_trimmed)):
+                if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
+                    ind = k
+            # Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
+            if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
+                posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3]))
+                posiblesminimos[1].append(listasecundaria_trimmed[j][0])
+            j = j + 1
+        # BUSCA REGULADO A LA IZQUIERDA
+        j = 0
+        while j < int(len(listasecundaria_trimmed)):
+            for k in range(len(salida_a2_trimmed)):
+                if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
+                    ind = k
+            # Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
+            if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
+                posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3]))
+                posiblesminimos[1].append(listasecundaria_trimmed[j][0])
+            j = j + 1
+        # ELIGE EL REGULADO MAS CERCANO
+        if posiblesminimos[0]:
+            refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
+            ref = posiblesminimos[1][refpointer]
+        # print(ref)
+        listadeRIs[i][1] = ref
+    if not listadeRIs[i][2]:  # no regulador
+        ref = ""
+        posiblesminimos = [[], []]
+        # BUSCA REGULADO A LA DERECHA
+        j = 0
+        while j < int(len(listasecundaria_trimmed)):
+            for k in range(len(salida_a2_trimmed)):
+                if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
+                    ind = k
+            # Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
+            if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
+                posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3]))
+                posiblesminimos[1].append(listasecundaria_trimmed[j][0])
+            j = j + 1
+        # BUSCA REGULADO A LA IZQUIERDA
+        j = 0
+        while j < int(len(listasecundaria_trimmed)):
+            for k in range(len(salida_a2_trimmed)):
+                if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
+                    ind = k
+            # Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
+            if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
+                posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3]))
+                posiblesminimos[1].append(listasecundaria_trimmed[j][0])
+            j = j + 1
+        # ELIGE EL REGULADO MAS CERCANO
+        if posiblesminimos[0]:
+            refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
+            ref = posiblesminimos[1][refpointer]
+        # print(ref)
+        listadeRIs[i][2] = ref
+    i = i + 1
+#print("ListadeRIs: {}".format(listadeRIs))
+
+# Elige reguladores y regulados de la listasecundaria para ser impresos
+setmem = []
+k = 0
+while k < int(len(listadeRIs)):
+    j = 0
+    copysec = False
+    #while j < int(len(listasecundaria_trimmed)):
+    while j < len(listasecundaria_trimmed):
+        # print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs))
+        # Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
+        if listasecundaria_trimmed[j][0] == listadeRIs[k][2]:
+            # print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][2] {}".format(listasecundaria_trimmed[j][0],
+            #                                                                        listadeRIs[k][2]))
+            copysec = True
+            # print("j: {}".format(j))
+            indj = j
+        j = j + 1
+    if copysec:
+        setmem.append(listasecundaria_trimmed[indj])
+        # print("setmen: {}".format(setmem))
+
+    #### CMC: AGREGO ESTE CODIGO PARA BUSCAR REGULADOS YA QUE EL CODIGO ANTERIOR BUSCA REGULADORES
+    j = 0
+    copysec = False
+    #while j < int(len(listasecundaria_trimmed)):
+    while j < len(listasecundaria_trimmed):
+        # print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs))
+        # Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
+        if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
+            # print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][1] {}".format(listasecundaria_trimmed[j][0],
+            #                                                                       listadeRIs[k][1]))
+            copysec = True
+            # print("j: {}".format(j))
+            indj = j
+        j = j + 1
+    if copysec:
+        setmem.append(listasecundaria_trimmed[indj])
+        # print("setmen: {}".format(setmem))
+
+    k = k + 1
+setmem = sorted(setmem)
+# print("setmen: {}".format(setmem))
+dedup = [setmem[i] for i in range(len(setmem)) if i == 0 or setmem[i] != setmem[i - 1]]
+# print("dedup: {}".format(dedup))
+salida_a2_trimmed.extend(dedup)
+#print("salida_a2_trimmed after listasecundaria_trimmed: {}".format(salida_a2_trimmed))
+
+# Asigna identificadores (EX) a eventos (RIs)
+for i in range(len(listadeRIs)):
+    # Original Daniel: ID = "E" + str(i+1)
+    ID = "R" + str(i + 1)
+    listadeRIs[i].insert(0, ID)
+#print("Con identificadores ListadeRIs: {}".format(listadeRIs))
+
+# CREA LISTADO DE EVENTOS (EX) Y ENTIDADES (TX) EN FORMATO DE IMPESIÓN
+for i in range(len(salida_a2_trimmed)):
+    linea = str(salida_a2_trimmed[i][0]) + '	' + str(salida_a2_trimmed[i][1]) + ' ' + str(
+        salida_a2_trimmed[i][2]) + ' ' + str(salida_a2_trimmed[i][3]) + '	' + str(salida_a2_trimmed[i][4])
+    # Original Daniel: impresion.append(linea)
+    impresionEntities.append(linea)
+
+for i in range(len(listadeRIs)):
+    if listadeRIs[i][2] and listadeRIs[i][3]:
+        # Original Daniel: linea = str(listadeRIs[i][0]) + '	' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2]) + ' ' + 'Cause:' + str(listadeRIs[i][3])
+        linea = str(listadeRIs[i][0]) + '	' + "Interaction." + str(listadeRIs[i][1]) + ' ' + 'Target:' + str(
+            listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3])
+        # Original Daniel: elif listadeRIs[i][2]:
+        # Original Daniel: linea = str(listadeRIs[i][0]) + '	' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2])
+        # Original Daniel: elif listadeRIs[i][3]:
+        # Original Daniel: linea = str(listadeRIs[i][0]) + '	' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Cause:' + str(listadeRIs[i][3])
+        # Original Daniel: else:
+        # Original Daniel: linea = str(listadeRIs[i][0]) + '	' + "deverbal_effect:" + str(listadeRIs[i][1])
+        # Original Daniel: impresion.append(linea)
+        impresionInteractionsComplete.append(linea)
+        #print("Interaction complete: {}".format(linea))
+        linea = str(listadeRIs[i][0]) + '	' + "Interaction.regulator" + ' ' + 'Target:' + str(
+            listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3])
+        impresionInteractionsIncomplete.append(linea)
+
+#print("Entities: {}".format(impresionEntities))
+
+# Escribir entidades interacciones completas en a1
+for line in impresionEntities:
+    # Original Daniel: save_file = open( file_name, "a" )
+    save_file = open(file_name_entities_complete, "a")
+    save_file.write(line)
+    save_file.write("\n")
+    save_file.close()
+
+# Escribir entidades interacciones incompletas en a1
+for line in impresionEntities:
+    # Original Daniel: save_file = open( file_name, "a" )
+    save_file = open(file_name_entities_incomplete, "a")
+    save_file.write(line)
+    save_file.write("\n")
+    save_file.close()
+
+# Escribir interacciones completas (regulator, effect, regulated)
+# print("InteractionsComplete: {}".format(impresionInteractionsComplete))
+for line in impresionInteractionsComplete:
+    # Original Daniel: save_file = open( file_name, "a" )
+    save_file = open(file_name_interactions_complete, "a")
+    save_file.write(line)
+    save_file.write("\n")
+    save_file.close()
+
+# Escribir interacciones incompletas (regulator, "regulator", regulated)
+# print("InteractionsIncomplete: {}".format(impresionInteractionsIncomplete))
+for line in impresionInteractionsIncomplete:
+    # Original Daniel: save_file = open( file_name, "a" )
+    save_file = open(file_name_interactions_incomplete, "a")
+    save_file.write(line)
+    save_file.write("\n")
+    save_file.close()
+
+with open(file_name_text_complete, mode="w") as txtFile:
+    txtFile.write(dato)
+with open(file_name_text_incomplete, mode="w") as txtFile:
+    txtFile.write(dato)
--- a/filtered-sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/filtered-sentences/.gitignore 0 → 100644
View file @f3df57a
+
--- a/format/regex.py 0 → 100644
View file @f3df57a
+++ b/format/regex.py 0 → 100644
View file @f3df57a
+import fileinput
+import re
+import sys
+
+if ( len( sys.argv ) < 3 ):
+    sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <output_file> \n" )
+    sys.stderr.flush();
+
+    exit( 2 );
+else:
+    print("Ok.")
+
+#LEER ARCHIVO INPUT
+text_file = open( sys.argv[1], "r" )
+dato = text_file.read().splitlines()
+text_file.close()
+
+
+#QUITA EXTENSION DE NOMBRE DE ARCHIVO
+split_line = sys.argv[2]
+split_line = split_line[:-4]
+file_name=""
+file_name = split_line + ".san"
+open( file_name , 'w').close()
+
+#ESCRIBIR REGEX EN ARGV 2
+for line in dato:
+    line = re.sub('[\(][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NNNNa_)
+    line = re.sub('[\[][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NNNNa_]
+    line = re.sub('[\(][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NN,NN,NN_)
+    line = re.sub('[\[][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NN,NN,NN_]
+    line = re.sub('[\(][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num_)
+    line = re.sub('[\(][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num.num_)
+    line = re.sub('[\(][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num-num_)
+    line = re.sub('[\[][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num_]
+    line = re.sub('[\[][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num.num_]
+    line = re.sub('[\[][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num-num_]
+    line = re.sub('[\(]\s[a-zA-Z]{1}\s[\)]', '', line.rstrip()) #elimina (_alpha_)
+    line = re.sub('[\[]\s[a-zA-Z]{1}\s[\]]', '', line.rstrip()) #elimina [_alpha_]
+    line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman_)
+    line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman-Roman_)
+    line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman_)
+    line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman-roman_)
+    line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman_]
+    line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman-Roman_]
+    line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman_]
+    line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman-roman_]
+    line = re.sub('[\(][^\(|^\)]*\s(fig\s\.|figure|see|i\s\.\se\s\.|e\s\.\sg\s\.|tab\s\.table)\s[^\(|^\)]*[\)]', '', line.rstrip(), flags=re.I) #
+    line = re.sub('  ', ' ', line.rstrip()) #elimina (_NNNNa_)
+    #print(line)
+
+
+    save_file = open( file_name, "a" )
+    save_file.write(line)
+    save_file.write("\n")
+    save_file.close()
--- a/format/sanitized_sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/format/sanitized_sentences/.gitignore 0 → 100644
View file @f3df57a
+
--- a/format/split_sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/format/split_sentences/.gitignore 0 → 100644
View file @f3df57a
+
--- a/get-TRN-Organism-v1.py 0 → 100644
View file @f3df57a
+++ b/get-TRN-Organism-v1.py 0 → 100644
View file @f3df57a
+# -*- coding: UTF-8 -*-
+import operator
+from optparse import OptionParser
+import os
+import sys
+import json
+import re
+import pandas as pd
+
+__author__ = 'CMendezC'
+
+
+# Objective: add organism annotation (http://pakal.ccg.unam.mx/cmendezc/bacteria-annotation) to TRN tabla
+
+# Parameters:
+#   1) --trnPath Path to TRN detail table
+#   2) --trnFile File of TRN detail table
+#   3) --outputPath Output path
+#   4) --organismPath Path to Organism annotation table
+#   5) --organismFile File of Organism annotation table
+
+# Ouput:
+#   1) Tsv file detail with:
+# TF	TypeRegulated	Regulated	Effect	PMID    IdSentence  TypeSentence    Sentence
+#   Original_idsentence  Original_sentence  SectionNum SectionName  OrganismMentions    OrganismScore    ConfirmationLevel
+# OrganismScore = {
+#   If only salmonella or only non identified organism  = 1,
+#   If (startswith salmonella or non identified organism) and other organisms = 0.5
+#   If only other organisms = 0
+#   }
+
+# Execution:
+# python3.4 get-TRN-Organism-v1.py
+
+# Local
+# python get-TRN-Organism-v1.py
+# --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
+# --trnFile STMTRN_all.detail.tsv
+# --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
+# --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results
+# --organismFile annotations_STMTRN_all.sentences.csv
+# python3 get-TRN-Organism-v1.py --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --trnFile STMTRN_all.detail.tsv --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results --organismFile annotations_STMTRN_all.sentences.csv
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+def only_salmonella_or_non_identified_organism(list_temp):
+    non_identified_organisms = [
+        'unidentified plasmid',
+        'unidentified',
+        'bacterium',
+        'bacterium IFAM-3211',
+        'bacterium IFAM-2074',
+        'bacterium IFAM-1493',
+        'bacterium IFAM-3215',
+        'bacterium IFAM-3359',
+        'hybrid',
+        'Vector pMC1403',
+        'Transposon Tn10',
+        'unidentified cloning vector',
+        'Plasmid F',
+        'Cloning vector pUC19'
+    ]
+    matches = 0
+    for o in list_temp:
+        if o.lower().startswith("salmonella") or o in non_identified_organisms:
+            matches += 1
+    if matches == len(list_temp):
+        return True
+    else:
+        return False
+
+def salmonella_or_non_identified_and_other_organisms(list_temp):
+    non_identified_organisms = [
+        'unidentified plasmid',
+        'unidentified',
+        'bacterium',
+        'bacterium IFAM-3211',
+        'bacterium IFAM-2074',
+        'bacterium IFAM-1493',
+        'bacterium IFAM-3215',
+        'bacterium IFAM-3359',
+        'hybrid',
+        'Vector pMC1403',
+        'Transposon Tn10',
+        'unidentified cloning vector',
+        'Plasmid F',
+        'Cloning vector pUC19'
+    ]
+    matches = 0
+    for o in list_temp:
+        if o.lower().startswith("salmonella") or o in non_identified_organisms:
+            matches += 1
+    if matches < len(list_temp) and matches > 0:
+        return True
+    else:
+        return False
+
+def only_other_organims(list_temp):
+    non_identified_organisms = [
+        'unidentified plasmid',
+        'unidentified',
+        'bacterium',
+        'bacterium IFAM-3211',
+        'bacterium IFAM-2074',
+        'bacterium IFAM-1493',
+        'bacterium IFAM-3215',
+        'bacterium IFAM-3359',
+        'hybrid',
+        'Vector pMC1403',
+        'Transposon Tn10',
+        'unidentified cloning vector',
+        'Plasmid F',
+        'Cloning vector pUC19'
+    ]
+    matches = 0
+    for o in list_temp:
+        if o.lower().startswith("salmonella") or o in non_identified_organisms:
+            matches += 1
+    if matches == 0:
+        return True
+    else:
+        return False
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--trnPath", dest="trnPath",
+                      help="Path to TRN detail table", metavar="PATH")
+    parser.add_option("--trnFile", dest="trnFile",
+                      help="File of TRN detail table", metavar="FILE")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path", metavar="PATH")
+    parser.add_option("--organismPath", dest="organismPath",
+                      help="Path to organism annotation table", metavar="PATH")
+    parser.add_option("--organismFile", dest="organismFile",
+                      help="File of organism annotation table", metavar="FILE")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameter entered.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to TRN detail table: " + str(options.trnPath))
+    print("File of TRN detail table: " + str(options.trnFile))
+    print("Output path: " + str(options.outputPath))
+    print("Path to organism annotation table: " + str(options.organismPath))
+    print("File of organism annotation table: " + str(options.organismFile))
+
+    # Load organism annotation table
+    print("Loading organism annotation table")
+    df_organisms = pd.read_csv(os.path.join(options.organismPath, options.organismFile), sep=',')
+    print("Total de frases anotadas con organism: {}".format(df_organisms.shape[0]))
+
+    # Load TRN detail table
+    print("Loading TRN detail table")
+    df_detail = pd.read_csv(os.path.join(options.trnPath, options.trnFile), sep='\t')
+    print("Total de frases en TRN: {}".format(df_detail.shape[0]))
+
+    # Fix column for organism. We changed this issue in get-TRN-v2.py
+    df_detail = df_detail.rename(columns={"Organism": "Organisms"})
+    df_detail['OrganismScore'] = 1.00
+    print(df_detail.columns)
+    #print(df_detail['Sentence'].head(15))
+
+    for idx in df_organisms.index:
+        organisms = df_organisms['Organisms'][idx]
+        SentenceNumberInFile = df_organisms['SentenceNumberInFile'][idx]
+        SentenceNumberInFile = SentenceNumberInFile - 2
+        # print("Organisms before: {}".format(df_detail.Organisms[SentenceNumberInFile]))
+        df_detail.Organisms[SentenceNumberInFile] = organisms
+        # print("Organisms assigned: {}".format(df_detail.Organisms[SentenceNumberInFile]))
+
+        # OrganismScore = {
+        #   If only salmonella or only non identified organism  = 1,
+        #   If (startswith salmonella or non identified organism) and other organisms = 0.5
+        #   If only other organisms = 0
+        #   }
+        list_organisms = organisms.split(';')
+        # print("     OrganismScore before: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
+        if only_salmonella_or_non_identified_organism(list_organisms):
+            df_detail.OrganismScore[SentenceNumberInFile] = 1.00
+        elif salmonella_or_non_identified_and_other_organisms(list_organisms):
+            df_detail.OrganismScore[SentenceNumberInFile] = 0.50
+        elif only_other_organims(list_organisms):
+            df_detail.OrganismScore[SentenceNumberInFile] = 0.00
+        # print("     OrganismScore assigned: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
+
+    hashPredictedRIs = {}
+    hashPredictedRIsCount = {}
+    hashPredictedRIsCountVer = {}
+    hashPredictedRIsCountDev = {}
+    hashPredictedRIsCountAtt = {}
+    hashPredictedRIsCountAuto = {}
+    hashPredictedRIsScore = {}
+    hashPredictedRIsRI = {}
+    for idx in df_detail.index:
+        tf = df_detail['TF'][idx]
+        TypeRegulated = df_detail['TypeRegulated'][idx]
+        Regulated = df_detail['Regulated'][idx]
+        Effect = df_detail['Effect'][idx]
+        pmid = df_detail['PMID'][idx]
+        numsent = df_detail['NumSentence'][idx]
+        type_sent = df_detail['TypeSentence'][idx]
+        sentence = df_detail['Sentence'][idx]
+        original_idsentence = df_detail['OriginalIdSentence'][idx]
+        original_sentence = df_detail['OriginalSentence'][idx]
+        section_num = df_detail['SectionNum'][idx]
+        section_name = df_detail['SectionName'][idx]
+        organisms = df_detail['Organisms'][idx]
+        organism_score = df_detail['OrganismScore'][idx]
+        llave = "{}\t{}\t{}\t{}".format(tf, TypeRegulated, Regulated, Effect)
+        if organism_score == 0:
+            continue
+        if llave in hashPredictedRIs:
+            hashPredictedRIs[llave].append(
+                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
+                                                            original_sentence, section_num, section_name, organisms,
+                                                            organism_score, "", "", "", "", "", ""))
+            hashPredictedRIsCount[llave] += 1
+            if type_sent == "ver/dev":
+                hashPredictedRIsCountVer[llave] += 1
+            elif type_sent == "dev":
+                hashPredictedRIsCountDev[llave] += 1
+            elif type_sent == "att":
+                hashPredictedRIsCountAtt[llave] += 1
+            elif type_sent == "auto":
+                hashPredictedRIsCountAuto[llave] += 1
+            # if organism_score == 0.5:
+                # We penalize RI
+                # hashPredictedRIsScore[llave] -= 0.05
+
+        else:
+            hashPredictedRIs[llave] = [
+                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
+                                                            original_sentence, section_num, section_name, organisms,
+                                                            organism_score, "", "", "", "", "", "")]
+            hashPredictedRIsCount[llave] = 1
+            hashPredictedRIsCountVer[llave] = 0
+            hashPredictedRIsCountDev[llave] = 0
+            hashPredictedRIsCountAtt[llave] = 0
+            hashPredictedRIsCountAuto[llave] = 0
+            hashPredictedRIsScore[llave] = 1
+            if type_sent == "ver/dev":
+                hashPredictedRIsCountVer[llave] = 1
+            elif type_sent == "dev":
+                hashPredictedRIsCountDev[llave] = 1
+            elif type_sent == "att":
+                hashPredictedRIsCountAtt[llave] = 1
+            elif type_sent == "auto":
+                hashPredictedRIsCountAuto[llave] = 1
+            # if organism_score == 0.5:
+                # We penalize RI
+                # hashPredictedRIsScore[llave] -= 0.05
+
+    print("Total RIs en TRN con organismo: {}".format(len(hashPredictedRIs)))
+    with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "summary_org")), mode="w") as oFile:
+        # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
+        oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
+        for k,v in hashPredictedRIs.items():
+            RI_value = "True"
+            # if hashPredictedRIsScore[k] < 1:
+                # RI_value = "Possible"
+            oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
+                                                              hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k],
+                                                              hashPredictedRIsScore[k], RI_value))
+    with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "detail_org")), mode="w") as oFile:
+        # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
+        oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tKT\tCL\tSource\tSpeculation\tNegation\tConfirmationLevel\n")
+        i = 0
+        for k,v in hashPredictedRIs.items():
+            for s in v:
+                oFile.write("{}\t{}\n".format(k, s))
+                i += 1
+    print("Total de frases en TRN organismo: {}".format(i))
+
--- a/get-TRN-v2.py 0 → 100644
View file @f3df57a
+++ b/get-TRN-v2.py 0 → 100644
View file @f3df57a
+# -*- coding: UTF-8 -*-
+import operator
+from optparse import OptionParser
+import os
+import sys
+import json
+import re
+import pandas as pd
+
+__author__ = 'CMendezC'
+
+
+# Objective: generate TRN
+# CFMC 2022-03-11: Agregamos:
+#   1) Sección de oraciones de salida
+#   2)
+
+# Parameters:
+#   1) --predictedPath Path for predicted interactions
+#   2) --outputPath Output path
+#   3) --outputFile Preffix file for saving TRN
+#   4) --diccPath Dictionary path
+#   5) --diccSynon File with synonyms of TFs
+#   6) --tsvPath    Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf
+#   7) --jsonpdfPath    Path to read jsonpdf file to extract section name
+
+# Ouput:
+#   1) Tsv file detail with:
+# TF	TypeRegulated	Regulated	Effect	PMID    IdSentence  TypeSentence    Sentence
+#   Original_idsentence  Original_sentence  SectionNum SectionName  OrganismMentions    OrganismScore    ConfirmationLevel
+
+#   1) Tsv file summary with:
+# TF	TypeRegulated	Regulated	Effect	SentCount	Ver/Dev	Att	Auto	Score   RI (True/False)
+
+# Execution:
+# Version 2 TRN Salmonella
+# python3.4 get-TRN-v2.py
+# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn
+# --outputFile STMTRN_v2
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv
+# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf
+# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_v2 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf
+
+# articulos_sal_4
+# python3.4 get-TRN-v2.py
+# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn
+# --outputFile STMTRN_articulos_sal_4
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv
+# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf
+# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_4 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf
+
+# articulos_sal_1
+# python3.4 get-TRN-v2.py
+# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn
+# --outputFile STMTRN_articulos_sal_1
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv
+# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf
+# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_1 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf
+
+# all = articulos_sal_1 + articulos_sal_2 + articulos_sal_3 + articulos_sal_4
+# python3.4 get-TRN-v2.py
+# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn
+# --outputFile STMTRN_all
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+# --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv
+# --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf
+# python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_all --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf
+
+####
+# python3.4 get-TRN-v1.py
+# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn
+# --outputFile STMTRN
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+# python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+
+# Con dataset automatic-extraction-STM-RIs-dataset
+# python3.4 get-TRN-v1.py
+# --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn
+# --outputFile STM-RIs-dataset
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+# python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STM-RIs-dataset --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+def updateHashPredicted(pr, hashP, pm, sF, ef):
+    # updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect])
+    if pr not in hashP:
+        hashTemp = {"pmids": {pm: [sF]}, "orieff": ef}
+        hashP[pr] = hashTemp
+    else:
+        hashTemp = hashP[pr]
+        if pm in hashTemp["pmids"]:
+            hashP[pr]["pmids"][pm].append(sF)
+        else:
+            hashP[pr]["pmids"][pm] = [sF]
+
+def get_standard_name(regSynon):
+    reg = regSynon
+    if regSynon in hashSynon:
+        reg = hashSynon[regSynon]
+    else:
+        for syn, std in hashSynon.items():
+            if regSynon.startswith(syn):
+                reg = regSynon.replace(syn, std, 1)
+                break
+    return reg
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--predictedPath", dest="predictedPath",
+                      help="Path predicted ris gcs", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path", metavar="PATH")
+    parser.add_option("--outputFile", dest="outputFile",
+                      help="Preffix file for saving results", metavar="FILE")
+    parser.add_option("--diccPath", dest="diccPath",
+                      help="Path to dictionary", metavar="PATH")
+    parser.add_option("--diccSynon", dest="diccSynon",
+                      help="File with synonyms", metavar="FILE")
+    parser.add_option("--tsvPath", dest="tsvPath",
+                      help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")
+    parser.add_option("--jsonpdfPath", dest="jsonpdfPath",
+                        help="Path to read jsonpdf file to extract section name", metavar="PATH")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameter entered.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path predicted ris gcs: " + str(options.predictedPath))
+    print("Output path: " + str(options.outputPath))
+    print("Preffix file for saving results: " + str(options.outputFile))
+    print("Path to dictionary: " + str(options.diccPath))
+    print("File with synonyms: " + str(options.diccSynon))
+    print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))
+    print("Path to read jsonpdf file to extract section name: " + str(options.jsonpdfPath))
+
+    use_synonyms = False
+    hashSynon = {}
+    if options.diccPath != None and options.diccSynon != "no-synonyms":
+        print("***** Using synonyms *****")
+        use_synonyms = True
+        print('Loading dictionary of synonyms...')
+        with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon:
+            hashSynon = json.load(diccSynon)
+        print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon)))
+
+    hashPredictedRIs = {}
+    hashPredictedRIsCount = {}
+    hashPredictedRIsCountVer = {}
+    hashPredictedRIsCountDev = {}
+    hashPredictedRIsCountAtt = {}
+    hashPredictedRIsCountAuto = {}
+    hashFiles = {}
+    for path, dirs, files in os.walk(options.predictedPath):
+        for file in files:
+            if file.endswith(".a1"):
+                filename = file[:-3]
+                if filename not in hashFiles:
+                    hashFiles[filename] = 1
+                else:
+                    hashFiles[filename] += 1
+    print("Files: {}".format(len(hashFiles)))
+
+    processedFiles = 0
+    id_ri = 1
+    regex_att_auto = re.compile(r"(\.att\.|\.auto\.)[0-9]*$")
+    for file in sorted(hashFiles.keys()):
+        print("File: {}".format(file))
+        type_sent = "ver/dev"
+        if file.find("dataSet_OnlyRI_sentences") > -1:
+            pmid = "000000"
+            if file.find("dataSet_OnlyRI_sentences.") > -1:
+                if file.find(".att.") > -1:
+                    numsent = file[file.find("att.") + 4:]
+                    type_sent = "att"
+                if pmid.find(".auto.") > -1:
+                    numsent = file[file.find("auto.") + 5:]
+                    type_sent = "auto"
+            else:
+                numsent = file[file.find("_", file.find("_", file.find("_") + 1) + 1) + 1:file.find("-")]
+            numsent = numsent.replace(".al", "")
+            print("dataSet_OnlyRI_sentences numsent: {}".format(numsent))
+            print("dataSet_OnlyRI_sentences pmid: {}".format(pmid))
+        else:
+            pmid = file[:file.find("_")]
+            # print("pmid: {}".format(pmid))
+            numsent = file[file.find("_")+1:file.find("-")]
+            numsent = numsent.replace(".al", "")
+            if pmid.find(".att.") > -1:
+                # CFMC 2022-03-11: Fix errro in pmid
+                # CFMC 2022-03-11 Original: pmid = pmid.replace(".att.", "")
+                pmid = regex_att_auto.sub("", pmid)
+                numsent = file[file.find("att.")+4:]
+                type_sent = "att"
+            if pmid.find(".auto.") > -1:
+                # CFMC 2022-03-11: Fix errro in pmid
+                # CFMC 2022-03-11 Original: pmid = pmid.replace(".auto.", "")
+                pmid = regex_att_auto.sub("", pmid)
+                numsent = file[file.find("auto.") + 5:]
+                type_sent = "auto"
+        # numsent = file[file.find("_"):file.find("-")]
+        # print("pmid {}".format(pmid))
+        # print("numsent: {}".format(numsent))
+
+        sentenceFile = file[:file.find("-", file.find("_"))] + ".txt"
+        hashEntitiesGenes = {}
+        hashEntitiesTUs = {}
+        hashEntitiesTFs = {}
+        hashEntitiesEffects = {}
+        hashOriginalEffect = {}
+        regex_fix_regulator = re.compile(r'(Regulated|Binds|Bind|deverbal_effect|Regulate)')
+        regex_fix_repressor = re.compile(r'(Repressing|Represses)')
+        with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File:
+            for line in a1File:
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                entity = listLine2[0]
+                entity_type = listLine2[0]
+                idEntity = listLine1[0]
+                originalEffect = listLine1[2]
+                if entity.startswith("EFFECT"):
+                    entity = entity[entity.find(".") + 1:]
+                    # print("Entity: {}".format(entity))
+                    if pmid.find("_dev") > -1:
+                        type_sent = "dev"
+                        entity = entity.replace("_dev", "")
+                    # print("Entity without _dev: {}".format(entity))
+                    if idEntity not in hashOriginalEffect:
+                        hashOriginalEffect[idEntity] = originalEffect
+                    if idEntity not in hashEntitiesEffects:
+                        # We fixed some wrong effects in TRN, but we must fix this also in another script where error is produced
+                        if regex_fix_regulator.match(entity):
+                            print("WARNING EFFECT: {}".format(entity))
+                            entity = regex_fix_regulator.sub("regulator", entity)
+                            print("WARNING EFFECT after: {}".format(entity))
+                        if regex_fix_repressor.match(entity):
+                            print("WARNING EFFECT: {}".format(entity))
+                            entity = regex_fix_repressor.sub("repressor", entity)
+                            print("WARNING EFFECT after: {}".format(entity))
+                        hashEntitiesEffects[idEntity] = entity
+                else:
+                    entity = listLine1[2]
+                    if entity_type == "GENE":
+                        if idEntity not in hashEntitiesGenes:
+                            hashEntitiesGenes[idEntity] = entity
+                    elif entity_type == "TU":
+                        if idEntity not in hashEntitiesTUs:
+                            hashEntitiesTUs[idEntity] = entity
+                    elif entity_type == "TF":
+                        if idEntity not in hashEntitiesTFs:
+                            hashEntitiesTFs[idEntity] = entity
+
+        # print("hashEntities: {}".format(hashEntitiesGenes))
+        # print("hashEntities: {}".format(hashEntitiesTUs))
+        # print("hashEntities: {}".format(hashEntitiesTFs))
+
+        with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File:
+            sentence = ''
+            with open(os.path.join(options.predictedPath, file + ".txt"), mode="r") as txtFile:
+                sentence = txtFile.read()
+                listTokens = [token.split('|')[0] for token in sentence.split()]
+                sentence = ' '.join(listTokens)
+
+            # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
+            # Open jsonpdf file
+            hash_sections = {}
+            sentences = {}
+            print('Loading jsonpdf file...')
+            with open(os.path.join(options.jsonpdfPath, pmid + ".jsonpdf"), "r", encoding="utf-8", errors="replace") as jsonpdfFile:
+                text_file = jsonpdfFile.read()
+                if file.startswith("26781240"):
+                    text_file = text_file.replace(" \\ ", " \\\\ ")
+                elif file.startswith("26249345"):
+                    text_file = text_file.replace('}], ', '}],"sections": {}')
+                try:
+                    hash_jsonpdf = json.loads(text_file)
+                    print('   Loading jsponpdf file... done!')
+                except Exception as e:
+                    print(e)
+                    print("   Loading jsonpdf file failed: {}".format(file))
+                hash_sections = hash_jsonpdf["sections"]
+                # print("Sections: {}".format(hash_sections))
+                sentences = hash_jsonpdf["sentences"]
+            # Open tsv file
+            print('Loading tsv file...')
+            file_tsv = pmid + ".pre.fil.tsv"
+            tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
+            # print("tsv_file.shape: {}".format(tsv_file.shape))
+            tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
+            # print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
+            tsv_file_new = tsv_file_filtered.reset_index(drop=True)
+            # print(tsv_file_new.head(10))
+            print('   Loading tsv file... done!')
+            numsent_int = int(numsent)
+            original_sentence = tsv_file_new.at[numsent_int, 'sentence']
+            section_num = tsv_file_new.at[numsent_int, 'section']
+            # print("type(section_num): {}".format(type(section_num)))
+            original_idsentence = tsv_file_new.at[numsent_int, 'idsentence']
+            section_num_str = str(section_num)
+            if section_num_str in hash_sections:
+                section_name = hash_sections[section_num_str]
+            else:
+                section_name = "Unknown"
+
+            for line in a2File:
+                # print("Line a2: {}".format(line))
+                # R1	Interaction.T3 Target:T2 Agent:T1 Condition: T4
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                regulator = listLine2[2]
+                regulator = regulator[regulator.find(":") + 1:]
+                regulated = listLine2[1]
+                regulated = regulated[regulated.find(":") + 1:]
+                effect = listLine2[0]
+                effect = effect[effect.find(".") + 1:]
+
+                tf = hashEntitiesTFs[regulator]
+                if tf.endswith("ed"):
+                    tf = tf[:tf.find("-")]
+                #else:
+                # Clean TF names by expressions seen in TRN outpur file
+                tf = re.sub(r"(/absence|controlle|activation|‐regulate|‐mediate|mediate|-regulate|regulate|ˉ|-like|-mutant|-type|-independent|-dependent|dependent|-dependant|-binding|-and|-family|-bound|-deficient|-indepen-dent|-inducing|-green|-overproducing|-or|-depletion|-repressible|-dual|-box)", "", tf)
+                # Clean false TF names - 2329
+                result = re.match(r"(cyclic|RHONDA|Crawford|Hulett|Rhodobacter|Danino|Huang|Neisseria|Huang|HUGHES1|Robbe-Saule|Danchin|Roberts|Furer|Hunter|Furue|Humphreys|Nacional)", tf)
+                if result:
+                    break
+                # H
+                tf = get_standard_name(tf)
+
+                # print("numsent: {}".format(numsent))
+                # For L&C do not increment 1
+                # CFMC 2022-03-11 Original: numsent_int = int(numsent)
+
+                if regulated in hashEntitiesGenes:
+                    type_regulated = "Gene"
+                    llave = "{}\t{}\t{}\t{}".format(tf, "gene", hashEntitiesGenes[regulated],
+                                                    hashEntitiesEffects[effect])
+                elif regulated in hashEntitiesTUs:
+                    type_regulated ="TU"
+                    llave = "{}\t{}\t{}\t{}".format(tf, "TU", hashEntitiesTUs[regulated],
+                                                    hashEntitiesEffects[effect])
+                else:
+                    print("ERROR: Regulated did not found!")
+                # Clean false cases
+                if llave.startswith("Hu"):
+                    break
+
+                if llave in hashPredictedRIs:
+                    # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
+                    hashPredictedRIs[llave].append("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, ""))
+                    hashPredictedRIsCount[llave] += 1
+                    if type_sent == "ver/dev":
+                    #    if llave in hashPredictedRIsCountVer:
+                        hashPredictedRIsCountVer[llave] += 1
+                    #    else:
+                    #        hashPredictedRIsCountVer[llave] = 1
+                    elif type_sent == "dev":
+                    #    if llave in hashPredictedRIsCountVer:
+                        hashPredictedRIsCountDev[llave] += 1
+                    #    else:
+                    #        hashPredictedRIsCountDev[llave] = 1
+                    elif type_sent == "att":
+                    #    if llave in hashPredictedRIsCountVer:
+                        hashPredictedRIsCountAtt[llave] += 1
+                    #    else:
+                    #        hashPredictedRIsCountAtt[llave] = 1
+                    elif type_sent == "auto":
+                    #    if llave in hashPredictedRIsCountVer:
+                        hashPredictedRIsCountAuto[llave] += 1
+                    #    else:
+                    #        hashPredictedRIsCountAuto[llave] = 1
+                else:
+                    # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
+                    hashPredictedRIs[llave] = ["{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, "")]
+                    hashPredictedRIsCount[llave] = 1
+                    hashPredictedRIsCountVer[llave] = 0
+                    hashPredictedRIsCountDev[llave] = 0
+                    hashPredictedRIsCountAtt[llave] = 0
+                    hashPredictedRIsCountAuto[llave] = 0
+                    if type_sent == "ver/dev":
+                        hashPredictedRIsCountVer[llave] = 1
+                    elif type_sent == "dev":
+                        hashPredictedRIsCountDev[llave] = 1
+                    elif type_sent == "att":
+                        hashPredictedRIsCountAtt[llave] = 1
+                    elif type_sent == "auto":
+                        hashPredictedRIsCountAuto[llave] = 1
+
+                id_ri += 1
+        processedFiles += 1
+
+    print("Processed files: {}".format(processedFiles))
+    with open(os.path.join(options.outputPath, options.outputFile + ".summary.tsv"), mode="w") as oFile:
+        # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
+        oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
+        for k,v in hashPredictedRIs.items():
+            oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
+                                                              hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], "1", "True"))
+            #oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k], hashPredictedRIsCountDev[k], hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], v))
+    with open(os.path.join(options.outputPath, options.outputFile + ".detail.tsv"), mode="w") as oFile:
+        # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
+        oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tConfirmationLevel\n")
+        for k,v in hashPredictedRIs.items():
+            for s in v:
+                oFile.write("{}\t{}\n".format(k, s))
+
--- a/predicted-ris-gcs/complete-ris/.gitignore 0 → 100644
View file @f3df57a
+++ b/predicted-ris-gcs/complete-ris/.gitignore 0 → 100644
View file @f3df57a
+
--- a/predicted-ris-gcs/incomplete-ris/.gitignore 0 → 100644
View file @f3df57a
+++ b/predicted-ris-gcs/incomplete-ris/.gitignore 0 → 100644
View file @f3df57a
+
--- a/ri-attributive-extraction-v02.py 0 → 100644
View file @f3df57a
+++ b/ri-attributive-extraction-v02.py 0 → 100644
View file @f3df57a
+# -*- coding: UTF-8 -*-
+from optparse import OptionParser
+import sys
+import os
+import json
+import operator
+import re
+from nltk.corpus import words
+
+__author__ = 'CMendezC'
+
+
+# Objective: obtain predicted ris from attributive sentences, such as ArgP-regulated gene argP
+# Input format: transformed format.
+# WARNING: Only one sentence per line
+
+# Parameters:
+#   1) --inputPath Input path
+#   2) --inputFile Inpupt file
+#   3) --outputPath Output path
+#   5) --diccPath Dictionary path
+#   7) --diccEffect File with normalized effects
+
+#   6) --diccFile JSON file with entity dictionaries
+#   9) --diccEPAth Dictionary path diccEffect
+#   8) --format Output format: standoff, tabs
+
+# Ouput:
+#   1) File with predicted ris combined with existing files.
+# Format standoff:
+# T1	TF 0 0	ArgP-regulated
+# T2	GENE 0 0	argP
+# T1      Growth_condition 88 137 mitochondrial electron transport chain inhibitors
+# R1	Interaction.activator Target:T3 Agent:T1
+
+# Execution
+# C:\anaconda3\python ri-attributive-extraction.py
+# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences
+# --inputFile ris-sentences-analysis.att.017.txt
+# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs
+# --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources
+# --diccEffect normalized_Effects.json
+# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json
+# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.286.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json
+
+# python3 ri-attributive-extraction.py
+# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences
+# --inputFile ris-sentences-analysis.att.017.txt
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccEffect normalized_Effects.json
+# python3 ri-attributive-extraction.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+def getPosWord(wordPos, endPos, text, termList):
+    offsetStart = 0
+    wordNum = 0
+    listText = text.split()
+    for w in listText:
+        # if filenameBefore.find('000-2') > -1:
+        #     print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
+        if wordNum >= int(wordPos):
+            # for tok in word.split():
+            for t in termList:
+                # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
+                if w == t:
+                    return [w, offsetStart, offsetStart + len(w) - 1]
+            #else:
+        wordNum += 1
+        offsetStart += len(w) + 1
+        if wordNum > int(endPos):
+            return None
+    return None
+
+def getIdEntity(aList, etype, idE):
+    entity = aList[0]
+    if etype == "EFFECT":
+        normalizedEffect = entity
+        #print("EFFECT: {}".format(entity))
+        if entity in hashNormalizedEffects:
+            normalizedEffect = hashNormalizedEffects[entity]
+        etype += "." + normalizedEffect
+        #print("etype: {}".format(etype))
+    entityPosStart = aList[1]
+    entityPosEnd = aList[2]
+    keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+    #print("keyEntity: {}".format(keyEntity))
+    if keyEntity not in hashEntities:
+        idE += 1
+        idEntity = "T{}".format(idE)
+        hashEntities[keyEntity] = idEntity
+        #print("New entity {}: {}".format(idEntity, keyEntity))
+        return idEntity, idE
+    else:
+        idEntity = hashEntities[keyEntity]
+        return idEntity, idE
+
+def getIdInteraction(regulator, regulated, effect, idI, hashInt):
+    #print("hashInt: {}".format(hashInt))
+    keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+    if keyInteraction not in hashInt:
+        idI += 1
+        idInteraction = "R{}".format(idI)
+        hashInt[keyInteraction] = idInteraction
+        #print("New interaction {}: {}".format(idInteraction, keyInteraction))
+        #return idInteraction, idI
+    else:
+        idInteraction = hashInt[keyInteraction]
+    return idInteraction, idI
+
+def saveFiles(filename, hashE, hashI, s, effect):
+    if effect:
+        outputPath = os.path.join(options.outputPath, "complete-ris")
+    else:
+        outputPath = os.path.join(options.outputPath, "incomplete-ris")
+    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
+    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
+        for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
+            aList = k.split()
+            a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
+    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
+    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
+        for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
+            aList = k.split()
+            a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
+    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
+        txtFile.write(s)
+
+def loadFileEntities(filename, outputPath, hashTemp):
+    #print("Start loadFileEntities")
+    idE = 1
+    try:
+        f = filename[:filename.rfind(".")] + ".a1"
+        # print("file entities: {}".format(f))
+        with open(os.path.join(outputPath, f), mode="r") as a1File:
+            for line in a1File:
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                etype = listLine2[0]
+                entityPosStart = listLine2[1]
+                entityPosEnd = listLine2[2]
+                entity = listLine1[2]
+                keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+                idEntity = listLine1[0]
+                if keyEntity not in hashTemp:
+                    hashTemp[keyEntity] = idEntity
+                    if int(idEntity[1:]) > idE:
+                        idE = int(idEntity[1:])
+    except IOError:
+        print("IOError file: {}".format(os.path.join(outputPath, f)))
+        # idE = 1
+    return idE
+
+def loadFileInteractions(filename, outputPath, hashTemp):
+    #print("Start loadFileInteractions")
+    idI = 1
+    try:
+        with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
+            for line in a2File:
+                #print("Line a2: {}".format(line))
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                regulator = listLine2[2]
+                regulator = regulator[regulator.find(":") + 1:]
+                regulated = listLine2[1]
+                regulated = regulated[regulated.find(":") + 1:]
+                effect = listLine2[0]
+                effect = effect[effect.find(".") + 1:]
+                idInteraction = listLine1[0]
+                keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+                if keyInteraction not in hashTemp:
+                    hashTemp[keyInteraction] = idInteraction
+                    if int(idInteraction[1:]) > idI:
+                        idI = int(idInteraction[1:])
+    except IOError:
+        print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
+        # idI = 1
+    return idI
+
+def getRealPos(posStart, posEnd, lin):
+    return (posStart, posEnd)
+
+def getRI(r, l):
+    regulator = r.group('regulator')
+    regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
+    # regulatorStart = getRealPos(r.start('regulator'), l)
+    # regulatorEnd = getRealPos(r.end('regulator'), l)
+    regulated = r.group('regulated')
+    regulatedPos = getRealPos(r.start('regulated'), r.end('regulated'), l)
+    # regulatedStart = getRealPos(r.start('regulated'), l)
+    # regulatedEnd = getRealPos(r.end('regulated'), l)
+    effect = r.group('effect')
+    effectPos = getRealPos(r.start('effect'), r.end('effect'), l)
+    # effectStart = getRealPos(r.start('effect'), l)
+    # effectEnd = getRealPos(r.end('effect'), l)
+    #print("Regulator {}, start {}, end {}".format(regulator, regulatorPos[0], regulatorPos[1]))
+    #print("Regulated {}, start {}, end {}".format(regulated, regulatedPos[0], regulatedPos[1]))
+    #print("Effect {}, start {}, end {}".format(effect, effectPos[0], effectPos[1]))
+    return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
+                    regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
+                    effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
+
+if __name__ == "__main__":
+    # Parameter definition
+    # python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py
+    # --inputPath $(dirname ${file})
+    # --inputFile $(basename ${file})
+    # --outputPath $OUTPUT_PATH
+    # --diccPath $DICC_PATH
+    # --diccEffect normalized_Effects.json
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Input path", metavar="PATH")
+    parser.add_option("--inputFile", dest="inputFile",
+                      help="Input file", metavar="FILE")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path", metavar="PATH")
+    parser.add_option("--diccPath", dest="diccPath",
+                      help="Path to read dictionaries", metavar="PATH")
+    # parser.add_option("--diccFile", dest="diccFile",
+    #                   help="JSON file with entity dictionaries", metavar="FILE")
+    parser.add_option("--diccEffect", dest="diccEffect",
+                      help="File with normalized effects", metavar="FILE")
+
+    # parser.add_option("--format", dest="format",
+    #                   help="Output format: standoff", metavar="TEXT")
+    # parser.add_option("--diccEPAth", dest="diccEPAth",
+    #                   help="File with normalized effects", metavar="FILE")
+
+    (options, args) = parser.parse_args()
+    #if len(args) > 0:
+    #    parser.error("None parameter entered.")
+    #    sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Input path: " + str(options.inputPath))
+    print("Input file: " + str(options.inputFile))
+    print("Output path: " + str(options.outputPath))
+    print("Path to read dictionaries: " + str(options.diccPath))
+    # print("JSON file with entity dictionaries: " + str(options.diccFile))
+    print("File with normalized effects: " + str(options.diccEffect))
+    # print("Output format: " + str(options.format))
+    # print("Path to read normalized effects: " + str(options.diccEPAth))
+
+    # regularWords =  words.words('en')
+
+    # print('Loading dictionaries...')
+    # with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
+    #    hashDicc = json.load(diccFile)
+
+    # hashTermFiles = hashDicc["hashTermFiles"]
+    # hashTerms = hashDicc["hashTerms"]
+
+    # for key in hashTermFiles.keys():
+    #     for f in hashTermFiles[key]:
+    #         # print('File: ' + f)
+    #         with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
+    #             for line in iFile:
+    #                 line = line.strip('\n')
+    #                 line = line.replace(' ', '-')
+    #                 if line not in hashTerms[key]:
+    #                     hashTerms[key].append(line)
+    #                     # if options.termLower:
+    #                     # hashTerms[key].append(line.lower())
+    #                     # if options.termCapitalize:
+    #                     # hashTerms[key].append(line.capitalize())
+    #     print('   Terms read {} size: {}'.format(key, len(hashTerms[key])))
+
+    # Loading normalized effects
+    print('Loading normalized effects ending with -d...')
+    hashNormalizedEffects = {}
+    with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
+        hashNormalizedEffects = json.load(diccFile)
+    listEffects = []
+    for eff in hashNormalizedEffects.keys():
+        if eff.endswith('d'):
+            listEffects.append(eff)
+    listEffects.append("dependent")
+    effects = "|".join(listEffects)
+    #print("Effects: {}".format(effects))
+
+    files = {}
+    hashEntities = {}
+    hashInteractions = {}
+    hashInteractionsEffect = {}
+    idEntities = 1
+    idInteractions = 1
+    idInteractionsEffect = 1
+
+    # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)\s([^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
+    # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+( [^ ]+)')
+    # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))\s([^|]+\|[^|]+\|(CC|,))?)+ ([^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
+    # regexAttRILeft = re.compile(r'(?:([^|\s]+\|[^|]+\|(?:GENE|TU))\s(?:[^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
+    # regexAttRILeft = re.compile(r'(?=([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
+    # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
+    # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+) ([^ ]+ )+(?P<regulator>[^|]+\|[^|]+\|TF)')
+    # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>' + r'(' + effects + ')\|[^|]+\|TF) [^|]+\|gene')
+
+    # reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
+    # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(regulated|repressed)\|[^|]+\|TF) [^|]+\|gene')
+    # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ ){,5}(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
+    # CMC 2018-11-07: regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
+    regexAttRILeft = re.compile(
+        r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF)')
+    # regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ ){,5}(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
+    # CMC 2018-11-07: regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ )+(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
+    regexAttRIRight = re.compile(
+        r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) ([^ ]+ )*(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
+
+    filename = options.inputFile
+    hashEntities = {}
+    hashInteractions = {}
+    hashInteractionsEffect = {}
+    idEntities = 1
+    idInteractions = 1
+    idInteractionsEffect = 1
+    outputPath = os.path.join(options.outputPath, "complete-ris")
+    idEntities = loadFileEntities(filename, outputPath, hashEntities)
+    idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
+    outputPath = os.path.join(options.outputPath, "incomplete-ris")
+    idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
+
+    listRIs = []
+
+    with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
+        for line in iFile:
+            line = line.rstrip('\n')
+            # Buscar hacia la izquierda
+            #print("Buscando hacia <<")
+            result = regexAttRILeft.search(line)
+            #print("result: {}".format(result))
+            lineTemp = line
+            # print("lineTemp: {}".format(lineTemp))
+            while result:
+                #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
+                listRIs.append(getRI(result, line))
+                #print("listRIs: {}".format(listRIs))
+                lineTemp = lineTemp.replace(result.group('regulated'), '')
+                #print("lineTemp for: {}".format(lineTemp))
+                result = regexAttRILeft.search(lineTemp)
+                #print("result: {}".format(result))
+
+            # Buscar hacia la derecha
+            #print("Buscando hacia >>")
+            result = regexAttRIRight.search(line)
+            #print("result: {}".format(result))
+            lineTemp = line
+            # print("lineTemp: {}".format(lineTemp))
+            while result:
+                #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
+                listRIs.append(getRI(result, line))
+                #print("listRIs: {}".format(listRIs))
+                lineTemp = lineTemp.replace(result.group('regulated'), '')
+                #print("lineTemp for: {}".format(lineTemp))
+                result = regexAttRIRight.search(lineTemp)
+                #print("result: {}".format(result))
+
+            # result = regexAttRIRight.finditer(line)
+            # lineTemp = line
+            # while result:
+            #     listRIs.append(getRI(result, line))
+            #     lineTemp = lineTemp.replace(result.group('regulated'), '')
+            #     result = regexAttRIRight.finditer(lineTemp)
+
+    # return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
+    #                 regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
+    #                 effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
+    for ri in listRIs:
+        #print("ri: {}".format(ri))
+        if len(ri) != 4:
+            print("WARNING! corrupted list")
+            exit()
+        regulator = ri[0]
+        regulated = ri[1]
+        effect = ri[2]
+        line = ri[3]
+
+        listElem = regulator.split('|')
+        regulatorWord = listElem[0]
+        regulatorType = listElem[2]
+        regulatorStart = listElem[3]
+        regulatorEnd = listElem[4]
+
+        listElem = regulated.split('|')
+        regulatedWord = listElem[0]
+        regulatedType = listElem[2]
+        regulatedStart = listElem[3]
+        regulatedEnd = listElem[4]
+
+        listElem = effect.split('|')
+        effectWord = listElem[0]
+        effectType = "EFFECT"
+        effectStart = listElem[1]
+        effectEnd = listElem[2]
+
+        idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
+        if regulatedType == "GENE":
+            idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
+        elif regulatedType == "TU":
+            idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "TU", idEntities)
+        else:
+            print("WARNING! Unknown entity type")
+        idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
+                                                         idInteractions, hashInteractions)
+        idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
+        idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
+                                                               idInteractionsEffect,
+                                                               hashInteractionsEffect)
+
+        saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
+        saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)
--- a/ri-autoregulation-extraction-v01.py 0 → 100644
View file @f3df57a
+++ b/ri-autoregulation-extraction-v01.py 0 → 100644
View file @f3df57a
+# -*- coding: UTF-8 -*-
+from optparse import OptionParser
+import sys
+import os
+import json
+import operator
+import re
+from general_functions import getTypeRegulation
+from nltk.corpus import words
+
+__author__ = 'CMendezC'
+
+
+# Objective: obtain predicted ris from autoregulation sentences,
+# such as ArgP protein represses its own synthesis
+# Input format: transformed format.
+# WARNING: Only one sentence per line
+
+# Parameters:
+#   1) --inputPath Input path
+#   2) --inputFile Inpupt file
+#   3) --outputPath Output path
+#   5) --diccPath Dictionary path
+#   7) --diccEffect File with normalized effects
+
+#   6) --diccFile JSON file with entity dictionaries
+#   9) --diccEPAth Dictionary path diccEffect
+#   8) --format Output format: standoff, tabs
+
+# Ouput:
+#   1) File with predicted ris combined with existing files.
+# Format standoff:
+# T1	TF 0 0	ArgP
+# T2	GENE 0 0	Argp -- > argP
+# R1	Interaction.activator Target:T3 Agent:T1
+# Sentence ArgP protein represses its own synthesis
+# The FimZ transcription factor activates this promoter directly ,
+#   and it also positively regulates the transcription of its own gene
+# FimZ is known to regulate the expression of its own gene positively
+# FimZ also positively regulates its own transcription
+# ArgP protein represses its own synthesis
+# ArgP both represses its own transcription
+# ArgP protein represses its own synthesis
+# OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
+#   of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
+
+# Execution
+# python3 ri-autoregulation-extraction-v01.py
+# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
+# --inputFile dataSet_OnlyRI_sentences.auto.1017.txt
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccEffect normalized_Effects.json
+# python3 ri-autoregulation-extraction-v01.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences --inputFile dataSet_OnlyRI_sentences.auto.1017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+def getPosWord(wordPos, endPos, text, termList):
+    offsetStart = 0
+    wordNum = 0
+    listText = text.split()
+    for w in listText:
+        # if filenameBefore.find('000-2') > -1:
+        #     print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
+        if wordNum >= int(wordPos):
+            # for tok in word.split():
+            for t in termList:
+                # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
+                if w == t:
+                    return [w, offsetStart, offsetStart + len(w) - 1]
+            #else:
+        wordNum += 1
+        offsetStart += len(w) + 1
+        if wordNum > int(endPos):
+            return None
+    return None
+
+def getIdEntity(aList, etype, idE):
+    entity = aList[0]
+    if etype == "EFFECT":
+        normalizedEffect = entity
+        #print("EFFECT: {}".format(entity))
+        if entity in hashNormalizedEffects:
+            normalizedEffect = hashNormalizedEffects[entity]
+        etype += "." + normalizedEffect
+        #print("etype: {}".format(etype))
+    entityPosStart = aList[1]
+    entityPosEnd = aList[2]
+    keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+    #print("keyEntity: {}".format(keyEntity))
+    if keyEntity not in hashEntities:
+        idE += 1
+        idEntity = "T{}".format(idE)
+        hashEntities[keyEntity] = idEntity
+        #print("New entity {}: {}".format(idEntity, keyEntity))
+        return idEntity, idE
+    else:
+        idEntity = hashEntities[keyEntity]
+        return idEntity, idE
+
+def getIdInteraction(regulator, regulated, effect, idI, hashInt):
+    #print("hashInt: {}".format(hashInt))
+    keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+    if keyInteraction not in hashInt:
+        idI += 1
+        idInteraction = "R{}".format(idI)
+        hashInt[keyInteraction] = idInteraction
+        #print("New interaction {}: {}".format(idInteraction, keyInteraction))
+        #return idInteraction, idI
+    else:
+        idInteraction = hashInt[keyInteraction]
+    return idInteraction, idI
+
+def saveFiles(filename, hashE, hashI, s, effect):
+    if effect:
+        outputPath = os.path.join(options.outputPath, "complete-ris")
+    else:
+        outputPath = os.path.join(options.outputPath, "incomplete-ris")
+    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
+    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
+        for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
+            aList = k.split()
+            a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
+    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
+    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
+        for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
+            aList = k.split()
+            a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
+    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
+        txtFile.write(s)
+
+def loadFileEntities(filename, outputPath, hashTemp):
+    #print("Start loadFileEntities")
+    idE = 1
+    try:
+        f = filename[:filename.rfind(".")] + ".a1"
+        # print("file entities: {}".format(f))
+        with open(os.path.join(outputPath, f), mode="r") as a1File:
+            for line in a1File:
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                etype = listLine2[0]
+                entityPosStart = listLine2[1]
+                entityPosEnd = listLine2[2]
+                entity = listLine1[2]
+                keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+                idEntity = listLine1[0]
+                if keyEntity not in hashTemp:
+                    hashTemp[keyEntity] = idEntity
+                    if int(idEntity[1:]) > idE:
+                        idE = int(idEntity[1:])
+    except IOError:
+        print("IOError file: {}".format(os.path.join(outputPath, f)))
+        # idE = 1
+    return idE
+
+def loadFileInteractions(filename, outputPath, hashTemp):
+    #print("Start loadFileInteractions")
+    idI = 1
+    try:
+        with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
+            for line in a2File:
+                #print("Line a2: {}".format(line))
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                regulator = listLine2[2]
+                regulator = regulator[regulator.find(":") + 1:]
+                regulated = listLine2[1]
+                regulated = regulated[regulated.find(":") + 1:]
+                effect = listLine2[0]
+                effect = effect[effect.find(".") + 1:]
+                idInteraction = listLine1[0]
+                keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+                if keyInteraction not in hashTemp:
+                    hashTemp[keyInteraction] = idInteraction
+                    if int(idInteraction[1:]) > idI:
+                        idI = int(idInteraction[1:])
+    except IOError:
+        print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
+        # idI = 1
+    return idI
+
+'''
+def getTypeRegulation(effect_group, posini, sent, type_sent):
+    # To change regulation effect in such as:
+    # negative regulator --> repressor
+    # positively regulates --> activator
+    effect_ret = effect_group
+    #listEff = effect_ret.split('|')
+
+    if type_sent == "tra":
+        regexTypeEffectPosi = re.compile(r'(?<=positive\|(RB|JJ) )' + effect_ret)
+        regexTypeEffectNega = re.compile(r'(?<=negative\|(RB|JJ) )' + effect_ret)
+        if regexTypeEffectPosi.search(sent, posini - 12):
+            # Creo que no es necesario: effect_ret = "activator|{}|{}".format(listEff[1], listEff[2])
+            effect_ret = "activator"
+            print("Change regulation effect: {}".format(sent))
+        elif regexTypeEffectNega.search(sent, posini - 12):
+            # Creo que no es necesario: effect_ret = "repressor|{}|{}".format(listEff[1], listEff[2])
+            effect_ret = "repressor"
+            print("Change regulation effect: {}".format(sent))
+    return effect_ret
+'''
+
+def getRealPos(posStart, posEnd, lin):
+    return (posStart, posEnd)
+
+def getRI(r, l):
+    regulator = r.group('regulator')
+    regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
+    # We change TF name to GENE name
+    listRegulator = regulator.split('|')
+    regulatorWord = listRegulator[0]
+    regulated = regulatorWord[0].lower()+regulatorWord[1:]
+    regulated += "|{}|GENE".format(regulated)
+    regulatedPos = getRealPos(0, 0, l)
+    effect = r.group('effect')
+    # print("effect from group: {}".format(effect))
+    effectPos = getRealPos(r.start('effect'), r.end('effect'), l)
+
+    # To change regulation effect in:
+    # negative regulator --> repressor
+    # positively regulates --> activator
+    effect = getTypeRegulation(effect, r.start('effect'), l, "tra")
+
+    return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
+                    regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
+                    effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Input path", metavar="PATH")
+    parser.add_option("--inputFile", dest="inputFile",
+                      help="Input file", metavar="FILE")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path", metavar="PATH")
+    parser.add_option("--diccPath", dest="diccPath",
+                      help="Path to read dictionaries", metavar="PATH")
+    parser.add_option("--diccEffect", dest="diccEffect",
+                      help="File with normalized effects", metavar="FILE")
+
+    (options, args) = parser.parse_args()
+    #if len(args) > 0:
+    #    parser.error("None parameter entered.")
+    #    sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Input path: " + str(options.inputPath))
+    print("Input file: " + str(options.inputFile))
+    print("Output path: " + str(options.outputPath))
+    print("Path to read dictionaries: " + str(options.diccPath))
+    print("File with normalized effects: " + str(options.diccEffect))
+
+    # Loading normalized effects
+    print('Loading normalized effects (all)...')
+    hashNormalizedEffects = {}
+    with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
+        hashNormalizedEffects = json.load(diccFile)
+    listEffects = []
+    for eff in hashNormalizedEffects.keys():
+        listEffects.append(eff)
+    effects = "|".join(listEffects)
+    #print("Effects: {}".format(effects))
+
+    files = {}
+    hashEntities = {}
+    hashInteractions = {}
+    hashInteractionsEffect = {}
+    idEntities = 1
+    idInteractions = 1
+    idInteractionsEffect = 1
+
+    # The FimZ transcription factor activates this promoter directly ,
+    #   and it also positively regulates the transcription of its own gene
+    # FimZ is known to regulate the expression of its own gene positively
+    # FimZ also positively regulates its own transcription
+    # ArgP protein represses its own synthesis
+    # ArgP both represses its own transcription
+    # ArgP protein represses its own synthesis
+    # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
+    #   of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
+    regexAutoRI = re.compile(
+        # r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]\s){,4}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
+        r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+\s(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
+        #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^(TF)\s]+\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
+        #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^T][^F]\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT')
+
+    filename = options.inputFile
+    hashEntities = {}
+    hashInteractions = {}
+    hashInteractionsEffect = {}
+    idEntities = 1
+    idInteractions = 1
+    idInteractionsEffect = 1
+    outputPath = os.path.join(options.outputPath, "complete-ris")
+    idEntities = loadFileEntities(filename, outputPath, hashEntities)
+    idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
+    outputPath = os.path.join(options.outputPath, "incomplete-ris")
+    idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
+
+    listRIs = []
+    # print("Read autoregulation file")
+    with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
+        for line in iFile:
+            line = line.rstrip('\n')
+            print("Buscando autoregulation")
+            result = regexAutoRI.search(line)
+            #print("result: {}".format(result))
+            if result:
+                lineTemp = result.string[result.end('regulator'):result.end(0)]
+                # print("lineTemp: {}".format(lineTemp))
+                result2 = regexAutoRI.search(lineTemp)
+                if result2:
+                    print("Regulator {} regulated {} effect {}".format(result2.group('regulator'), result2.group('regulator'), result2.group('effect')))
+                    listRIs.append(getRI(result2, line))
+                    print("listRIs: {}".format(listRIs))
+                elif result:
+                    print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulator'), result.group('effect')))
+                    listRIs.append(getRI(result, line))
+                    print("listRIs: {}".format(listRIs))
+
+
+    for ri in listRIs:
+        #print("ri: {}".format(ri))
+        if len(ri) != 4:
+            print("WARNING! corrupted list")
+            exit()
+        regulator = ri[0]
+        regulated = ri[1]
+        effect = ri[2]
+        line = ri[3]
+
+        listElem = regulator.split('|')
+        regulatorWord = listElem[0]
+        regulatorType = listElem[2]
+        regulatorStart = listElem[3]
+        regulatorEnd = listElem[4]
+
+        listElem = regulated.split('|')
+        regulatedWord = listElem[0]
+        regulatedType = listElem[2]
+        regulatedStart = listElem[3]
+        regulatedEnd = listElem[4]
+
+        listElem = effect.split('|')
+        effectWord = listElem[0]
+        effectType = "EFFECT"
+        effectStart = listElem[1]
+        effectEnd = listElem[2]
+
+        idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
+        idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
+        idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
+                                                         idInteractions, hashInteractions)
+        idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
+        idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
+                                                               idInteractionsEffect,
+                                                               hashInteractionsEffect)
+
+        saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
+        saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)
--- a/ri-openie-extraction-v02.py 0 → 100644
View file @f3df57a
+++ b/ri-openie-extraction-v02.py 0 → 100644
View file @f3df57a
+# -*- coding: UTF-8 -*-
+from optparse import OptionParser
+import sys
+import os
+import json
+import operator
+from general_functions import getTypeRegulation
+import re
+from nltk.corpus import words
+
+__author__ = 'CMendezC'
+
+
+# Objective: obtain predicted ris from triplets extracted by OpenIE Stanford CoreNLP
+# Input format:
+# WARNING: Only one sentence per line
+
+# Parameters:
+#   1) --inputPath Input path
+#   2) --inputFile Inpupt file
+#   3) --outputPath Output path
+#   5) --diccPath Dictionary path
+#   6) --diccFile JSON file with entity dictionaries
+#   7) --diccEffect File with normalized effects
+#   8) --format Output format: standoff, tabs
+#   9) --diccEPAth Dictionary path diccEffect
+
+# Ouput:
+#   1) File with predicted ris.
+# Format standoff:
+# T1	TF 0 0	MetR
+# T2	TU 0 0	metH
+# T3	GENE 0 0	metH
+# T1      Growth_condition 88 137 mitochondrial electron transport chain inhibitors
+# T2      Growth_condition 150 179        switch rich to minimal medium
+# R1	Interaction.activator Target:T3 Agent:T1
+# R2	Interaction.activator Target:T2 Agent:T1
+
+# Execution
+# python3.4 ri-openie-extraction.py
+# --inputFile /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris/predicted-ris.reverb
+# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/processing-ris
+# --diccPath /home/cmendezc/terminologicalResources
+# --diccFile normalized_Effects_Type.json
+# --diccEffect termFilesTag_RIE_GCE_SYSTEM_ECCO.jsong
+# --format standoff
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+def getPosWord(wordPos, endPos, text, termList, type_entity=""):
+    #print("GETPOSWORD wordPOs {}".format(wordPos))
+    offsetStart = 0
+    wordNum = 0
+    listText = text.split()
+    for w in listText:
+        # if filenameBefore.find('000-2') > -1:
+        #     print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
+        if wordNum >= int(wordPos):
+            # for tok in word.split():
+            for t in termList:
+                # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
+                if w == t:
+                    if type_entity == "EFFECT":
+                        # To change regulation effect in:
+                        # negative regulator --> repressor
+                        # positively regulates --> activator
+                        print("text: {}".format(text))
+                        new_w = getTypeRegulation(w, int(wordPos), text, "word")
+                        return [new_w, offsetStart, offsetStart + len(w) - 1]
+                    else:
+                        return [w, offsetStart, offsetStart + len(w) - 1]
+            #else:
+        wordNum += 1
+        offsetStart += len(w) + 1
+        if wordNum > int(endPos):
+            return None
+    return None
+
+
+def getIdEntity(aList, etype, idE):
+    entity = aList[0]
+    if etype == "EFFECT":
+        normalizedEffect = entity
+        # print("EFFECT: {}".format(entity))
+        if entity in hashEffects:
+            normalizedEffect = hashEffects[entity]
+        etype += "." + normalizedEffect
+            # print("EFFECT: {}".format(entity))
+    entityPosStart = aList[1]
+    entityPosEnd = aList[2]
+    keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+    #if filenameBefore.find('061-02') > -1:
+    #    print("keyEntity: {}".format(keyEntity))
+    #    print("idE: {}".format(idE))
+    #    print("hashEntities: {}".format(hashEntities))
+    if keyEntity not in hashEntities:
+        idE += 1
+        idEntity = "T{}".format(idE)
+        #if filenameBefore.find('061-02') > -1:
+        #    print("idEntity not in hashEntities: {}".format(keyEntity))
+        #    print("idE not in hashEntities: {}".format(idE))
+        hashEntities[keyEntity] = idEntity
+        #print("New entity {}: {}".format(idEntity, keyEntity))
+        return idEntity, idE
+    else:
+        idEntity = hashEntities[keyEntity]
+        return idEntity, idE
+
+
+def getIdInteraction(regulator, regulated, effect, idI, hashInt):
+    #print("hashInt: {}".format(hashInt))
+    keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+    if keyInteraction not in hashInt:
+        idI += 1
+        idInteraction = "R{}".format(idI)
+        hashInt[keyInteraction] = idInteraction
+        #print("New interaction {}: {}".format(idInteraction, keyInteraction))
+        #return idInteraction, idI
+    else:
+        idInteraction = hashInt[keyInteraction]
+    return idInteraction, idI
+
+
+def saveFiles(filename, hashE, hashI, s, effect):
+    if effect:
+        outputPath = os.path.join(options.outputPath, "complete-ris")
+    else:
+        outputPath = os.path.join(options.outputPath, "incomplete-ris")
+    with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="w") as a1File:
+    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
+        for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
+            aList = k.split()
+            a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
+    with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="w") as a2File:
+    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
+        for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
+            aList = k.split()
+            a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
+    with open(os.path.join(outputPath, filename[:file.find(".")] + ".txt"), mode="w") as txtFile:
+        txtFile.write(s)
+
+def loadFileEntities(filename, outputPath, hashTemp):
+    idE = 1
+    try:
+        with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="r") as a1File:
+            for line in a1File:
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                etype = listLine2[0]
+                entityPosStart = listLine2[1]
+                entityPosEnd = listLine2[2]
+                entity = listLine1[2]
+                keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+                idEntity = listLine1[0]
+                if keyEntity not in hashTemp:
+                    hashTemp[keyEntity] = idEntity
+                    if int(idEntity[1:]) > idE:
+                        idE = int(idEntity[1:])
+    except IOError:
+        print("IOError file, idEntity starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a1")))
+        # idE = 1
+    return idE
+
+def loadFileInteractions(filename, outputPath, hashTemp):
+    idI = 1
+    try:
+        with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="r") as a2File:
+            for line in a2File:
+                #print("Line a2: {}".format(line))
+                line = line.strip('\n')
+                listLine1 = line.split('\t')
+                listLine2 = listLine1[1].split(' ')
+                regulator = listLine2[2]
+                regulator = regulator[regulator.find(":") + 1:]
+                regulated = listLine2[1]
+                regulated = regulated[regulated.find(":") + 1:]
+                effect = listLine2[0]
+                effect = effect[effect.find(".") + 1:]
+                idInteraction = listLine1[0]
+                keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+                if keyInteraction not in hashTemp:
+                    hashTemp[keyInteraction] = idInteraction
+                    if int(idInteraction[1:]) > idI:
+                        idI = int(idInteraction[1:])
+    except IOError:
+        print("IOError file, idInteraction starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a2")))
+        # idI = 1
+    return idI
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Input path", metavar="PATH")
+    parser.add_option("--inputFile", dest="inputFile",
+                      help="Input file", metavar="FILE")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path", metavar="PATH")
+    #parser.add_option("--outputFile", dest="outputFile",
+                      #help="Output file", metavar="FILE")
+    parser.add_option("--diccPath", dest="diccPath",
+                      help="Path to read dictionaries", metavar="PATH")
+    parser.add_option("--diccFile", dest="diccFile",
+                      help="JSON file with entity dictionaries", metavar="FILE")
+    parser.add_option("--diccEffect", dest="diccEffect",
+                      help="File with normalized effects", metavar="FILE")
+    parser.add_option("--format", dest="format",
+                      help="Output format: standoff", metavar="TEXT")
+    parser.add_option("--diccEPAth", dest="diccEPAth",
+                      help="File with normalized effects", metavar="FILE")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameter entered.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Input path: " + str(options.inputPath))
+    print("Input file: " + str(options.inputFile))
+    print("Output path: " + str(options.outputPath))
+    #print("Output file: " + str(options.outputFile))
+    print("Path to read dictionaries: " + str(options.diccPath))
+    print("JSON file with entity dictionaries: " + str(options.diccFile))
+    print("Path to read normalized effects: " + str(options.diccEPAth))
+    print("File with normalized effects: " + str(options.diccEffect))
+    print("Output format: " + str(options.format))
+
+    regularWords =  words.words('en')
+
+    print('Loading dictionaries...')
+    with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
+        hashDicc = json.load(diccFile)
+
+    # hashTermFiles = hashDicc["hashTermFiles"]
+    # hashTerms = hashDicc["hashTerms"]
+
+    # for key in hashTermFiles.keys():
+    #     for f in hashTermFiles[key]:
+    #         # print('File: ' + f)
+    #         with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
+    #             for line in iFile:
+    #                 line = line.strip('\n')
+    #                 line = line.replace(' ', '-')
+    #                 if line not in hashTerms[key]:
+    #                     hashTerms[key].append(line)
+    #                     # if options.termLower:
+    #                     # hashTerms[key].append(line.lower())
+    #                     # if options.termCapitalize:
+    #                     # hashTerms[key].append(line.capitalize())
+    #     print('   Terms read {} size: {}'.format(key, len(hashTerms[key])))
+
+    # Loading normalized effects
+    print('Loading normalized effects...')
+    with open(os.path.join(options.diccEPAth, options.diccEffect)) as diccFile:
+        hashEffects = json.load(diccFile)
+
+    files = {}
+    hashEntities = {}
+    hashInteractions = {}
+    hashInteractionsEffect = {}
+    idEntities = 1
+    idInteractions = 1
+    idInteractionsEffect = 1
+    filenameBefore = ''
+    regexNumFile = re.compile(r'_([0-9]+)[.-]')
+    numFile = ""
+    inumFile = 0
+    hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}
+
+    with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
+        for line in iFile:
+            line = line.rstrip('\n')
+            listLine = line.split('\t')
+            file = listLine[0]
+            filename = file.split("/")[-1]
+            filename = filename[:-4]
+            if filename not in files:
+                # New file, that is, new sentence
+                files[filename] = 1
+                if len(files) > 1:
+                    if len(hashEntities) > 0:
+                        #if filenameBefore.find('061-02') > -1:
+                        #    print("filenameBefore: {}".format(filenameBefore))
+                        #    print("Save hashEntities: {}".format(hashEntities))
+                        #    print("Save hashInteractions: {}".format(hashInteractions))
+                        #    print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
+                        saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
+                        saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)
+                filenameBefore = filename
+                hashEntities = {}
+                hashInteractions = {}
+                hashInteractionsEffect = {}
+                idEntities = 1
+                idInteractions = 1
+                idInteractionsEffect = 1
+                outputPath = os.path.join(options.outputPath, "complete-ris")
+                idEntities = loadFileEntities(filename, outputPath, hashEntities)
+                idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
+                outputPath = os.path.join(options.outputPath, "incomplete-ris")
+                idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
+                result = regexNumFile.search(filenameBefore)
+                if result:
+                    inumFile = int(result.group(1))
+                    numFile = str(inumFile)
+                    print("Numfile: {}".format(numFile))
+                else:
+                    print("WARNING: numfile not found in filename")
+                hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}
+                if numFile in hashDicc:
+                    hashTemp = hashDicc[numFile]
+                    #print("hashDicc[numFile]: {}".format(hashTemp))
+                    for k, v in hashTemp.items():
+                        if v == "TF":
+                            # print("Verifiying TF")
+                            if "TF" in hashTerms:
+                                # print(" TF {}".format(k))
+                                hashTerms["TF"].append(k)
+                            else:
+                                hashTerms["TF"] = [k]
+                        elif v == "GENE":
+                            if "GENE" in hashTerms:
+                                hashTerms["GENE"].append(k)
+                            else:
+                                hashTerms["GENE"] = [k]
+                        elif v == "TU":
+                            if "TU" in hashTerms:
+                                hashTerms["TU"].append(k)
+                            else:
+                                hashTerms["TU"] = [k]
+                        elif v == "EFFECT":
+                            if "EFFECT" in hashTerms:
+                                hashTerms["EFFECT"].append(k)
+                            else:
+                                hashTerms["EFFECT"] = [k]
+                        else:
+                            print("WARNING: entity not found in dictionaries")
+                else:
+                    print("WARNING: numfile not found in dictionaries")
+                #if filename.find('061-02') > -1:
+                #    print("filename: {}".format(filename))
+                #    print("Load hashEntities: {}".format(hashEntities))
+                #    print("Load hashInteractions: {}".format(hashInteractions))
+                #    print("Load hashInteractionsEffect: {}".format(hashInteractionsEffect))
+
+            wordA = listLine[2]
+            wordB = listLine[3]
+            wordC = listLine[4]
+            startA = listLine[5]
+            endA = listLine[6]
+            startB = listLine[7]
+            endB = listLine[8]
+            startC = listLine[9]
+            endC = listLine[10]
+            sent = listLine[12]
+            lemmaA = listLine[2]
+            lemmaB = listLine[3]
+            lemmaC = listLine[4]
+
+            # Return [tok, offsetStart, offsetEnd ]
+            # print("hashTerms[TF]: {}".format(hashTerms["TF"]))
+            listRegulator = getPosWord(startA, endA, sent, hashTerms["TF"])
+            if listRegulator is not None:
+                #if filenameBefore.find('061-02') > -1:
+                #    print(">> Regulator found: {}".format(listRegulator[0]))
+                listRegulated = getPosWord(startC, endC, sent, hashTerms["GENE"])
+                if listRegulated is not None:
+                    #if filenameBefore.find('061-02') > -1:
+                    #    print(">> Regulated GENE found: {}".format(listRegulated[0]))
+                    idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
+                    idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
+                    idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
+                    #print("Review EFFECT")
+                    listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
+                    if listEffect is not None:
+                        idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
+                        idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
+                else:
+                    listRegulated = getPosWord(startC, endC, sent, hashTerms["TU"])
+                    if listRegulated is not None:
+                        #if filenameBefore.find('061-02') > -1:
+                        #    print(">> Regulated TU found: {}".format(listRegulated[0]))
+                        idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
+                        idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
+                        idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
+                        #print("Review EFFECT")
+                        listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
+                        if listEffect is not None:
+                            idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
+                            idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
+            else:
+                listRegulator = getPosWord(startC, endC, sent, hashTerms["TF"])
+                if listRegulator is not None:
+                    #if filenameBefore.find('061-02') > -1:
+                    #    print(">> Regulator found: {}".format(listRegulator[0]))
+                    listRegulated = getPosWord(startA, endA, sent, hashTerms["GENE"])
+                    if listRegulated is not None:
+                        #if filenameBefore.find('061-02') > -1:
+                        #    print(">> Regulated GENE found: {}".format(listRegulated[0]))
+                        idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
+                        idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
+                        idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
+                        #print("Review EFFECT")
+                        listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
+                        if listEffect is not None:
+                            idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
+                            idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
+                    else:
+                        listRegulated = getPosWord(startA, endA, sent, hashTerms["TU"])
+                        if listRegulated is not None:
+                            #if filenameBefore.find('061-02') > -1:
+                            #    print(">> Regulated TU found: {}".format(listRegulated[0]))
+                            idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
+                            idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
+                            idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
+                            #print("Review EFFECT")
+                            listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
+                            if listEffect is not None:
+                                idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
+                                idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
+        if len(files) > 1:
+            if len(hashEntities) > 0:
+                #print("filenameBefore: {}".format(filenameBefore))
+                #print("Save hashEntities: {}".format(hashEntities))
+                #print("Save hashInteractions: {}".format(hashInteractions))
+                #print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
+                saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
+                saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)
--- a/ri-openie-extraction/.gitignore 0 → 100644
View file @f3df57a
+++ b/ri-openie-extraction/.gitignore 0 → 100644
View file @f3df57a
+
--- a/run-several-files.sh 0 → 100755
View file @f3df57a
+++ b/run-several-files.sh 0 → 100755
View file @f3df57a
+#!/bin/bash
+
+###### Automatic extraction of TRN from several files ######
+
+BRIES_HOME=/myhome/bries
+PMIDS_HOME=/myhome/preprocessed-files
+# We don't use REFERENCE_HOME because we don't evaluate. Path /reference-data-set doesn't exist. File no-reference.txt doesn't exist.
+REFERENCE_HOME=/myhome/reference-data-set
+
+for f in $PMIDS_HOME/original/text/*.*
+do
+ FILE_NAME=$(basename "$f")
+ FILE_NAME="${FILE_NAME%.*}"
+ echo "File: $FILE_NAME"
+ ./automatic-extraction-ris-gcs.sh $PMIDS_HOME/features/$FILE_NAME.tra.word.txt $PMIDS_HOME/transformed/$FILE_NAME.tra.txt $BRIES_HOME/ri-openie-extraction/$FILE_NAME.txt $BRIES_HOME/predicted-ris-gcs Y Y FILT1 $REFERENCE_HOME no-reference.txt $BRIES_HOME/evaluation-reports no-evaluation.txt diccionario-SYNONYMS.json $PMIDS_HOME/original/tsv 1>uno-$FILE_NAME.txt 2>dos-$FILE_NAME.txt
+done
--- a/sentence-filter_v02.py 0 → 100644
View file @f3df57a
+++ b/sentence-filter_v02.py 0 → 100644
View file @f3df57a
+# -*- coding: UTF-8 -*-
+
+from optparse import OptionParser
+import os
+import sys
+from time import time
+import json
+import re
+import pandas as pd
+
+__author__ = 'CMendezC'
+
+
+# Objective: Filter sentences with specific entities.
+# Also extract attributive sentences: effect-TF
+# And autoregulation: regulates its own gene
+# CFMC 2022-03-08: We added updating tsv file with idsentence, sentence and section (.pre.tsv)
+#   to indicate filtered sentences.
+
+# Parameters:
+#   1) --inputFileWord Path and filename to read feature word file.
+#   2) --inputFileTrans Path and filename to read transformed file.
+#   3) --outputPath Path to place output file.
+#   4) --outputFile Output file.
+#   5) --filter FILT1: (GENE OR TU) AND TF
+#               FILT2: (GENE OR TU) AND EFFECT AND TF
+#   6) --attrPath Path for attributive cases: ArgP-regulated genes
+#   8) --dicPath Path for dictionary
+#   9) --dicFile Path for dictionary file normalized_Effects.json
+#   10) --autoPath Path for autoregulation cases: regulates its own gene
+# /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
+
+# Output:
+#   1) Filtered sentences.
+#   2) Attributive sentences
+#   3) Autoregulation sentences
+
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+def getEntities(tline, filt):
+    # FILT1: (GENE OR TU) AND TF
+    # FILT2: (GENE OR TU) AND EFFECT AND TF
+    entities = {}
+    tline = tline.rstrip('\n\r ')
+    for token in tline.split(" "):
+        # print("Token: {}".format(token))
+        listElem = token.split("|")
+        w = listElem[0]
+        l = listElem[1]
+        t = listElem[2]
+        if filt == "FILT1" or filt == "FILT2":
+            if t in ["GENE", "TU", "TF", "EFFECT"]:
+                if w not in entities:
+                    entities[w] = t
+        # if filt == "FILT2":
+        #     if t in ["GENE", "TU", "TF", "EFFECT"]:
+        #         if w not in entities:
+        #             entities[w] = t
+    return entities
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+
+    parser.add_option("--inputFileWord", dest="inputFileWord",
+                      help="Path and filename to read feature word file", metavar="PATH")
+    parser.add_option("--inputFileTrans", dest="inputFileTrans",
+                      help="Path and filename to read transformed file", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path", metavar="PATH")
+    parser.add_option("--outputFile", dest="outputFile",
+                      help="Output file", metavar="FILE")
+    parser.add_option("--filter", dest="filter", choices=('FILT1', 'FILT2'), default=None,
+                      help="FILT1: (GENE OR TU) AND TF; FILT2: (GENE OR TU) AND EFFECT AND TF", metavar="TEXT")
+    parser.add_option("--attrPath", dest="attrPath",
+                      help="Output path attributive sentences", metavar="PATH")
+    parser.add_option("--dicPath", dest="dicPath",
+                      help="Output path dictionary", metavar="PATH")
+    parser.add_option("--dicFile", dest="dicFile",
+                      help="Output file dictionary normalized_Effects.json", metavar="FILE")
+    parser.add_option("--autoPath", dest="autoPath",
+                      help="Output path autoregulation sentences", metavar="PATH")
+    parser.add_option("--tsvPath", dest="tsvPath",
+                      help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path and filename to read feature word file: " + str(options.inputFileWord))
+    print("Path and filename to read transformed file: " + str(options.inputFileTrans))
+    print("Output path: " + str(options.outputPath))
+    print("Output file: " + str(options.outputFile))
+    print("Filter: " + str(options.filter))
+    print("Output path attributive sentences: " + str(options.attrPath))
+    print("Output path autoregulation sentences: " + str(options.autoPath))
+    print("Output path dictionary: " + str(options.dicPath))
+    print("Output file dictionary normalized_Effects.json: " + str(options.dicFile))
+    print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))
+
+    # Loading normalized effects
+    # print('Loading normalized effects...')
+    hashNormalizedEffects = {}
+    with open(os.path.join(options.dicPath, options.dicFile)) as diccFile:
+        hashNormalizedEffects = json.load(diccFile)
+    listEffects = []
+    for eff in hashNormalizedEffects.keys():
+        if eff.endswith('d'):
+            listEffects.append(eff)
+    listEffects.append("dependent")
+    effects = "|".join(listEffects)
+    print("Effects: {}".format(effects))
+
+    t0 = time()
+    count = 0
+    hashEntities = {}
+    hashAttrSent = {}
+    hashAutoSent = {}
+    # Original CMC 2018-11-07: reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
+    # We decided to extract all sentences containing effect-TF because we observed some patterns where
+    # "gene" does not appear, then, to recover these examples we employ a more general rule to separate
+    # attributive sentences.
+    reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF')
+    # We decided to extract all sentences containing autoregulation
+    # The FimZ transcription factor activates this promoter directly ,
+    #   and it also positively regulates the transcription of its own gene
+    # FimZ is known to regulate the expression of its own gene positively
+    # FimZ also positively regulates its own transcription
+    # ArgP protein represses its own synthesis
+    # ArgP both represses its own transcription
+    # ArgP protein represses its own synthesis
+    # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
+    #   of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
+    reAutoSent = re.compile(r'(?<=\|TF).+\|EFFECT.+its\|its\|PRP\$ own\|own\|JJ')
+    aFilter = options.filter
+    print("   Processing file...{}".format(options.inputFileTrans))
+    with open(os.path.join(options.outputPath, options.outputFile), "w", encoding="utf-8", errors="replace") as oFile:
+        with open(os.path.join(options.inputFileTrans), mode="r", encoding="utf-8", errors="replace") as tFile, open(os.path.join(options.inputFileWord), mode="r", encoding="utf-8", errors="replace") as wFile:
+            # CFMC 2022-03-09: Load tsv file with section, id sentence, sentence (Extracted from jsonpdf)
+            file = options.inputFileTrans[options.inputFileTrans.rfind("/")+1:]
+            file_tsv = file.replace(".tra.txt", ".pre.tsv")
+            tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
+            print("tsv_file.shape: {}".format(tsv_file.shape))
+            tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
+            print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
+            # print(tsv_file_filtered.head(10))
+            tsv_file_new = tsv_file_filtered.reset_index(drop=True)
+            # print(tsv_file_new.shape)
+            # print(tsv_file_new.head(10))
+            i = 0
+            for tLine, wLine in zip(tFile, wFile):
+                # FILT1: (GENE OR TU) AND TF
+                # FILT2: (GENE OR TU) AND EFFECT AND TF
+                if aFilter is not None:
+                    reGENETU = re.compile(r'(\|GENE|\|TU)')
+                    reEFFECT = re.compile(r'\|EFFECT')
+                    reTF = re.compile(r'\|TF')
+                    tCount = str(count)
+                    if aFilter == "FILT1":
+                        if not (reGENETU.search(tLine) and reTF.search(tLine)):
+                            #print("NOT FOUND")
+                            # CFMC 2022-03-08
+                            tsv_file_new.at[i, 'status'] = 0
+                            i += 1
+                            continue
+                        else:
+                            #print("FOUND")
+                            oFile.write(wLine)
+                            if tCount not in hashEntities:
+                                hashEntities[tCount] = getEntities(tLine, aFilter)
+                            if reAttrSent.search(tLine):
+                                #print("ATTRIBUTIVE SENTENCE: {}".format(tLine))
+                                if tCount not in hashAttrSent:
+                                    hashAttrSent[tCount] = tLine
+                            # Autoregulation sentences
+                            if reAutoSent.search(tLine):
+                                # print("AUOREGULATION SENTENCE: {}".format(tLine))
+                                if tCount not in hashAutoSent:
+                                    hashAutoSent[tCount] = tLine
+                            #print(tLine)
+                    elif aFilter == "FILT2":
+                        if not (reGENETU.search(tLine) and reEFFECT.search(tLine) and reTF.search(tLine)):
+                            continue
+                            # CFMC 2022-03-08
+                            tsv_file_new.at[i, 'status'] = 0
+                            i += 1
+                        else:
+                            oFile.write(wLine)
+                            if tCount not in hashEntities:
+                                hashEntities[tCount] = getEntities(tLine, aFilter)
+                            if reAttrSent.search(tLine):
+                                if tCount not in hashAttrSent:
+                                    hashAttrSent[tCount] = tLine
+                            if reAutoSent.search(tLine):
+                                if tCount not in hashAutoSent:
+                                    hashAutoSent[tCount] = tLine
+                count += 1
+                i += 1
+
+    merged = tsv_file.merge(tsv_file_new, on=['idsentence'], how='left')
+    # print(merged.shape)
+    # print(merged.head(10))
+    tsv_file.status = merged.status_y.where(~merged.status_y.isnull(), tsv_file.status).astype(int)
+    tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
+    print("Last tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
+    # print(tsv_file_filtered.head(10))
+    tsv_file.to_csv(os.path.join(options.tsvPath, file_tsv.replace('.tsv', '.fil.tsv')), sep='\t')
+
+    with open(os.path.join(options.outputPath, options.outputFile.replace(".txt", ".ents.json")), "w", encoding="utf-8",
+              errors="replace") as eFile:
+        json.dump(hashEntities, eFile)
+
+    for f, sent in hashAttrSent.items():
+        listPath = options.inputFileTrans.split('/')
+        fileName = listPath[-1]
+        fileName = fileName.replace('.tra.', '.att.' + f + '.')
+        print("Save file {}".format(fileName))
+        with open(os.path.join(options.attrPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
+            aFile.write(sent)
+
+    for f, sent in hashAutoSent.items():
+        listPath = options.inputFileTrans.split('/')
+        fileName = listPath[-1]
+        fileName = fileName.replace('.tra.', '.auto.' + f + '.')
+        print("Save file {}".format(fileName))
+        with open(os.path.join(options.autoPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
+            aFile.write(sent)
+
+    print("Files split in: %fs" % (time() - t0))
--- a/sentence-simplification/algorithm_sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/sentence-simplification/algorithm_sentences/.gitignore 0 → 100644
View file @f3df57a
+
--- a/sentence-simplification/iSimp_sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/sentence-simplification/iSimp_sentences/.gitignore 0 → 100644
View file @f3df57a
+
--- a/sentence-simplification/sentence-simplification-main.sh 0 → 100755
View file @f3df57a
+++ b/sentence-simplification/sentence-simplification-main.sh 0 → 100755
View file @f3df57a
+#!/bin/bash
+
+#Validate arguments
+if [[ ! ("$#" == 3 ) ]]; then
+    echo 'Usage: ./sentence-simplification-main.sh <input_path> <output_file_path> <isimp_path>'
+    exit 1
+fi
+
+SCRIPT_PATH=$(cd `dirname $0` && pwd)
+#Define aquí la palabra clave del grupo de oraciones a simplificar.
+INPUT_PATH=$1
+OUTPUT_INDEX_FILE_PATH=$2
+ISIMP_PATH=$3
+cd $SCRIPT_PATH
+
+
+
+
+#ANALIZAR EN ISIMP
+echo "Analysing in iSimp..."
+if [ -z "$(ls -A ./iSimp_sentences/)" ]; then :
+else
+   #echo "Not Empty"
+   rm ./iSimp_sentences/*
+fi
+#cd $INPUT_PATH
+for j in $INPUT_PATH/*
+do
+	echo $j
+	#echo "++++entrada_simp: $j salida_simp: $SCRIPT_PATH/iSimp_sentences/$(basename $j)"
+	$ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
+done
+cd $SCRIPT_PATH
+
+#CREA INDICE DE ARCHIVOS SIMPLIFICADOS
+#touch $SCRIPT_PATH/index.txt
+>| $OUTPUT_INDEX_FILE_PATH
+
+#ALIMENTAR A ALGORITMO 
+echo "Analysing in Algorithm..."
+if [ -z "$(ls -A ./algorithm_sentences/)" ]; then :
+else
+   #echo "Not Empty"
+   rm ./algorithm_sentences/*
+fi
+#cd ./iSimp_sentences
+for k in $SCRIPT_PATH/iSimp_sentences/*
+do
+	echo $k
+	#echo "entrada: $k  salida: $SCRIPT_PATH/algorithm_sentences/$(basename $k) index: $OUTPUT_INDEX_FILE_PATH"
+	python2 $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
+done
+cd $SCRIPT_PATH
--- a/sentence-simplification/simplifier.py 0 → 100644
View file @f3df57a
+++ b/sentence-simplification/simplifier.py 0 → 100644
View file @f3df57a
+import copy
+import sys
+import requests
+
+class Simp(object):
+  def __init__(self):
+    self.TYPE=""
+    self.TYPEx=0
+    self.TYPEy=0
+    self.TEXT=""
+    self.COMP=[]
+  def agregarTYPE(self,Type):
+    self.TYPE=Type
+  def agregarTEXT(self,text):
+    self.TEXT=text
+  def agregarCOMP(self,comp):
+    self.COMP.append(comp)
+
+class Frase(object):
+  def __init__(self):
+    self.TYPE=""
+    self.TEXT=""
+    self.POS=""
+    self.TREE=""
+    self.SIMP=[]
+  def agregarTYPE(self,Type):
+    self.TYPE=Type
+  def agregarTEXT(self,text):
+    self.TEXT=text
+  def agregarPOS(self,Pos):
+    self.POS=Pos
+  def agregarTREE(self,Tree):
+    self.TREE=Tree
+  def agregarSIMP(self):
+    self.SIMP.append(Simp())
+
+class Sentence(object):
+  def __init__(self):
+    self.FLAG=True
+    self.TEXT=""
+    self.TREE=""
+    self.SIMP=[]
+  def agregarTEXT(self,text):
+    self.TEXT=text
+  def agregarTREE(self,Tree):
+    self.TREE=Tree
+  def agregarSIMP(self):
+    self.SIMP.append(Simp())
+
+
+MEMORIAB=[]
+MEMORIAA=[]
+
+
+#----lectura de datos desde archivo
+arch=(sys.argv[1])
+f = open(arch)
+dato = f.read().splitlines()
+f.close
+frase=Frase()
+for i in range(len(dato)):
+  if 'TYPE: ' in dato[i][0:6]:
+    frase.agregarTYPE(dato[i][6:])
+  elif 'TEXT: ' in dato[i][0:6]:
+    frase.agregarTEXT(dato[i][6:])
+  elif 'POS : ' in dato[i][0:6]:
+    frase.agregarPOS(dato[i][6:])
+  elif 'TREE: ' in dato[i][0:6]:
+    frase.agregarTREE(dato[i][6:])
+  elif 'SIMP:' in dato[i]:
+    frase.agregarSIMP()
+  elif '  TYPE: ' in dato[i][0:8]:
+    frase.SIMP[-1].agregarTYPE(dato[i][8:])
+  elif '  TEXT: ' in dato[i][0:8]:
+    frase.SIMP[-1].agregarTEXT(dato[i][8:])
+  elif '  COMP: ' in dato[i]:
+    frase.SIMP[-1].agregarCOMP(dato[i][8:])
+#------------
+
+
+#-------Programa principal
+#Algoritmo v4
+
+
+if ((frase.TYPE.find('sentence')) !=- 1) and (frase.SIMP!=[]) and (frase.SIMP[0].TYPE != ''):
+  y=1
+  w=1
+  SIMPworkspace=[]
+  # copia TREE y cada SIMP a SENTENCE.1
+  Sentence1=Sentence()
+  Sentence1.TREE=copy.deepcopy(frase.TREE)
+  Sentence1.TEXT=copy.deepcopy(frase.TEXT)
+  for i in range(len(frase.SIMP)):
+    #Sentence1.SIMP.append(Simp())
+    #Sentence1.SIMP[i]=copy.deepcopy(frase.SIMP[i])
+    SIMPworkspace.append(Simp())
+    SIMPworkspace[i]=copy.deepcopy(frase.SIMP[i])
+  
+## ORDENAMIENTO DE SIMPs
+  for i in range(len(SIMPworkspace)):
+    #print SIMPworkspace[i].TEXT
+    #print SIMPworkspace[i].TYPE
+    SIMPworkspace[i].TYPEx = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('[')+1:SIMPworkspace[i].TYPE.find('..')])
+    SIMPworkspace[i].TYPEy = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('..')+2:SIMPworkspace[i].TYPE.find(']')])
+    if 'parenthesis' in SIMPworkspace[i].TYPE:
+      SIMPworkspace[i].TYPEy = SIMPworkspace[i].TYPEy + 2
+    #print SIMPworkspace[i].TYPEx
+    #print SIMPworkspace[i].TYPEy
+
+
+  SIMPworkspace.sort(key=lambda x: x.TYPEy, reverse=True)
+  SIMPworkspace.sort(key=lambda x: x.TYPEx)
+
+ 
+ # for i in range(len(SIMPworkspace)):
+ #   print "\nSIMP " + str(i) + " :"
+ #   print SIMPworkspace[i].TYPE 
+ #   print SIMPworkspace[i].TYPEx
+ #   print SIMPworkspace[i].TYPEy
+ # print "\n"  
+
+  for i in range(len(SIMPworkspace)):
+    Sentence1.SIMP.append(Simp())
+    Sentence1.SIMP[i]=copy.deepcopy(SIMPworkspace[i])  
+   
+
+  # Agrega la oracion original Sentence1 a la memoria como primer objeto en ser analizado
+  MEMORIAB.append(Sentence())
+  MEMORIAB[0]=copy.deepcopy(Sentence1)
+
+
+
+  # 1 entrada al bucle A por cada SIMP diferente en Sentence1
+  numSimp=len(Sentence1.SIMP)
+  s = 0
+  #bucle A
+  while s < numSimp :
+    #print "\nEntro por vez " + str(s) + " al bucle A"
+    #print "Analizando todos los SIMP de tipo: " + MEMORIAB[0].SIMP[s].TYPE
+    #Entra al bucle B el numero de veces igual al numerode elementos en MEMORIAB
+    numMEM = len(MEMORIAB)
+    t = 0
+    #bucle B
+    while t < numMEM :
+      #print "Entro por vez " + str(t) + " al bucle B"
+      #Entra si la oracion no ha sido analizada antes (FLAG==True) y si el texto del simp esta presente en la oracion.
+      #print "CONDICIONES:"
+      #print "SIMP " + MEMORIAB[0].SIMP[s].TEXT
+      #print "SIMP " + MEMORIAB[0].SIMP[s].TYPE
+      #print "MEMB " + str(MEMORIAB[t].FLAG)
+      #print "MEMB " + MEMORIAB[t].TEXT
+      if ( MEMORIAB[0].SIMP[s].TEXT in MEMORIAB[t].TEXT ) and ( MEMORIAB[t].FLAG == True ):
+        MEMORIAB[t].FLAG = False
+	#print "False to: " + MEMORIAB[t].TEXT
+        #print "Entro a condicional"
+        #Reglas de simplificacion
+        if ( 'coordination' in MEMORIAB[t].SIMP[s].TYPE ) and ( not ('sentence coordination' in MEMORIAB[t].SIMP[s].TYPE ) ) :
+          #print "Aplico regla coord"
+          TEMPORALES = []
+          c = len(MEMORIAB[t].SIMP[s].COMP)
+          #print "Hay " + str(c) + " COMP en este SIMP"
+          tt = 0
+          while c > 0 :
+            c = c - 1
+            if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) :
+              TEMPORALES.append(Sentence())
+              TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+              replaced = MEMORIAB[0].SIMP[s].TEXT
+              indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+              indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+              replacer = MEMORIAB[0].TEXT[indice1:indice2]
+              TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+              tt = tt + 1
+          #copiar simplificaciones de memoria temporal a MEMORIAB
+          indtempamem = 0
+          while indtempamem < len(TEMPORALES) :
+            MEMORIAB.append(Sentence())
+            MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+            MEMORIAB[-1].FLAG = True
+            #print MEMORIAB[-1].TEXT
+            indtempamem = indtempamem + 1
+        elif 'parenthesis' in MEMORIAB[t].SIMP[s].TYPE:
+          #print "Aplico regla par"
+          TEMPORALES = []
+          c = len(MEMORIAB[t].SIMP[s].COMP)
+          #print "Hay " + str(c) + " COMP en este SIMP"
+          tt = 0
+          while c > 0 :
+            #print "entro al while de par"
+            c = c - 1
+            TEMPORALES.append(Sentence())
+            TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+            replaced = MEMORIAB[0].SIMP[s].TEXT + ' )'
+            indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+            indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+            replacer = MEMORIAB[0].TEXT[indice1:indice2]
+            #print "replaced: " + replaced
+            #print "replacer: " + replacer
+            TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+            tt = tt + 1
+          #copiar simplificaciones de memoria temporal a MEMORIAB
+          indtempamem = 0
+          while indtempamem < len(TEMPORALES) :
+            MEMORIAB.append(Sentence())
+            MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+            MEMORIAB[-1].FLAG = True
+            #print MEMORIAB[-1].TEXT
+            indtempamem = indtempamem + 1
+        elif 'apposition' in MEMORIAB[t].SIMP[s].TYPE:
+          #print "Aplico regla Apposition"
+          TEMPORALES = []
+          c = len(MEMORIAB[t].SIMP[s].COMP)
+          #print "Hay " + str(c) + " COMP en este SIMP"
+          tt = 0
+          while c > 0 :
+            #print "entro al while de par"
+            c = c - 1
+            TEMPORALES.append(Sentence())
+            TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+            replaced = MEMORIAB[0].SIMP[s].TEXT
+            indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+            indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+            replacer = MEMORIAB[0].TEXT[indice1:indice2]
+            #print "replaced: " + replaced
+            #print "replacer: " + replacer
+            TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+            tt = tt + 1
+          #copiar simplificaciones de memoria temporal a MEMORIAB
+          indtempamem = 0
+          while indtempamem < len(TEMPORALES) :
+            MEMORIAB.append(Sentence())
+            MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+            MEMORIAB[-1].FLAG = True
+            #print "Copio a memoria: " + MEMORIAB[-1].TEXT
+            indtempamem = indtempamem + 1
+        elif 'member-collection' in MEMORIAB[t].SIMP[s].TYPE:
+          #print "Aplico regla member-collection"
+          TEMPORALES = []
+          c = len(MEMORIAB[t].SIMP[s].COMP)
+          #print "Hay " + str(c) + " COMP en este SIMP"
+          tt = 0
+          while c > 0 :
+            #print "entro al while de mem"
+            c = c - 1
+            TEMPORALES.append(Sentence())
+            TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+            replaced = MEMORIAB[0].SIMP[s].TEXT
+            indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+            indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+            replacer = MEMORIAB[0].TEXT[indice1:indice2]
+            #print "replaced: " + replaced
+            #print "replacer: " + replacer
+            TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+            tt = tt + 1
+          #copiar simplificaciones de memoria temporal a MEMORIAB
+          indtempamem = 0
+          while indtempamem < len(TEMPORALES) :
+            MEMORIAB.append(Sentence())
+            MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+            MEMORIAB[-1].FLAG = True
+            #print "Copio a memoria: " + MEMORIAB[-1].TEXT
+            indtempamem = indtempamem + 1
+        elif 'sentence coordination' in MEMORIAB[t].SIMP[s].TYPE:
+          #print "Aplico regla Verb"
+          TEMPORALES = []
+          c = len(MEMORIAB[t].SIMP[s].COMP)
+          #print "Hay " + str(c) + " COMP en este SIMP"
+          tt = 0
+          while c > 0 :
+            c = c - 1
+            if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) :
+              TEMPORALES.append(Sentence())
+              TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+              #sustituye todo el contenido de TEMPORAL.r/TREE, por el contenido la oracion coordinada
+              #replaced = MEMORIAB[0].SIMP[s].TEXT
+              indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+              indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+              replacer = MEMORIAB[0].TEXT[indice1:indice2]
+              #print replacer
+              TEMPORALES[tt].TEXT = replacer 
+              ## si la oracion no termina en punto o ! 
+              tt = tt + 1
+          #copiar simplificaciones de memoria temporal a MEMORIAB
+          indtempamem = 0
+          while indtempamem < len(TEMPORALES) :
+            MEMORIAB.append(Sentence())
+            MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+            MEMORIAB[-1].FLAG = True
+            #print MEMORIAB[-1].TEXT
+            indtempamem = indtempamem + 1
+        elif 'full relative clause' in MEMORIAB[t].SIMP[s].TYPE:
+          #print "Aplico regla RelCl"
+          TEMPORALES = []
+          c = 0
+          tt = 0
+          while c < 2 :
+            if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] :
+              TEMPORALES.append(Sentence())
+              TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
+              if MEMORIAB[0].TEXT[MEMORIAB[0].TEXT.index(TEMPORALES[tt].SIMP[s].TEXT)+len(TEMPORALES[tt].SIMP[s].TEXT)-1] == ',':
+                replaced = MEMORIAB[0].SIMP[s].TEXT + ',' #posible error, si es asi probar con ' ,'
+              else:
+                replaced = MEMORIAB[0].SIMP[s].TEXT
+              indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+              indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+              replacer = MEMORIAB[0].TEXT[indice1:indice2]
+              TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+              indice3 = indice1
+              indice4 = indice2
+            if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] :
+              TEMPORALES.append(Sentence())
+              TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
+              indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+              indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+              TEMPORALES[tt].TEXT = copy.deepcopy(MEMORIAB[0].TEXT[indice3:indice4]+' '+MEMORIAB[0].TEXT[indice1:indice2] ) ##
+              cad3 = MEMORIAB[0].TEXT[indice1:indice2]
+              cad4 = cad3.split()
+              if (cad4[0]+'_WDT') in frase.POS:
+                TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(' '+cad4[0],'')
+            tt = tt + 1
+            c = c + 1
+          #copiar simplificaciones de memoria temporal a MEMORIAB
+          indtempamem = 0
+          while indtempamem < len(TEMPORALES) :
+            MEMORIAB.append(Sentence())
+            MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+            MEMORIAB[-1].FLAG = True
+            #print MEMORIAB[-1].TEXT
+            indtempamem = indtempamem + 1
+        elif 'reduced relative clause' in MEMORIAB[t].SIMP[s].TYPE:
+          #print "Aplico regla RelCl"
+          TEMPORALES = []
+          c = 0
+          tt = 0
+          while c < 2 :
+            if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] :
+              TEMPORALES.append(Sentence())
+              TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
+              replaced = MEMORIAB[0].SIMP[s].TEXT
+              indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+              indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+              replacer = MEMORIAB[0].TEXT[indice1:indice2]
+              #subj = MEMORIAB[0].TEXT[indice1:(indice2+1)]
+              subj = MEMORIAB[0].TEXT[indice1:(indice2)]
+              TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+            if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] :
+              TEMPORALES.append(Sentence())
+              TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #el referente debera estar antes que la clausula para tener orden correcto
+              indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+              indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+              replacer = MEMORIAB[0].TEXT[indice1:indice2]
+              TEMPORALES[tt].TEXT = subj + " _ " + replacer #en este punto para ingresar copula necesitas info de numero y tiempo
+            tt = tt + 1
+            c = c + 1
+          #copiar simplificaciones de memoria temporal a MEMORIAB
+          indtempamem = 0
+          while indtempamem < len(TEMPORALES) :
+            MEMORIAB.append(Sentence())
+            MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+            MEMORIAB[-1].FLAG = True
+            #print MEMORIAB[-1].TEXT
+            indtempamem = indtempamem + 1
+        elif 'hypernymy' in MEMORIAB[t].SIMP[s].TYPE:
+	  print "**hypernymy detected**"
+	  #print "True to: " + MEMORIAB[t].TEXT
+	  MEMORIAB[t].FLAG = True
+        else:
+	  print "Error: Unknown simplification construct detected."
+	  #print "True to: " + MEMORIAB[t].TEXT
+	  MEMORIAB[t].FLAG = True
+      t = t + 1
+    s = s + 1
+
+  #CONDICIONES PARA IMPRESION DE SIMPLIFICACIONES EN ARCHIVO DE TEXTO
+  #print "Sentence simplificated. New sentences generated:"
+  for i in range(len(MEMORIAB)):
+    #se reutiliza flag para marcar las oraciones finales
+    MEMORIAB[i].FLAG = True
+    for j in range(len(MEMORIAB[0].SIMP)):   
+      #NOTA: si se agrega un constructo simplificable, anadirlo tambien a esta lista:
+      if ( ('member-collection' in MEMORIAB[0].SIMP[j].TYPE) or ('apposition' in MEMORIAB[0].SIMP[j].TYPE) or ('coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('parenthesis' in MEMORIAB[0].SIMP[j].TYPE) or ('sentence coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('full relative clause' in MEMORIAB[0].SIMP[j].TYPE) or ('reduced relative clause' in MEMORIAB[0].SIMP[j].TYPE) ) and (MEMORIAB[0].SIMP[j].TEXT in MEMORIAB[i].TEXT) :
+        MEMORIAB[i].FLAG = False
+
+  ##areglar numeracion archivos salida ej 011
+  arcsalnum = 0
+  for i in range(len(MEMORIAB)):
+    if MEMORIAB[i].FLAG == True:
+      arcsalnum = arcsalnum + 1
+  length = len(str(arcsalnum))
+  #print('{:03d}'.format(arcsalnum))  # python >= 2.7 + python3
+#  >>> n = '4'
+#>>> print n.zfill(3)
+  arcsalnum = 0
+  for i in range(len(MEMORIAB)):
+    if MEMORIAB[i].FLAG == True:
+      arcsalnum = arcsalnum + 1
+      print MEMORIAB[i].TEXT#Salida
+      archSalNombre = sys.argv[2]
+      archSalNombre=archSalNombre[:-4] + "-" + (str(arcsalnum)).zfill(length) + '.alg'
+      archivoSalida=open(archSalNombre,"w")
+      archivoSalida.write(MEMORIAB[i].TEXT+"\n")##
+      archivoSalida.close()
+      #WRITE OUTPUT FILE PATH TO INDEX (Arg 3)
+      index_name = sys.argv[3]
+      index = open(index_name, "a+")
+      archSalNombreforIndex=archSalNombre + "\n"
+      index.write(archSalNombreforIndex)
+      index.close()
+else:
+  print frase.TEXT #----Salida si no habia constructos simplificables
+  archSalNombre = sys.argv[2]
+  archSalNombre = archSalNombre[:-4] + ".alg"
+  archivoSalida = open(archSalNombre,"a+")
+  archivoSalida.write(frase.TEXT+"\n")##
+  archivoSalida.close()
+  #WRITE OUTPUT FILE PATH TO INDEX (Arg 3)
+  index_name = sys.argv[3]
+  index = open(index_name, "a+")
+  archSalNombreforIndex=archSalNombre + "\n"
+  index.write(archSalNombreforIndex)
+  index.close()
+
+
+#FIN
--- a/trn/empty-file.txt 0 → 100644
View file @f3df57a
+++ b/trn/empty-file.txt 0 → 100644
View file @f3df57a
+Delete me
\ No newline at end of file