Bacterial regulatory interaction extraction system

cmendezc
Commit f3df57ab2b884d6cdeb18e80bfc986cf683d8da9 f3df57ab 0 parents
Showing 31 changed files with 4658 additions and 0 deletions
README.md
attributive-sentences/.gitignore
automatic-extraction-ris-gcs.sh
autoregulation-sentences/deleteme.txt
deverbal-separator/filter-v03.py
deverbal-separator/separated_sentences/dev/.gitignore
deverbal-separator/separated_sentences/vrb/.gitignore
deverbal-separator/separator-v02.sh
deverbal-separator/tagged-line/.gitignore
deverbal-separator/tagged/.gitignore
evaluate-ris-gcs-standoff-v04.py
extract-ris-deverbal/EFF_DVB-regex-v03.py
filtered-sentences/.gitignore
format/regex.py
format/sanitized_sentences/.gitignore
format/split_sentences/.gitignore
get-TRN-Organism-v1.py
get-TRN-v2.py
predicted-ris-gcs/complete-ris/.gitignore
predicted-ris-gcs/incomplete-ris/.gitignore
--- a/README.md 0 → 100644
View file @f3df57a
+++ b/README.md 0 → 100644
View file @f3df57a
+ # Bacterial regulatory interaction extraction system
+ 
+ ## Prerequisites
+ 1. Input file must be tokenized and sentence split
+ 
+ 
+ 
+ 
+ ## Run
+ ### Several files
+ Set filenames and paths in run-several-files.sh
+ 
+ ## Acknowledgments
+ This work was supported by UNAM-PAPIIT IA203420.
\ No newline at end of file
--- a/attributive-sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/attributive-sentences/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/automatic-extraction-ris-gcs.sh 0 → 100755
View file @f3df57a
+++ b/automatic-extraction-ris-gcs.sh 0 → 100755
View file @f3df57a
+ #!/bin/bash
+ # Main script for automatic extraction of regulatory interactions
+ 
+ #Parameters
+ #1: Path y nombre de archivo con las frases preprocesadas en formato de tokens (palabras)
+ #2: Path y nombre de archivo con las frases preprocesadas en formato trasformado (palabra|lemma|pos)
+ #3: Path y nombre de archivo para procesamiento con OpenIE
+ #4: Path de salida de archivos a1 y a2 con RIS y GCs
+ #5: Simplificar Y/N?
+ #6: Separar verbales y deverbales Y/N?
+ #7: Filtro de frases que contengan entidades. FILT1 = (GENE OR TU) AND TF
+ #8: Path con archivos a1 y a2 de referencia (RIs y GCs verdaderas)
+ #9: Archivo de referencia (RIs y GCs verdaderas)
+ #10: Path para guardar archivo de evaluación
+ #11: Archivo para guardar resultados de la evaluación contra referencia
+ #12: Archivo de sinónimos de TFs
+ 
+ # RUN EXTRACTION FOR L&C STM
+ # ./automatic-extraction-ris-gcs.sh
+ # /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt
+ # /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt
+ # /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt
+ # /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs
+ # Y Y FILT1
+ # /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference
+ # unused.txt
+ # /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports
+ # unused.txt
+ # diccionario-STM-LT2-v7.0.SYNONYMS.json
+ # 1>uno-STM-LC.txt
+ # 2>dos-STM-LC.txt
+ # ./automatic-extraction-ris-gcs.sh /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/features/6094508.tra.word.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/transformed/6094508.tra.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/ri-openie-extraction/ris-STM.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs Y Y FILT1 /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/unused-reference unused.txt /home/cmendezc/gitlab_repositories/lisen-curate-nlp-tasks/ris-extraction/bries-bacterial-regulatory-interaction-extraction-system/evaluation-reports unused.txt diccionario-STM-LT2-v7.0.SYNONYMS.json 1>uno-STM-LC.txt 2>dos-STM-LC.txt
+ 
+ # Some help
+ # Filename without path: filename=$(basename "$fullfile")
+ # Filename extension: extension="${filename##*.}"
+ # Filename without extension: filename="${filename%.*}"
+ # Por error de muchos archivos: find . -print0 | xargs -0 grep AcrR
+ 
+ 
+ 
+ PATH_TO_CORENLP=/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09
+ DICC_PATH=/home/cmendezc/terminologicalResources
+ ISIMP_PATH=/home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/sentence-simplification/isimp_v2
+ 
+ SCRIPT_PATH=$(cd `dirname $0` && pwd)
+ INPUT_PATH=$1
+ INPUT_PATH_TRANS=$2
+ OUTPUT_FILE=$3
+ OUTPUT_PATH=$4
+ INPUT_NAME_EXT=$(basename "$INPUT_PATH")
+ INPUT_NAME="${INPUT_NAME_EXT%.*}"
+ # Simplify sentences?
+ SIMPLIFY=$5
+ # Separate sentences with deverbal effect?
+ DEVERBAL_SEPARATOR=$6
+ FILT=$7
+ TRUE_PATH=$8
+ TRUE_FILE=$9
+ PATH_EVAL=${10}
+ FILE_EVAL=${11}
+ DICC_SYNON=${12}
+ # CFMC 2022-03-09: tsv file with section, id sentence, sentence (Extracted from jsonpdf)
+ TSV_PATH=${13}
+ 
+ #Validate arguments
+ if [[ ! ("$#" == 13 ) ]]; then
+     echo 'Usage: ./automatic-extraction-ris-gcs.sh <inputPath_wordFile>
+     <inputPath_taggedFile> <outputPath_file> <simplify?> <deverbal_detector?>
+     <filter> <true_path> <true_file> <path_evaluation_report> <file_evaluation_report>
+     <dictionary_TFs_synonyms> <path_tsv_file>'
+     exit 1
+ fi
+ 
+ echo "********** SELECTED PARAMETERS **********"
+ echo "INPUT PATH: $INPUT_PATH"
+ echo "INPUT PATH TRANSFORMED FILE $INPUT_PATH_TRANS"
+ echo "OUTPUT FILE: $OUTPUT_FILE"
+ echo "OUTPUT PATH: $OUTPUT_PATH"
+ echo "SIMPLIFY SENTENCES? $SIMPLIFY"
+ echo "SEPARATE DEVERBAL SENTENCES? $DEVERBAL_SEPARATOR"
+ echo "FILTER SENTENCES WITH ENTITIES? $FILT"
+ echo "REFERENCE (TRUE) PATH: $TRUE_PATH"
+ echo "REFERENCE (TRUE) FILE: $TRUE_FILE"
+ echo "PATH EVALUATION REPORT: $PATH_EVAL"
+ echo "FILE EVALUATION REPORT: $FILE_EVAL"
+ echo "DICTIONARY OF SYNONYMS OF TFS: $DICC_SYNON"
+ 
+ echo "********** SELECTED PROCESSES **********"
+ CLEAN_OUTPUT=FALSE
+ echo "   Clean output paths: $CLEAN_OUTPUT"
+ 
+ FILTER=TRUE
+ echo "   Filter sentences: $FILTER"
+ 
+ CLEAN=TRUE
+ echo "   Clean sentences for iSimp: $CLEAN"
+ 
+ SEPARATE=TRUE
+ echo "   Separate sentences to iSimp: $SEPARATE"
+ 
+ SIMPLI=TRUE
+ echo "   Simplify sentences: $SIMPLI"
+ 
+ DEVERBAL=TRUE
+ echo "   Separate deverbal and verbal sentences: $DEVERBAL"
+ 
+ DEVTAG=TRUE # Needs DEVERBAL=TRUE
+ echo "   Tag sentences to separate deverbal and verbal sentences: $DEVTAG"
+ 
+ DEVSEPAR=TRUE # Needs DEVERBAL=TRUE
+ echo "   Do separate deverbal and verbal sentences: $DEVSEPAR"
+ 
+ EXTDEVERBAL=TRUE
+ echo "   Extract RI deverbal: $EXTDEVERBAL"
+ 
+ OPENIE=TRUE
+ echo "   OpenIE triplet extraction: $OPENIE"
+ 
+ EXTOPENIE=TRUE
+ echo "   Extract RI verbal: $EXTOPENIE"
+ 
+ EXTATTRIB=TRUE
+ echo "   Extract RI attributive: $EXTATTRIB"
+ 
+ EXTAUTOREG=TRUE
+ echo "   Extract RI autoregulation: $EXTAUTOREG"
+ 
+ EXTGC=FALSE
+ echo "   Extract growth conditions: $EXTGC"
+ 
+ EVAL=FALSE
+ echo "   Evaluate extraction: $EVAL"
+ 
+ EVALGC=FALSE
+ echo "   Evaluate growth condition extraction: $EVALGC"
+ 
+ #########################
+ # Cleaning output paths #
+ #########################
+ if [ "$CLEAN_OUTPUT" = "TRUE" ]; then
+     if [ -z "$(ls -A $OUTPUT_PATH/complete-ris/)" ]; then :
+     else
+        #echo "Not Empty"
+        # Original: rm $OUTPUT_PATH/complete-ris/*
+        find $OUTPUT_PATH/complete-ris -maxdepth 1 -name '*.*' -delete
+     fi
+     if [ -z "$(ls -A $OUTPUT_PATH/incomplete-ris/)" ]; then :
+     else
+        #echo "Not Empty"
+        # Original: rm $OUTPUT_PATH/incomplete-ris/*
+        find $OUTPUT_PATH/incomplete-ris -maxdepth 1 -name '*.*' -delete
+     fi
+ fi # if [ "$CLEAN_OUTPUT" = "TRUE" ]; then
+ ################
+ # preliminares #
+ ################
+ #Clone and update simplification pipeline
+ #if [ ! -d "./sentence-simplification" ]
+ #	then
+ #		echo Downloading sentence simplificator...
+ #		git clone https://github.com/ezojg/sentence-simplification
+ #	else 
+ #		cd ./sentence-simplification
+ #		git pull origin master
+ #		cd ..
+ #fi
+ #Check for iSimp 
+ #if [ ! -d "./sentence-simplification/isimp_v2" ]
+ #	then
+ #		echo ERROR: ./sentence-simplification/isimp_v2 not found. Please manually copy iSimp to said path.
+ #		exit 1
+ #fi
+ 
+ if [ "$FILTER" = "TRUE" ]; then
+ echo "********** FILTER SENTENCES **********"
+ ###################################################
+ # filter sentences with entities of interest      #
+ # and collect attributive examples ArgP-regulated #
+ ###################################################
+ # INPUT:
+ # 1) --inputFileWord $INPUT_PATH input file transformed
+ # 2) --inputFileTrans $INPUT_PATH_TRANS input file of feature 'word'
+ # 3) --outputPath $SCRIPT_PATH/filtered-sentences
+ # 4) --outputFile filtered-sentences.txt output File
+ # 5) --filter filter $FILT
+ #  FILT1: (GENE OR TU) AND TF
+ #  FILT2: (GENE OR TU) AND EFFECT AND TF
+ # 6) --attrPath $SCRIPT_PATH/attributive-sentences Path for attributive cases: ArgP-regulated genes
+ # 7) --attrFile attributive-sentences.txt File for attributive cases: ArgP-regulated genes
+ # $DICC_PATH/normalized_Effects.json
+ 
+ cd $SCRIPT_PATH
+ if [ -z "$(ls -A ./filtered-sentences/)" ]; then :
+ else
+    #echo "Not Empty"
+    rm ./filtered-sentences/*
+ fi
+ if [ -z "$(ls -A ./attributive-sentences/)" ]; then :
+ else
+    #echo "Not Empty"
+    rm ./attributive-sentences/*
+ fi
+ if [ -z "$(ls -A ./autoregulation-sentences/)" ]; then :
+ else
+    #echo "Not Empty"
+    rm ./autoregulation-sentences/*
+ fi
+ # CFMC 2022-03-09: To update tsv file with filtered sentences
+ # python3.4 $SCRIPT_PATH/sentence-filter.py --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json
+ python3.4 $SCRIPT_PATH/sentence-filter_v02.py --tsvPath $TSV_PATH --inputFileWord $INPUT_PATH --inputFileTrans $INPUT_PATH_TRANS --outputPath $SCRIPT_PATH/filtered-sentences --outputFile filtered-sentences.txt --filter $FILT --attrPath $SCRIPT_PATH/attributive-sentences --autoPath $SCRIPT_PATH/autoregulation-sentences --dicPath $DICC_PATH --dicFile normalized_Effects.json
+ fi # if [ "$PRE" = "TRUE" ]; then
+ 
+ if [ "$CLEAN" = "TRUE" ]; then
+ echo "********** CLEAN SENTENCES **********"
+ #################################
+ # Clean sentences for iSimpm #
+ #################################
+ # INPUT - PREVIOUS OUTPUT: filtered sentences $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt
+ # output path and file $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
+ if [ -z "$(ls -A ./format/sanitized_sentences/)" ]; then :
+ else
+    #echo "Not Empty"
+    rm ./format/sanitized_sentences/*
+ fi
+ #Original Daniel: python2 $SCRIPT_PATH/format/regex-before.py $INPUT_PATH $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
+ python2 $SCRIPT_PATH/format/regex.py $SCRIPT_PATH/filtered-sentences/filtered-sentences.txt $SCRIPT_PATH/format/sanitized_sentences/$INPUT_NAME_EXT
+ fi # if [ "$CLEAN" = "TRUE" ]; then
+ 
+ if [ "$SEPARATE" = "TRUE" ]; then
+ echo "********** SEPARATE SENTENCES **********"
+ ################################
+ # Separate sentences for iSimp #
+ ################################
+ # INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/sanitized_sentences/$l
+ # output path and file $SCRIPT_PATH/format/split_sentences/$BARE_NAME
+ cd $SCRIPT_PATH
+ if [ -z "$(ls -A ./format/split_sentences/)" ]; then :
+ 	else
+ 		rm ./format/split_sentences/*
+ fi
+ cd ./format/sanitized_sentences
+ for l in $(\ls $INPUT_NAME*)
+ do
+ 	# echo $l
+ 	BARE_NAME=$(echo $l | cut -f 1 -d '.') 
+ 	BARE_NAME+="_"
+ 	LENGTH="$(wc -l < $l)"
+ 	LENGTH="$(echo "${#LENGTH}")"
+ 	split -a $LENGTH -d -l 1 --additional-suffix=.spt $SCRIPT_PATH/format/sanitized_sentences/$l $SCRIPT_PATH/format/split_sentences/$BARE_NAME
+ done
+ fi # if [ "$SEPARATE" = "TRUE" ]; then
+ 
+ if [ "$SIMPLI" = "TRUE" ]; then
+ echo "********** SIMPLIFY SENTENCES **********"
+ ######################
+ # Simplify sentences #
+ ######################
+ # INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/format/split_sentences
+ # output file $OUTPUT_FILE
+ # path to iSimp $ISIMP_PATH
+ # CALL: ./sentence-simplification/sentence-simplification-main.sh
+ #  CALL: $ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
+ #  CALL: $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
+ #     $OUTPUT_INDEX_FILE_PATH = $OUTPUT_FILE
+ # OUTPUT: simplified sentences in path ./algorithm_sentences
+ 
+ # while true; do
+     # read -p "Do you wish to simplificate sentences? [Y/N]: " yn
+     # case $yn in
+         # [Yy]* ) SIMP=1; break;;
+         # [Nn]* ) SIMP=0; break;;
+         # * ) echo "Please answer yes [Y] or no [N].";;
+     # esac
+ # done
+ case $SIMPLIFY in
+ 	[Yy]* )
+ 		SIMP=1
+ 		;;
+ 	[Nn]* )
+ 		SIMP=0
+ 		;;
+ 	* )
+ 		SIMP=1
+ 		;;
+ esac
+ cd $SCRIPT_PATH
+ if [ $SIMP == 1 ]
+ 	then    #USING SIMPLIFICATION
+         echo "********** YES SIMPLIFY SENTENCES **********"
+ 		#Copy file to sentence-simplification
+ 		#FILE_NAME=$(basename "$INPUT_PATH")
+ 		#Call simplification pipeline AND create a file with the paths for the simplificated sentences
+ 		./sentence-simplification/sentence-simplification-main.sh $SCRIPT_PATH/format/split_sentences $OUTPUT_FILE $ISIMP_PATH
+ 		#echo "entrada: $SCRIPT_PATH/format/split_sentences --salida: $OUTPUT_FILE"
+ 		#echo "Sentences simplificated. Paths to simplificated sentences saved in $OUTPUT_FILE"
+ 	else    #WITHOUT SIMPLIFICACION
+         echo "********** NO SIMPLIFY SENTENCES **********"
+         if [ -z "$(ls -A ./sentence-simplification/algorithm_sentences/)" ]; then :
+         else
+            #echo "Not Empty"
+            rm ./sentence-simplification/algorithm_sentences/*
+         fi
+ 		ls $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE
+ 		cp $SCRIPT_PATH/format/split_sentences/* $SCRIPT_PATH/sentence-simplification/algorithm_sentences
+ 		#echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE"
+ fi
+ fi # if [ "$SIMPLI" = "TRUE" ]; then
+ 
+ if [ "$DEVERBAL" = "TRUE" ]; then
+ echo "********** SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
+ ######################
+ # Deverbal separator #
+ ######################
+ # $PATH_TO_CORENLP
+ # INPUT - PREVIOUS OUTPUT: $SCRIPT_PATH/sentence-simplification/algorithm_sentences
+ # output path $SCRIPT_PATH/deverbal-separator/separated_sentences
+ # $DICC_PATH
+ # $DEVTAG POS taggging sentences
+ # $DEVSEPAR Do separate sentences
+ # CALL: java -cp "$PATH_TO_CORENLP/*"
+ #       $SCRIPT_PATH/filter.py
+ # OUTPUT: sentences separated in two paths according to verbal/deverbal effect
+ 
+ case $DEVERBAL_SEPARATOR in
+ 	[Yy]* ) 
+ 		DEVSEP=1
+ 		;;
+ 	[Nn]* ) 
+ 		DEVSEP=0
+ 		;;
+ 	* ) 
+ 		DEVSEP=1
+ 		;;
+ esac
+ if [ $DEVSEP == 1 ]
+ 	then    #USING DEVERBAL SEPARATOR 
+ 
+         #if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/)" ]; then :
+         #else
+            #echo "Not Empty"
+            # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/*
+         #   find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -maxdepth 1 -name '*.vrb' -delete
+         #fi
+         #if [ -z "$(ls -A $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/)" ]; then :
+         #else
+            #echo "Not Empty"
+            # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/*
+         #   find $SCRIPT_PATH/deverbal-separator/separated_sentences/dev -maxdepth 1 -name '*.dev' -delete
+         #fi
+ 
+ 		echo "********** YES SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
+ 		# Original Daniel 2018-12-06: ./deverbal-separator/separator.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR
+ 		./deverbal-separator/separator-v02.sh $PATH_TO_CORENLP $SCRIPT_PATH/sentence-simplification/algorithm_sentences $SCRIPT_PATH/deverbal-separator/separated_sentences $DICC_PATH $DEVTAG $DEVSEPAR
+ 	else    #WITHOUT DEVERBAL SEPARATOR
+ 	    echo "********** NO SEPARATE VERBAL AND DEVERBAL SENTENCES **********"
+ 		ls $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE
+ 		#echo "Sentences split. Paths to split sentences saved in $OUTPUT_FILE"
+ fi # [ $DEVSEP == 1 ]
+ fi # if [ "$DEVERBAL" = "TRUE" ]; then
+ 
+ if [ "$EXTDEVERBAL" = "TRUE" ]; then
+ echo "********** EXTRACT RI DEVERBAL **********"
+ #######################
+ # Extract RI deverbal #
+ #######################
+ # INPUT: deverbal files $(dirname ${file}) $(basename ${file})
+ # output path $OUTPUT_PATH $(basename ${file%.*})
+ # $DICC_PATH/names_EFFECT_ONTOGENE.txt $
+ # DICC_PATH/names_GENE.txt
+ # $DICC_PATH/names_GENE_ONTOGENE.txt
+ # $DICC_PATH/names_GENE_SYN.txt
+ # $DICC_PATH/names_TU.txt
+ # $DICC_PATH/names_TU_ONTOGENE.txt
+ # $DICC_PATH/names_TF_1grams.txt
+ # $DICC_PATH/names_TF_2grams.txt
+ # $DICC_PATH/names_TF_3grams.txt
+ # $DICC_PATH/names_TF_4grams.txt
+ # $DICC_PATH/names_TF_5Moregrams.txt
+ # $DICC_PATH/names_TF_ONTOGENE.txt
+ # $DICC_PATH/normalized_Effects.json
+ # OUTPUT: standoff files with RIs
+ # PATH ALREADY TAGGED ENTITIES: $SCRIPT_PATH/filtered-sentences
+ # FILE ALREADY TAGGED ENTITIES: filtered-sentences.ents.json
+     for file in $SCRIPT_PATH/deverbal-separator/separated_sentences/dev/*.*
+     do
+         #python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-OriginalDaniel.py $file $OUTPUT_PATH/$(basename ${file%.*}) $DICC_PATH/names_EFFECT_ONTOGENE.txt $DICC_PATH/names_GENE.txt $DICC_PATH/names_GENE_ONTOGENE.txt $DICC_PATH/names_GENE_SYN.txt $DICC_PATH/names_TU.txt $DICC_PATH/names_TU_ONTOGENE.txt $DICC_PATH/names_TF_1grams.txt $DICC_PATH/names_TF_2grams.txt $DICC_PATH/names_TF_3grams.txt $DICC_PATH/names_TF_4grams.txt $DICC_PATH/names_TF_5Moregrams.txt $DICC_PATH/names_TF_ONTOGENE.txt
+         #echo "Dir file: $(dirname ${file})"
+         #echo "File $(basename ${file})"
+         #echo "OUTOUT_PATH $OUTPUT_PATH"
+         #echo "File $(basename ${file%.*})"
+         echo "Dir and files: $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
+         #python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v02.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json
+         python3 $SCRIPT_PATH/extract-ris-deverbal/EFF_DVB-regex-v03.py $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*}) $DICC_PATH/normalized_Effects.json $SCRIPT_PATH/filtered-sentences filtered-sentences.ents.json
+     done
+ fi # if [ "$EXTDEVERBAL" = "TRUE" ]; then
+ 
+ if [ "$OPENIE" = "TRUE" ]; then
+ echo "********** OPENIE TRIPLET EXTRACTION **********"
+     #########################
+     # OpenIE RI extraction #
+     #########################
+     # Juntamos frases verbales en archivo para OpenIE extraction
+     # Error: /bin/ls: Argument list too long: ls $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE
+     echo "   Join verbal sentences into file for OpenIE extraction"
+     find $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb -type f -name '*' > $OUTPUT_FILE
+     #echo "Deberval sentences separated. Paths to verbal sentences saved in $OUTPUT_FILE"
+ 
+     echo "   CoreNLP OpenIE..."
+     java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.naturalli.OpenIE -filelist $OUTPUT_FILE -triple.strict false -triple.all_nominals true -format reverb > $OUTPUT_FILE.reverb
+ fi # if [ "$OPENIE" = "TRUE" ]; then
+ 
+ if [ "$EXTOPENIE" = "TRUE" ]; then
+     echo "********** OPENIE RI EXTRACTION **********"
+     #########################
+     # OpenIE RI extraction #
+     #########################
+     # Sustituyo oie_compress de Nacho por un programa hecho por CMC para analizar las tripletas
+     # y obtener aquellas que sugieran a los participantes y el efecto
+     #Paste input and output for fancy printing
+     # Original Nacho: echo "   Fancy printing..."
+     # Original Nacho: > $OUTPUT_FILE.fuzzy
+     # Original Nacho: python3 oie_compress.py --oies $OUTPUT_FILE.reverb --op fuzzy --ris $DICC_PATH/normalized_Effects.json --out $OUTPUT_FILE.fuzzy
+     #
+     # --inputFile $OUTPUT_FILE.reverb file obtained with CoreNLPL
+     # --outputPath $OUTPUT_PATH
+     # --diccPath $SCRIPT_PATH/filtered-sentences  Before: $DICC_PATH
+     # --diccFile Before: termFilesTag_RIE_GCE_SYSTEM_ECCO.json
+     # --diccEffect normalized_Effects.json
+     # --format standoff
+     # --diccEPAth $DICC_PATH
+     # OUTPUT: standoff files with RIs
+ 
+     # python3.4 $SCRIPT_PATH/ri-openie-extraction.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccFile termFilesTag_RIE_GCE_SYSTEM_ECCO.json --diccEffect normalized_Effects.json --format standoff
+     python3.4 $SCRIPT_PATH/ri-openie-extraction-v02.py --inputFile $OUTPUT_FILE.reverb --outputPath $OUTPUT_PATH --diccPath $SCRIPT_PATH/filtered-sentences --diccFile filtered-sentences.ents.json --diccEffect normalized_Effects.json --diccEPAth $DICC_PATH --format standoff
+ 
+     #Join into single file
+     #Sort fuzzy
+     # Original Nacho: echo "   Sort fuzzy..."
+     # Obtiene tipo de efecto
+     # Original Nacho: sort $OUTPUT_FILE.fuzzy -o $OUTPUT_FILE.fuzzy
+     #Concatenate
+     # CMC eliminated following lines because simplification was
+     #discriminated before
+     #if [ $SIMP == 1 ]
+         #then    #USING SIMPLIFICATION
+             #ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+             #awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+             #cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
+             #paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
+         #else    #WITHOUT SIMPLIFICACION
+             #ls -l $SCRIPT_PATH/format/split_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+             #awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+             #cat $SCRIPT_PATH/format/split_sentences/* > $OUTPUT_FILE.als
+             #paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
+     #fi
+     # Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+     # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+     # Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
+     # Original Nacho: echo "   Creating ils, fls and als files..."
+     # Original Nacho: if [ $DEVSEP == 1 ]
+         # Original Nacho: then    #USING DEVERBAL SEPARATOR
+             # Original Nacho: ls -l $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+             # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+             # Original Nacho: cat $SCRIPT_PATH/deverbal-separator/separated_sentences/vrb/* > $OUTPUT_FILE.als
+         # Original Nacho: else    #WITHOUT DEVERBAL SEPARATOR
+             # Original Nacho: ls -l $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* | awk -F '/' '{print $NF}' > $OUTPUT_FILE.ils
+             # Original Nacho: awk '{print $0":"}' $OUTPUT_FILE.ils > $OUTPUT_FILE.fls
+             # Original Nacho: cat $SCRIPT_PATH/sentence-simplification/algorithm_sentences/* > $OUTPUT_FILE.als
+     # Original Nacho: fi
+     # Original Nacho: echo "   Paste merger..."
+     # Original Nacho: paste $OUTPUT_FILE.fls $OUTPUT_FILE.als > $OUTPUT_FILE.merger
+     # Original Nacho: echo "   Create dsp file..."
+     #  Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp
+     # Original Nacho: awk -F "\t" 'NR==FNR{a[$1]=$0} NR>FNR && a[$1]>0{print a[$1],"\t",$2}' $OUTPUT_FILE.fuzzy $OUTPUT_FILE.merger > $OUTPUT_FILE.dsp
+     # rm $(dirname "$OUTPUT_FILE")/*.fls
+     # rm $(dirname "$OUTPUT_FILE")/*.ils
+     # rm $(dirname "$OUTPUT_FILE")/*.als
+     #rm $SCRIPT_PATH/*.merger
+     #rm $SCRIPT_PATH/*.reverb
+     #rm $SCRIPT_PATH/*.fuzzy
+ fi # if [ "$EXTOPENIE" = "TRUE" ]; then
+ 
+ if [ "$EXTATTRIB" = "TRUE" ]; then
+     echo "********** ATTRIBUTIVE RI EXTRACTION **********"
+     #########################
+     # Attributive RI extraction #
+     #########################
+     # Attributive RI extraction, such as AraP-regulated genes aragP, aragT
+     #
+     # --inputPath $SCRIPT_PATH/attributive-sentences
+     # --outputPath $OUTPUT_PATH
+     # --diccPath $SCRIPT_PATH/filtered-sentences  Before: $DICC_PATH
+     # --diccEffect normalized_Effects.json
+     # OUTPUT: standoff files with RIs
+ 
+     for file in $SCRIPT_PATH/attributive-sentences/*.*
+     do
+         echo "Dir file: $(dirname ${file})"
+         echo "File: $(basename ${file})"
+         # echo "OUTOUT_PATH $OUTPUT_PATH"
+         # echo "File $(basename ${file%.*})"
+         # echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
+         if [ "$(basename ${file})" = "*.*" ]; then
+           echo "None attributive sentence found"
+         else
+           python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json
+         fi
+     done
+ 
+ fi # if [ "$EXTATTRIB" = "TRUE" ]; then
+ 
+ if [ "$EXTAUTOREG" = "TRUE" ]; then
+     echo "********** AUTOREGULATION RI EXTRACTION **********"
+     #########################
+     # Autoregulation RI extraction #
+     #########################
+     # Autoregulation RI extraction, such as ArgP protein represses its own synthesis
+     #
+     # --inputPath $SCRIPT_PATH/autoregulation-sentences
+     # --outputPath $OUTPUT_PATH
+     # --diccPath $DICC_PATH
+     # --diccEffect normalized_Effects.json
+     # OUTPUT: standoff files with RIs
+ 
+     for file in $SCRIPT_PATH/autoregulation-sentences/*.*
+     do
+         echo "Dir file: $(dirname ${file})"
+         echo "File: $(basename ${file})"
+         # echo "OUTOUT_PATH $OUTPUT_PATH"
+         # echo "File $(basename ${file%.*})"
+         # echo "All $(dirname ${file}) $(basename ${file}) $OUTPUT_PATH $(basename ${file%.*})"
+         if [ "$(basename ${file})" = "*.*" ]; then
+           echo "None autoregulation sentence found"
+         else
+           python3 $SCRIPT_PATH/ri-autoregulation-extraction-v01.py --inputPath $(dirname ${file}) --inputFile $(basename ${file}) --outputPath $OUTPUT_PATH --diccPath $DICC_PATH --diccEffect normalized_Effects.json
+         fi
+     done
+ 
+ fi # if [ "$EXTAUTOREG" = "TRUE" ]; then
+ 
+ if [ "$EXTGC" = "TRUE" ]; then
+     echo "********** EXTRACT GROWTH CONDITIONS **********"
+     #############################
+     # Extract growth conditions #
+     #############################
+     python3.4 $SCRIPT_PATH/extract-gcs/extract-gcs-regex.py --inputPath $OUTPUT_PATH/complete-ris --outputPath $OUTPUT_PATH/complete-ris --termPath /home/cmendezc/terminologicalResources
+     #python3 ./GCs-regex-before.py ./ejemplo_11.spt
+     #/home/elwe/Documents/prueba3/RIE_reordenado/RI-searcher/GC/ejemplo_11.spt ./ejemplo_11.a2
+     #./names_GC_ECCO_1grams.txt ./names_GC_ECCO_2grams.txt ./names_GC_ECCO_3grams.txt
+     #./names_GC_ECCO_4grams.txt ./names_GC_ECCO_5Moregrams.txt
+ fi # if [ "$EXTGC" = "TRUE" ]; then
+ 
+ if [ "$EVAL" = "TRUE" ]; then
+     echo "********** EVALUATE EXTRACTION **********"
+     if [ "$EVALGC" = "TRUE" ]; then
+         echo "********** EVALUATE GROWTH CONDITION EXTRACTION **********"
+         python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON --evaluateGCs
+     else
+         echo "********** EVALUATE WITHOUT GROWTH CONDITION EXTRACTION **********"
+         python3.4 $SCRIPT_PATH/evaluate-ris-gcs-standoff-v04.py --truePath $TRUE_PATH --trueFile $TRUE_FILE --predictedPath $OUTPUT_PATH/complete-ris --outputPath $PATH_EVAL --outputFile $FILE_EVAL --diccPath $DICC_PATH --diccSynon $DICC_SYNON
+ 
+     fi  # if [ "$EVALGC" = "TRUE" ]; then
+ fi # if [ "$EVAL" = "TRUE" ]; then
+ 
--- a/autoregulation-sentences/deleteme.txt 0 → 100644
View file @f3df57a
+++ b/autoregulation-sentences/deleteme.txt 0 → 100644
View file @f3df57a
--- a/deverbal-separator/filter-v03.py 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/filter-v03.py 0 → 100644
View file @f3df57a
+ # import fileinput
+ # import regex as re
+ # from regex import finditer
+ import sys
+ import json
+ 
+ if ( len( sys.argv ) != 3 ):
+     # Original Daniel: sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <EFFs_dictionary> \n" )
+     sys.stderr.write("E: usage: " + sys.argv[0] + " <input_file> <normalized_Effects> \n")
+     sys.stderr.flush();
+ 
+ #    exit( 2 );
+ 
+ #LEER ARCHIVO INPUT
+ text_file = open( sys.argv[1], "r" )
+ dato = text_file.read()
+ text_file.close()
+ 
+ #LEE DICCIONARIO
+ 
+ # Loading normalized effects
+ # print('Loading normalized effects...')
+ with open(sys.argv[2]) as diccFile:
+     hashNormalizedEffects = json.load(diccFile)
+ DICC = list(hashNormalizedEffects.keys())
+ 
+ # Original Daniel: text_file = open( sys.argv[2], "r" )
+ # Original Daniel: DICC = text_file.read().splitlines()
+ # Original Daniel: text_file.close()
+ 
+ 
+ #declara variables
+ is_dev = False
+ is_vrb = False
+ 
+ 
+ # DICC
+ # 2018-11-30 CMC: We separated noun and only past participle for deverbal processing
+ # and all verb forms as verbal
+ # VRB: VB 	verb, base form 	think
+ # VRB: VBZ 	verb, 3rd person singular present 	she thinks
+ # VRB: VBP 	verb, non-3rd person singular present 	I think
+ # VRB: VBD 	verb, past tense 	they thought
+ # DEV: VBN 	verb, past participle 	a sunken ship
+ # VRB: VBG 	verb, gerund or present participle 	thinking is fun
+ # extend/VBP
+ for i in range(len(DICC)):
+     # print(DICC[i])
+     for token in dato.split():
+         word = token[:token.find("/")]
+         tag = token[token.find("/")+1:]
+         # print("word: {}".format(word))
+         # print("tag: {}".format(tag))
+         if (DICC[i] in word) and (("NN" in tag)
+                                   or ("VBN" == tag)
+                                   ):
+             is_dev = True
+             # print("deverbal: " + word)
+         if (DICC[i] in word) and ("VB" in tag):
+             is_vrb = True
+             # print("verbal: " + word)
+ 
+ if is_dev and is_vrb:
+     sys.exit(11)
+ elif is_dev:
+     sys.exit(12)
+ elif is_vrb:
+     sys.exit(13)
+ else:
+     sys.exit(10)
+ 
--- a/deverbal-separator/separated_sentences/dev/.gitignore 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/separated_sentences/dev/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/deverbal-separator/separated_sentences/vrb/.gitignore 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/separated_sentences/vrb/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/deverbal-separator/separator-v02.sh 0 → 100755
View file @f3df57a
+++ b/deverbal-separator/separator-v02.sh 0 → 100755
View file @f3df57a
+ #!/bin/bash
+ # Separates sentences by deverbal (.dev) and verbal (.vrb)
+ 
+ # Original Daniel: PATH_TO_CORENLP=/home/elwe/Documents/temporal/CoreNLP
+ 
+ #Validate arguments
+ if [[ ! ("$#" == 6 ) ]]; then
+     echo 'Usage: ./separator.sh <path_to_corenlp> <input_path> <output_path> <dicc_path> <if_tag> <if_separate>'
+     exit 1
+ fi
+ 
+ SCRIPT_PATH=$(cd `dirname $0` && pwd)
+ # Original Daniel: INPUT_PATH=$1 #carpeta que contiene archivos a separar
+ # Original Daniel: OUTPUT_PATH=$2
+ PATH_TO_CORENLP=$1
+ INPUT_PATH=$2 #carpeta que contiene archivos a separar
+ OUTPUT_PATH=$3
+ DICC_PATH=$4
+ # Tag sentences to separate deverbal and verbal sentences: $DEVTAG
+ TAG=$5
+ # Do separate deverbal and verbal sentences: $DEVSEPAR
+ SEP=$6
+ 
+ if [ $TAG == "TRUE" ]
+ 	then    #ANALIZAR EN STANFORD PARSER
+ 
+     if [ -z "$(ls -A $SCRIPT_PATH/tagged/)" ]; then :
+     else
+        #echo "Not Empty"
+        # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged/*
+        find $SCRIPT_PATH/tagged -maxdepth 1 -name '*.conll' -delete
+     fi
+ 
+     # Added by CMC
+     if [ -z "$(ls -A $SCRIPT_PATH/tagged-line/)" ]; then :
+     else
+        #echo "Not Empty"
+        # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged-line/*
+        find $SCRIPT_PATH/tagged-line -maxdepth 1 -name '*.spt' -delete
+     fi
+ 
+     for j in $INPUT_PATH/*
+     do
+         #echo $j
+         #Original Daniel: java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.parser.lexparser.LexicalizedParser -writeOutputFiles -retainTMPSubcategories -outputFormat "wordsAndTags" $SCRIPT_PATH/englishPCFG.ser.gz $j
+         # Command line: java -cp "/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file datos_0.spt -outputDirectory tagged
+         # java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
+         # With parse: java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
+         java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
+     done
+ 
+     # Original Daniel: mv $INPUT_PATH/*.stp $SCRIPT_PATH/tagged/
+     for j in $SCRIPT_PATH/tagged/*
+     do
+         # Original Daniel: awk 'NF {print $2 "/" $4}' tagged/$j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${j%.spt}"
+         filename=$(basename "$j")
+         #filename="${filename%.*}"
+          awk 'NF {print $2 "/" $4}' $j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${filename%.*}.spt"
+         # Original Daniel: mv "$j" "${j%.stp}"
+     done
+ fi # if [ $TAG == "TRUE" ]
+ 
+ if [ $SEP == "TRUE" ]
+ 	then    #SEPARAR ARCHIVOS
+ 
+     # Original Daniel: if [ -z "$(ls -A $OUTPUT_PATH)" ]; then :
+     # Modified by Carlos Méndez
+     if [ -z "$(ls -A $OUTPUT_PATH/dev)" ]; then :
+     else
+        #echo "Not Empty"
+        # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/dev/*
+        find $OUTPUT_PATH/dev -maxdepth 1 -name '*.dev' -delete
+     fi
+ 
+     if [ -z "$(ls -A $OUTPUT_PATH/vrb)" ]; then :
+     else
+        #echo "Not Empty"
+        # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/vrb/*
+        find $OUTPUT_PATH/vrb -maxdepth 1 -name '*.vrb' -delete
+     fi
+ 
+     for j in $SCRIPT_PATH/tagged-line/*
+     do
+         # Original Daniel: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/names_EFFECT_ONTOGENE.txt
+         # CMC 2018-12-04: Without separating verbal forms: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/normalized_Effects.json
+         # CMC 2018-12-11: With separating verbal forms: python3 $SCRIPT_PATH/filter-v02.py $j $DICC_PATH/normalized_Effects.json
+         # CMC 2018-12-11: Considering only passive verbal form as deverbal: VBN 	verb, past participle
+         python3 $SCRIPT_PATH/filter-v03.py $j $DICC_PATH/normalized_Effects.json
+         VAR=$?
+         # filename=${j##*/}
+         # inputfile=${filename%.spt}
+         # exit
+ 
+         if [ $VAR == 11 ]; then :
+             #contiene dev y vrb $SCRIPT_PATH/tagged-line/
+             # o
+             #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
+             #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+             #echo "Deverbal and verbal"
+             cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
+             cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+         elif [ $VAR == 12 ]; then :
+             #contiene dev
+             #echo "Deverbal"
+             cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
+             # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
+         elif [ $VAR == 13 ]; then :
+             #contiene vrb
+             #echo "Verbal"
+             cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+             # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+         elif [ $VAR == 10 ]; then :
+             #parece no contener dev ni vrb
+             echo "Non deverbal and verbal"
+             cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+             # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
+         fi
+     done
+ fi # if [ $SEP == "TRUE" ]
--- a/deverbal-separator/tagged-line/.gitignore 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/tagged-line/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/deverbal-separator/tagged/.gitignore 0 → 100644
View file @f3df57a
+++ b/deverbal-separator/tagged/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/evaluate-ris-gcs-standoff-v04.py 0 → 100644
View file @f3df57a
+++ b/evaluate-ris-gcs-standoff-v04.py 0 → 100644
View file @f3df57a
+ # -*- coding: UTF-8 -*-
+ import operator
+ from optparse import OptionParser
+ import os
+ import sys
+ import json
+ import re
+ 
+ __author__ = 'CMendezC'
+ 
+ 
+ # Objective: evaluate predicted interactions in standoff format
+ # versus true interactions in tab format
+ # v04: add synonyms of TFs
+ 
+ # Parameters:
+ #   1) --truePath Path for true interactions
+ #   2) --trueFile File for true interactions
+ #   3) --predictedPath Path for predicted interactions
+ #   4) --outputPath Output path
+ #   5) --outputFile File for saving results
+ #   6) --evaluateGCs Evaluate with GCs
+ #   7) --diccPath Dictionary path
+ #   8) --diccSynon File with synonyms of TFs
+ 
+ # Ouput:
+ #   1) File with TP, FP, FN and scores Precision, Recall , F1
+ 
+ # Execution:
+ # python3.4 evaluate-ris-gcs-standoff.py
+ # --truePath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/analysis-validation-data-sets
+ # --trueFile ris-analysis-reference.txt
+ # --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris-gcs
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/evaluation-reports
+ # --outputFile evaluation-riegce-system-ris-analysis.txt
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+ # --evaluateGCs
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ def updateHashPredicted(pr, hashP, pm, sF, ef):
+     if pr not in hashP:
+         hashTemp = {"pmids": {pm: [sF]}, "orieff": ef}
+         hashP[pr] = hashTemp
+     else:
+         hashTemp = hashP[pr]
+         if pm in hashTemp["pmids"]:
+             hashP[pr]["pmids"][pm].append(sF)
+         else:
+             hashP[pr]["pmids"][pm] = [sF]
+ 
+ 
+ def getSummary(r, hashTemp):
+     pmids = 0
+     sentences = 0
+     orieff = ""
+     if r in hashTemp:
+         # print("r: {}".format(r))
+         orieff = hashTemp[r]["orieff"]
+         for pmid in hashTemp[r]["pmids"]:
+             pmids += 1
+             # print("PMID with sentences: {}".format(pmid))
+             for sent in hashTemp[r]["pmids"][pmid]:
+                 sentences += 1
+     else:
+         return "WARNING: no data available!"
+     return "Artículos: {}\tFrases: {}\tOriginal effect: {}".format(pmids, sentences, orieff)
+ 
+ 
+ def getDetail(r, hashTemp):
+     return_text = ""
+     sentences = 0
+     aHash = {}
+     if r in hashTemp:
+         for pmid in hashTemp[r]["pmids"]:
+             for sent in hashTemp[r]["pmids"][pmid]:
+                 sentences += 1
+             if pmid not in aHash:
+                 aHash[pmid] = sentences
+             else:
+                 return "WARNING: PMID duplicated!"
+     else:
+         return "WARNING: no data available!"
+     for p, s in sorted(aHash.items(), key=operator.itemgetter(1), reverse=True):
+         return_text += "\tPMID {}: {} frases\n".format(p, s)
+ 
+     return return_text
+ 
+ 
+ def get_standard_name(regSynon):
+     reg = ""
+     if regSynon in hashSynon:
+         reg = hashSynon[regSynon]
+     else:
+         for syn, std in hashSynon.items():
+             if regSynon.startswith(syn):
+                 reg = regSynon.replace(syn, std, 1)
+                 break
+     return reg
+ 
+ 
+ def isCorrect(ripr, listT, rtype):
+     # The predicted regulator starts with entity
+     # Effect and regulated coincide
+     # Regulator coincides with activator or repressor
+     # We return a flag to indicate type of matching: full
+     list_ripr = ripr.split('\t')
+     regulator = list_ripr[0]
+     regulatorStdName = ""
+     if use_synonyms:
+         regulatorStdName = get_standard_name(regulator)
+     for rit in listT:
+         # print("RI TRUE: {}".format(rit))
+         listRT = rit.split('\t')
+         regulatorT = listRT[0]
+         regexRegulatorStarts = re.compile(r'(' + regulatorT + r').+')
+         if rtype == "ri":
+             regulated = list_ripr[1]
+             regulatedT = listRT[1]
+             if (regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT:
+                 return (rit, 'Full')
+             # For cases where regulator is part of the word, such as ArgP-regulated
+             result = regexRegulatorStarts.match(regulator)
+             if result:
+                 # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
+                 regulator = result.group(1)
+                 if regulator == regulatorT and regulated == regulatedT:
+                     return (rit, 'Start')
+             else:
+                 if use_synonyms:
+                     result = regexRegulatorStarts.match(regulatorStdName)
+                     if result:
+                         # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
+                         regulator = result.group(1)
+                         if regulator == regulatorT and regulated == regulatedT:
+                             return (rit, 'Start')
+         elif rtype == "rief":
+             effect = list_ripr[2]
+             regulated = list_ripr[1]
+             effectT = listRT[2]
+             regulatedT = listRT[1]
+             # if ripr == "ArgP\ttargets\tregulator":
+             # print("RI-PREDICT: ArgP\ttargets\tregulator")
+             # print("  PREDICT: regulator {} effect {} regulated {}".format(regulator, effect, regulated))
+             # print("  TRUE: regulator {} effect {} regulated {}".format(regulatorT, effectT, regulatedT))
+             if (
+                     regulator == regulatorT or regulatorStdName == regulatorT) and effect == effectT and regulated == regulatedT:
+                 return (rit, 'Full')
+             elif (
+                     regulator == regulatorT or regulatorStdName == regulatorT) and regulated == regulatedT and effect == "regulator" and (
+                     effectT == "activator" or effectT == "repressor"):
+                 # if ripr == "ArgP\ttargets\tregulator":
+                 #    print("   Correct RI with regulator: {}".format(ripr))
+                 # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
+                 return (ripr, 'Regulator')
+             else:
+                 # For cases where regulator is part of the word, such as ArgP-regulated
+                 result = regexRegulatorStarts.match(regulator)
+                 if result:
+                     # print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
+                     regulator = result.group(1)
+                     if regulator == regulatorT and effect == effectT and regulated == regulatedT:
+                         return (rit, 'Start')
+                     elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and (
+                             effectT == "activator" or effectT == "repressor"):
+                         # if ripr == "ArgP\ttargets\tregulator":
+                         #    print("   Correct RI with regulator: {}".format(ripr))
+                         # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
+                         # solo que en este caso uso solo el regulador
+                         # return rit
+                         return (regulator + '\t' + regulated + '\t' + effect, 'Regulator')
+                 else:
+                     if use_synonyms:
+                         result = regexRegulatorStarts.match(regulatorStdName)
+                         if result:
+                             if regulator == regulatorT and effect == effectT and regulated == regulatedT:
+                                 return (rit, 'Start')
+                             elif regulator == regulatorT and regulated == regulatedT and effect == "regulator" and (
+                                             effectT == "activator" or effectT == "repressor"):
+                                 # if ripr == "ArgP\ttargets\tregulator":
+                                 #    print("   Correct RI with regulator: {}".format(ripr))
+                                 # return rit CMC 20181014: creo que deberia ser la predicha porque pierdo en la slitas de salida si fue correcta o no
+                                 # solo que en este caso uso solo el regulador
+                                 # return rit
+                                 return (regulator + '\t' + regulated + '\t' + effect, 'Regulator')
+ 
+                                 # CMC 2018-10-14: Revisar riefgc porque no se ha actualizado
+                                 # elif rtype == "riefgc":
+                                 #     effect = list_ripr[2]
+                                 #     regulated = list_ripr[1]
+                                 #     gc = list_ripr[3]
+                                 #     effectT = listRT[2]
+                                 #     regulatedT = listRT[1]
+                                 #     gcT = listRT[3]
+                                 #     if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT:
+                                 #         return rit
+                                 #     elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT:
+                                 #         return rit
+                                 #     else:
+                                 #         # For cases where regulator is part of the word, such as ArgP-regulated
+                                 #         result = regexRegulatorStarts.match(regulator)
+                                 #         if result:
+                                 #             #print("Regulator predicted {} starts with regulator true {}".format(regulator, result.group(1)))
+                                 #             regulator = result.group(1)
+                                 #         if regulatorT == regulator and effect == effectT and regulated == regulatedT and gc == gcT:
+                                 #             return rit
+                                 #         elif regulatorT == regulator and effect == "regulator" and (effectT == "activator" or effectT == "repressor") and gc == gcT:
+                                 #             return rit
+     return ('', '')
+ 
+ 
+ def get_scores_rules(listTrue, listPredicted, hashTemp, title, ri_type):
+     print("Evaluation")
+     # print(listPredicted)
+     # Precision = Extraídos correctos / Predichos
+     # Recall = Extraídos correctos / Referencia
+     # F - 1 = 2 * ((Precision * Recall) / (Precision + Recall))
+     correct = 0
+     incorrect = 0
+     # For registering correct and incorrect RIs
+     hashPredicted = {}
+     # To print output RIs
+     hashOutputRIs = {}
+     # For registering unrecovered RIs
+     hashUnrecovered = {}
+ 
+     predicted = len(listPredicted)
+     print("len(listPredicted): {}".format(predicted))
+     reference = len(listTrue)
+     # print("Reference: {}".format(reference))
+ 
+     listRecovered = []
+     for ri_pred in listPredicted:
+         print("ri_pred: {}".format(ri_pred))
+         # if ri_pred in hashPredicted:
+         #    print("WARNING: RI predicted {} duplicated {}".format(ri_pred, hashPredicted[ri_pred]))
+         # else:
+         # First all predicted RIs are incorrect
+         #    hashPredicted[ri_pred] = "incorrect"
+         # if ri_pred in listTrue:
+         #    hashPredicted[ri_pred] = "correct"
+         #    listRecovered.append(ri_pred)
+         #    correct += 1
+         #    continue
+         riTrue = ''
+         result = isCorrect(ri_pred, listTrue, ri_type)
+         riResult = result[0]
+         matchType = result[1]
+         if riResult != '':
+             if riResult not in hashOutputRIs:
+                 hashOutputRIs[riResult] = "Correct"
+             if ri_pred not in hashPredicted:
+                 hashPredicted[ri_pred] = "correct"
+             print("ri_pred {} correct".format(ri_pred))
+             correct += 1
+             # Complete matching or the predicted regulator starts with entity
+             if matchType == 'Full' or matchType == 'Start':
+                 # ri_pred matches with ri_true
+                 if riResult in listRecovered:
+                     print("WARNING: riResult {} already in listRecovered".format(riResult))
+                 else:
+                     listRecovered.append(riResult)
+         else:
+             incorrect += 1
+             if riResult not in hashOutputRIs:
+                 hashOutputRIs[riResult] = "Incorrect"
+             if ri_pred not in hashPredicted:
+                 hashPredicted[ri_pred] = "incorrect"
+             print("ri_pred {} incorrect".format(ri_pred))
+ 
+     if len(hashPredicted) != predicted:
+         print("ERROR: number of predicted RIs mismatch")
+         # return
+     print("Predicted: {}".format(predicted))
+     print("len(hashPredicted): {}".format(len(hashPredicted)))
+ 
+     cor = 0
+     inc = 0
+     for r, v in hashPredicted.items():
+         if v == "correct":
+             cor += 1
+         elif v == "incorrect":
+             inc += 1
+     if cor != correct:
+         print("ERROR: number of correct RIs mismatch")
+         # return
+     if inc != incorrect:
+         print("ERROR: number of incorrect RIs mismatch")
+         # return
+     print("Correct: {}".format(correct))
+     print("Incorrect: {}".format(incorrect))
+ 
+     unrecovered = 0
+     recovered = 0  # Only when coincide with reference
+     # without considering Regulator correct when Activator or Repressor appears in reference
+     listRecovered2 = []
+     listUnrecovered = []
+     for ri in listTrue:
+         if ri not in listRecovered:
+             if ri in listUnrecovered:
+                 print("WARNING: ri {} already in listUnrecovered".format(ri))
+             else:
+                 listUnrecovered.append(ri)
+                 unrecovered += 1
+         else:
+             if ri in listRecovered2:
+                 print("WARNING: ri {} already in listRecovered2".format(ri))
+             else:
+                 listRecovered2.append(ri)
+                 recovered += 1
+ 
+     print("Len listRecovered: {}".format(len(listRecovered)))
+     print("Len listRecovered2: {}".format(len(listRecovered2)))
+     print("Len listUnrecovered: {}".format(len(listUnrecovered)))
+     # if (unrecovered + correct) != reference:
+     #    print("ERROR: number of unrecovered {} + correct {} and reference {} RIs mismatch".format(unrecovered, correct, reference))
+     #    return
+ 
+     print("{}".format(title))
+     print("Predicted: {}".format(predicted))
+     print("Reference: {}".format(reference))
+     print("Unrecovered: {}".format(unrecovered))
+     print("Recovered: {}".format(recovered))
+ 
+     precision = correct / predicted
+     print("Precision = correct / predicted: {}".format(precision))
+     # recall = correct / reference
+     # We calculate recall as recovery rate, because correct instances are calculates
+     # considering Regulator correct when Activator and Repressor appears in reference
+     recall = recovered / reference
+     print("Recall = recovered / reference: {}".format(recall))
+     f1 = 2 * ((precision * recall) / (precision + recall))
+     print("F1: {}".format(f1))
+ 
+     with open(os.path.join(options.outputPath, options.outputFile), mode="a", errors="replace") as oFile:
+         oFile.write("{}\n".format(title))
+         oFile.write("Predicted: {}\n".format(predicted))
+         oFile.write("Reference: {}\n".format(reference))
+         oFile.write("Correct: {}\n".format(correct))
+         oFile.write("Incorrect: {}\n".format(incorrect))
+         oFile.write("Unrecovered: {}\n".format(unrecovered))
+         oFile.write("Recovered: {}\n".format(recovered))
+         oFile.write("Precision = correct / predicted: {}\n".format(precision))
+         oFile.write("Recall = recovered / reference: {}\n".format(recall))
+         oFile.write("F1: {}\n".format(f1))
+         oFile.write("Unrecovered instances:\n")
+         for r in sorted(listUnrecovered):
+             oFile.write("\tUnrecovered: {}\n".format(r))
+         oFile.write("Recovered instances:\n")
+         for r in sorted(listRecovered):
+             oFile.write("\tRecovered: {}\n".format(r))
+         oFile.write("Incorrect instances:\n")
+         for r, v in sorted(hashPredicted.items()):
+             if v == "incorrect":
+                 oFile.write("\tIncorrect: {}\n".format(r))
+         oFile.write("Correct instances:\n")
+         for r, v in sorted(hashPredicted.items()):
+             if v == "correct":
+                 oFile.write("\tCorrect: {}\n".format(r))
+                 # oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp)))
+                 # oFile.write("\t{}\n".format(getDetail(r, hashTemp)))
+ 
+ 
+ def get_scores(listTrue, listPredicted, hashTemp, title):
+     # Precision = Extraídos correctos / Extraídos
+     # Recall = Extraídos correctos / Referencia
+     # F - 1 = 2 * ((Precision * Recall) / (Precision + Recall))
+     print("{}".format(title))
+     # print("listTrue: {}".format(listTrue))
+     # print("listPredicted: {}".format(listPredicted))
+     print("Predicted: {}".format(len(listPredicted)))
+     print("Reference: {}".format(len(listTrue)))
+     correct = set(listTrue) & set(listPredicted)
+     print("Correct: {} ({})".format(len(correct), len(correct) / len(listPredicted)))
+     incorrect = set(listPredicted) - set(listTrue)
+     print("Incorrect: {} ({})".format(len(incorrect), len(incorrect) / len(listPredicted)))
+     unrecovered = set(listTrue) - set(listPredicted)
+     print("Unrecovered: {} ()".format(len(unrecovered), len(unrecovered) / len(listTrue)))
+     precision = len(correct) / len(listPredicted)
+     print("Precision: {}".format(precision))
+     recall = len(correct) / len(listTrue)
+     print("Recall: {}".format(recall))
+     f1 = 2 * ((precision * recall) / (precision + recall))
+     print("F1: {}".format(f1))
+ 
+     with open(os.path.join(options.outputPath, options.outputFile), mode="a") as oFile:
+         oFile.write("{}\n".format(title))
+         oFile.write("Predicted: {}\n".format(len(listPredicted)))
+         oFile.write("Reference: {}\n".format(len(listTrue)))
+         oFile.write("Correct: {}\n".format(len(correct)))
+         oFile.write("Incorrect: {}\n".format(len(incorrect)))
+         oFile.write("Unrecovered: {}\n".format(len(unrecovered)))
+         oFile.write("Precision: {}\n".format(precision))
+         oFile.write("Recall: {}\n".format(recall))
+         oFile.write("F1: {}\n".format(f1))
+         oFile.write("Correct instances:\n")
+         for r in sorted(correct):
+             oFile.write("\t{}\t{}\n".format(r, getSummary(r, hashTemp)))
+             oFile.write("\t{}\n".format(getDetail(r, hashTemp)))
+         oFile.write("Incorrect instances:\n")
+         for r in sorted(incorrect):
+             oFile.write("\t{}\n".format(r))
+         oFile.write("Unrecovered instances:\n")
+         for r in sorted(unrecovered):
+             oFile.write("\t{}\n".format(r))
+ 
+ 
+ if __name__ == "__main__":
+     # Parameter definition
+     parser = OptionParser()
+     parser.add_option("--truePath", dest="truePath",
+                       help="Path true ris gcs", metavar="PATH")
+     parser.add_option("--trueFile", dest="trueFile",
+                       help="File true ris gcs", metavar="FILE")
+     parser.add_option("--predictedPath", dest="predictedPath",
+                       help="Path predicted ris gcs", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path", metavar="PATH")
+     parser.add_option("--outputFile", dest="outputFile",
+                       help="File for saving results", metavar="FILE")
+     parser.add_option("--evaluateGCs", default=False,
+                       action="store_true", dest="evaluateGCs",
+                       help="Evaluate GCs?")
+     parser.add_option("--diccPath", dest="diccPath",
+                       help="Path to dictionary", metavar="PATH")
+     parser.add_option("--diccSynon", dest="diccSynon",
+                       help="File with synonyms", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("None parameter entered.")
+         sys.exit(1)
+ 
+     # Printing parameter values
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path true ris gcs: " + str(options.truePath))
+     print("File true ris gcs: " + str(options.trueFile))
+     print("Path predicted ris gcs: " + str(options.predictedPath))
+     print("Output path: " + str(options.outputPath))
+     print("File for saving results: " + str(options.outputFile))
+     print("Evaluate GCs: " + str(options.evaluateGCs))
+     print("Path to dictionary: " + str(options.diccPath))
+     print("File with synonyms: " + str(options.diccSynon))
+ 
+     use_synonyms = False
+     hashSynon = {}
+     if options.diccPath != None and options.diccSynon != "no-synonyms":
+         print("***** Using synonyms *****")
+         use_synonyms = True
+         print('Loading dictionary of synonyms...')
+         with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon:
+             hashSynon = json.load(diccSynon)
+         print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon)))
+ 
+     listTrueRI = []  # Without effect nor gc
+     listTrueRIEF = []  # With effect nor gc
+     if options.evaluateGCs:
+         listTrueRIEFGC = []  # With effect and gc
+     # Read and process Reference
+     with open(os.path.join(options.truePath, options.trueFile), mode="r", encoding="utf-8") as iFile:
+         for line in iFile:
+             line = line.strip('\n')
+             if line.startswith("#"):
+                 continue
+             listElem = line.split('\t')
+             if len(listElem) > 4:
+                 regulator = listElem[2]
+                 regulated = listElem[3]
+                 effect = listElem[4]
+                 if options.evaluateGCs:
+                     gc = listElem[5]
+             else:
+                 regulator = listElem[0]
+                 regulated = listElem[1]
+                 effect = listElem[2]
+                 if options.evaluateGCs:
+                     gc = listElem[3]
+             if effect == "binding":
+                 effect = "regulator"
+             ri = "{}\t{}".format(regulator, regulated)
+             if ri not in listTrueRI:
+                 listTrueRI.append(ri)
+             rief = "{}\t{}\t{}".format(regulator, regulated, effect)
+             if rief not in listTrueRIEF:
+                 listTrueRIEF.append(rief)
+             if options.evaluateGCs:
+                 riefgc = "{}\t{}\t{}\t{}".format(regulator, regulated, effect, gc)
+                 if riefgc not in listTrueRIEFGC:
+                     listTrueRIEFGC.append(riefgc)
+     print("   RIs en referencia antes regulators: {}".format(len(listTrueRI)))
+     print("   RIEFs en referencia antes regulators: {}".format(len(listTrueRIEF)))
+     if options.evaluateGCs:
+         print("   RIEFGCs en referencia antes regulators: {}".format(len(listTrueRIEFGC)))
+ 
+     # Eliminate those RIs with regulator which also have RIs with activator or repressor
+     listRITemp = []
+     for ri in listTrueRIEF:
+         listRI = ri.split('\t')
+         regulator = listRI[0]
+         regulated = listRI[1]
+         effect = listRI[2]
+         if effect == "regulator":
+             tempRIA = "{}\t{}\t{}".format(regulator, regulated, "activator")
+             tempRIR = "{}\t{}\t{}".format(regulator, regulated, "repressor")
+             if tempRIA in listTrueRIEF or tempRIR in listTrueRIEF:
+                 pass
+                 # print("RI regulator matchs RI activator/repressor: {}".format(ri))
+                 # listTrueRIEF.remove(ri)
+             else:
+                 # print("Len before: {}".format(len(listRITemp)))
+                 listRITemp.append(ri)
+                 # print("Len after: {}".format(len(listRITemp)))
+         else:
+             listRITemp.append(ri)
+     listTrueRIEF = listRITemp
+ 
+     print("   RIEFs en referencia después regulators: {}".format(len(listTrueRIEF)))
+     if options.evaluateGCs:
+         for ri in listTrueRIEFGC:
+             listRI = ri.split('\t')
+             regulator = listRI[0]
+             regulated = listRI[1]
+             effect = listRI[2]
+             gc = listRI[3]
+             if effect == "regulator":
+                 tempRIGCA = "{}\t{}\t{}\t{}".format(regulator, regulated, "activator", gc)
+                 tempRIGCR = "{}\t{}\t{}\t{}".format(regulator, regulated, "repressor", gc)
+                 if tempRIGCA in listTrueRIEFGC or tempRIGCR in listTrueRIEFGC:
+                     listTrueRIEFGC.remove(ri)
+         print("   RIEFGCs en referencia después regulators: {}".format(len(listTrueRIEFGC)))
+ 
+     listPredictedRI = []
+     hashPredictedRI = {}
+     listPredictedRIEF = []
+     hashPredictedRIEF = {}
+     if options.evaluateGCs:
+         listPredictedRIEFGC = []
+         hashPredictedRIEFGC = {}
+     hashFiles = {}
+     for path, dirs, files in os.walk(options.predictedPath):
+         for file in files:
+             if file.endswith(".a1"):
+                 filename = file[:-3]
+                 if filename not in hashFiles:
+                     hashFiles[filename] = 1
+                 else:
+                     hashFiles[filename] += 1
+     print("Files: {}".format(len(hashFiles)))
+ 
+     hashEntities = {}
+     processedFiles = 0
+     for file in sorted(hashFiles.keys()):
+         print("File: {}".format(file))
+         pmid = file[:file.find("_")]
+         # print("pmid {}".format(pmid))
+         sentenceFile = file[:file.find("-", file.find("_"))] + ".txt"
+         hashEntities = {}
+         hashOriginalEffect = {}
+         with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File:
+             for line in a1File:
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 entity = listLine2[0]
+                 idEntity = listLine1[0]
+                 originalEffect = listLine1[2]
+                 if entity.startswith("EFFECT"):
+                     entity = entity[entity.find(".") + 1:]
+                     print("Entity: {}".format(entity))
+                     entity = entity.replace("_dev", "")
+                     print("Entity without _dev: {}".format(entity))
+                     if idEntity not in hashOriginalEffect:
+                         hashOriginalEffect[idEntity] = originalEffect
+                 else:
+                     entity = listLine1[2]
+                 if idEntity not in hashEntities:
+                     hashEntities[idEntity] = entity
+         print("hashEntities: {}".format(hashEntities))
+ 
+         with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File:
+             for line in a2File:
+                 # print("Line a2: {}".format(line))
+                 # R1	Interaction.T3 Target:T2 Agent:T1 Condition: T4
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 regulator = listLine2[2]
+                 regulator = regulator[regulator.find(":") + 1:]
+                 regulated = listLine2[1]
+                 regulated = regulated[regulated.find(":") + 1:]
+                 effect = listLine2[0]
+                 effect = effect[effect.find(".") + 1:]
+                 # print("effect: {}".format(hashEntities[effect]))
+                 # if hashEntities[effect] == "binding":
+                 #    continue
+                 if options.evaluateGCs:
+                     gc = listLine2[3]
+                     gc = gc[gc.find(":") + 1:]
+ 
+                 pri = "{}\t{}".format(hashEntities[regulator], hashEntities[regulated])
+                 if pri not in listPredictedRI:
+                     listPredictedRI.append(pri)
+                 updateHashPredicted(pri, hashPredictedRI, pmid, sentenceFile, None)
+ 
+                 prief = "{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated], hashEntities[effect])
+                 print("prief: {}".format(prief))
+                 if prief not in listPredictedRIEF:
+                     listPredictedRIEF.append(prief)
+                 updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect])
+ 
+                 if options.evaluateGCs:
+                     priefgc = "{}\t{}\t{}\t{}".format(hashEntities[regulator], hashEntities[regulated],
+                                                       hashEntities[effect], hashEntities[gc])
+                     if priefgc not in listPredictedRIEFGC:
+                         listPredictedRIEFGC.append(priefgc)
+                     updateHashPredicted(priefgc, hashPredictedRIEFGC, pmid, sentenceFile, hashOriginalEffect[effect])
+         processedFiles += 1
+ 
+     print("Processed files: {}".format(processedFiles))
+     with open(os.path.join(options.outputPath, options.outputFile), mode="w") as oFile:
+         pass
+     get_scores_rules(listTrueRIEF, listPredictedRIEF, hashPredictedRIEF,
+                      "Scores regulator-regulated-effect (without gc)", "rief")
+     get_scores_rules(listTrueRI, listPredictedRI, hashPredictedRI, "Scores regulator-regulated (without effect nor gc)",
+                      "ri")
+     if options.evaluateGCs:
+         get_scores_rules(listTrueRIEFGC, listPredictedRIEFGC, hashPredictedRIEFGC,
+                          "Scores regulator-regulated-effect-gc", "riefgc")
--- a/extract-ris-deverbal/EFF_DVB-regex-v03.py 0 → 100644
View file @f3df57a
+++ b/extract-ris-deverbal/EFF_DVB-regex-v03.py 0 → 100644
View file @f3df57a
+ import fileinput
+ #import regex as re
+ #from regex import finditer
+ # We use Python3 and we had to eliminate option overlapped from finditer method of re
+ # As Daniel created this Python script in Python 2.7, he used overlapped, but in
+ # Python 3 this option was eliminated.
+ import re
+ from re import finditer
+ import sys
+ import os
+ import json
+ 
+ if (len(sys.argv) != 8):
+     sys.stderr.write("E: usage: " + sys.argv[
+         0] + " <input_path> <input_file> <output_path> <output_file> <normalized_Effects> <entity_path> <entity_file>\n")
+     sys.stderr.flush();
+     exit(2);
+ 
+ # LEER ARCHIVO INPUT
+ # Original Daniel: text_file = open( sys.argv[1], "r" )
+ # Original Daniel: dato = text_file.read()
+ # Original Daniel: text_file.close()
+ filename = sys.argv[2]
+ input_file = open(os.path.join(sys.argv[1], filename), "r")
+ #print("Input file: {}".format(os.path.join(sys.argv[1], sys.argv[2])))
+ dato = input_file.read()
+ input_file.close()
+ 
+ # Loading normalized effects
+ # print('Loading normalized effects...')
+ with open(os.path.join(sys.argv[5])) as diccFile:
+     hashNormalizedEffects = json.load(diccFile)
+ 
+ # USING ALREADY TAGGED ENTITIES OF THE FILE (in filter sentence step)
+ #<entity_path> <entity_file>
+ # READ DICTIONARY WITH ALREADY TAGGED ENTITIES
+ entity_path = sys.argv[6]
+ entity_file = sys.argv[7]
+ print('Loading dictionaries with already tagged entities...')
+ with open(os.path.join(entity_path, entity_file)) as entFile:
+     hashDicc = json.load(entFile)
+ print('   Loading dictionaries with already tagged entities... Done!')
+ # CREATE LISTS WITH ALREADY TAGGED ENTITIES OF THE FILE
+ regexNumFile = re.compile(r'_([0-9]+)[.-]')
+ result = regexNumFile.search(filename)
+ numFile = ""
+ inumFile = 0
+ if result:
+     inumFile = int(result.group(1))
+     numFile = str(inumFile)
+     print("Numfile: {}".format(numFile))
+ else:
+     print("WARNING: numfile not found in filename")
+ 
+ ATEREG1 = []
+ PTEREG1GENE = []
+ PTEREG1TU = []
+ listEffects = []
+ 
+ if numFile in hashDicc:
+     hashTemp = hashDicc[numFile]
+     # print("hashDicc[numFile]: {}".format(hashTemp))
+     for k, v in hashTemp.items():
+         if v == "TF":
+             # print("Verifiying TF")
+             if k not in ATEREG1:
+                 # print(" TF {}".format(k))
+                 ATEREG1.append(k)
+         elif v == "GENE":
+             if k not in PTEREG1GENE:
+                 PTEREG1GENE.append(k)
+         elif v == "TU":
+             if k not in PTEREG1TU:
+                 PTEREG1TU.append(k)
+         elif v == "EFFECT":
+             if k not in listEffects:
+                 listEffects.append(k)
+         else:
+             print("WARNING: entity not found in dictionaries")
+ else:
+     print("WARNING: numfile not found in dictionaries")
+ 
+ # QUITA EXTENSION DE NOMBRE DE ARCHIVO
+ # Original Daniel: split_line = sys.argv[2]
+ output_path = sys.argv[3]
+ # Original Daniel: split_line = split_line[:-4]
+ # Original Daniel: file_name = split_line + ".a2"
+ input_file_name = sys.argv[2]
+ # Original Daniel: open( file_name , 'w').close()
+ file_name_entities_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a1")
+ file_name_interactions_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".a2")
+ file_name_entities_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a1")
+ file_name_interactions_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".a2")
+ 
+ file_name_text_complete = os.path.join(output_path, "complete-ris", input_file_name[:-4] + ".txt")
+ file_name_text_incomplete = os.path.join(output_path, "incomplete-ris", input_file_name[:-4] + ".txt")
+ 
+ open(file_name_entities_complete, 'w').close()
+ open(file_name_interactions_complete, 'w').close()
+ # Original Daniel: open( file_name , 'w').close()
+ open(file_name_entities_incomplete, 'w').close()
+ open(file_name_interactions_incomplete, 'w').close()
+ 
+ # declara variables
+ # Original Daniel: impresion = []
+ impresionEntities = []
+ impresionInteractionsComplete = []
+ impresionInteractionsIncomplete = []
+ salida_a2 = []
+ salida_a2_trimmed = []
+ salida_a2_str = []
+ q2line = ()
+ listadeRIs = []
+ posiblesminimos = [[], []]
+ posiblesmaximos = [[], []]
+ listasecundaria = []
+ listasecundaria_trimmed = []
+ impresionEntities = []
+ impresionInteractionsComplete = []
+ impresionInteractionsIncomplete = []
+ 
+ # Effects
+ for i in range(len(listEffects)):
+     if listEffects[i] in dato:
+         for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at|for)\b)', dato):  # "of","for" o "at" a la derecha de EFF
+         # Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\b(of|at)\b)', dato,
+         # Original Daniel:                       overlapped=True):  # "of" o "at" a la derecha de EFF
+             spantup = match.span(1)
+             # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1))
+             if match.group(1).lower() in hashNormalizedEffects:
+                 effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
+             else:
+                 effect = "EFFECT." + "deverbal_effect"
+             # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1))
+             a2line = (effect, spantup[0], spantup[1] - 1, match.group(1))
+             #print("Append effect a2line: {}".format(a2line))
+             salida_a2.append(a2line)
+         for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato):  # "by" a la derecha de EFF
+             # Original Daniel: for match in finditer(r'\b(' + listEffects[i] + r')\b(\s\bby\b)', dato,
+             # Original Daniel:                       overlapped=True):  # "by" a la derecha de EFF
+             spantup = match.span(1)
+             # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(1))
+             if match.group(1).lower() in hashNormalizedEffects:
+                 effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
+             else:
+                 effect = "EFFECT." + "deverbal_effect"
+             # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(1))
+             a2line = (effect, spantup[0], spantup[1] - 1, match.group(1))
+             salida_a2.append(a2line)
+             #print("Append effect a2line: {}".format(a2line))
+         for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato):  # "is the" 0-1 palabras a la izquierda de EFF
+             # Original Daniel: for match in finditer(r'(is\sthe\s(\S+\s){0,1})\b(' + listEffects[i] + r')\b', dato,
+             # Original Daniel:                   overlapped=True):  # "is the" 0-1 palabras a la izquierda de EFF
+             spantup = match.span(3)
+             # Original Daniel: a2line = ('deverbal_effect', spantup[0], spantup[1], match.group(3))
+             if match.group(1).lower() in hashNormalizedEffects:
+                 effect = "EFFECT." + hashNormalizedEffects[match.group(1).lower()]
+             else:
+                 effect = "EFFECT." + "deverbal_effect"
+             # Original Daniel: a2line = (effect, spantup[0], spantup[1], match.group(3))
+             a2line = (effect, spantup[0], spantup[1] - 1, match.group(3))
+             salida_a2.append(a2line)
+             #print("Append effect a2line: {}".format(a2line))
+ #print("Efectos salida_a2: {}".format(salida_a2))
+ 
+ # PTEREG1GENE regulados pacientes GENE
+ for i in range(len(PTEREG1GENE)):
+     if PTEREG1GENE[i] in dato:
+         # print(PTEREG1GENE[i])
+         for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato):  # "of", "for" o "at" 0-1 palabras a la izq de regulado
+             # Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1GENE[i] + r')\b', dato,
+             # Original Daniel:                   overlapped=True):  # "of" o "at" 0-1 palabras a la izq de regulado
+             spantup = match.span(3)
+             # print("match {} spantup {}".format(match.group(3), match.span(3)))
+             # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3))
+             a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(3))
+             salida_a2.append(a2line)
+             # print("Append regulados a2line: {}".format(a2line))
+         for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato):  # regulados sin patron
+             # Original Daniel: for match in finditer(r'\b(' + PTEREG1GENE[i] + r')\b', dato, overlapped=True):  # regulados sin patron
+             spantup = match.span(1)
+             # print("match {} spantup {}".format(match.group(1), match.span(1)))
+             # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1))
+             a2line = ('GENE', spantup[0], spantup[1] - 1, match.group(1))
+             listasecundaria.append(a2line)
+ #print("Efectos regulados gene listasecundaria: {}".format(listasecundaria))
+ 
+ # CMC: ADDED TO SEPARTE REGULATED GENE AND TU
+ # PTEREG1TU regulados pacientes TU
+ for i in range(len(PTEREG1TU)):
+     if PTEREG1TU[i] in dato:
+         # print(PTEREG1TU[i])
+         for match in finditer(r'\b(of|at|for)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato):  # "of","for" o "at" 0-1 palabras a la izq de regulado
+             # Original Daniel: for match in finditer(r'\b(of|at)\b\s+(\w\s){0,1}\b(' + PTEREG1TU[i] + r')\b', dato,
+             # Original Daniel:                   overlapped=True):  # "of" o "at" 0-1 palabras a la izq de regulado
+             spantup = match.span(3)
+             # print("match: " + match.group(3))
+             # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(3))
+             a2line = ('TU', spantup[0], spantup[1] - 1, match.group(3))
+             salida_a2.append(a2line)
+             # print("Append regulados a2line: {}".format(a2line))
+         for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato):  # regulados sin patron
+         # for match in finditer(r'\b(' + PTEREG1TU[i] + r')\b', dato, overlapped=True):  # regulados sin patron
+             spantup = match.span(1)
+             # Original Daniel: a2line = ('regulated', spantup[0], spantup[1], match.group(1))
+             a2line = ('TU', spantup[0], spantup[1] - 1, match.group(1))
+             listasecundaria.append(a2line)
+ #print("Efectos regulados tu listasecundaria: {}".format(listasecundaria))
+ 
+ # ATEREG1 reguladores agentes
+ for i in range(len(ATEREG1)):
+     if ATEREG1[i] in dato:
+         # print(ATEREG1[i])
+         for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato):  # "by" 0-1 palabras a la izq de regulado
+             # Original Daniel: for match in finditer(r'\bby\b\s+(\w\s){0,1}\b(' + ATEREG1[i] + r')\b', dato,
+             # Original Daniel:                   overlapped=True):  # "by" 0-1 palabras a la izq de regulado
+             spantup = match.span(2)
+             # print("match: " + match.group(2))
+             # print("match {} spantup {}".format(match.group(2), match.span(2)))
+             # Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(2))
+             a2line = ('TF', spantup[0], spantup[1] - 1, match.group(2))
+             salida_a2.append(a2line)
+             #print("Append regulator a2line: {}".format(a2line))
+         for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato):  # reguladores sin patron
+         # for match in finditer(r'\b(' + ATEREG1[i] + r')\b', dato, overlapped=True):  # reguladores sin patron
+             spantup = match.span(1)
+             # print("match {} spantup {}".format(match.group(1), match.span(1)))
+             # Original Daniel: a2line = ('regulator', spantup[0], spantup[1], match.group(1))
+             a2line = ('TF', spantup[0], spantup[1] - 1, match.group(1))
+             listasecundaria.append(a2line)
+             #print("Append regulator a2line: {}".format(a2line))
+ #print("Reguladores agentes salida_a2: {}".format(salida_a2))
+ #print("Reguladores agentes listasecundaria: {}".format(listasecundaria))
+ 
+ # Elimina etiquetados repetidos o que estan incluidos en otros
+ if salida_a2:
+     salida_a2.sort(key=lambda tup: tup[1])
+     salida_a2_trimmed.append(salida_a2[0])
+     for i in range(len(salida_a2)):
+         copiar = True
+         for j in range(len(salida_a2_trimmed)):
+             if ((salida_a2[i][1] >= salida_a2_trimmed[j][1]) and (salida_a2[i][2] <= salida_a2_trimmed[j][2])):
+                 copiar = False
+         if copiar:
+             salida_a2_trimmed.append(salida_a2[i])
+ if listasecundaria:
+     listasecundaria.sort(key=lambda tup: tup[1])
+     listasecundaria_trimmed.append(listasecundaria[0])
+     for i in range(len(listasecundaria)):
+         copiar = True
+         for j in range(len(listasecundaria_trimmed)):
+             if ((listasecundaria[i][1] >= listasecundaria_trimmed[j][1]) and (
+                 listasecundaria[i][2] <= listasecundaria_trimmed[j][2])):
+                 copiar = False
+         if copiar:
+             listasecundaria_trimmed.append(listasecundaria[i])
+ # print("Sin repeticiones salida_a2_trimmed: {}".format(salida_a2_trimmed))
+ #print("Sin repeticiones listasecundaria_trimmed: {}".format(listasecundaria_trimmed))
+ 
+ # Asigna identificadores (TX) a entidades (eff, regulador, regulado)
+ lastID = 0
+ for i in range(len(salida_a2_trimmed)):
+     # if sys.argv[2].find('355') > -1:
+     #    print("i : {}".format(i))
+     salida_a2_trimmed[i] = list(salida_a2_trimmed[i])
+     ID = "T" + str(i + 1)
+     salida_a2_trimmed[i].insert(0, ID)
+     lastID = i + 1
+     # if sys.argv[2].find('355') > -1:
+     #    print("lastID : {}".format(lastID))
+ 
+ for i in range(len(listasecundaria_trimmed)):
+     # if sys.argv[2].find('355') > -1:
+     #    print("i : {}".format(i))
+     #    print("lastID : {}".format(lastID))
+     listasecundaria_trimmed[i] = list(listasecundaria_trimmed[i])
+     ID = "T" + str(i + 1 + lastID)
+     listasecundaria_trimmed[i].insert(0, ID)
+ 
+ # print("Con identificadores salida_a2_trimmed: {}".format(salida_a2_trimmed))
+ #print("Con identificadores listasecundaria_trimmed: {}".format(listasecundaria_trimmed))
+ 
+ #print("salida_a2_trimmed")  #########################
+ #print(salida_a2_trimmed)  #########################
+ #print("listasecundaria_trimmed")
+ #print(listasecundaria_trimmed)
+ 
+ # Arma Interacciones Regulatorias
+ i = 0
+ while i < int(len(salida_a2_trimmed)):
+     if "EFFECT" in salida_a2_trimmed[i][1]:
+         # BUSCA REGULADO A LA DERECHA
+         nuevaRI = [salida_a2_trimmed[i][0], "", ""]  # efecto, tema, causa
+         ref = ""
+         posiblesminimos = [[], []]
+         j = 0
+         while j < int(len(salida_a2_trimmed)):
+             # Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
+             if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
+                 posiblesminimos[0].append(salida_a2_trimmed[j][2])
+                 posiblesminimos[1].append(salida_a2_trimmed[j][0])
+             j = j + 1
+         if posiblesminimos[0]:
+             refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
+             ref = posiblesminimos[1][refpointer]
+         # si no encuentra, BUSCA REGULADO A LA IZQUIERDA
+         if not ref:
+             posiblesmaximos = [[], []]
+             j = 0
+             while j < int(len(salida_a2_trimmed)):
+                 # Original Daniel: if ("regulated" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
+                 if ("GENE" in salida_a2_trimmed[j][1] or "TU" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
+                     posiblesmaximos[0].append(salida_a2_trimmed[j][3])
+                     posiblesmaximos[1].append(salida_a2_trimmed[j][0])
+                 j = j + 1
+             if posiblesmaximos[0]:
+                 refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0]))
+                 ref = posiblesmaximos[1][refpointer]
+         nuevaRI[1] = ref
+         # BUSCA REGULADOR A LA DERECHA
+         ref = ""
+         posiblesminimos = [[], []]
+         j = 0
+         while j < int(len(salida_a2_trimmed)):
+             # Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
+             if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][3] < salida_a2_trimmed[j][2]):
+                 posiblesminimos[0].append(salida_a2_trimmed[j][2])
+                 posiblesminimos[1].append(salida_a2_trimmed[j][0])
+             j = j + 1
+         if posiblesminimos[0]:
+             refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
+             ref = posiblesminimos[1][refpointer]
+         # si no encuentra, BUSCA REGULADOR A LA IZQUIERDA
+         if not ref:
+             posiblesmaximos = [[], []]
+             j = 0
+             while j < int(len(salida_a2_trimmed)):
+                 # Original Daniel: if ("regulator" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
+                 if ("TF" in salida_a2_trimmed[j][1]) and (salida_a2_trimmed[i][2] > salida_a2_trimmed[j][3]):
+                     posiblesmaximos[0].append(salida_a2_trimmed[j][3])
+                     posiblesmaximos[1].append(salida_a2_trimmed[j][0])
+                 j = j + 1
+             if posiblesmaximos[0]:
+                 refpointer = posiblesmaximos[0].index(max(posiblesmaximos[0]))
+                 ref = posiblesmaximos[1][refpointer]
+         nuevaRI[2] = ref
+         listadeRIs.append(nuevaRI)
+     i = i + 1
+ 
+ # SEGUNDA FASE DE BUSQUEDA DE REGULADORES Y REGULADOS
+ i = 0
+ while i < int(len(listadeRIs)):
+     if not listadeRIs[i][1]:  # no regulado
+         ref = ""
+         posiblesminimos = [[], []]
+         # BUSCA REGULADO A LA DERECHA
+         j = 0
+         while j < int(len(listasecundaria_trimmed)):
+             for k in range(len(salida_a2_trimmed)):
+                 if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
+                     ind = k
+             # Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
+             if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
+                 posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3]))
+                 posiblesminimos[1].append(listasecundaria_trimmed[j][0])
+             j = j + 1
+         # BUSCA REGULADO A LA IZQUIERDA
+         j = 0
+         while j < int(len(listasecundaria_trimmed)):
+             for k in range(len(salida_a2_trimmed)):
+                 if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
+                     ind = k
+             # Original Daniel: if ("regulated" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
+             if ("GENE" in listasecundaria_trimmed[j][1] or "TU" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
+                 posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3]))
+                 posiblesminimos[1].append(listasecundaria_trimmed[j][0])
+             j = j + 1
+         # ELIGE EL REGULADO MAS CERCANO
+         if posiblesminimos[0]:
+             refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
+             ref = posiblesminimos[1][refpointer]
+         # print(ref)
+         listadeRIs[i][1] = ref
+     if not listadeRIs[i][2]:  # no regulador
+         ref = ""
+         posiblesminimos = [[], []]
+         # BUSCA REGULADO A LA DERECHA
+         j = 0
+         while j < int(len(listasecundaria_trimmed)):
+             for k in range(len(salida_a2_trimmed)):
+                 if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
+                     ind = k
+             # Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
+             if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][3] < listasecundaria_trimmed[j][2]):
+                 posiblesminimos[0].append((listasecundaria_trimmed[j][2] - salida_a2_trimmed[ind][3]))
+                 posiblesminimos[1].append(listasecundaria_trimmed[j][0])
+             j = j + 1
+         # BUSCA REGULADO A LA IZQUIERDA
+         j = 0
+         while j < int(len(listasecundaria_trimmed)):
+             for k in range(len(salida_a2_trimmed)):
+                 if listadeRIs[i][0] == salida_a2_trimmed[k][0]:
+                     ind = k
+             # Original Daniel: if ("regulator" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
+             if ("TF" in listasecundaria_trimmed[j][1]) and (salida_a2_trimmed[ind][2] > listasecundaria_trimmed[j][3]):
+                 posiblesminimos[0].append((salida_a2_trimmed[ind][2] - listasecundaria_trimmed[j][3]))
+                 posiblesminimos[1].append(listasecundaria_trimmed[j][0])
+             j = j + 1
+         # ELIGE EL REGULADO MAS CERCANO
+         if posiblesminimos[0]:
+             refpointer = posiblesminimos[0].index(min(posiblesminimos[0]))
+             ref = posiblesminimos[1][refpointer]
+         # print(ref)
+         listadeRIs[i][2] = ref
+     i = i + 1
+ #print("ListadeRIs: {}".format(listadeRIs))
+ 
+ # Elige reguladores y regulados de la listasecundaria para ser impresos
+ setmem = []
+ k = 0
+ while k < int(len(listadeRIs)):
+     j = 0
+     copysec = False
+     #while j < int(len(listasecundaria_trimmed)):
+     while j < len(listasecundaria_trimmed):
+         # print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs))
+         # Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
+         if listasecundaria_trimmed[j][0] == listadeRIs[k][2]:
+             # print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][2] {}".format(listasecundaria_trimmed[j][0],
+             #                                                                        listadeRIs[k][2]))
+             copysec = True
+             # print("j: {}".format(j))
+             indj = j
+         j = j + 1
+     if copysec:
+         setmem.append(listasecundaria_trimmed[indj])
+         # print("setmen: {}".format(setmem))
+ 
+     #### CMC: AGREGO ESTE CODIGO PARA BUSCAR REGULADOS YA QUE EL CODIGO ANTERIOR BUSCA REGULADORES
+     j = 0
+     copysec = False
+     #while j < int(len(listasecundaria_trimmed)):
+     while j < len(listasecundaria_trimmed):
+         # print("listasecundaria_trimmed {} and listadeRIs {}".format(listasecundaria_trimmed, listadeRIs))
+         # Original Daniel: if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
+         if listasecundaria_trimmed[j][0] == listadeRIs[k][1]:
+             # print("listasecundaria_trimmed[j][0] {} == listadeRIs[k][1] {}".format(listasecundaria_trimmed[j][0],
+             #                                                                       listadeRIs[k][1]))
+             copysec = True
+             # print("j: {}".format(j))
+             indj = j
+         j = j + 1
+     if copysec:
+         setmem.append(listasecundaria_trimmed[indj])
+         # print("setmen: {}".format(setmem))
+ 
+     k = k + 1
+ setmem = sorted(setmem)
+ # print("setmen: {}".format(setmem))
+ dedup = [setmem[i] for i in range(len(setmem)) if i == 0 or setmem[i] != setmem[i - 1]]
+ # print("dedup: {}".format(dedup))
+ salida_a2_trimmed.extend(dedup)
+ #print("salida_a2_trimmed after listasecundaria_trimmed: {}".format(salida_a2_trimmed))
+ 
+ # Asigna identificadores (EX) a eventos (RIs)
+ for i in range(len(listadeRIs)):
+     # Original Daniel: ID = "E" + str(i+1)
+     ID = "R" + str(i + 1)
+     listadeRIs[i].insert(0, ID)
+ #print("Con identificadores ListadeRIs: {}".format(listadeRIs))
+ 
+ # CREA LISTADO DE EVENTOS (EX) Y ENTIDADES (TX) EN FORMATO DE IMPESIÓN
+ for i in range(len(salida_a2_trimmed)):
+     linea = str(salida_a2_trimmed[i][0]) + '	' + str(salida_a2_trimmed[i][1]) + ' ' + str(
+         salida_a2_trimmed[i][2]) + ' ' + str(salida_a2_trimmed[i][3]) + '	' + str(salida_a2_trimmed[i][4])
+     # Original Daniel: impresion.append(linea)
+     impresionEntities.append(linea)
+ 
+ for i in range(len(listadeRIs)):
+     if listadeRIs[i][2] and listadeRIs[i][3]:
+         # Original Daniel: linea = str(listadeRIs[i][0]) + '	' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2]) + ' ' + 'Cause:' + str(listadeRIs[i][3])
+         linea = str(listadeRIs[i][0]) + '	' + "Interaction." + str(listadeRIs[i][1]) + ' ' + 'Target:' + str(
+             listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3])
+         # Original Daniel: elif listadeRIs[i][2]:
+         # Original Daniel: linea = str(listadeRIs[i][0]) + '	' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Theme:' + str(listadeRIs[i][2])
+         # Original Daniel: elif listadeRIs[i][3]:
+         # Original Daniel: linea = str(listadeRIs[i][0]) + '	' + "deverbal_effect:" + str(listadeRIs[i][1]) + ' ' + 'Cause:' + str(listadeRIs[i][3])
+         # Original Daniel: else:
+         # Original Daniel: linea = str(listadeRIs[i][0]) + '	' + "deverbal_effect:" + str(listadeRIs[i][1])
+         # Original Daniel: impresion.append(linea)
+         impresionInteractionsComplete.append(linea)
+         #print("Interaction complete: {}".format(linea))
+         linea = str(listadeRIs[i][0]) + '	' + "Interaction.regulator" + ' ' + 'Target:' + str(
+             listadeRIs[i][2]) + ' ' + 'Agent:' + str(listadeRIs[i][3])
+         impresionInteractionsIncomplete.append(linea)
+ 
+ #print("Entities: {}".format(impresionEntities))
+ 
+ # Escribir entidades interacciones completas en a1
+ for line in impresionEntities:
+     # Original Daniel: save_file = open( file_name, "a" )
+     save_file = open(file_name_entities_complete, "a")
+     save_file.write(line)
+     save_file.write("\n")
+     save_file.close()
+ 
+ # Escribir entidades interacciones incompletas en a1
+ for line in impresionEntities:
+     # Original Daniel: save_file = open( file_name, "a" )
+     save_file = open(file_name_entities_incomplete, "a")
+     save_file.write(line)
+     save_file.write("\n")
+     save_file.close()
+ 
+ # Escribir interacciones completas (regulator, effect, regulated)
+ # print("InteractionsComplete: {}".format(impresionInteractionsComplete))
+ for line in impresionInteractionsComplete:
+     # Original Daniel: save_file = open( file_name, "a" )
+     save_file = open(file_name_interactions_complete, "a")
+     save_file.write(line)
+     save_file.write("\n")
+     save_file.close()
+ 
+ # Escribir interacciones incompletas (regulator, "regulator", regulated)
+ # print("InteractionsIncomplete: {}".format(impresionInteractionsIncomplete))
+ for line in impresionInteractionsIncomplete:
+     # Original Daniel: save_file = open( file_name, "a" )
+     save_file = open(file_name_interactions_incomplete, "a")
+     save_file.write(line)
+     save_file.write("\n")
+     save_file.close()
+ 
+ with open(file_name_text_complete, mode="w") as txtFile:
+     txtFile.write(dato)
+ with open(file_name_text_incomplete, mode="w") as txtFile:
+     txtFile.write(dato)
--- a/filtered-sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/filtered-sentences/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/format/regex.py 0 → 100644
View file @f3df57a
+++ b/format/regex.py 0 → 100644
View file @f3df57a
+ import fileinput
+ import re
+ import sys
+ 
+ if ( len( sys.argv ) < 3 ):
+     sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <output_file> \n" )
+     sys.stderr.flush();
+ 
+     exit( 2 );
+ else:
+     print("Ok.")
+ 
+ #LEER ARCHIVO INPUT
+ text_file = open( sys.argv[1], "r" )
+ dato = text_file.read().splitlines()
+ text_file.close()
+ 
+ 
+ #QUITA EXTENSION DE NOMBRE DE ARCHIVO
+ split_line = sys.argv[2]
+ split_line = split_line[:-4]
+ file_name=""
+ file_name = split_line + ".san"
+ open( file_name , 'w').close()
+ 
+ #ESCRIBIR REGEX EN ARGV 2
+ for line in dato:
+     line = re.sub('[\(][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NNNNa_)
+     line = re.sub('[\[][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NNNNa_]
+     line = re.sub('[\(][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NN,NN,NN_)
+     line = re.sub('[\[][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NN,NN,NN_]
+     line = re.sub('[\(][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num_)
+     line = re.sub('[\(][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num.num_)
+     line = re.sub('[\(][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num-num_)
+     line = re.sub('[\[][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num_]
+     line = re.sub('[\[][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num.num_]
+     line = re.sub('[\[][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num-num_]
+     line = re.sub('[\(]\s[a-zA-Z]{1}\s[\)]', '', line.rstrip()) #elimina (_alpha_)
+     line = re.sub('[\[]\s[a-zA-Z]{1}\s[\]]', '', line.rstrip()) #elimina [_alpha_]
+     line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman_)
+     line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman-Roman_)
+     line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman_)
+     line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman-roman_)
+     line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman_]
+     line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman-Roman_]
+     line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman_]
+     line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman-roman_]
+     line = re.sub('[\(][^\(|^\)]*\s(fig\s\.|figure|see|i\s\.\se\s\.|e\s\.\sg\s\.|tab\s\.table)\s[^\(|^\)]*[\)]', '', line.rstrip(), flags=re.I) #
+     line = re.sub('  ', ' ', line.rstrip()) #elimina (_NNNNa_)
+     #print(line)
+ 
+ 
+     save_file = open( file_name, "a" )
+     save_file.write(line)
+     save_file.write("\n")
+     save_file.close()
--- a/format/sanitized_sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/format/sanitized_sentences/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/format/split_sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/format/split_sentences/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/get-TRN-Organism-v1.py 0 → 100644
View file @f3df57a
+++ b/get-TRN-Organism-v1.py 0 → 100644
View file @f3df57a
+ # -*- coding: UTF-8 -*-
+ import operator
+ from optparse import OptionParser
+ import os
+ import sys
+ import json
+ import re
+ import pandas as pd
+ 
+ __author__ = 'CMendezC'
+ 
+ 
+ # Objective: add organism annotation (http://pakal.ccg.unam.mx/cmendezc/bacteria-annotation) to TRN tabla
+ 
+ # Parameters:
+ #   1) --trnPath Path to TRN detail table
+ #   2) --trnFile File of TRN detail table
+ #   3) --outputPath Output path
+ #   4) --organismPath Path to Organism annotation table
+ #   5) --organismFile File of Organism annotation table
+ 
+ # Ouput:
+ #   1) Tsv file detail with:
+ # TF	TypeRegulated	Regulated	Effect	PMID    IdSentence  TypeSentence    Sentence
+ #   Original_idsentence  Original_sentence  SectionNum SectionName  OrganismMentions    OrganismScore    ConfirmationLevel
+ # OrganismScore = {
+ #   If only salmonella or only non identified organism  = 1,
+ #   If (startswith salmonella or non identified organism) and other organisms = 0.5
+ #   If only other organisms = 0
+ #   }
+ 
+ # Execution:
+ # python3.4 get-TRN-Organism-v1.py
+ 
+ # Local
+ # python get-TRN-Organism-v1.py
+ # --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
+ # --trnFile STMTRN_all.detail.tsv
+ # --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
+ # --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results
+ # --organismFile annotations_STMTRN_all.sentences.csv
+ # python3 get-TRN-Organism-v1.py --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --trnFile STMTRN_all.detail.tsv --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results --organismFile annotations_STMTRN_all.sentences.csv
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ def only_salmonella_or_non_identified_organism(list_temp):
+     non_identified_organisms = [
+         'unidentified plasmid',
+         'unidentified',
+         'bacterium',
+         'bacterium IFAM-3211',
+         'bacterium IFAM-2074',
+         'bacterium IFAM-1493',
+         'bacterium IFAM-3215',
+         'bacterium IFAM-3359',
+         'hybrid',
+         'Vector pMC1403',
+         'Transposon Tn10',
+         'unidentified cloning vector',
+         'Plasmid F',
+         'Cloning vector pUC19'
+     ]
+     matches = 0
+     for o in list_temp:
+         if o.lower().startswith("salmonella") or o in non_identified_organisms:
+             matches += 1
+     if matches == len(list_temp):
+         return True
+     else:
+         return False
+ 
+ def salmonella_or_non_identified_and_other_organisms(list_temp):
+     non_identified_organisms = [
+         'unidentified plasmid',
+         'unidentified',
+         'bacterium',
+         'bacterium IFAM-3211',
+         'bacterium IFAM-2074',
+         'bacterium IFAM-1493',
+         'bacterium IFAM-3215',
+         'bacterium IFAM-3359',
+         'hybrid',
+         'Vector pMC1403',
+         'Transposon Tn10',
+         'unidentified cloning vector',
+         'Plasmid F',
+         'Cloning vector pUC19'
+     ]
+     matches = 0
+     for o in list_temp:
+         if o.lower().startswith("salmonella") or o in non_identified_organisms:
+             matches += 1
+     if matches < len(list_temp) and matches > 0:
+         return True
+     else:
+         return False
+ 
+ def only_other_organims(list_temp):
+     non_identified_organisms = [
+         'unidentified plasmid',
+         'unidentified',
+         'bacterium',
+         'bacterium IFAM-3211',
+         'bacterium IFAM-2074',
+         'bacterium IFAM-1493',
+         'bacterium IFAM-3215',
+         'bacterium IFAM-3359',
+         'hybrid',
+         'Vector pMC1403',
+         'Transposon Tn10',
+         'unidentified cloning vector',
+         'Plasmid F',
+         'Cloning vector pUC19'
+     ]
+     matches = 0
+     for o in list_temp:
+         if o.lower().startswith("salmonella") or o in non_identified_organisms:
+             matches += 1
+     if matches == 0:
+         return True
+     else:
+         return False
+ 
+ if __name__ == "__main__":
+     # Parameter definition
+     parser = OptionParser()
+     parser.add_option("--trnPath", dest="trnPath",
+                       help="Path to TRN detail table", metavar="PATH")
+     parser.add_option("--trnFile", dest="trnFile",
+                       help="File of TRN detail table", metavar="FILE")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path", metavar="PATH")
+     parser.add_option("--organismPath", dest="organismPath",
+                       help="Path to organism annotation table", metavar="PATH")
+     parser.add_option("--organismFile", dest="organismFile",
+                       help="File of organism annotation table", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("None parameter entered.")
+         sys.exit(1)
+ 
+     # Printing parameter values
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path to TRN detail table: " + str(options.trnPath))
+     print("File of TRN detail table: " + str(options.trnFile))
+     print("Output path: " + str(options.outputPath))
+     print("Path to organism annotation table: " + str(options.organismPath))
+     print("File of organism annotation table: " + str(options.organismFile))
+ 
+     # Load organism annotation table
+     print("Loading organism annotation table")
+     df_organisms = pd.read_csv(os.path.join(options.organismPath, options.organismFile), sep=',')
+     print("Total de frases anotadas con organism: {}".format(df_organisms.shape[0]))
+ 
+     # Load TRN detail table
+     print("Loading TRN detail table")
+     df_detail = pd.read_csv(os.path.join(options.trnPath, options.trnFile), sep='\t')
+     print("Total de frases en TRN: {}".format(df_detail.shape[0]))
+ 
+     # Fix column for organism. We changed this issue in get-TRN-v2.py
+     df_detail = df_detail.rename(columns={"Organism": "Organisms"})
+     df_detail['OrganismScore'] = 1.00
+     print(df_detail.columns)
+     #print(df_detail['Sentence'].head(15))
+ 
+     for idx in df_organisms.index:
+         organisms = df_organisms['Organisms'][idx]
+         SentenceNumberInFile = df_organisms['SentenceNumberInFile'][idx]
+         SentenceNumberInFile = SentenceNumberInFile - 2
+         # print("Organisms before: {}".format(df_detail.Organisms[SentenceNumberInFile]))
+         df_detail.Organisms[SentenceNumberInFile] = organisms
+         # print("Organisms assigned: {}".format(df_detail.Organisms[SentenceNumberInFile]))
+ 
+         # OrganismScore = {
+         #   If only salmonella or only non identified organism  = 1,
+         #   If (startswith salmonella or non identified organism) and other organisms = 0.5
+         #   If only other organisms = 0
+         #   }
+         list_organisms = organisms.split(';')
+         # print("     OrganismScore before: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
+         if only_salmonella_or_non_identified_organism(list_organisms):
+             df_detail.OrganismScore[SentenceNumberInFile] = 1.00
+         elif salmonella_or_non_identified_and_other_organisms(list_organisms):
+             df_detail.OrganismScore[SentenceNumberInFile] = 0.50
+         elif only_other_organims(list_organisms):
+             df_detail.OrganismScore[SentenceNumberInFile] = 0.00
+         # print("     OrganismScore assigned: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
+ 
+     hashPredictedRIs = {}
+     hashPredictedRIsCount = {}
+     hashPredictedRIsCountVer = {}
+     hashPredictedRIsCountDev = {}
+     hashPredictedRIsCountAtt = {}
+     hashPredictedRIsCountAuto = {}
+     hashPredictedRIsScore = {}
+     hashPredictedRIsRI = {}
+     for idx in df_detail.index:
+         tf = df_detail['TF'][idx]
+         TypeRegulated = df_detail['TypeRegulated'][idx]
+         Regulated = df_detail['Regulated'][idx]
+         Effect = df_detail['Effect'][idx]
+         pmid = df_detail['PMID'][idx]
+         numsent = df_detail['NumSentence'][idx]
+         type_sent = df_detail['TypeSentence'][idx]
+         sentence = df_detail['Sentence'][idx]
+         original_idsentence = df_detail['OriginalIdSentence'][idx]
+         original_sentence = df_detail['OriginalSentence'][idx]
+         section_num = df_detail['SectionNum'][idx]
+         section_name = df_detail['SectionName'][idx]
+         organisms = df_detail['Organisms'][idx]
+         organism_score = df_detail['OrganismScore'][idx]
+         llave = "{}\t{}\t{}\t{}".format(tf, TypeRegulated, Regulated, Effect)
+         if organism_score == 0:
+             continue
+         if llave in hashPredictedRIs:
+             hashPredictedRIs[llave].append(
+                 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
+                                                             original_sentence, section_num, section_name, organisms,
+                                                             organism_score, "", "", "", "", "", ""))
+             hashPredictedRIsCount[llave] += 1
+             if type_sent == "ver/dev":
+                 hashPredictedRIsCountVer[llave] += 1
+             elif type_sent == "dev":
+                 hashPredictedRIsCountDev[llave] += 1
+             elif type_sent == "att":
+                 hashPredictedRIsCountAtt[llave] += 1
+             elif type_sent == "auto":
+                 hashPredictedRIsCountAuto[llave] += 1
+             # if organism_score == 0.5:
+                 # We penalize RI
+                 # hashPredictedRIsScore[llave] -= 0.05
+ 
+         else:
+             hashPredictedRIs[llave] = [
+                 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
+                                                             original_sentence, section_num, section_name, organisms,
+                                                             organism_score, "", "", "", "", "", "")]
+             hashPredictedRIsCount[llave] = 1
+             hashPredictedRIsCountVer[llave] = 0
+             hashPredictedRIsCountDev[llave] = 0
+             hashPredictedRIsCountAtt[llave] = 0
+             hashPredictedRIsCountAuto[llave] = 0
+             hashPredictedRIsScore[llave] = 1
+             if type_sent == "ver/dev":
+                 hashPredictedRIsCountVer[llave] = 1
+             elif type_sent == "dev":
+                 hashPredictedRIsCountDev[llave] = 1
+             elif type_sent == "att":
+                 hashPredictedRIsCountAtt[llave] = 1
+             elif type_sent == "auto":
+                 hashPredictedRIsCountAuto[llave] = 1
+             # if organism_score == 0.5:
+                 # We penalize RI
+                 # hashPredictedRIsScore[llave] -= 0.05
+ 
+     print("Total RIs en TRN con organismo: {}".format(len(hashPredictedRIs)))
+     with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "summary_org")), mode="w") as oFile:
+         # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
+         oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
+         for k,v in hashPredictedRIs.items():
+             RI_value = "True"
+             # if hashPredictedRIsScore[k] < 1:
+                 # RI_value = "Possible"
+             oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
+                                                               hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k],
+                                                               hashPredictedRIsScore[k], RI_value))
+     with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "detail_org")), mode="w") as oFile:
+         # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
+         oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tKT\tCL\tSource\tSpeculation\tNegation\tConfirmationLevel\n")
+         i = 0
+         for k,v in hashPredictedRIs.items():
+             for s in v:
+                 oFile.write("{}\t{}\n".format(k, s))
+                 i += 1
+     print("Total de frases en TRN organismo: {}".format(i))
+ 
--- a/get-TRN-v2.py 0 → 100644
View file @f3df57a
+++ b/get-TRN-v2.py 0 → 100644
View file @f3df57a
+ # -*- coding: UTF-8 -*-
+ import operator
+ from optparse import OptionParser
+ import os
+ import sys
+ import json
+ import re
+ import pandas as pd
+ 
+ __author__ = 'CMendezC'
+ 
+ 
+ # Objective: generate TRN
+ # CFMC 2022-03-11: Agregamos:
+ #   1) Sección de oraciones de salida
+ #   2)
+ 
+ # Parameters:
+ #   1) --predictedPath Path for predicted interactions
+ #   2) --outputPath Output path
+ #   3) --outputFile Preffix file for saving TRN
+ #   4) --diccPath Dictionary path
+ #   5) --diccSynon File with synonyms of TFs
+ #   6) --tsvPath    Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf
+ #   7) --jsonpdfPath    Path to read jsonpdf file to extract section name
+ 
+ # Ouput:
+ #   1) Tsv file detail with:
+ # TF	TypeRegulated	Regulated	Effect	PMID    IdSentence  TypeSentence    Sentence
+ #   Original_idsentence  Original_sentence  SectionNum SectionName  OrganismMentions    OrganismScore    ConfirmationLevel
+ 
+ #   1) Tsv file summary with:
+ # TF	TypeRegulated	Regulated	Effect	SentCount	Ver/Dev	Att	Auto	Score   RI (True/False)
+ 
+ # Execution:
+ # Version 2 TRN Salmonella
+ # python3.4 get-TRN-v2.py
+ # --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn
+ # --outputFile STMTRN_v2
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+ # --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv
+ # --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf
+ # python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_v2 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/original-toy/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/jsonpdf
+ 
+ # articulos_sal_4
+ # python3.4 get-TRN-v2.py
+ # --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn
+ # --outputFile STMTRN_articulos_sal_4
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+ # --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv
+ # --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf
+ # python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-4/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_4 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_4/jsonpdf
+ 
+ # articulos_sal_1
+ # python3.4 get-TRN-v2.py
+ # --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn
+ # --outputFile STMTRN_articulos_sal_1
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+ # --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv
+ # --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf
+ # python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-1/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_articulos_sal_1 --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_1/jsonpdf
+ 
+ # all = articulos_sal_1 + articulos_sal_2 + articulos_sal_3 + articulos_sal_4
+ # python3.4 get-TRN-v2.py
+ # --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn
+ # --outputFile STMTRN_all
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+ # --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv
+ # --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf
+ # python3.4 get-TRN-v2.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN-2021-all/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN_all --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json --tsvPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/original/tsv --jsonpdfPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/dataSets/data-sets-STM/preprocessed-STMTRN-2021/articulos_sal_all/jsonpdf
+ 
+ ####
+ # python3.4 get-TRN-v1.py
+ # --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn
+ # --outputFile STMTRN
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+ # python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STMTRN/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STMTRN --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+ 
+ # Con dataset automatic-extraction-STM-RIs-dataset
+ # python3.4 get-TRN-v1.py
+ # --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn
+ # --outputFile STM-RIs-dataset
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+ # python3.4 get-TRN-v1.py --predictedPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs/complete-ris --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/trn --outputFile STM-RIs-dataset --diccPath /home/cmendezc/terminologicalResources --diccSynon diccionario-STM-LT2-v7.0.SYNONYMS.json
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ def updateHashPredicted(pr, hashP, pm, sF, ef):
+     # updateHashPredicted(prief, hashPredictedRIEF, pmid, sentenceFile, hashOriginalEffect[effect])
+     if pr not in hashP:
+         hashTemp = {"pmids": {pm: [sF]}, "orieff": ef}
+         hashP[pr] = hashTemp
+     else:
+         hashTemp = hashP[pr]
+         if pm in hashTemp["pmids"]:
+             hashP[pr]["pmids"][pm].append(sF)
+         else:
+             hashP[pr]["pmids"][pm] = [sF]
+ 
+ def get_standard_name(regSynon):
+     reg = regSynon
+     if regSynon in hashSynon:
+         reg = hashSynon[regSynon]
+     else:
+         for syn, std in hashSynon.items():
+             if regSynon.startswith(syn):
+                 reg = regSynon.replace(syn, std, 1)
+                 break
+     return reg
+ 
+ if __name__ == "__main__":
+     # Parameter definition
+     parser = OptionParser()
+     parser.add_option("--predictedPath", dest="predictedPath",
+                       help="Path predicted ris gcs", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path", metavar="PATH")
+     parser.add_option("--outputFile", dest="outputFile",
+                       help="Preffix file for saving results", metavar="FILE")
+     parser.add_option("--diccPath", dest="diccPath",
+                       help="Path to dictionary", metavar="PATH")
+     parser.add_option("--diccSynon", dest="diccSynon",
+                       help="File with synonyms", metavar="FILE")
+     parser.add_option("--tsvPath", dest="tsvPath",
+                       help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")
+     parser.add_option("--jsonpdfPath", dest="jsonpdfPath",
+                         help="Path to read jsonpdf file to extract section name", metavar="PATH")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("None parameter entered.")
+         sys.exit(1)
+ 
+     # Printing parameter values
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path predicted ris gcs: " + str(options.predictedPath))
+     print("Output path: " + str(options.outputPath))
+     print("Preffix file for saving results: " + str(options.outputFile))
+     print("Path to dictionary: " + str(options.diccPath))
+     print("File with synonyms: " + str(options.diccSynon))
+     print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))
+     print("Path to read jsonpdf file to extract section name: " + str(options.jsonpdfPath))
+ 
+     use_synonyms = False
+     hashSynon = {}
+     if options.diccPath != None and options.diccSynon != "no-synonyms":
+         print("***** Using synonyms *****")
+         use_synonyms = True
+         print('Loading dictionary of synonyms...')
+         with open(os.path.join(options.diccPath, options.diccSynon)) as diccSynon:
+             hashSynon = json.load(diccSynon)
+         print('Loading dictionary of synonyms {}... done!'.format(len(hashSynon)))
+ 
+     hashPredictedRIs = {}
+     hashPredictedRIsCount = {}
+     hashPredictedRIsCountVer = {}
+     hashPredictedRIsCountDev = {}
+     hashPredictedRIsCountAtt = {}
+     hashPredictedRIsCountAuto = {}
+     hashFiles = {}
+     for path, dirs, files in os.walk(options.predictedPath):
+         for file in files:
+             if file.endswith(".a1"):
+                 filename = file[:-3]
+                 if filename not in hashFiles:
+                     hashFiles[filename] = 1
+                 else:
+                     hashFiles[filename] += 1
+     print("Files: {}".format(len(hashFiles)))
+ 
+     processedFiles = 0
+     id_ri = 1
+     regex_att_auto = re.compile(r"(\.att\.|\.auto\.)[0-9]*$")
+     for file in sorted(hashFiles.keys()):
+         print("File: {}".format(file))
+         type_sent = "ver/dev"
+         if file.find("dataSet_OnlyRI_sentences") > -1:
+             pmid = "000000"
+             if file.find("dataSet_OnlyRI_sentences.") > -1:
+                 if file.find(".att.") > -1:
+                     numsent = file[file.find("att.") + 4:]
+                     type_sent = "att"
+                 if pmid.find(".auto.") > -1:
+                     numsent = file[file.find("auto.") + 5:]
+                     type_sent = "auto"
+             else:
+                 numsent = file[file.find("_", file.find("_", file.find("_") + 1) + 1) + 1:file.find("-")]
+             numsent = numsent.replace(".al", "")
+             print("dataSet_OnlyRI_sentences numsent: {}".format(numsent))
+             print("dataSet_OnlyRI_sentences pmid: {}".format(pmid))
+         else:
+             pmid = file[:file.find("_")]
+             # print("pmid: {}".format(pmid))
+             numsent = file[file.find("_")+1:file.find("-")]
+             numsent = numsent.replace(".al", "")
+             if pmid.find(".att.") > -1:
+                 # CFMC 2022-03-11: Fix errro in pmid
+                 # CFMC 2022-03-11 Original: pmid = pmid.replace(".att.", "")
+                 pmid = regex_att_auto.sub("", pmid)
+                 numsent = file[file.find("att.")+4:]
+                 type_sent = "att"
+             if pmid.find(".auto.") > -1:
+                 # CFMC 2022-03-11: Fix errro in pmid
+                 # CFMC 2022-03-11 Original: pmid = pmid.replace(".auto.", "")
+                 pmid = regex_att_auto.sub("", pmid)
+                 numsent = file[file.find("auto.") + 5:]
+                 type_sent = "auto"
+         # numsent = file[file.find("_"):file.find("-")]
+         # print("pmid {}".format(pmid))
+         # print("numsent: {}".format(numsent))
+ 
+         sentenceFile = file[:file.find("-", file.find("_"))] + ".txt"
+         hashEntitiesGenes = {}
+         hashEntitiesTUs = {}
+         hashEntitiesTFs = {}
+         hashEntitiesEffects = {}
+         hashOriginalEffect = {}
+         regex_fix_regulator = re.compile(r'(Regulated|Binds|Bind|deverbal_effect|Regulate)')
+         regex_fix_repressor = re.compile(r'(Repressing|Represses)')
+         with open(os.path.join(options.predictedPath, file + ".a1"), mode="r") as a1File:
+             for line in a1File:
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 entity = listLine2[0]
+                 entity_type = listLine2[0]
+                 idEntity = listLine1[0]
+                 originalEffect = listLine1[2]
+                 if entity.startswith("EFFECT"):
+                     entity = entity[entity.find(".") + 1:]
+                     # print("Entity: {}".format(entity))
+                     if pmid.find("_dev") > -1:
+                         type_sent = "dev"
+                         entity = entity.replace("_dev", "")
+                     # print("Entity without _dev: {}".format(entity))
+                     if idEntity not in hashOriginalEffect:
+                         hashOriginalEffect[idEntity] = originalEffect
+                     if idEntity not in hashEntitiesEffects:
+                         # We fixed some wrong effects in TRN, but we must fix this also in another script where error is produced
+                         if regex_fix_regulator.match(entity):
+                             print("WARNING EFFECT: {}".format(entity))
+                             entity = regex_fix_regulator.sub("regulator", entity)
+                             print("WARNING EFFECT after: {}".format(entity))
+                         if regex_fix_repressor.match(entity):
+                             print("WARNING EFFECT: {}".format(entity))
+                             entity = regex_fix_repressor.sub("repressor", entity)
+                             print("WARNING EFFECT after: {}".format(entity))
+                         hashEntitiesEffects[idEntity] = entity
+                 else:
+                     entity = listLine1[2]
+                     if entity_type == "GENE":
+                         if idEntity not in hashEntitiesGenes:
+                             hashEntitiesGenes[idEntity] = entity
+                     elif entity_type == "TU":
+                         if idEntity not in hashEntitiesTUs:
+                             hashEntitiesTUs[idEntity] = entity
+                     elif entity_type == "TF":
+                         if idEntity not in hashEntitiesTFs:
+                             hashEntitiesTFs[idEntity] = entity
+ 
+         # print("hashEntities: {}".format(hashEntitiesGenes))
+         # print("hashEntities: {}".format(hashEntitiesTUs))
+         # print("hashEntities: {}".format(hashEntitiesTFs))
+ 
+         with open(os.path.join(options.predictedPath, file + ".a2"), mode="r") as a2File:
+             sentence = ''
+             with open(os.path.join(options.predictedPath, file + ".txt"), mode="r") as txtFile:
+                 sentence = txtFile.read()
+                 listTokens = [token.split('|')[0] for token in sentence.split()]
+                 sentence = ' '.join(listTokens)
+ 
+             # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
+             # Open jsonpdf file
+             hash_sections = {}
+             sentences = {}
+             print('Loading jsonpdf file...')
+             with open(os.path.join(options.jsonpdfPath, pmid + ".jsonpdf"), "r", encoding="utf-8", errors="replace") as jsonpdfFile:
+                 text_file = jsonpdfFile.read()
+                 if file.startswith("26781240"):
+                     text_file = text_file.replace(" \\ ", " \\\\ ")
+                 elif file.startswith("26249345"):
+                     text_file = text_file.replace('}], ', '}],"sections": {}')
+                 try:
+                     hash_jsonpdf = json.loads(text_file)
+                     print('   Loading jsponpdf file... done!')
+                 except Exception as e:
+                     print(e)
+                     print("   Loading jsonpdf file failed: {}".format(file))
+                 hash_sections = hash_jsonpdf["sections"]
+                 # print("Sections: {}".format(hash_sections))
+                 sentences = hash_jsonpdf["sentences"]
+             # Open tsv file
+             print('Loading tsv file...')
+             file_tsv = pmid + ".pre.fil.tsv"
+             tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
+             # print("tsv_file.shape: {}".format(tsv_file.shape))
+             tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
+             # print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
+             tsv_file_new = tsv_file_filtered.reset_index(drop=True)
+             # print(tsv_file_new.head(10))
+             print('   Loading tsv file... done!')
+             numsent_int = int(numsent)
+             original_sentence = tsv_file_new.at[numsent_int, 'sentence']
+             section_num = tsv_file_new.at[numsent_int, 'section']
+             # print("type(section_num): {}".format(type(section_num)))
+             original_idsentence = tsv_file_new.at[numsent_int, 'idsentence']
+             section_num_str = str(section_num)
+             if section_num_str in hash_sections:
+                 section_name = hash_sections[section_num_str]
+             else:
+                 section_name = "Unknown"
+ 
+             for line in a2File:
+                 # print("Line a2: {}".format(line))
+                 # R1	Interaction.T3 Target:T2 Agent:T1 Condition: T4
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 regulator = listLine2[2]
+                 regulator = regulator[regulator.find(":") + 1:]
+                 regulated = listLine2[1]
+                 regulated = regulated[regulated.find(":") + 1:]
+                 effect = listLine2[0]
+                 effect = effect[effect.find(".") + 1:]
+ 
+                 tf = hashEntitiesTFs[regulator]
+                 if tf.endswith("ed"):
+                     tf = tf[:tf.find("-")]
+                 #else:
+                 # Clean TF names by expressions seen in TRN outpur file
+                 tf = re.sub(r"(/absence|controlle|activation|‐regulate|‐mediate|mediate|-regulate|regulate|ˉ|-like|-mutant|-type|-independent|-dependent|dependent|-dependant|-binding|-and|-family|-bound|-deficient|-indepen-dent|-inducing|-green|-overproducing|-or|-depletion|-repressible|-dual|-box)", "", tf)
+                 # Clean false TF names - 2329
+                 result = re.match(r"(cyclic|RHONDA|Crawford|Hulett|Rhodobacter|Danino|Huang|Neisseria|Huang|HUGHES1|Robbe-Saule|Danchin|Roberts|Furer|Hunter|Furue|Humphreys|Nacional)", tf)
+                 if result:
+                     break
+                 # H
+                 tf = get_standard_name(tf)
+ 
+                 # print("numsent: {}".format(numsent))
+                 # For L&C do not increment 1
+                 # CFMC 2022-03-11 Original: numsent_int = int(numsent)
+ 
+                 if regulated in hashEntitiesGenes:
+                     type_regulated = "Gene"
+                     llave = "{}\t{}\t{}\t{}".format(tf, "gene", hashEntitiesGenes[regulated],
+                                                     hashEntitiesEffects[effect])
+                 elif regulated in hashEntitiesTUs:
+                     type_regulated ="TU"
+                     llave = "{}\t{}\t{}\t{}".format(tf, "TU", hashEntitiesTUs[regulated],
+                                                     hashEntitiesEffects[effect])
+                 else:
+                     print("ERROR: Regulated did not found!")
+                 # Clean false cases
+                 if llave.startswith("Hu"):
+                     break
+ 
+                 if llave in hashPredictedRIs:
+                     # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
+                     hashPredictedRIs[llave].append("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, ""))
+                     hashPredictedRIsCount[llave] += 1
+                     if type_sent == "ver/dev":
+                     #    if llave in hashPredictedRIsCountVer:
+                         hashPredictedRIsCountVer[llave] += 1
+                     #    else:
+                     #        hashPredictedRIsCountVer[llave] = 1
+                     elif type_sent == "dev":
+                     #    if llave in hashPredictedRIsCountVer:
+                         hashPredictedRIsCountDev[llave] += 1
+                     #    else:
+                     #        hashPredictedRIsCountDev[llave] = 1
+                     elif type_sent == "att":
+                     #    if llave in hashPredictedRIsCountVer:
+                         hashPredictedRIsCountAtt[llave] += 1
+                     #    else:
+                     #        hashPredictedRIsCountAtt[llave] = 1
+                     elif type_sent == "auto":
+                     #    if llave in hashPredictedRIsCountVer:
+                         hashPredictedRIsCountAuto[llave] += 1
+                     #    else:
+                     #        hashPredictedRIsCountAuto[llave] = 1
+                 else:
+                     # CFMC 2022-03-11: We included section of sentences (num, name) and original idsentence and original sentence
+                     hashPredictedRIs[llave] = ["{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent_int, type_sent, sentence, original_idsentence, original_sentence, section_num, section_name, "", 0, "")]
+                     hashPredictedRIsCount[llave] = 1
+                     hashPredictedRIsCountVer[llave] = 0
+                     hashPredictedRIsCountDev[llave] = 0
+                     hashPredictedRIsCountAtt[llave] = 0
+                     hashPredictedRIsCountAuto[llave] = 0
+                     if type_sent == "ver/dev":
+                         hashPredictedRIsCountVer[llave] = 1
+                     elif type_sent == "dev":
+                         hashPredictedRIsCountDev[llave] = 1
+                     elif type_sent == "att":
+                         hashPredictedRIsCountAtt[llave] = 1
+                     elif type_sent == "auto":
+                         hashPredictedRIsCountAuto[llave] = 1
+ 
+                 id_ri += 1
+         processedFiles += 1
+ 
+     print("Processed files: {}".format(processedFiles))
+     with open(os.path.join(options.outputPath, options.outputFile + ".summary.tsv"), mode="w") as oFile:
+         # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
+         oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
+         for k,v in hashPredictedRIs.items():
+             oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
+                                                               hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], "1", "True"))
+             #oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k], hashPredictedRIsCountDev[k], hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k], v))
+     with open(os.path.join(options.outputPath, options.outputFile + ".detail.tsv"), mode="w") as oFile:
+         # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
+         oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tConfirmationLevel\n")
+         for k,v in hashPredictedRIs.items():
+             for s in v:
+                 oFile.write("{}\t{}\n".format(k, s))
+ 
--- a/predicted-ris-gcs/complete-ris/.gitignore 0 → 100644
View file @f3df57a
+++ b/predicted-ris-gcs/complete-ris/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/predicted-ris-gcs/incomplete-ris/.gitignore 0 → 100644
View file @f3df57a
+++ b/predicted-ris-gcs/incomplete-ris/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/ri-attributive-extraction-v02.py 0 → 100644
View file @f3df57a
+++ b/ri-attributive-extraction-v02.py 0 → 100644
View file @f3df57a
+ # -*- coding: UTF-8 -*-
+ from optparse import OptionParser
+ import sys
+ import os
+ import json
+ import operator
+ import re
+ from nltk.corpus import words
+ 
+ __author__ = 'CMendezC'
+ 
+ 
+ # Objective: obtain predicted ris from attributive sentences, such as ArgP-regulated gene argP
+ # Input format: transformed format.
+ # WARNING: Only one sentence per line
+ 
+ # Parameters:
+ #   1) --inputPath Input path
+ #   2) --inputFile Inpupt file
+ #   3) --outputPath Output path
+ #   5) --diccPath Dictionary path
+ #   7) --diccEffect File with normalized effects
+ 
+ #   6) --diccFile JSON file with entity dictionaries
+ #   9) --diccEPAth Dictionary path diccEffect
+ #   8) --format Output format: standoff, tabs
+ 
+ # Ouput:
+ #   1) File with predicted ris combined with existing files.
+ # Format standoff:
+ # T1	TF 0 0	ArgP-regulated
+ # T2	GENE 0 0	argP
+ # T1      Growth_condition 88 137 mitochondrial electron transport chain inhibitors
+ # R1	Interaction.activator Target:T3 Agent:T1
+ 
+ # Execution
+ # C:\anaconda3\python ri-attributive-extraction.py
+ # --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences
+ # --inputFile ris-sentences-analysis.att.017.txt
+ # --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs
+ # --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources
+ # --diccEffect normalized_Effects.json
+ # C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json
+ # C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.286.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json
+ 
+ # python3 ri-attributive-extraction.py
+ # --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences
+ # --inputFile ris-sentences-analysis.att.017.txt
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccEffect normalized_Effects.json
+ # python3 ri-attributive-extraction.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ def getPosWord(wordPos, endPos, text, termList):
+     offsetStart = 0
+     wordNum = 0
+     listText = text.split()
+     for w in listText:
+         # if filenameBefore.find('000-2') > -1:
+         #     print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
+         if wordNum >= int(wordPos):
+             # for tok in word.split():
+             for t in termList:
+                 # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
+                 if w == t:
+                     return [w, offsetStart, offsetStart + len(w) - 1]
+             #else:
+         wordNum += 1
+         offsetStart += len(w) + 1
+         if wordNum > int(endPos):
+             return None
+     return None
+ 
+ def getIdEntity(aList, etype, idE):
+     entity = aList[0]
+     if etype == "EFFECT":
+         normalizedEffect = entity
+         #print("EFFECT: {}".format(entity))
+         if entity in hashNormalizedEffects:
+             normalizedEffect = hashNormalizedEffects[entity]
+         etype += "." + normalizedEffect
+         #print("etype: {}".format(etype))
+     entityPosStart = aList[1]
+     entityPosEnd = aList[2]
+     keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+     #print("keyEntity: {}".format(keyEntity))
+     if keyEntity not in hashEntities:
+         idE += 1
+         idEntity = "T{}".format(idE)
+         hashEntities[keyEntity] = idEntity
+         #print("New entity {}: {}".format(idEntity, keyEntity))
+         return idEntity, idE
+     else:
+         idEntity = hashEntities[keyEntity]
+         return idEntity, idE
+ 
+ def getIdInteraction(regulator, regulated, effect, idI, hashInt):
+     #print("hashInt: {}".format(hashInt))
+     keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+     if keyInteraction not in hashInt:
+         idI += 1
+         idInteraction = "R{}".format(idI)
+         hashInt[keyInteraction] = idInteraction
+         #print("New interaction {}: {}".format(idInteraction, keyInteraction))
+         #return idInteraction, idI
+     else:
+         idInteraction = hashInt[keyInteraction]
+     return idInteraction, idI
+ 
+ def saveFiles(filename, hashE, hashI, s, effect):
+     if effect:
+         outputPath = os.path.join(options.outputPath, "complete-ris")
+     else:
+         outputPath = os.path.join(options.outputPath, "incomplete-ris")
+     with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
+     #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
+         for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
+             aList = k.split()
+             a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
+     with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
+     #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
+         for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
+             aList = k.split()
+             a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
+     with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
+         txtFile.write(s)
+ 
+ def loadFileEntities(filename, outputPath, hashTemp):
+     #print("Start loadFileEntities")
+     idE = 1
+     try:
+         f = filename[:filename.rfind(".")] + ".a1"
+         # print("file entities: {}".format(f))
+         with open(os.path.join(outputPath, f), mode="r") as a1File:
+             for line in a1File:
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 etype = listLine2[0]
+                 entityPosStart = listLine2[1]
+                 entityPosEnd = listLine2[2]
+                 entity = listLine1[2]
+                 keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+                 idEntity = listLine1[0]
+                 if keyEntity not in hashTemp:
+                     hashTemp[keyEntity] = idEntity
+                     if int(idEntity[1:]) > idE:
+                         idE = int(idEntity[1:])
+     except IOError:
+         print("IOError file: {}".format(os.path.join(outputPath, f)))
+         # idE = 1
+     return idE
+ 
+ def loadFileInteractions(filename, outputPath, hashTemp):
+     #print("Start loadFileInteractions")
+     idI = 1
+     try:
+         with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
+             for line in a2File:
+                 #print("Line a2: {}".format(line))
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 regulator = listLine2[2]
+                 regulator = regulator[regulator.find(":") + 1:]
+                 regulated = listLine2[1]
+                 regulated = regulated[regulated.find(":") + 1:]
+                 effect = listLine2[0]
+                 effect = effect[effect.find(".") + 1:]
+                 idInteraction = listLine1[0]
+                 keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+                 if keyInteraction not in hashTemp:
+                     hashTemp[keyInteraction] = idInteraction
+                     if int(idInteraction[1:]) > idI:
+                         idI = int(idInteraction[1:])
+     except IOError:
+         print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
+         # idI = 1
+     return idI
+ 
+ def getRealPos(posStart, posEnd, lin):
+     return (posStart, posEnd)
+ 
+ def getRI(r, l):
+     regulator = r.group('regulator')
+     regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
+     # regulatorStart = getRealPos(r.start('regulator'), l)
+     # regulatorEnd = getRealPos(r.end('regulator'), l)
+     regulated = r.group('regulated')
+     regulatedPos = getRealPos(r.start('regulated'), r.end('regulated'), l)
+     # regulatedStart = getRealPos(r.start('regulated'), l)
+     # regulatedEnd = getRealPos(r.end('regulated'), l)
+     effect = r.group('effect')
+     effectPos = getRealPos(r.start('effect'), r.end('effect'), l)
+     # effectStart = getRealPos(r.start('effect'), l)
+     # effectEnd = getRealPos(r.end('effect'), l)
+     #print("Regulator {}, start {}, end {}".format(regulator, regulatorPos[0], regulatorPos[1]))
+     #print("Regulated {}, start {}, end {}".format(regulated, regulatedPos[0], regulatedPos[1]))
+     #print("Effect {}, start {}, end {}".format(effect, effectPos[0], effectPos[1]))
+     return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
+                     regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
+                     effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
+ 
+ if __name__ == "__main__":
+     # Parameter definition
+     # python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py
+     # --inputPath $(dirname ${file})
+     # --inputFile $(basename ${file})
+     # --outputPath $OUTPUT_PATH
+     # --diccPath $DICC_PATH
+     # --diccEffect normalized_Effects.json
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Input path", metavar="PATH")
+     parser.add_option("--inputFile", dest="inputFile",
+                       help="Input file", metavar="FILE")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path", metavar="PATH")
+     parser.add_option("--diccPath", dest="diccPath",
+                       help="Path to read dictionaries", metavar="PATH")
+     # parser.add_option("--diccFile", dest="diccFile",
+     #                   help="JSON file with entity dictionaries", metavar="FILE")
+     parser.add_option("--diccEffect", dest="diccEffect",
+                       help="File with normalized effects", metavar="FILE")
+ 
+     # parser.add_option("--format", dest="format",
+     #                   help="Output format: standoff", metavar="TEXT")
+     # parser.add_option("--diccEPAth", dest="diccEPAth",
+     #                   help="File with normalized effects", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     #if len(args) > 0:
+     #    parser.error("None parameter entered.")
+     #    sys.exit(1)
+ 
+     # Printing parameter values
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Input path: " + str(options.inputPath))
+     print("Input file: " + str(options.inputFile))
+     print("Output path: " + str(options.outputPath))
+     print("Path to read dictionaries: " + str(options.diccPath))
+     # print("JSON file with entity dictionaries: " + str(options.diccFile))
+     print("File with normalized effects: " + str(options.diccEffect))
+     # print("Output format: " + str(options.format))
+     # print("Path to read normalized effects: " + str(options.diccEPAth))
+ 
+     # regularWords =  words.words('en')
+ 
+     # print('Loading dictionaries...')
+     # with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
+     #    hashDicc = json.load(diccFile)
+ 
+     # hashTermFiles = hashDicc["hashTermFiles"]
+     # hashTerms = hashDicc["hashTerms"]
+ 
+     # for key in hashTermFiles.keys():
+     #     for f in hashTermFiles[key]:
+     #         # print('File: ' + f)
+     #         with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
+     #             for line in iFile:
+     #                 line = line.strip('\n')
+     #                 line = line.replace(' ', '-')
+     #                 if line not in hashTerms[key]:
+     #                     hashTerms[key].append(line)
+     #                     # if options.termLower:
+     #                     # hashTerms[key].append(line.lower())
+     #                     # if options.termCapitalize:
+     #                     # hashTerms[key].append(line.capitalize())
+     #     print('   Terms read {} size: {}'.format(key, len(hashTerms[key])))
+ 
+     # Loading normalized effects
+     print('Loading normalized effects ending with -d...')
+     hashNormalizedEffects = {}
+     with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
+         hashNormalizedEffects = json.load(diccFile)
+     listEffects = []
+     for eff in hashNormalizedEffects.keys():
+         if eff.endswith('d'):
+             listEffects.append(eff)
+     listEffects.append("dependent")
+     effects = "|".join(listEffects)
+     #print("Effects: {}".format(effects))
+ 
+     files = {}
+     hashEntities = {}
+     hashInteractions = {}
+     hashInteractionsEffect = {}
+     idEntities = 1
+     idInteractions = 1
+     idInteractionsEffect = 1
+ 
+     # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)\s([^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
+     # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+( [^ ]+)')
+     # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))\s([^|]+\|[^|]+\|(CC|,))?)+ ([^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
+     # regexAttRILeft = re.compile(r'(?:([^|\s]+\|[^|]+\|(?:GENE|TU))\s(?:[^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
+     # regexAttRILeft = re.compile(r'(?=([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
+     # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
+     # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+) ([^ ]+ )+(?P<regulator>[^|]+\|[^|]+\|TF)')
+     # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>' + r'(' + effects + ')\|[^|]+\|TF) [^|]+\|gene')
+ 
+     # reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
+     # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(regulated|repressed)\|[^|]+\|TF) [^|]+\|gene')
+     # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ ){,5}(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
+     # CMC 2018-11-07: regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
+     regexAttRILeft = re.compile(
+         r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF)')
+     # regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ ){,5}(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
+     # CMC 2018-11-07: regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ )+(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
+     regexAttRIRight = re.compile(
+         r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) ([^ ]+ )*(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
+ 
+     filename = options.inputFile
+     hashEntities = {}
+     hashInteractions = {}
+     hashInteractionsEffect = {}
+     idEntities = 1
+     idInteractions = 1
+     idInteractionsEffect = 1
+     outputPath = os.path.join(options.outputPath, "complete-ris")
+     idEntities = loadFileEntities(filename, outputPath, hashEntities)
+     idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
+     outputPath = os.path.join(options.outputPath, "incomplete-ris")
+     idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
+ 
+     listRIs = []
+ 
+     with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
+         for line in iFile:
+             line = line.rstrip('\n')
+             # Buscar hacia la izquierda
+             #print("Buscando hacia <<")
+             result = regexAttRILeft.search(line)
+             #print("result: {}".format(result))
+             lineTemp = line
+             # print("lineTemp: {}".format(lineTemp))
+             while result:
+                 #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
+                 listRIs.append(getRI(result, line))
+                 #print("listRIs: {}".format(listRIs))
+                 lineTemp = lineTemp.replace(result.group('regulated'), '')
+                 #print("lineTemp for: {}".format(lineTemp))
+                 result = regexAttRILeft.search(lineTemp)
+                 #print("result: {}".format(result))
+ 
+             # Buscar hacia la derecha
+             #print("Buscando hacia >>")
+             result = regexAttRIRight.search(line)
+             #print("result: {}".format(result))
+             lineTemp = line
+             # print("lineTemp: {}".format(lineTemp))
+             while result:
+                 #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
+                 listRIs.append(getRI(result, line))
+                 #print("listRIs: {}".format(listRIs))
+                 lineTemp = lineTemp.replace(result.group('regulated'), '')
+                 #print("lineTemp for: {}".format(lineTemp))
+                 result = regexAttRIRight.search(lineTemp)
+                 #print("result: {}".format(result))
+ 
+             # result = regexAttRIRight.finditer(line)
+             # lineTemp = line
+             # while result:
+             #     listRIs.append(getRI(result, line))
+             #     lineTemp = lineTemp.replace(result.group('regulated'), '')
+             #     result = regexAttRIRight.finditer(lineTemp)
+ 
+     # return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
+     #                 regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
+     #                 effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
+     for ri in listRIs:
+         #print("ri: {}".format(ri))
+         if len(ri) != 4:
+             print("WARNING! corrupted list")
+             exit()
+         regulator = ri[0]
+         regulated = ri[1]
+         effect = ri[2]
+         line = ri[3]
+ 
+         listElem = regulator.split('|')
+         regulatorWord = listElem[0]
+         regulatorType = listElem[2]
+         regulatorStart = listElem[3]
+         regulatorEnd = listElem[4]
+ 
+         listElem = regulated.split('|')
+         regulatedWord = listElem[0]
+         regulatedType = listElem[2]
+         regulatedStart = listElem[3]
+         regulatedEnd = listElem[4]
+ 
+         listElem = effect.split('|')
+         effectWord = listElem[0]
+         effectType = "EFFECT"
+         effectStart = listElem[1]
+         effectEnd = listElem[2]
+ 
+         idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
+         if regulatedType == "GENE":
+             idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
+         elif regulatedType == "TU":
+             idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "TU", idEntities)
+         else:
+             print("WARNING! Unknown entity type")
+         idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
+                                                          idInteractions, hashInteractions)
+         idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
+         idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
+                                                                idInteractionsEffect,
+                                                                hashInteractionsEffect)
+ 
+         saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
+         saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)
--- a/ri-autoregulation-extraction-v01.py 0 → 100644
View file @f3df57a
+++ b/ri-autoregulation-extraction-v01.py 0 → 100644
View file @f3df57a
+ # -*- coding: UTF-8 -*-
+ from optparse import OptionParser
+ import sys
+ import os
+ import json
+ import operator
+ import re
+ from general_functions import getTypeRegulation
+ from nltk.corpus import words
+ 
+ __author__ = 'CMendezC'
+ 
+ 
+ # Objective: obtain predicted ris from autoregulation sentences,
+ # such as ArgP protein represses its own synthesis
+ # Input format: transformed format.
+ # WARNING: Only one sentence per line
+ 
+ # Parameters:
+ #   1) --inputPath Input path
+ #   2) --inputFile Inpupt file
+ #   3) --outputPath Output path
+ #   5) --diccPath Dictionary path
+ #   7) --diccEffect File with normalized effects
+ 
+ #   6) --diccFile JSON file with entity dictionaries
+ #   9) --diccEPAth Dictionary path diccEffect
+ #   8) --format Output format: standoff, tabs
+ 
+ # Ouput:
+ #   1) File with predicted ris combined with existing files.
+ # Format standoff:
+ # T1	TF 0 0	ArgP
+ # T2	GENE 0 0	Argp -- > argP
+ # R1	Interaction.activator Target:T3 Agent:T1
+ # Sentence ArgP protein represses its own synthesis
+ # The FimZ transcription factor activates this promoter directly ,
+ #   and it also positively regulates the transcription of its own gene
+ # FimZ is known to regulate the expression of its own gene positively
+ # FimZ also positively regulates its own transcription
+ # ArgP protein represses its own synthesis
+ # ArgP both represses its own transcription
+ # ArgP protein represses its own synthesis
+ # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
+ #   of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
+ 
+ # Execution
+ # python3 ri-autoregulation-extraction-v01.py
+ # --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
+ # --inputFile dataSet_OnlyRI_sentences.auto.1017.txt
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccEffect normalized_Effects.json
+ # python3 ri-autoregulation-extraction-v01.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences --inputFile dataSet_OnlyRI_sentences.auto.1017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ def getPosWord(wordPos, endPos, text, termList):
+     offsetStart = 0
+     wordNum = 0
+     listText = text.split()
+     for w in listText:
+         # if filenameBefore.find('000-2') > -1:
+         #     print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
+         if wordNum >= int(wordPos):
+             # for tok in word.split():
+             for t in termList:
+                 # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
+                 if w == t:
+                     return [w, offsetStart, offsetStart + len(w) - 1]
+             #else:
+         wordNum += 1
+         offsetStart += len(w) + 1
+         if wordNum > int(endPos):
+             return None
+     return None
+ 
+ def getIdEntity(aList, etype, idE):
+     entity = aList[0]
+     if etype == "EFFECT":
+         normalizedEffect = entity
+         #print("EFFECT: {}".format(entity))
+         if entity in hashNormalizedEffects:
+             normalizedEffect = hashNormalizedEffects[entity]
+         etype += "." + normalizedEffect
+         #print("etype: {}".format(etype))
+     entityPosStart = aList[1]
+     entityPosEnd = aList[2]
+     keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+     #print("keyEntity: {}".format(keyEntity))
+     if keyEntity not in hashEntities:
+         idE += 1
+         idEntity = "T{}".format(idE)
+         hashEntities[keyEntity] = idEntity
+         #print("New entity {}: {}".format(idEntity, keyEntity))
+         return idEntity, idE
+     else:
+         idEntity = hashEntities[keyEntity]
+         return idEntity, idE
+ 
+ def getIdInteraction(regulator, regulated, effect, idI, hashInt):
+     #print("hashInt: {}".format(hashInt))
+     keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+     if keyInteraction not in hashInt:
+         idI += 1
+         idInteraction = "R{}".format(idI)
+         hashInt[keyInteraction] = idInteraction
+         #print("New interaction {}: {}".format(idInteraction, keyInteraction))
+         #return idInteraction, idI
+     else:
+         idInteraction = hashInt[keyInteraction]
+     return idInteraction, idI
+ 
+ def saveFiles(filename, hashE, hashI, s, effect):
+     if effect:
+         outputPath = os.path.join(options.outputPath, "complete-ris")
+     else:
+         outputPath = os.path.join(options.outputPath, "incomplete-ris")
+     with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
+     #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
+         for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
+             aList = k.split()
+             a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
+     with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
+     #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
+         for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
+             aList = k.split()
+             a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
+     with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
+         txtFile.write(s)
+ 
+ def loadFileEntities(filename, outputPath, hashTemp):
+     #print("Start loadFileEntities")
+     idE = 1
+     try:
+         f = filename[:filename.rfind(".")] + ".a1"
+         # print("file entities: {}".format(f))
+         with open(os.path.join(outputPath, f), mode="r") as a1File:
+             for line in a1File:
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 etype = listLine2[0]
+                 entityPosStart = listLine2[1]
+                 entityPosEnd = listLine2[2]
+                 entity = listLine1[2]
+                 keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+                 idEntity = listLine1[0]
+                 if keyEntity not in hashTemp:
+                     hashTemp[keyEntity] = idEntity
+                     if int(idEntity[1:]) > idE:
+                         idE = int(idEntity[1:])
+     except IOError:
+         print("IOError file: {}".format(os.path.join(outputPath, f)))
+         # idE = 1
+     return idE
+ 
+ def loadFileInteractions(filename, outputPath, hashTemp):
+     #print("Start loadFileInteractions")
+     idI = 1
+     try:
+         with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
+             for line in a2File:
+                 #print("Line a2: {}".format(line))
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 regulator = listLine2[2]
+                 regulator = regulator[regulator.find(":") + 1:]
+                 regulated = listLine2[1]
+                 regulated = regulated[regulated.find(":") + 1:]
+                 effect = listLine2[0]
+                 effect = effect[effect.find(".") + 1:]
+                 idInteraction = listLine1[0]
+                 keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+                 if keyInteraction not in hashTemp:
+                     hashTemp[keyInteraction] = idInteraction
+                     if int(idInteraction[1:]) > idI:
+                         idI = int(idInteraction[1:])
+     except IOError:
+         print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
+         # idI = 1
+     return idI
+ 
+ '''
+ def getTypeRegulation(effect_group, posini, sent, type_sent):
+     # To change regulation effect in such as:
+     # negative regulator --> repressor
+     # positively regulates --> activator
+     effect_ret = effect_group
+     #listEff = effect_ret.split('|')
+ 
+     if type_sent == "tra":
+         regexTypeEffectPosi = re.compile(r'(?<=positive\|(RB|JJ) )' + effect_ret)
+         regexTypeEffectNega = re.compile(r'(?<=negative\|(RB|JJ) )' + effect_ret)
+         if regexTypeEffectPosi.search(sent, posini - 12):
+             # Creo que no es necesario: effect_ret = "activator|{}|{}".format(listEff[1], listEff[2])
+             effect_ret = "activator"
+             print("Change regulation effect: {}".format(sent))
+         elif regexTypeEffectNega.search(sent, posini - 12):
+             # Creo que no es necesario: effect_ret = "repressor|{}|{}".format(listEff[1], listEff[2])
+             effect_ret = "repressor"
+             print("Change regulation effect: {}".format(sent))
+     return effect_ret
+ '''
+ 
+ def getRealPos(posStart, posEnd, lin):
+     return (posStart, posEnd)
+ 
+ def getRI(r, l):
+     regulator = r.group('regulator')
+     regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
+     # We change TF name to GENE name
+     listRegulator = regulator.split('|')
+     regulatorWord = listRegulator[0]
+     regulated = regulatorWord[0].lower()+regulatorWord[1:]
+     regulated += "|{}|GENE".format(regulated)
+     regulatedPos = getRealPos(0, 0, l)
+     effect = r.group('effect')
+     # print("effect from group: {}".format(effect))
+     effectPos = getRealPos(r.start('effect'), r.end('effect'), l)
+ 
+     # To change regulation effect in:
+     # negative regulator --> repressor
+     # positively regulates --> activator
+     effect = getTypeRegulation(effect, r.start('effect'), l, "tra")
+ 
+     return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
+                     regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
+                     effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
+ 
+ if __name__ == "__main__":
+     # Parameter definition
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Input path", metavar="PATH")
+     parser.add_option("--inputFile", dest="inputFile",
+                       help="Input file", metavar="FILE")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path", metavar="PATH")
+     parser.add_option("--diccPath", dest="diccPath",
+                       help="Path to read dictionaries", metavar="PATH")
+     parser.add_option("--diccEffect", dest="diccEffect",
+                       help="File with normalized effects", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     #if len(args) > 0:
+     #    parser.error("None parameter entered.")
+     #    sys.exit(1)
+ 
+     # Printing parameter values
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Input path: " + str(options.inputPath))
+     print("Input file: " + str(options.inputFile))
+     print("Output path: " + str(options.outputPath))
+     print("Path to read dictionaries: " + str(options.diccPath))
+     print("File with normalized effects: " + str(options.diccEffect))
+ 
+     # Loading normalized effects
+     print('Loading normalized effects (all)...')
+     hashNormalizedEffects = {}
+     with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
+         hashNormalizedEffects = json.load(diccFile)
+     listEffects = []
+     for eff in hashNormalizedEffects.keys():
+         listEffects.append(eff)
+     effects = "|".join(listEffects)
+     #print("Effects: {}".format(effects))
+ 
+     files = {}
+     hashEntities = {}
+     hashInteractions = {}
+     hashInteractionsEffect = {}
+     idEntities = 1
+     idInteractions = 1
+     idInteractionsEffect = 1
+ 
+     # The FimZ transcription factor activates this promoter directly ,
+     #   and it also positively regulates the transcription of its own gene
+     # FimZ is known to regulate the expression of its own gene positively
+     # FimZ also positively regulates its own transcription
+     # ArgP protein represses its own synthesis
+     # ArgP both represses its own transcription
+     # ArgP protein represses its own synthesis
+     # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
+     #   of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
+     regexAutoRI = re.compile(
+         # r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]\s){,4}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
+         r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+\s(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
+         #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^(TF)\s]+\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
+         #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^T][^F]\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT')
+ 
+     filename = options.inputFile
+     hashEntities = {}
+     hashInteractions = {}
+     hashInteractionsEffect = {}
+     idEntities = 1
+     idInteractions = 1
+     idInteractionsEffect = 1
+     outputPath = os.path.join(options.outputPath, "complete-ris")
+     idEntities = loadFileEntities(filename, outputPath, hashEntities)
+     idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
+     outputPath = os.path.join(options.outputPath, "incomplete-ris")
+     idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
+ 
+     listRIs = []
+     # print("Read autoregulation file")
+     with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
+         for line in iFile:
+             line = line.rstrip('\n')
+             print("Buscando autoregulation")
+             result = regexAutoRI.search(line)
+             #print("result: {}".format(result))
+             if result:
+                 lineTemp = result.string[result.end('regulator'):result.end(0)]
+                 # print("lineTemp: {}".format(lineTemp))
+                 result2 = regexAutoRI.search(lineTemp)
+                 if result2:
+                     print("Regulator {} regulated {} effect {}".format(result2.group('regulator'), result2.group('regulator'), result2.group('effect')))
+                     listRIs.append(getRI(result2, line))
+                     print("listRIs: {}".format(listRIs))
+                 elif result:
+                     print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulator'), result.group('effect')))
+                     listRIs.append(getRI(result, line))
+                     print("listRIs: {}".format(listRIs))
+ 
+ 
+     for ri in listRIs:
+         #print("ri: {}".format(ri))
+         if len(ri) != 4:
+             print("WARNING! corrupted list")
+             exit()
+         regulator = ri[0]
+         regulated = ri[1]
+         effect = ri[2]
+         line = ri[3]
+ 
+         listElem = regulator.split('|')
+         regulatorWord = listElem[0]
+         regulatorType = listElem[2]
+         regulatorStart = listElem[3]
+         regulatorEnd = listElem[4]
+ 
+         listElem = regulated.split('|')
+         regulatedWord = listElem[0]
+         regulatedType = listElem[2]
+         regulatedStart = listElem[3]
+         regulatedEnd = listElem[4]
+ 
+         listElem = effect.split('|')
+         effectWord = listElem[0]
+         effectType = "EFFECT"
+         effectStart = listElem[1]
+         effectEnd = listElem[2]
+ 
+         idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
+         idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
+         idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
+                                                          idInteractions, hashInteractions)
+         idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
+         idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
+                                                                idInteractionsEffect,
+                                                                hashInteractionsEffect)
+ 
+         saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
+         saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)
--- a/ri-openie-extraction-v02.py 0 → 100644
View file @f3df57a
+++ b/ri-openie-extraction-v02.py 0 → 100644
View file @f3df57a
+ # -*- coding: UTF-8 -*-
+ from optparse import OptionParser
+ import sys
+ import os
+ import json
+ import operator
+ from general_functions import getTypeRegulation
+ import re
+ from nltk.corpus import words
+ 
+ __author__ = 'CMendezC'
+ 
+ 
+ # Objective: obtain predicted ris from triplets extracted by OpenIE Stanford CoreNLP
+ # Input format:
+ # WARNING: Only one sentence per line
+ 
+ # Parameters:
+ #   1) --inputPath Input path
+ #   2) --inputFile Inpupt file
+ #   3) --outputPath Output path
+ #   5) --diccPath Dictionary path
+ #   6) --diccFile JSON file with entity dictionaries
+ #   7) --diccEffect File with normalized effects
+ #   8) --format Output format: standoff, tabs
+ #   9) --diccEPAth Dictionary path diccEffect
+ 
+ # Ouput:
+ #   1) File with predicted ris.
+ # Format standoff:
+ # T1	TF 0 0	MetR
+ # T2	TU 0 0	metH
+ # T3	GENE 0 0	metH
+ # T1      Growth_condition 88 137 mitochondrial electron transport chain inhibitors
+ # T2      Growth_condition 150 179        switch rich to minimal medium
+ # R1	Interaction.activator Target:T3 Agent:T1
+ # R2	Interaction.activator Target:T2 Agent:T1
+ 
+ # Execution
+ # python3.4 ri-openie-extraction.py
+ # --inputFile /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris/predicted-ris.reverb
+ # --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/processing-ris
+ # --diccPath /home/cmendezc/terminologicalResources
+ # --diccFile normalized_Effects_Type.json
+ # --diccEffect termFilesTag_RIE_GCE_SYSTEM_ECCO.jsong
+ # --format standoff
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ def getPosWord(wordPos, endPos, text, termList, type_entity=""):
+     #print("GETPOSWORD wordPOs {}".format(wordPos))
+     offsetStart = 0
+     wordNum = 0
+     listText = text.split()
+     for w in listText:
+         # if filenameBefore.find('000-2') > -1:
+         #     print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
+         if wordNum >= int(wordPos):
+             # for tok in word.split():
+             for t in termList:
+                 # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
+                 if w == t:
+                     if type_entity == "EFFECT":
+                         # To change regulation effect in:
+                         # negative regulator --> repressor
+                         # positively regulates --> activator
+                         print("text: {}".format(text))
+                         new_w = getTypeRegulation(w, int(wordPos), text, "word")
+                         return [new_w, offsetStart, offsetStart + len(w) - 1]
+                     else:
+                         return [w, offsetStart, offsetStart + len(w) - 1]
+             #else:
+         wordNum += 1
+         offsetStart += len(w) + 1
+         if wordNum > int(endPos):
+             return None
+     return None
+ 
+ 
+ def getIdEntity(aList, etype, idE):
+     entity = aList[0]
+     if etype == "EFFECT":
+         normalizedEffect = entity
+         # print("EFFECT: {}".format(entity))
+         if entity in hashEffects:
+             normalizedEffect = hashEffects[entity]
+         etype += "." + normalizedEffect
+             # print("EFFECT: {}".format(entity))
+     entityPosStart = aList[1]
+     entityPosEnd = aList[2]
+     keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+     #if filenameBefore.find('061-02') > -1:
+     #    print("keyEntity: {}".format(keyEntity))
+     #    print("idE: {}".format(idE))
+     #    print("hashEntities: {}".format(hashEntities))
+     if keyEntity not in hashEntities:
+         idE += 1
+         idEntity = "T{}".format(idE)
+         #if filenameBefore.find('061-02') > -1:
+         #    print("idEntity not in hashEntities: {}".format(keyEntity))
+         #    print("idE not in hashEntities: {}".format(idE))
+         hashEntities[keyEntity] = idEntity
+         #print("New entity {}: {}".format(idEntity, keyEntity))
+         return idEntity, idE
+     else:
+         idEntity = hashEntities[keyEntity]
+         return idEntity, idE
+ 
+ 
+ def getIdInteraction(regulator, regulated, effect, idI, hashInt):
+     #print("hashInt: {}".format(hashInt))
+     keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+     if keyInteraction not in hashInt:
+         idI += 1
+         idInteraction = "R{}".format(idI)
+         hashInt[keyInteraction] = idInteraction
+         #print("New interaction {}: {}".format(idInteraction, keyInteraction))
+         #return idInteraction, idI
+     else:
+         idInteraction = hashInt[keyInteraction]
+     return idInteraction, idI
+ 
+ 
+ def saveFiles(filename, hashE, hashI, s, effect):
+     if effect:
+         outputPath = os.path.join(options.outputPath, "complete-ris")
+     else:
+         outputPath = os.path.join(options.outputPath, "incomplete-ris")
+     with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="w") as a1File:
+     #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
+         for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
+             aList = k.split()
+             a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
+     with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="w") as a2File:
+     #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
+         for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
+             aList = k.split()
+             a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
+     with open(os.path.join(outputPath, filename[:file.find(".")] + ".txt"), mode="w") as txtFile:
+         txtFile.write(s)
+ 
+ def loadFileEntities(filename, outputPath, hashTemp):
+     idE = 1
+     try:
+         with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="r") as a1File:
+             for line in a1File:
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 etype = listLine2[0]
+                 entityPosStart = listLine2[1]
+                 entityPosEnd = listLine2[2]
+                 entity = listLine1[2]
+                 keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
+                 idEntity = listLine1[0]
+                 if keyEntity not in hashTemp:
+                     hashTemp[keyEntity] = idEntity
+                     if int(idEntity[1:]) > idE:
+                         idE = int(idEntity[1:])
+     except IOError:
+         print("IOError file, idEntity starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a1")))
+         # idE = 1
+     return idE
+ 
+ def loadFileInteractions(filename, outputPath, hashTemp):
+     idI = 1
+     try:
+         with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="r") as a2File:
+             for line in a2File:
+                 #print("Line a2: {}".format(line))
+                 line = line.strip('\n')
+                 listLine1 = line.split('\t')
+                 listLine2 = listLine1[1].split(' ')
+                 regulator = listLine2[2]
+                 regulator = regulator[regulator.find(":") + 1:]
+                 regulated = listLine2[1]
+                 regulated = regulated[regulated.find(":") + 1:]
+                 effect = listLine2[0]
+                 effect = effect[effect.find(".") + 1:]
+                 idInteraction = listLine1[0]
+                 keyInteraction = "{} {} {}".format(regulator, regulated, effect)
+                 if keyInteraction not in hashTemp:
+                     hashTemp[keyInteraction] = idInteraction
+                     if int(idInteraction[1:]) > idI:
+                         idI = int(idInteraction[1:])
+     except IOError:
+         print("IOError file, idInteraction starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a2")))
+         # idI = 1
+     return idI
+ 
+ if __name__ == "__main__":
+     # Parameter definition
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Input path", metavar="PATH")
+     parser.add_option("--inputFile", dest="inputFile",
+                       help="Input file", metavar="FILE")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path", metavar="PATH")
+     #parser.add_option("--outputFile", dest="outputFile",
+                       #help="Output file", metavar="FILE")
+     parser.add_option("--diccPath", dest="diccPath",
+                       help="Path to read dictionaries", metavar="PATH")
+     parser.add_option("--diccFile", dest="diccFile",
+                       help="JSON file with entity dictionaries", metavar="FILE")
+     parser.add_option("--diccEffect", dest="diccEffect",
+                       help="File with normalized effects", metavar="FILE")
+     parser.add_option("--format", dest="format",
+                       help="Output format: standoff", metavar="TEXT")
+     parser.add_option("--diccEPAth", dest="diccEPAth",
+                       help="File with normalized effects", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("None parameter entered.")
+         sys.exit(1)
+ 
+     # Printing parameter values
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Input path: " + str(options.inputPath))
+     print("Input file: " + str(options.inputFile))
+     print("Output path: " + str(options.outputPath))
+     #print("Output file: " + str(options.outputFile))
+     print("Path to read dictionaries: " + str(options.diccPath))
+     print("JSON file with entity dictionaries: " + str(options.diccFile))
+     print("Path to read normalized effects: " + str(options.diccEPAth))
+     print("File with normalized effects: " + str(options.diccEffect))
+     print("Output format: " + str(options.format))
+ 
+     regularWords =  words.words('en')
+ 
+     print('Loading dictionaries...')
+     with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
+         hashDicc = json.load(diccFile)
+ 
+     # hashTermFiles = hashDicc["hashTermFiles"]
+     # hashTerms = hashDicc["hashTerms"]
+ 
+     # for key in hashTermFiles.keys():
+     #     for f in hashTermFiles[key]:
+     #         # print('File: ' + f)
+     #         with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
+     #             for line in iFile:
+     #                 line = line.strip('\n')
+     #                 line = line.replace(' ', '-')
+     #                 if line not in hashTerms[key]:
+     #                     hashTerms[key].append(line)
+     #                     # if options.termLower:
+     #                     # hashTerms[key].append(line.lower())
+     #                     # if options.termCapitalize:
+     #                     # hashTerms[key].append(line.capitalize())
+     #     print('   Terms read {} size: {}'.format(key, len(hashTerms[key])))
+ 
+     # Loading normalized effects
+     print('Loading normalized effects...')
+     with open(os.path.join(options.diccEPAth, options.diccEffect)) as diccFile:
+         hashEffects = json.load(diccFile)
+ 
+     files = {}
+     hashEntities = {}
+     hashInteractions = {}
+     hashInteractionsEffect = {}
+     idEntities = 1
+     idInteractions = 1
+     idInteractionsEffect = 1
+     filenameBefore = ''
+     regexNumFile = re.compile(r'_([0-9]+)[.-]')
+     numFile = ""
+     inumFile = 0
+     hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}
+ 
+     with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
+         for line in iFile:
+             line = line.rstrip('\n')
+             listLine = line.split('\t')
+             file = listLine[0]
+             filename = file.split("/")[-1]
+             filename = filename[:-4]
+             if filename not in files:
+                 # New file, that is, new sentence
+                 files[filename] = 1
+                 if len(files) > 1:
+                     if len(hashEntities) > 0:
+                         #if filenameBefore.find('061-02') > -1:
+                         #    print("filenameBefore: {}".format(filenameBefore))
+                         #    print("Save hashEntities: {}".format(hashEntities))
+                         #    print("Save hashInteractions: {}".format(hashInteractions))
+                         #    print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
+                         saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
+                         saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)
+                 filenameBefore = filename
+                 hashEntities = {}
+                 hashInteractions = {}
+                 hashInteractionsEffect = {}
+                 idEntities = 1
+                 idInteractions = 1
+                 idInteractionsEffect = 1
+                 outputPath = os.path.join(options.outputPath, "complete-ris")
+                 idEntities = loadFileEntities(filename, outputPath, hashEntities)
+                 idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
+                 outputPath = os.path.join(options.outputPath, "incomplete-ris")
+                 idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
+                 result = regexNumFile.search(filenameBefore)
+                 if result:
+                     inumFile = int(result.group(1))
+                     numFile = str(inumFile)
+                     print("Numfile: {}".format(numFile))
+                 else:
+                     print("WARNING: numfile not found in filename")
+                 hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}
+                 if numFile in hashDicc:
+                     hashTemp = hashDicc[numFile]
+                     #print("hashDicc[numFile]: {}".format(hashTemp))
+                     for k, v in hashTemp.items():
+                         if v == "TF":
+                             # print("Verifiying TF")
+                             if "TF" in hashTerms:
+                                 # print(" TF {}".format(k))
+                                 hashTerms["TF"].append(k)
+                             else:
+                                 hashTerms["TF"] = [k]
+                         elif v == "GENE":
+                             if "GENE" in hashTerms:
+                                 hashTerms["GENE"].append(k)
+                             else:
+                                 hashTerms["GENE"] = [k]
+                         elif v == "TU":
+                             if "TU" in hashTerms:
+                                 hashTerms["TU"].append(k)
+                             else:
+                                 hashTerms["TU"] = [k]
+                         elif v == "EFFECT":
+                             if "EFFECT" in hashTerms:
+                                 hashTerms["EFFECT"].append(k)
+                             else:
+                                 hashTerms["EFFECT"] = [k]
+                         else:
+                             print("WARNING: entity not found in dictionaries")
+                 else:
+                     print("WARNING: numfile not found in dictionaries")
+                 #if filename.find('061-02') > -1:
+                 #    print("filename: {}".format(filename))
+                 #    print("Load hashEntities: {}".format(hashEntities))
+                 #    print("Load hashInteractions: {}".format(hashInteractions))
+                 #    print("Load hashInteractionsEffect: {}".format(hashInteractionsEffect))
+ 
+             wordA = listLine[2]
+             wordB = listLine[3]
+             wordC = listLine[4]
+             startA = listLine[5]
+             endA = listLine[6]
+             startB = listLine[7]
+             endB = listLine[8]
+             startC = listLine[9]
+             endC = listLine[10]
+             sent = listLine[12]
+             lemmaA = listLine[2]
+             lemmaB = listLine[3]
+             lemmaC = listLine[4]
+ 
+             # Return [tok, offsetStart, offsetEnd ]
+             # print("hashTerms[TF]: {}".format(hashTerms["TF"]))
+             listRegulator = getPosWord(startA, endA, sent, hashTerms["TF"])
+             if listRegulator is not None:
+                 #if filenameBefore.find('061-02') > -1:
+                 #    print(">> Regulator found: {}".format(listRegulator[0]))
+                 listRegulated = getPosWord(startC, endC, sent, hashTerms["GENE"])
+                 if listRegulated is not None:
+                     #if filenameBefore.find('061-02') > -1:
+                     #    print(">> Regulated GENE found: {}".format(listRegulated[0]))
+                     idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
+                     idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
+                     idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
+                     #print("Review EFFECT")
+                     listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
+                     if listEffect is not None:
+                         idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
+                         idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
+                 else:
+                     listRegulated = getPosWord(startC, endC, sent, hashTerms["TU"])
+                     if listRegulated is not None:
+                         #if filenameBefore.find('061-02') > -1:
+                         #    print(">> Regulated TU found: {}".format(listRegulated[0]))
+                         idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
+                         idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
+                         idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
+                         #print("Review EFFECT")
+                         listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
+                         if listEffect is not None:
+                             idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
+                             idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
+             else:
+                 listRegulator = getPosWord(startC, endC, sent, hashTerms["TF"])
+                 if listRegulator is not None:
+                     #if filenameBefore.find('061-02') > -1:
+                     #    print(">> Regulator found: {}".format(listRegulator[0]))
+                     listRegulated = getPosWord(startA, endA, sent, hashTerms["GENE"])
+                     if listRegulated is not None:
+                         #if filenameBefore.find('061-02') > -1:
+                         #    print(">> Regulated GENE found: {}".format(listRegulated[0]))
+                         idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
+                         idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
+                         idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
+                         #print("Review EFFECT")
+                         listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
+                         if listEffect is not None:
+                             idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
+                             idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
+                     else:
+                         listRegulated = getPosWord(startA, endA, sent, hashTerms["TU"])
+                         if listRegulated is not None:
+                             #if filenameBefore.find('061-02') > -1:
+                             #    print(">> Regulated TU found: {}".format(listRegulated[0]))
+                             idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
+                             idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
+                             idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
+                             #print("Review EFFECT")
+                             listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
+                             if listEffect is not None:
+                                 idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
+                                 idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
+         if len(files) > 1:
+             if len(hashEntities) > 0:
+                 #print("filenameBefore: {}".format(filenameBefore))
+                 #print("Save hashEntities: {}".format(hashEntities))
+                 #print("Save hashInteractions: {}".format(hashInteractions))
+                 #print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
+                 saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
+                 saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)
--- a/ri-openie-extraction/.gitignore 0 → 100644
View file @f3df57a
+++ b/ri-openie-extraction/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/run-several-files.sh 0 → 100755
View file @f3df57a
+++ b/run-several-files.sh 0 → 100755
View file @f3df57a
+ #!/bin/bash
+ 
+ ###### Automatic extraction of TRN from several files ######
+ 
+ BRIES_HOME=/myhome/bries
+ PMIDS_HOME=/myhome/preprocessed-files
+ # We don't use REFERENCE_HOME because we don't evaluate. Path /reference-data-set doesn't exist. File no-reference.txt doesn't exist.
+ REFERENCE_HOME=/myhome/reference-data-set
+ 
+ for f in $PMIDS_HOME/original/text/*.*
+ do
+  FILE_NAME=$(basename "$f")
+  FILE_NAME="${FILE_NAME%.*}"
+  echo "File: $FILE_NAME"
+  ./automatic-extraction-ris-gcs.sh $PMIDS_HOME/features/$FILE_NAME.tra.word.txt $PMIDS_HOME/transformed/$FILE_NAME.tra.txt $BRIES_HOME/ri-openie-extraction/$FILE_NAME.txt $BRIES_HOME/predicted-ris-gcs Y Y FILT1 $REFERENCE_HOME no-reference.txt $BRIES_HOME/evaluation-reports no-evaluation.txt diccionario-SYNONYMS.json $PMIDS_HOME/original/tsv 1>uno-$FILE_NAME.txt 2>dos-$FILE_NAME.txt
+ done
--- a/sentence-filter_v02.py 0 → 100644
View file @f3df57a
+++ b/sentence-filter_v02.py 0 → 100644
View file @f3df57a
+ # -*- coding: UTF-8 -*-
+ 
+ from optparse import OptionParser
+ import os
+ import sys
+ from time import time
+ import json
+ import re
+ import pandas as pd
+ 
+ __author__ = 'CMendezC'
+ 
+ 
+ # Objective: Filter sentences with specific entities.
+ # Also extract attributive sentences: effect-TF
+ # And autoregulation: regulates its own gene
+ # CFMC 2022-03-08: We added updating tsv file with idsentence, sentence and section (.pre.tsv)
+ #   to indicate filtered sentences.
+ 
+ # Parameters:
+ #   1) --inputFileWord Path and filename to read feature word file.
+ #   2) --inputFileTrans Path and filename to read transformed file.
+ #   3) --outputPath Path to place output file.
+ #   4) --outputFile Output file.
+ #   5) --filter FILT1: (GENE OR TU) AND TF
+ #               FILT2: (GENE OR TU) AND EFFECT AND TF
+ #   6) --attrPath Path for attributive cases: ArgP-regulated genes
+ #   8) --dicPath Path for dictionary
+ #   9) --dicFile Path for dictionary file normalized_Effects.json
+ #   10) --autoPath Path for autoregulation cases: regulates its own gene
+ # /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
+ 
+ # Output:
+ #   1) Filtered sentences.
+ #   2) Attributive sentences
+ #   3) Autoregulation sentences
+ 
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ def getEntities(tline, filt):
+     # FILT1: (GENE OR TU) AND TF
+     # FILT2: (GENE OR TU) AND EFFECT AND TF
+     entities = {}
+     tline = tline.rstrip('\n\r ')
+     for token in tline.split(" "):
+         # print("Token: {}".format(token))
+         listElem = token.split("|")
+         w = listElem[0]
+         l = listElem[1]
+         t = listElem[2]
+         if filt == "FILT1" or filt == "FILT2":
+             if t in ["GENE", "TU", "TF", "EFFECT"]:
+                 if w not in entities:
+                     entities[w] = t
+         # if filt == "FILT2":
+         #     if t in ["GENE", "TU", "TF", "EFFECT"]:
+         #         if w not in entities:
+         #             entities[w] = t
+     return entities
+ 
+ if __name__ == "__main__":
+     # Parameter definition
+     parser = OptionParser()
+ 
+     parser.add_option("--inputFileWord", dest="inputFileWord",
+                       help="Path and filename to read feature word file", metavar="PATH")
+     parser.add_option("--inputFileTrans", dest="inputFileTrans",
+                       help="Path and filename to read transformed file", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path", metavar="PATH")
+     parser.add_option("--outputFile", dest="outputFile",
+                       help="Output file", metavar="FILE")
+     parser.add_option("--filter", dest="filter", choices=('FILT1', 'FILT2'), default=None,
+                       help="FILT1: (GENE OR TU) AND TF; FILT2: (GENE OR TU) AND EFFECT AND TF", metavar="TEXT")
+     parser.add_option("--attrPath", dest="attrPath",
+                       help="Output path attributive sentences", metavar="PATH")
+     parser.add_option("--dicPath", dest="dicPath",
+                       help="Output path dictionary", metavar="PATH")
+     parser.add_option("--dicFile", dest="dicFile",
+                       help="Output file dictionary normalized_Effects.json", metavar="FILE")
+     parser.add_option("--autoPath", dest="autoPath",
+                       help="Output path autoregulation sentences", metavar="PATH")
+     parser.add_option("--tsvPath", dest="tsvPath",
+                       help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("None parameters indicated.")
+         sys.exit(1)
+ 
+     # Printing parameter values
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path and filename to read feature word file: " + str(options.inputFileWord))
+     print("Path and filename to read transformed file: " + str(options.inputFileTrans))
+     print("Output path: " + str(options.outputPath))
+     print("Output file: " + str(options.outputFile))
+     print("Filter: " + str(options.filter))
+     print("Output path attributive sentences: " + str(options.attrPath))
+     print("Output path autoregulation sentences: " + str(options.autoPath))
+     print("Output path dictionary: " + str(options.dicPath))
+     print("Output file dictionary normalized_Effects.json: " + str(options.dicFile))
+     print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))
+ 
+     # Loading normalized effects
+     # print('Loading normalized effects...')
+     hashNormalizedEffects = {}
+     with open(os.path.join(options.dicPath, options.dicFile)) as diccFile:
+         hashNormalizedEffects = json.load(diccFile)
+     listEffects = []
+     for eff in hashNormalizedEffects.keys():
+         if eff.endswith('d'):
+             listEffects.append(eff)
+     listEffects.append("dependent")
+     effects = "|".join(listEffects)
+     print("Effects: {}".format(effects))
+ 
+     t0 = time()
+     count = 0
+     hashEntities = {}
+     hashAttrSent = {}
+     hashAutoSent = {}
+     # Original CMC 2018-11-07: reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
+     # We decided to extract all sentences containing effect-TF because we observed some patterns where
+     # "gene" does not appear, then, to recover these examples we employ a more general rule to separate
+     # attributive sentences.
+     reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF')
+     # We decided to extract all sentences containing autoregulation
+     # The FimZ transcription factor activates this promoter directly ,
+     #   and it also positively regulates the transcription of its own gene
+     # FimZ is known to regulate the expression of its own gene positively
+     # FimZ also positively regulates its own transcription
+     # ArgP protein represses its own synthesis
+     # ArgP both represses its own transcription
+     # ArgP protein represses its own synthesis
+     # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
+     #   of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
+     reAutoSent = re.compile(r'(?<=\|TF).+\|EFFECT.+its\|its\|PRP\$ own\|own\|JJ')
+     aFilter = options.filter
+     print("   Processing file...{}".format(options.inputFileTrans))
+     with open(os.path.join(options.outputPath, options.outputFile), "w", encoding="utf-8", errors="replace") as oFile:
+         with open(os.path.join(options.inputFileTrans), mode="r", encoding="utf-8", errors="replace") as tFile, open(os.path.join(options.inputFileWord), mode="r", encoding="utf-8", errors="replace") as wFile:
+             # CFMC 2022-03-09: Load tsv file with section, id sentence, sentence (Extracted from jsonpdf)
+             file = options.inputFileTrans[options.inputFileTrans.rfind("/")+1:]
+             file_tsv = file.replace(".tra.txt", ".pre.tsv")
+             tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
+             print("tsv_file.shape: {}".format(tsv_file.shape))
+             tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
+             print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
+             # print(tsv_file_filtered.head(10))
+             tsv_file_new = tsv_file_filtered.reset_index(drop=True)
+             # print(tsv_file_new.shape)
+             # print(tsv_file_new.head(10))
+             i = 0
+             for tLine, wLine in zip(tFile, wFile):
+                 # FILT1: (GENE OR TU) AND TF
+                 # FILT2: (GENE OR TU) AND EFFECT AND TF
+                 if aFilter is not None:
+                     reGENETU = re.compile(r'(\|GENE|\|TU)')
+                     reEFFECT = re.compile(r'\|EFFECT')
+                     reTF = re.compile(r'\|TF')
+                     tCount = str(count)
+                     if aFilter == "FILT1":
+                         if not (reGENETU.search(tLine) and reTF.search(tLine)):
+                             #print("NOT FOUND")
+                             # CFMC 2022-03-08
+                             tsv_file_new.at[i, 'status'] = 0
+                             i += 1
+                             continue
+                         else:
+                             #print("FOUND")
+                             oFile.write(wLine)
+                             if tCount not in hashEntities:
+                                 hashEntities[tCount] = getEntities(tLine, aFilter)
+                             if reAttrSent.search(tLine):
+                                 #print("ATTRIBUTIVE SENTENCE: {}".format(tLine))
+                                 if tCount not in hashAttrSent:
+                                     hashAttrSent[tCount] = tLine
+                             # Autoregulation sentences
+                             if reAutoSent.search(tLine):
+                                 # print("AUOREGULATION SENTENCE: {}".format(tLine))
+                                 if tCount not in hashAutoSent:
+                                     hashAutoSent[tCount] = tLine
+                             #print(tLine)
+                     elif aFilter == "FILT2":
+                         if not (reGENETU.search(tLine) and reEFFECT.search(tLine) and reTF.search(tLine)):
+                             continue
+                             # CFMC 2022-03-08
+                             tsv_file_new.at[i, 'status'] = 0
+                             i += 1
+                         else:
+                             oFile.write(wLine)
+                             if tCount not in hashEntities:
+                                 hashEntities[tCount] = getEntities(tLine, aFilter)
+                             if reAttrSent.search(tLine):
+                                 if tCount not in hashAttrSent:
+                                     hashAttrSent[tCount] = tLine
+                             if reAutoSent.search(tLine):
+                                 if tCount not in hashAutoSent:
+                                     hashAutoSent[tCount] = tLine
+                 count += 1
+                 i += 1
+ 
+     merged = tsv_file.merge(tsv_file_new, on=['idsentence'], how='left')
+     # print(merged.shape)
+     # print(merged.head(10))
+     tsv_file.status = merged.status_y.where(~merged.status_y.isnull(), tsv_file.status).astype(int)
+     tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
+     print("Last tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
+     # print(tsv_file_filtered.head(10))
+     tsv_file.to_csv(os.path.join(options.tsvPath, file_tsv.replace('.tsv', '.fil.tsv')), sep='\t')
+ 
+     with open(os.path.join(options.outputPath, options.outputFile.replace(".txt", ".ents.json")), "w", encoding="utf-8",
+               errors="replace") as eFile:
+         json.dump(hashEntities, eFile)
+ 
+     for f, sent in hashAttrSent.items():
+         listPath = options.inputFileTrans.split('/')
+         fileName = listPath[-1]
+         fileName = fileName.replace('.tra.', '.att.' + f + '.')
+         print("Save file {}".format(fileName))
+         with open(os.path.join(options.attrPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
+             aFile.write(sent)
+ 
+     for f, sent in hashAutoSent.items():
+         listPath = options.inputFileTrans.split('/')
+         fileName = listPath[-1]
+         fileName = fileName.replace('.tra.', '.auto.' + f + '.')
+         print("Save file {}".format(fileName))
+         with open(os.path.join(options.autoPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
+             aFile.write(sent)
+ 
+     print("Files split in: %fs" % (time() - t0))
--- a/sentence-simplification/algorithm_sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/sentence-simplification/algorithm_sentences/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/sentence-simplification/iSimp_sentences/.gitignore 0 → 100644
View file @f3df57a
+++ b/sentence-simplification/iSimp_sentences/.gitignore 0 → 100644
View file @f3df57a
+ 
--- a/sentence-simplification/sentence-simplification-main.sh 0 → 100755
View file @f3df57a
+++ b/sentence-simplification/sentence-simplification-main.sh 0 → 100755
View file @f3df57a
+ #!/bin/bash
+ 
+ #Validate arguments
+ if [[ ! ("$#" == 3 ) ]]; then
+     echo 'Usage: ./sentence-simplification-main.sh <input_path> <output_file_path> <isimp_path>'
+     exit 1
+ fi
+ 
+ SCRIPT_PATH=$(cd `dirname $0` && pwd)
+ #Define aquí la palabra clave del grupo de oraciones a simplificar.
+ INPUT_PATH=$1
+ OUTPUT_INDEX_FILE_PATH=$2
+ ISIMP_PATH=$3
+ cd $SCRIPT_PATH
+ 
+ 
+ 
+ 
+ #ANALIZAR EN ISIMP
+ echo "Analysing in iSimp..."
+ if [ -z "$(ls -A ./iSimp_sentences/)" ]; then :
+ else
+    #echo "Not Empty"
+    rm ./iSimp_sentences/*
+ fi
+ #cd $INPUT_PATH
+ for j in $INPUT_PATH/*
+ do
+ 	echo $j
+ 	#echo "++++entrada_simp: $j salida_simp: $SCRIPT_PATH/iSimp_sentences/$(basename $j)"
+ 	$ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
+ done
+ cd $SCRIPT_PATH
+ 
+ #CREA INDICE DE ARCHIVOS SIMPLIFICADOS
+ #touch $SCRIPT_PATH/index.txt
+ >| $OUTPUT_INDEX_FILE_PATH
+ 
+ #ALIMENTAR A ALGORITMO 
+ echo "Analysing in Algorithm..."
+ if [ -z "$(ls -A ./algorithm_sentences/)" ]; then :
+ else
+    #echo "Not Empty"
+    rm ./algorithm_sentences/*
+ fi
+ #cd ./iSimp_sentences
+ for k in $SCRIPT_PATH/iSimp_sentences/*
+ do
+ 	echo $k
+ 	#echo "entrada: $k  salida: $SCRIPT_PATH/algorithm_sentences/$(basename $k) index: $OUTPUT_INDEX_FILE_PATH"
+ 	python2 $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
+ done
+ cd $SCRIPT_PATH
--- a/sentence-simplification/simplifier.py 0 → 100644
View file @f3df57a
+++ b/sentence-simplification/simplifier.py 0 → 100644
View file @f3df57a
+ import copy
+ import sys
+ import requests
+ 
+ class Simp(object):
+   def __init__(self):
+     self.TYPE=""
+     self.TYPEx=0
+     self.TYPEy=0
+     self.TEXT=""
+     self.COMP=[]
+   def agregarTYPE(self,Type):
+     self.TYPE=Type
+   def agregarTEXT(self,text):
+     self.TEXT=text
+   def agregarCOMP(self,comp):
+     self.COMP.append(comp)
+ 
+ class Frase(object):
+   def __init__(self):
+     self.TYPE=""
+     self.TEXT=""
+     self.POS=""
+     self.TREE=""
+     self.SIMP=[]
+   def agregarTYPE(self,Type):
+     self.TYPE=Type
+   def agregarTEXT(self,text):
+     self.TEXT=text
+   def agregarPOS(self,Pos):
+     self.POS=Pos
+   def agregarTREE(self,Tree):
+     self.TREE=Tree
+   def agregarSIMP(self):
+     self.SIMP.append(Simp())
+ 
+ class Sentence(object):
+   def __init__(self):
+     self.FLAG=True
+     self.TEXT=""
+     self.TREE=""
+     self.SIMP=[]
+   def agregarTEXT(self,text):
+     self.TEXT=text
+   def agregarTREE(self,Tree):
+     self.TREE=Tree
+   def agregarSIMP(self):
+     self.SIMP.append(Simp())
+ 
+ 
+ MEMORIAB=[]
+ MEMORIAA=[]
+ 
+ 
+ #----lectura de datos desde archivo
+ arch=(sys.argv[1])
+ f = open(arch)
+ dato = f.read().splitlines()
+ f.close
+ frase=Frase()
+ for i in range(len(dato)):
+   if 'TYPE: ' in dato[i][0:6]:
+     frase.agregarTYPE(dato[i][6:])
+   elif 'TEXT: ' in dato[i][0:6]:
+     frase.agregarTEXT(dato[i][6:])
+   elif 'POS : ' in dato[i][0:6]:
+     frase.agregarPOS(dato[i][6:])
+   elif 'TREE: ' in dato[i][0:6]:
+     frase.agregarTREE(dato[i][6:])
+   elif 'SIMP:' in dato[i]:
+     frase.agregarSIMP()
+   elif '  TYPE: ' in dato[i][0:8]:
+     frase.SIMP[-1].agregarTYPE(dato[i][8:])
+   elif '  TEXT: ' in dato[i][0:8]:
+     frase.SIMP[-1].agregarTEXT(dato[i][8:])
+   elif '  COMP: ' in dato[i]:
+     frase.SIMP[-1].agregarCOMP(dato[i][8:])
+ #------------
+ 
+ 
+ #-------Programa principal
+ #Algoritmo v4
+ 
+ 
+ if ((frase.TYPE.find('sentence')) !=- 1) and (frase.SIMP!=[]) and (frase.SIMP[0].TYPE != ''):
+   y=1
+   w=1
+   SIMPworkspace=[]
+   # copia TREE y cada SIMP a SENTENCE.1
+   Sentence1=Sentence()
+   Sentence1.TREE=copy.deepcopy(frase.TREE)
+   Sentence1.TEXT=copy.deepcopy(frase.TEXT)
+   for i in range(len(frase.SIMP)):
+     #Sentence1.SIMP.append(Simp())
+     #Sentence1.SIMP[i]=copy.deepcopy(frase.SIMP[i])
+     SIMPworkspace.append(Simp())
+     SIMPworkspace[i]=copy.deepcopy(frase.SIMP[i])
+   
+ ## ORDENAMIENTO DE SIMPs
+   for i in range(len(SIMPworkspace)):
+     #print SIMPworkspace[i].TEXT
+     #print SIMPworkspace[i].TYPE
+     SIMPworkspace[i].TYPEx = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('[')+1:SIMPworkspace[i].TYPE.find('..')])
+     SIMPworkspace[i].TYPEy = int(SIMPworkspace[i].TYPE[SIMPworkspace[i].TYPE.find('..')+2:SIMPworkspace[i].TYPE.find(']')])
+     if 'parenthesis' in SIMPworkspace[i].TYPE:
+       SIMPworkspace[i].TYPEy = SIMPworkspace[i].TYPEy + 2
+     #print SIMPworkspace[i].TYPEx
+     #print SIMPworkspace[i].TYPEy
+ 
+ 
+   SIMPworkspace.sort(key=lambda x: x.TYPEy, reverse=True)
+   SIMPworkspace.sort(key=lambda x: x.TYPEx)
+ 
+  
+  # for i in range(len(SIMPworkspace)):
+  #   print "\nSIMP " + str(i) + " :"
+  #   print SIMPworkspace[i].TYPE 
+  #   print SIMPworkspace[i].TYPEx
+  #   print SIMPworkspace[i].TYPEy
+  # print "\n"  
+ 
+   for i in range(len(SIMPworkspace)):
+     Sentence1.SIMP.append(Simp())
+     Sentence1.SIMP[i]=copy.deepcopy(SIMPworkspace[i])  
+    
+ 
+   # Agrega la oracion original Sentence1 a la memoria como primer objeto en ser analizado
+   MEMORIAB.append(Sentence())
+   MEMORIAB[0]=copy.deepcopy(Sentence1)
+ 
+ 
+ 
+   # 1 entrada al bucle A por cada SIMP diferente en Sentence1
+   numSimp=len(Sentence1.SIMP)
+   s = 0
+   #bucle A
+   while s < numSimp :
+     #print "\nEntro por vez " + str(s) + " al bucle A"
+     #print "Analizando todos los SIMP de tipo: " + MEMORIAB[0].SIMP[s].TYPE
+     #Entra al bucle B el numero de veces igual al numerode elementos en MEMORIAB
+     numMEM = len(MEMORIAB)
+     t = 0
+     #bucle B
+     while t < numMEM :
+       #print "Entro por vez " + str(t) + " al bucle B"
+       #Entra si la oracion no ha sido analizada antes (FLAG==True) y si el texto del simp esta presente en la oracion.
+       #print "CONDICIONES:"
+       #print "SIMP " + MEMORIAB[0].SIMP[s].TEXT
+       #print "SIMP " + MEMORIAB[0].SIMP[s].TYPE
+       #print "MEMB " + str(MEMORIAB[t].FLAG)
+       #print "MEMB " + MEMORIAB[t].TEXT
+       if ( MEMORIAB[0].SIMP[s].TEXT in MEMORIAB[t].TEXT ) and ( MEMORIAB[t].FLAG == True ):
+         MEMORIAB[t].FLAG = False
+ 	#print "False to: " + MEMORIAB[t].TEXT
+         #print "Entro a condicional"
+         #Reglas de simplificacion
+         if ( 'coordination' in MEMORIAB[t].SIMP[s].TYPE ) and ( not ('sentence coordination' in MEMORIAB[t].SIMP[s].TYPE ) ) :
+           #print "Aplico regla coord"
+           TEMPORALES = []
+           c = len(MEMORIAB[t].SIMP[s].COMP)
+           #print "Hay " + str(c) + " COMP en este SIMP"
+           tt = 0
+           while c > 0 :
+             c = c - 1
+             if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) :
+               TEMPORALES.append(Sentence())
+               TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+               replaced = MEMORIAB[0].SIMP[s].TEXT
+               indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+               indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+               replacer = MEMORIAB[0].TEXT[indice1:indice2]
+               TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+               tt = tt + 1
+           #copiar simplificaciones de memoria temporal a MEMORIAB
+           indtempamem = 0
+           while indtempamem < len(TEMPORALES) :
+             MEMORIAB.append(Sentence())
+             MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+             MEMORIAB[-1].FLAG = True
+             #print MEMORIAB[-1].TEXT
+             indtempamem = indtempamem + 1
+         elif 'parenthesis' in MEMORIAB[t].SIMP[s].TYPE:
+           #print "Aplico regla par"
+           TEMPORALES = []
+           c = len(MEMORIAB[t].SIMP[s].COMP)
+           #print "Hay " + str(c) + " COMP en este SIMP"
+           tt = 0
+           while c > 0 :
+             #print "entro al while de par"
+             c = c - 1
+             TEMPORALES.append(Sentence())
+             TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+             replaced = MEMORIAB[0].SIMP[s].TEXT + ' )'
+             indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+             indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+             replacer = MEMORIAB[0].TEXT[indice1:indice2]
+             #print "replaced: " + replaced
+             #print "replacer: " + replacer
+             TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+             tt = tt + 1
+           #copiar simplificaciones de memoria temporal a MEMORIAB
+           indtempamem = 0
+           while indtempamem < len(TEMPORALES) :
+             MEMORIAB.append(Sentence())
+             MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+             MEMORIAB[-1].FLAG = True
+             #print MEMORIAB[-1].TEXT
+             indtempamem = indtempamem + 1
+         elif 'apposition' in MEMORIAB[t].SIMP[s].TYPE:
+           #print "Aplico regla Apposition"
+           TEMPORALES = []
+           c = len(MEMORIAB[t].SIMP[s].COMP)
+           #print "Hay " + str(c) + " COMP en este SIMP"
+           tt = 0
+           while c > 0 :
+             #print "entro al while de par"
+             c = c - 1
+             TEMPORALES.append(Sentence())
+             TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+             replaced = MEMORIAB[0].SIMP[s].TEXT
+             indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+             indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+             replacer = MEMORIAB[0].TEXT[indice1:indice2]
+             #print "replaced: " + replaced
+             #print "replacer: " + replacer
+             TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+             tt = tt + 1
+           #copiar simplificaciones de memoria temporal a MEMORIAB
+           indtempamem = 0
+           while indtempamem < len(TEMPORALES) :
+             MEMORIAB.append(Sentence())
+             MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+             MEMORIAB[-1].FLAG = True
+             #print "Copio a memoria: " + MEMORIAB[-1].TEXT
+             indtempamem = indtempamem + 1
+         elif 'member-collection' in MEMORIAB[t].SIMP[s].TYPE:
+           #print "Aplico regla member-collection"
+           TEMPORALES = []
+           c = len(MEMORIAB[t].SIMP[s].COMP)
+           #print "Hay " + str(c) + " COMP en este SIMP"
+           tt = 0
+           while c > 0 :
+             #print "entro al while de mem"
+             c = c - 1
+             TEMPORALES.append(Sentence())
+             TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+             replaced = MEMORIAB[0].SIMP[s].TEXT
+             indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+             indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+             replacer = MEMORIAB[0].TEXT[indice1:indice2]
+             #print "replaced: " + replaced
+             #print "replacer: " + replacer
+             TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+             tt = tt + 1
+           #copiar simplificaciones de memoria temporal a MEMORIAB
+           indtempamem = 0
+           while indtempamem < len(TEMPORALES) :
+             MEMORIAB.append(Sentence())
+             MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+             MEMORIAB[-1].FLAG = True
+             #print "Copio a memoria: " + MEMORIAB[-1].TEXT
+             indtempamem = indtempamem + 1
+         elif 'sentence coordination' in MEMORIAB[t].SIMP[s].TYPE:
+           #print "Aplico regla Verb"
+           TEMPORALES = []
+           c = len(MEMORIAB[t].SIMP[s].COMP)
+           #print "Hay " + str(c) + " COMP en este SIMP"
+           tt = 0
+           while c > 0 :
+             c = c - 1
+             if ( 'conjunct' in MEMORIAB[0].SIMP[s].COMP[c] ) and ( not ( 'conjunction' in MEMORIAB[0].SIMP[s].COMP[c] ) ) :
+               TEMPORALES.append(Sentence())
+               TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t])
+               #sustituye todo el contenido de TEMPORAL.r/TREE, por el contenido la oracion coordinada
+               #replaced = MEMORIAB[0].SIMP[s].TEXT
+               indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+               indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+               replacer = MEMORIAB[0].TEXT[indice1:indice2]
+               #print replacer
+               TEMPORALES[tt].TEXT = replacer 
+               ## si la oracion no termina en punto o ! 
+               tt = tt + 1
+           #copiar simplificaciones de memoria temporal a MEMORIAB
+           indtempamem = 0
+           while indtempamem < len(TEMPORALES) :
+             MEMORIAB.append(Sentence())
+             MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+             MEMORIAB[-1].FLAG = True
+             #print MEMORIAB[-1].TEXT
+             indtempamem = indtempamem + 1
+         elif 'full relative clause' in MEMORIAB[t].SIMP[s].TYPE:
+           #print "Aplico regla RelCl"
+           TEMPORALES = []
+           c = 0
+           tt = 0
+           while c < 2 :
+             if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] :
+               TEMPORALES.append(Sentence())
+               TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
+               if MEMORIAB[0].TEXT[MEMORIAB[0].TEXT.index(TEMPORALES[tt].SIMP[s].TEXT)+len(TEMPORALES[tt].SIMP[s].TEXT)-1] == ',':
+                 replaced = MEMORIAB[0].SIMP[s].TEXT + ',' #posible error, si es asi probar con ' ,'
+               else:
+                 replaced = MEMORIAB[0].SIMP[s].TEXT
+               indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+               indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+               replacer = MEMORIAB[0].TEXT[indice1:indice2]
+               TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+               indice3 = indice1
+               indice4 = indice2
+             if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] :
+               TEMPORALES.append(Sentence())
+               TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
+               indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+               indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+               TEMPORALES[tt].TEXT = copy.deepcopy(MEMORIAB[0].TEXT[indice3:indice4]+' '+MEMORIAB[0].TEXT[indice1:indice2] ) ##
+               cad3 = MEMORIAB[0].TEXT[indice1:indice2]
+               cad4 = cad3.split()
+               if (cad4[0]+'_WDT') in frase.POS:
+                 TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(' '+cad4[0],'')
+             tt = tt + 1
+             c = c + 1
+           #copiar simplificaciones de memoria temporal a MEMORIAB
+           indtempamem = 0
+           while indtempamem < len(TEMPORALES) :
+             MEMORIAB.append(Sentence())
+             MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+             MEMORIAB[-1].FLAG = True
+             #print MEMORIAB[-1].TEXT
+             indtempamem = indtempamem + 1
+         elif 'reduced relative clause' in MEMORIAB[t].SIMP[s].TYPE:
+           #print "Aplico regla RelCl"
+           TEMPORALES = []
+           c = 0
+           tt = 0
+           while c < 2 :
+             if 'referred noun phrase' in MEMORIAB[0].SIMP[s].COMP[c] :
+               TEMPORALES.append(Sentence())
+               TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #ok
+               replaced = MEMORIAB[0].SIMP[s].TEXT
+               indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+               indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+               replacer = MEMORIAB[0].TEXT[indice1:indice2]
+               #subj = MEMORIAB[0].TEXT[indice1:(indice2+1)]
+               subj = MEMORIAB[0].TEXT[indice1:(indice2)]
+               TEMPORALES[tt].TEXT = TEMPORALES[tt].TEXT.replace(replaced,replacer)
+             if 'clause' in MEMORIAB[0].SIMP[s].COMP[c] :
+               TEMPORALES.append(Sentence())
+               TEMPORALES[tt] = copy.deepcopy(MEMORIAB[t]) #el referente debera estar antes que la clausula para tener orden correcto
+               indice1 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('[')+1:TEMPORALES[tt].SIMP[s].COMP[c].find('..')])
+               indice2 = (int)(TEMPORALES[tt].SIMP[s].COMP[c][TEMPORALES[tt].SIMP[s].COMP[c].find('..')+2:TEMPORALES[tt].SIMP[s].COMP[c].find(']')])
+               replacer = MEMORIAB[0].TEXT[indice1:indice2]
+               TEMPORALES[tt].TEXT = subj + " _ " + replacer #en este punto para ingresar copula necesitas info de numero y tiempo
+             tt = tt + 1
+             c = c + 1
+           #copiar simplificaciones de memoria temporal a MEMORIAB
+           indtempamem = 0
+           while indtempamem < len(TEMPORALES) :
+             MEMORIAB.append(Sentence())
+             MEMORIAB[-1]=copy.deepcopy(TEMPORALES[indtempamem])
+             MEMORIAB[-1].FLAG = True
+             #print MEMORIAB[-1].TEXT
+             indtempamem = indtempamem + 1
+         elif 'hypernymy' in MEMORIAB[t].SIMP[s].TYPE:
+ 	  print "**hypernymy detected**"
+ 	  #print "True to: " + MEMORIAB[t].TEXT
+ 	  MEMORIAB[t].FLAG = True
+         else:
+ 	  print "Error: Unknown simplification construct detected."
+ 	  #print "True to: " + MEMORIAB[t].TEXT
+ 	  MEMORIAB[t].FLAG = True
+       t = t + 1
+     s = s + 1
+ 
+   #CONDICIONES PARA IMPRESION DE SIMPLIFICACIONES EN ARCHIVO DE TEXTO
+   #print "Sentence simplificated. New sentences generated:"
+   for i in range(len(MEMORIAB)):
+     #se reutiliza flag para marcar las oraciones finales
+     MEMORIAB[i].FLAG = True
+     for j in range(len(MEMORIAB[0].SIMP)):   
+       #NOTA: si se agrega un constructo simplificable, anadirlo tambien a esta lista:
+       if ( ('member-collection' in MEMORIAB[0].SIMP[j].TYPE) or ('apposition' in MEMORIAB[0].SIMP[j].TYPE) or ('coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('parenthesis' in MEMORIAB[0].SIMP[j].TYPE) or ('sentence coordination' in MEMORIAB[0].SIMP[j].TYPE) or ('full relative clause' in MEMORIAB[0].SIMP[j].TYPE) or ('reduced relative clause' in MEMORIAB[0].SIMP[j].TYPE) ) and (MEMORIAB[0].SIMP[j].TEXT in MEMORIAB[i].TEXT) :
+         MEMORIAB[i].FLAG = False
+ 
+   ##areglar numeracion archivos salida ej 011
+   arcsalnum = 0
+   for i in range(len(MEMORIAB)):
+     if MEMORIAB[i].FLAG == True:
+       arcsalnum = arcsalnum + 1
+   length = len(str(arcsalnum))
+   #print('{:03d}'.format(arcsalnum))  # python >= 2.7 + python3
+ #  >>> n = '4'
+ #>>> print n.zfill(3)
+   arcsalnum = 0
+   for i in range(len(MEMORIAB)):
+     if MEMORIAB[i].FLAG == True:
+       arcsalnum = arcsalnum + 1
+       print MEMORIAB[i].TEXT#Salida
+       archSalNombre = sys.argv[2]
+       archSalNombre=archSalNombre[:-4] + "-" + (str(arcsalnum)).zfill(length) + '.alg'
+       archivoSalida=open(archSalNombre,"w")
+       archivoSalida.write(MEMORIAB[i].TEXT+"\n")##
+       archivoSalida.close()
+       #WRITE OUTPUT FILE PATH TO INDEX (Arg 3)
+       index_name = sys.argv[3]
+       index = open(index_name, "a+")
+       archSalNombreforIndex=archSalNombre + "\n"
+       index.write(archSalNombreforIndex)
+       index.close()
+ else:
+   print frase.TEXT #----Salida si no habia constructos simplificables
+   archSalNombre = sys.argv[2]
+   archSalNombre = archSalNombre[:-4] + ".alg"
+   archivoSalida = open(archSalNombre,"a+")
+   archivoSalida.write(frase.TEXT+"\n")##
+   archivoSalida.close()
+   #WRITE OUTPUT FILE PATH TO INDEX (Arg 3)
+   index_name = sys.argv[3]
+   index = open(index_name, "a+")
+   archSalNombreforIndex=archSalNombre + "\n"
+   index.write(archSalNombreforIndex)
+   index.close()
+ 
+ 
+ #FIN
--- a/trn/empty-file.txt 0 → 100644
View file @f3df57a
+++ b/trn/empty-file.txt 0 → 100644
View file @f3df57a
+ Delete me
\ No newline at end of file