cmendezc

Bacterial regulatory interaction extraction system

# Bacterial regulatory interaction extraction system
## Prerequisites
1. Input file must be tokenized and sentence split
## Run
### Several files
Set filenames and paths in run-several-files.sh
## Acknowledgments
This work was supported by UNAM-PAPIIT IA203420.
\ No newline at end of file
This diff is collapsed. Click to expand it.
# import fileinput
# import regex as re
# from regex import finditer
import sys
import json
if ( len( sys.argv ) != 3 ):
# Original Daniel: sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <EFFs_dictionary> \n" )
sys.stderr.write("E: usage: " + sys.argv[0] + " <input_file> <normalized_Effects> \n")
sys.stderr.flush();
# exit( 2 );
#LEER ARCHIVO INPUT
text_file = open( sys.argv[1], "r" )
dato = text_file.read()
text_file.close()
#LEE DICCIONARIO
# Loading normalized effects
# print('Loading normalized effects...')
with open(sys.argv[2]) as diccFile:
hashNormalizedEffects = json.load(diccFile)
DICC = list(hashNormalizedEffects.keys())
# Original Daniel: text_file = open( sys.argv[2], "r" )
# Original Daniel: DICC = text_file.read().splitlines()
# Original Daniel: text_file.close()
#declara variables
is_dev = False
is_vrb = False
# DICC
# 2018-11-30 CMC: We separated noun and only past participle for deverbal processing
# and all verb forms as verbal
# VRB: VB verb, base form think
# VRB: VBZ verb, 3rd person singular present she thinks
# VRB: VBP verb, non-3rd person singular present I think
# VRB: VBD verb, past tense they thought
# DEV: VBN verb, past participle a sunken ship
# VRB: VBG verb, gerund or present participle thinking is fun
# extend/VBP
for i in range(len(DICC)):
# print(DICC[i])
for token in dato.split():
word = token[:token.find("/")]
tag = token[token.find("/")+1:]
# print("word: {}".format(word))
# print("tag: {}".format(tag))
if (DICC[i] in word) and (("NN" in tag)
or ("VBN" == tag)
):
is_dev = True
# print("deverbal: " + word)
if (DICC[i] in word) and ("VB" in tag):
is_vrb = True
# print("verbal: " + word)
if is_dev and is_vrb:
sys.exit(11)
elif is_dev:
sys.exit(12)
elif is_vrb:
sys.exit(13)
else:
sys.exit(10)
#!/bin/bash
# Separates sentences by deverbal (.dev) and verbal (.vrb)
# Original Daniel: PATH_TO_CORENLP=/home/elwe/Documents/temporal/CoreNLP
#Validate arguments
if [[ ! ("$#" == 6 ) ]]; then
echo 'Usage: ./separator.sh <path_to_corenlp> <input_path> <output_path> <dicc_path> <if_tag> <if_separate>'
exit 1
fi
SCRIPT_PATH=$(cd `dirname $0` && pwd)
# Original Daniel: INPUT_PATH=$1 #carpeta que contiene archivos a separar
# Original Daniel: OUTPUT_PATH=$2
PATH_TO_CORENLP=$1
INPUT_PATH=$2 #carpeta que contiene archivos a separar
OUTPUT_PATH=$3
DICC_PATH=$4
# Tag sentences to separate deverbal and verbal sentences: $DEVTAG
TAG=$5
# Do separate deverbal and verbal sentences: $DEVSEPAR
SEP=$6
if [ $TAG == "TRUE" ]
then #ANALIZAR EN STANFORD PARSER
if [ -z "$(ls -A $SCRIPT_PATH/tagged/)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged/*
find $SCRIPT_PATH/tagged -maxdepth 1 -name '*.conll' -delete
fi
# Added by CMC
if [ -z "$(ls -A $SCRIPT_PATH/tagged-line/)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged-line/*
find $SCRIPT_PATH/tagged-line -maxdepth 1 -name '*.spt' -delete
fi
for j in $INPUT_PATH/*
do
#echo $j
#Original Daniel: java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.parser.lexparser.LexicalizedParser -writeOutputFiles -retainTMPSubcategories -outputFormat "wordsAndTags" $SCRIPT_PATH/englishPCFG.ser.gz $j
# Command line: java -cp "/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file datos_0.spt -outputDirectory tagged
# java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
# With parse: java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
done
# Original Daniel: mv $INPUT_PATH/*.stp $SCRIPT_PATH/tagged/
for j in $SCRIPT_PATH/tagged/*
do
# Original Daniel: awk 'NF {print $2 "/" $4}' tagged/$j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${j%.spt}"
filename=$(basename "$j")
#filename="${filename%.*}"
awk 'NF {print $2 "/" $4}' $j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${filename%.*}.spt"
# Original Daniel: mv "$j" "${j%.stp}"
done
fi # if [ $TAG == "TRUE" ]
if [ $SEP == "TRUE" ]
then #SEPARAR ARCHIVOS
# Original Daniel: if [ -z "$(ls -A $OUTPUT_PATH)" ]; then :
# Modified by Carlos Méndez
if [ -z "$(ls -A $OUTPUT_PATH/dev)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/dev/*
find $OUTPUT_PATH/dev -maxdepth 1 -name '*.dev' -delete
fi
if [ -z "$(ls -A $OUTPUT_PATH/vrb)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/vrb/*
find $OUTPUT_PATH/vrb -maxdepth 1 -name '*.vrb' -delete
fi
for j in $SCRIPT_PATH/tagged-line/*
do
# Original Daniel: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/names_EFFECT_ONTOGENE.txt
# CMC 2018-12-04: Without separating verbal forms: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/normalized_Effects.json
# CMC 2018-12-11: With separating verbal forms: python3 $SCRIPT_PATH/filter-v02.py $j $DICC_PATH/normalized_Effects.json
# CMC 2018-12-11: Considering only passive verbal form as deverbal: VBN verb, past participle
python3 $SCRIPT_PATH/filter-v03.py $j $DICC_PATH/normalized_Effects.json
VAR=$?
# filename=${j##*/}
# inputfile=${filename%.spt}
# exit
if [ $VAR == 11 ]; then :
#contiene dev y vrb $SCRIPT_PATH/tagged-line/
# o
#Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
#Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
#echo "Deverbal and verbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
elif [ $VAR == 12 ]; then :
#contiene dev
#echo "Deverbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
# cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
elif [ $VAR == 13 ]; then :
#contiene vrb
#echo "Verbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
# cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
elif [ $VAR == 10 ]; then :
#parece no contener dev ni vrb
echo "Non deverbal and verbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
# cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
fi
done
fi # if [ $SEP == "TRUE" ]
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
import fileinput
import re
import sys
if ( len( sys.argv ) < 3 ):
sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <output_file> \n" )
sys.stderr.flush();
exit( 2 );
else:
print("Ok.")
#LEER ARCHIVO INPUT
text_file = open( sys.argv[1], "r" )
dato = text_file.read().splitlines()
text_file.close()
#QUITA EXTENSION DE NOMBRE DE ARCHIVO
split_line = sys.argv[2]
split_line = split_line[:-4]
file_name=""
file_name = split_line + ".san"
open( file_name , 'w').close()
#ESCRIBIR REGEX EN ARGV 2
for line in dato:
line = re.sub('[\(][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NNNNa_)
line = re.sub('[\[][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NNNNa_]
line = re.sub('[\(][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NN,NN,NN_)
line = re.sub('[\[][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NN,NN,NN_]
line = re.sub('[\(][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num_)
line = re.sub('[\(][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num.num_)
line = re.sub('[\(][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num-num_)
line = re.sub('[\[][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num_]
line = re.sub('[\[][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num.num_]
line = re.sub('[\[][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num-num_]
line = re.sub('[\(]\s[a-zA-Z]{1}\s[\)]', '', line.rstrip()) #elimina (_alpha_)
line = re.sub('[\[]\s[a-zA-Z]{1}\s[\]]', '', line.rstrip()) #elimina [_alpha_]
line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman_)
line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman-Roman_)
line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman_)
line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman-roman_)
line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman_]
line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman-Roman_]
line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman_]
line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman-roman_]
line = re.sub('[\(][^\(|^\)]*\s(fig\s\.|figure|see|i\s\.\se\s\.|e\s\.\sg\s\.|tab\s\.table)\s[^\(|^\)]*[\)]', '', line.rstrip(), flags=re.I) #
line = re.sub(' ', ' ', line.rstrip()) #elimina (_NNNNa_)
#print(line)
save_file = open( file_name, "a" )
save_file.write(line)
save_file.write("\n")
save_file.close()
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
#!/bin/bash
###### Automatic extraction of TRN from several files ######
BRIES_HOME=/myhome/bries
PMIDS_HOME=/myhome/preprocessed-files
# We don't use REFERENCE_HOME because we don't evaluate. Path /reference-data-set doesn't exist. File no-reference.txt doesn't exist.
REFERENCE_HOME=/myhome/reference-data-set
for f in $PMIDS_HOME/original/text/*.*
do
FILE_NAME=$(basename "$f")
FILE_NAME="${FILE_NAME%.*}"
echo "File: $FILE_NAME"
./automatic-extraction-ris-gcs.sh $PMIDS_HOME/features/$FILE_NAME.tra.word.txt $PMIDS_HOME/transformed/$FILE_NAME.tra.txt $BRIES_HOME/ri-openie-extraction/$FILE_NAME.txt $BRIES_HOME/predicted-ris-gcs Y Y FILT1 $REFERENCE_HOME no-reference.txt $BRIES_HOME/evaluation-reports no-evaluation.txt diccionario-SYNONYMS.json $PMIDS_HOME/original/tsv 1>uno-$FILE_NAME.txt 2>dos-$FILE_NAME.txt
done
This diff is collapsed. Click to expand it.
#!/bin/bash
#Validate arguments
if [[ ! ("$#" == 3 ) ]]; then
echo 'Usage: ./sentence-simplification-main.sh <input_path> <output_file_path> <isimp_path>'
exit 1
fi
SCRIPT_PATH=$(cd `dirname $0` && pwd)
#Define aquí la palabra clave del grupo de oraciones a simplificar.
INPUT_PATH=$1
OUTPUT_INDEX_FILE_PATH=$2
ISIMP_PATH=$3
cd $SCRIPT_PATH
#ANALIZAR EN ISIMP
echo "Analysing in iSimp..."
if [ -z "$(ls -A ./iSimp_sentences/)" ]; then :
else
#echo "Not Empty"
rm ./iSimp_sentences/*
fi
#cd $INPUT_PATH
for j in $INPUT_PATH/*
do
echo $j
#echo "++++entrada_simp: $j salida_simp: $SCRIPT_PATH/iSimp_sentences/$(basename $j)"
$ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
done
cd $SCRIPT_PATH
#CREA INDICE DE ARCHIVOS SIMPLIFICADOS
#touch $SCRIPT_PATH/index.txt
>| $OUTPUT_INDEX_FILE_PATH
#ALIMENTAR A ALGORITMO
echo "Analysing in Algorithm..."
if [ -z "$(ls -A ./algorithm_sentences/)" ]; then :
else
#echo "Not Empty"
rm ./algorithm_sentences/*
fi
#cd ./iSimp_sentences
for k in $SCRIPT_PATH/iSimp_sentences/*
do
echo $k
#echo "entrada: $k salida: $SCRIPT_PATH/algorithm_sentences/$(basename $k) index: $OUTPUT_INDEX_FILE_PATH"
python2 $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
done
cd $SCRIPT_PATH
This diff is collapsed. Click to expand it.
Delete me
\ No newline at end of file