cmendezc

Bacterial regulatory interaction extraction system

1 +# Bacterial regulatory interaction extraction system
2 +
3 +## Prerequisites
4 +1. Input file must be tokenized and sentence split
5 +
6 +
7 +
8 +
9 +## Run
10 +### Several files
11 +Set filenames and paths in run-several-files.sh
12 +
13 +## Acknowledgments
14 +This work was supported by UNAM-PAPIIT IA203420.
...\ No newline at end of file ...\ No newline at end of file
This diff is collapsed. Click to expand it.
1 +# import fileinput
2 +# import regex as re
3 +# from regex import finditer
4 +import sys
5 +import json
6 +
7 +if ( len( sys.argv ) != 3 ):
8 + # Original Daniel: sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <EFFs_dictionary> \n" )
9 + sys.stderr.write("E: usage: " + sys.argv[0] + " <input_file> <normalized_Effects> \n")
10 + sys.stderr.flush();
11 +
12 +# exit( 2 );
13 +
14 +#LEER ARCHIVO INPUT
15 +text_file = open( sys.argv[1], "r" )
16 +dato = text_file.read()
17 +text_file.close()
18 +
19 +#LEE DICCIONARIO
20 +
21 +# Loading normalized effects
22 +# print('Loading normalized effects...')
23 +with open(sys.argv[2]) as diccFile:
24 + hashNormalizedEffects = json.load(diccFile)
25 +DICC = list(hashNormalizedEffects.keys())
26 +
27 +# Original Daniel: text_file = open( sys.argv[2], "r" )
28 +# Original Daniel: DICC = text_file.read().splitlines()
29 +# Original Daniel: text_file.close()
30 +
31 +
32 +#declara variables
33 +is_dev = False
34 +is_vrb = False
35 +
36 +
37 +# DICC
38 +# 2018-11-30 CMC: We separated noun and only past participle for deverbal processing
39 +# and all verb forms as verbal
40 +# VRB: VB verb, base form think
41 +# VRB: VBZ verb, 3rd person singular present she thinks
42 +# VRB: VBP verb, non-3rd person singular present I think
43 +# VRB: VBD verb, past tense they thought
44 +# DEV: VBN verb, past participle a sunken ship
45 +# VRB: VBG verb, gerund or present participle thinking is fun
46 +# extend/VBP
47 +for i in range(len(DICC)):
48 + # print(DICC[i])
49 + for token in dato.split():
50 + word = token[:token.find("/")]
51 + tag = token[token.find("/")+1:]
52 + # print("word: {}".format(word))
53 + # print("tag: {}".format(tag))
54 + if (DICC[i] in word) and (("NN" in tag)
55 + or ("VBN" == tag)
56 + ):
57 + is_dev = True
58 + # print("deverbal: " + word)
59 + if (DICC[i] in word) and ("VB" in tag):
60 + is_vrb = True
61 + # print("verbal: " + word)
62 +
63 +if is_dev and is_vrb:
64 + sys.exit(11)
65 +elif is_dev:
66 + sys.exit(12)
67 +elif is_vrb:
68 + sys.exit(13)
69 +else:
70 + sys.exit(10)
71 +
1 +#!/bin/bash
2 +# Separates sentences by deverbal (.dev) and verbal (.vrb)
3 +
4 +# Original Daniel: PATH_TO_CORENLP=/home/elwe/Documents/temporal/CoreNLP
5 +
6 +#Validate arguments
7 +if [[ ! ("$#" == 6 ) ]]; then
8 + echo 'Usage: ./separator.sh <path_to_corenlp> <input_path> <output_path> <dicc_path> <if_tag> <if_separate>'
9 + exit 1
10 +fi
11 +
12 +SCRIPT_PATH=$(cd `dirname $0` && pwd)
13 +# Original Daniel: INPUT_PATH=$1 #carpeta que contiene archivos a separar
14 +# Original Daniel: OUTPUT_PATH=$2
15 +PATH_TO_CORENLP=$1
16 +INPUT_PATH=$2 #carpeta que contiene archivos a separar
17 +OUTPUT_PATH=$3
18 +DICC_PATH=$4
19 +# Tag sentences to separate deverbal and verbal sentences: $DEVTAG
20 +TAG=$5
21 +# Do separate deverbal and verbal sentences: $DEVSEPAR
22 +SEP=$6
23 +
24 +if [ $TAG == "TRUE" ]
25 + then #ANALIZAR EN STANFORD PARSER
26 +
27 + if [ -z "$(ls -A $SCRIPT_PATH/tagged/)" ]; then :
28 + else
29 + #echo "Not Empty"
30 + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged/*
31 + find $SCRIPT_PATH/tagged -maxdepth 1 -name '*.conll' -delete
32 + fi
33 +
34 + # Added by CMC
35 + if [ -z "$(ls -A $SCRIPT_PATH/tagged-line/)" ]; then :
36 + else
37 + #echo "Not Empty"
38 + # Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged-line/*
39 + find $SCRIPT_PATH/tagged-line -maxdepth 1 -name '*.spt' -delete
40 + fi
41 +
42 + for j in $INPUT_PATH/*
43 + do
44 + #echo $j
45 + #Original Daniel: java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.parser.lexparser.LexicalizedParser -writeOutputFiles -retainTMPSubcategories -outputFormat "wordsAndTags" $SCRIPT_PATH/englishPCFG.ser.gz $j
46 + # Command line: java -cp "/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file datos_0.spt -outputDirectory tagged
47 + # java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
48 + # With parse: java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
49 + java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
50 + done
51 +
52 + # Original Daniel: mv $INPUT_PATH/*.stp $SCRIPT_PATH/tagged/
53 + for j in $SCRIPT_PATH/tagged/*
54 + do
55 + # Original Daniel: awk 'NF {print $2 "/" $4}' tagged/$j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${j%.spt}"
56 + filename=$(basename "$j")
57 + #filename="${filename%.*}"
58 + awk 'NF {print $2 "/" $4}' $j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${filename%.*}.spt"
59 + # Original Daniel: mv "$j" "${j%.stp}"
60 + done
61 +fi # if [ $TAG == "TRUE" ]
62 +
63 +if [ $SEP == "TRUE" ]
64 + then #SEPARAR ARCHIVOS
65 +
66 + # Original Daniel: if [ -z "$(ls -A $OUTPUT_PATH)" ]; then :
67 + # Modified by Carlos Méndez
68 + if [ -z "$(ls -A $OUTPUT_PATH/dev)" ]; then :
69 + else
70 + #echo "Not Empty"
71 + # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/dev/*
72 + find $OUTPUT_PATH/dev -maxdepth 1 -name '*.dev' -delete
73 + fi
74 +
75 + if [ -z "$(ls -A $OUTPUT_PATH/vrb)" ]; then :
76 + else
77 + #echo "Not Empty"
78 + # Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/vrb/*
79 + find $OUTPUT_PATH/vrb -maxdepth 1 -name '*.vrb' -delete
80 + fi
81 +
82 + for j in $SCRIPT_PATH/tagged-line/*
83 + do
84 + # Original Daniel: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/names_EFFECT_ONTOGENE.txt
85 + # CMC 2018-12-04: Without separating verbal forms: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/normalized_Effects.json
86 + # CMC 2018-12-11: With separating verbal forms: python3 $SCRIPT_PATH/filter-v02.py $j $DICC_PATH/normalized_Effects.json
87 + # CMC 2018-12-11: Considering only passive verbal form as deverbal: VBN verb, past participle
88 + python3 $SCRIPT_PATH/filter-v03.py $j $DICC_PATH/normalized_Effects.json
89 + VAR=$?
90 + # filename=${j##*/}
91 + # inputfile=${filename%.spt}
92 + # exit
93 +
94 + if [ $VAR == 11 ]; then :
95 + #contiene dev y vrb $SCRIPT_PATH/tagged-line/
96 + # o
97 + #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
98 + #Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
99 + #echo "Deverbal and verbal"
100 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
101 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
102 + elif [ $VAR == 12 ]; then :
103 + #contiene dev
104 + #echo "Deverbal"
105 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
106 + # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
107 + elif [ $VAR == 13 ]; then :
108 + #contiene vrb
109 + #echo "Verbal"
110 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
111 + # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
112 + elif [ $VAR == 10 ]; then :
113 + #parece no contener dev ni vrb
114 + echo "Non deverbal and verbal"
115 + cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
116 + # cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
117 + fi
118 + done
119 +fi # if [ $SEP == "TRUE" ]
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
1 +import fileinput
2 +import re
3 +import sys
4 +
5 +if ( len( sys.argv ) < 3 ):
6 + sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <output_file> \n" )
7 + sys.stderr.flush();
8 +
9 + exit( 2 );
10 +else:
11 + print("Ok.")
12 +
13 +#LEER ARCHIVO INPUT
14 +text_file = open( sys.argv[1], "r" )
15 +dato = text_file.read().splitlines()
16 +text_file.close()
17 +
18 +
19 +#QUITA EXTENSION DE NOMBRE DE ARCHIVO
20 +split_line = sys.argv[2]
21 +split_line = split_line[:-4]
22 +file_name=""
23 +file_name = split_line + ".san"
24 +open( file_name , 'w').close()
25 +
26 +#ESCRIBIR REGEX EN ARGV 2
27 +for line in dato:
28 + line = re.sub('[\(][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NNNNa_)
29 + line = re.sub('[\[][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NNNNa_]
30 + line = re.sub('[\(][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NN,NN,NN_)
31 + line = re.sub('[\[][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NN,NN,NN_]
32 + line = re.sub('[\(][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num_)
33 + line = re.sub('[\(][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num.num_)
34 + line = re.sub('[\(][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num-num_)
35 + line = re.sub('[\[][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num_]
36 + line = re.sub('[\[][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num.num_]
37 + line = re.sub('[\[][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num-num_]
38 + line = re.sub('[\(]\s[a-zA-Z]{1}\s[\)]', '', line.rstrip()) #elimina (_alpha_)
39 + line = re.sub('[\[]\s[a-zA-Z]{1}\s[\]]', '', line.rstrip()) #elimina [_alpha_]
40 + line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman_)
41 + line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman-Roman_)
42 + line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman_)
43 + line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman-roman_)
44 + line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman_]
45 + line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman-Roman_]
46 + line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman_]
47 + line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman-roman_]
48 + line = re.sub('[\(][^\(|^\)]*\s(fig\s\.|figure|see|i\s\.\se\s\.|e\s\.\sg\s\.|tab\s\.table)\s[^\(|^\)]*[\)]', '', line.rstrip(), flags=re.I) #
49 + line = re.sub(' ', ' ', line.rstrip()) #elimina (_NNNNa_)
50 + #print(line)
51 +
52 +
53 + save_file = open( file_name, "a" )
54 + save_file.write(line)
55 + save_file.write("\n")
56 + save_file.close()
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
1 +#!/bin/bash
2 +
3 +###### Automatic extraction of TRN from several files ######
4 +
5 +BRIES_HOME=/myhome/bries
6 +PMIDS_HOME=/myhome/preprocessed-files
7 +# We don't use REFERENCE_HOME because we don't evaluate. Path /reference-data-set doesn't exist. File no-reference.txt doesn't exist.
8 +REFERENCE_HOME=/myhome/reference-data-set
9 +
10 +for f in $PMIDS_HOME/original/text/*.*
11 +do
12 + FILE_NAME=$(basename "$f")
13 + FILE_NAME="${FILE_NAME%.*}"
14 + echo "File: $FILE_NAME"
15 + ./automatic-extraction-ris-gcs.sh $PMIDS_HOME/features/$FILE_NAME.tra.word.txt $PMIDS_HOME/transformed/$FILE_NAME.tra.txt $BRIES_HOME/ri-openie-extraction/$FILE_NAME.txt $BRIES_HOME/predicted-ris-gcs Y Y FILT1 $REFERENCE_HOME no-reference.txt $BRIES_HOME/evaluation-reports no-evaluation.txt diccionario-SYNONYMS.json $PMIDS_HOME/original/tsv 1>uno-$FILE_NAME.txt 2>dos-$FILE_NAME.txt
16 +done
This diff is collapsed. Click to expand it.
1 +#!/bin/bash
2 +
3 +#Validate arguments
4 +if [[ ! ("$#" == 3 ) ]]; then
5 + echo 'Usage: ./sentence-simplification-main.sh <input_path> <output_file_path> <isimp_path>'
6 + exit 1
7 +fi
8 +
9 +SCRIPT_PATH=$(cd `dirname $0` && pwd)
10 +#Define aquí la palabra clave del grupo de oraciones a simplificar.
11 +INPUT_PATH=$1
12 +OUTPUT_INDEX_FILE_PATH=$2
13 +ISIMP_PATH=$3
14 +cd $SCRIPT_PATH
15 +
16 +
17 +
18 +
19 +#ANALIZAR EN ISIMP
20 +echo "Analysing in iSimp..."
21 +if [ -z "$(ls -A ./iSimp_sentences/)" ]; then :
22 +else
23 + #echo "Not Empty"
24 + rm ./iSimp_sentences/*
25 +fi
26 +#cd $INPUT_PATH
27 +for j in $INPUT_PATH/*
28 +do
29 + echo $j
30 + #echo "++++entrada_simp: $j salida_simp: $SCRIPT_PATH/iSimp_sentences/$(basename $j)"
31 + $ISIMP_PATH/simplify.sh $j $SCRIPT_PATH/iSimp_sentences/$(basename $j)
32 +done
33 +cd $SCRIPT_PATH
34 +
35 +#CREA INDICE DE ARCHIVOS SIMPLIFICADOS
36 +#touch $SCRIPT_PATH/index.txt
37 +>| $OUTPUT_INDEX_FILE_PATH
38 +
39 +#ALIMENTAR A ALGORITMO
40 +echo "Analysing in Algorithm..."
41 +if [ -z "$(ls -A ./algorithm_sentences/)" ]; then :
42 +else
43 + #echo "Not Empty"
44 + rm ./algorithm_sentences/*
45 +fi
46 +#cd ./iSimp_sentences
47 +for k in $SCRIPT_PATH/iSimp_sentences/*
48 +do
49 + echo $k
50 + #echo "entrada: $k salida: $SCRIPT_PATH/algorithm_sentences/$(basename $k) index: $OUTPUT_INDEX_FILE_PATH"
51 + python2 $SCRIPT_PATH/simplifier.py $k $SCRIPT_PATH/algorithm_sentences/$(basename $k) $OUTPUT_INDEX_FILE_PATH
52 +done
53 +cd $SCRIPT_PATH
This diff is collapsed. Click to expand it.
1 +Delete me
...\ No newline at end of file ...\ No newline at end of file