separator-v02.sh
5.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/bin/bash
# Separates sentences by deverbal (.dev) and verbal (.vrb)
# Original Daniel: PATH_TO_CORENLP=/home/elwe/Documents/temporal/CoreNLP
#Validate arguments
if [[ ! ("$#" == 6 ) ]]; then
echo 'Usage: ./separator.sh <path_to_corenlp> <input_path> <output_path> <dicc_path> <if_tag> <if_separate>'
exit 1
fi
SCRIPT_PATH=$(cd `dirname $0` && pwd)
# Original Daniel: INPUT_PATH=$1 #carpeta que contiene archivos a separar
# Original Daniel: OUTPUT_PATH=$2
PATH_TO_CORENLP=$1
INPUT_PATH=$2 #carpeta que contiene archivos a separar
OUTPUT_PATH=$3
DICC_PATH=$4
# Tag sentences to separate deverbal and verbal sentences: $DEVTAG
TAG=$5
# Do separate deverbal and verbal sentences: $DEVSEPAR
SEP=$6
if [ $TAG == "TRUE" ]
then #ANALIZAR EN STANFORD PARSER
if [ -z "$(ls -A $SCRIPT_PATH/tagged/)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged/*
find $SCRIPT_PATH/tagged -maxdepth 1 -name '*.conll' -delete
fi
# Added by CMC
if [ -z "$(ls -A $SCRIPT_PATH/tagged-line/)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $SCRIPT_PATH/tagged-line/*
find $SCRIPT_PATH/tagged-line -maxdepth 1 -name '*.spt' -delete
fi
for j in $INPUT_PATH/*
do
#echo $j
#Original Daniel: java -Xms2g -cp "$PATH_TO_CORENLP/*" edu.stanford.nlp.parser.lexparser.LexicalizedParser -writeOutputFiles -retainTMPSubcategories -outputFormat "wordsAndTags" $SCRIPT_PATH/englishPCFG.ser.gz $j
# Command line: java -cp "/home/cmendezc/STANFORD_CORENLP/stanford-corenlp-full-2017-06-09/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file datos_0.spt -outputDirectory tagged
# java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
# With parse: java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,parse -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
java -cp "$PATH_TO_CORENLP/*" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos -outputFormat conll -file $j -outputDirectory $SCRIPT_PATH/tagged
done
# Original Daniel: mv $INPUT_PATH/*.stp $SCRIPT_PATH/tagged/
for j in $SCRIPT_PATH/tagged/*
do
# Original Daniel: awk 'NF {print $2 "/" $4}' tagged/$j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${j%.spt}"
filename=$(basename "$j")
#filename="${filename%.*}"
awk 'NF {print $2 "/" $4}' $j | paste -d" " -s > $SCRIPT_PATH/tagged-line/"${filename%.*}.spt"
# Original Daniel: mv "$j" "${j%.stp}"
done
fi # if [ $TAG == "TRUE" ]
if [ $SEP == "TRUE" ]
then #SEPARAR ARCHIVOS
# Original Daniel: if [ -z "$(ls -A $OUTPUT_PATH)" ]; then :
# Modified by Carlos Méndez
if [ -z "$(ls -A $OUTPUT_PATH/dev)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/dev/*
find $OUTPUT_PATH/dev -maxdepth 1 -name '*.dev' -delete
fi
if [ -z "$(ls -A $OUTPUT_PATH/vrb)" ]; then :
else
#echo "Not Empty"
# Error: /bin/rm: Argument list too long: rm $OUTPUT_PATH/vrb/*
find $OUTPUT_PATH/vrb -maxdepth 1 -name '*.vrb' -delete
fi
for j in $SCRIPT_PATH/tagged-line/*
do
# Original Daniel: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/names_EFFECT_ONTOGENE.txt
# CMC 2018-12-04: Without separating verbal forms: python3 $SCRIPT_PATH/filter.py $j $DICC_PATH/normalized_Effects.json
# CMC 2018-12-11: With separating verbal forms: python3 $SCRIPT_PATH/filter-v02.py $j $DICC_PATH/normalized_Effects.json
# CMC 2018-12-11: Considering only passive verbal form as deverbal: VBN verb, past participle
python3 $SCRIPT_PATH/filter-v03.py $j $DICC_PATH/normalized_Effects.json
VAR=$?
# filename=${j##*/}
# inputfile=${filename%.spt}
# exit
if [ $VAR == 11 ]; then :
#contiene dev y vrb $SCRIPT_PATH/tagged-line/
# o
#Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
#Original Daniel: cp $INPUT_PATH/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
#echo "Deverbal and verbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
elif [ $VAR == 12 ]; then :
#contiene dev
#echo "Deverbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
# cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/dev/$(basename ${j%.*}).dev
elif [ $VAR == 13 ]; then :
#contiene vrb
#echo "Verbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
# cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
elif [ $VAR == 10 ]; then :
#parece no contener dev ni vrb
echo "Non deverbal and verbal"
cp $INPUT_PATH/$(basename ${j%.*}) $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
# cp $SCRIPT_PATH/tagged-line/${j##*/} $OUTPUT_PATH/vrb/$(basename ${j%.*}).vrb
fi
done
fi # if [ $SEP == "TRUE" ]