nlp-preprocessing-pipeline.sh
1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/bin/sh
echo 'Preprocessing files...'
ORIGINAL_CORPUS_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
CORPUS_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
TERM_PATH=/export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries
POST_PATH=/export/space1/users/compu2/bionlp/stanford-corenlp-full-2018-02-27
LEMMA_PATH=/export/space1/users/compu2/bionlp/biolemmatizer
PRE=TRUE
echo " Preprocessing: $PRE"
POS=TRUE
echo " POS Tagging: $POS"
LEMMA=TRUE
echo " Lemmatization: $LEMMA"
TERM=TRUE
echo " Terminological tagging: $TERM"
TRANS=TRUE
echo " Transformation: $TRANS"
if [ "$PRE" = "TRUE" ]; then
echo "Preprocessing..."
INPUT_PATH=$ORIGINAL_CORPUS_PATH
OUTPUT_PATH=$CORPUS_PATH/preprocessed
python3.4 preprocessingTermDetection.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termDetection --termPath $TERM_PATH --termFiles termFilesLength.json > outputPreprocessing.txt
fi
if [ "$POS" = "TRUE" ]; then
echo "POS Tagging..."
INPUT_PATH=$CORPUS_PATH/preprocessed
OUTPUT_PATH=$CORPUS_PATH/pos
python3.4 posTaggingStanford.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --taggerPath $POST_PATH --biolemmatizer > outputPOST.txt
fi
if [ "$LEMMA" = "TRUE" ]; then
echo "Lemmatization..."
INPUT_PATH=$CORPUS_PATH/pos
OUTPUT_PATH=$CORPUS_PATH/lemma
python3.4 biolemmatizing.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --biolemmatizerPath $LEMMA_PATH > outputLemma.txt
fi
if [ "$TERM" = "TRUE" ]; then
echo "Terminological tagging..."
INPUT_PATH=$CORPUS_PATH/lemma
OUTPUT_PATH=$CORPUS_PATH/term
python3.4 biologicalTermTagging.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
fi
if [ "$TRANS" = "TRUE" ]; then
echo "Transformation..."
INPUT_PATH=$CORPUS_PATH/term
OUTPUT_PATH=$CORPUS_PATH/transformed
python3.4 transforming.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation.txt
fi