Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

...@@ -6,11 +6,11 @@ TERM_PATH=/export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictiona ...@@ -6,11 +6,11 @@ TERM_PATH=/export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictiona
6 POST_PATH=/export/space1/users/compu2/bionlp/stanford-postagger-2018-02-27 6 POST_PATH=/export/space1/users/compu2/bionlp/stanford-postagger-2018-02-27
7 LEMMA_PATH=/export/space1/users/compu2/bionlp/biolemmatizer 7 LEMMA_PATH=/export/space1/users/compu2/bionlp/biolemmatizer
8 8
9 -PRE=TRUE 9 +PRE=FALSE
10 echo " Preprocessing: $PRE" 10 echo " Preprocessing: $PRE"
11 -POS=TRUE 11 +POS=FALSE
12 echo " POS Tagging: $POS" 12 echo " POS Tagging: $POS"
13 -LEMMA=TRUE 13 +LEMMA=FALSE
14 echo " Lemmatization: $LEMMA" 14 echo " Lemmatization: $LEMMA"
15 TERM=TRUE 15 TERM=TRUE
16 echo " Terminological tagging: $TERM" 16 echo " Terminological tagging: $TERM"
...@@ -42,12 +42,12 @@ if [ "$TERM" = "TRUE" ]; then ...@@ -42,12 +42,12 @@ if [ "$TERM" = "TRUE" ]; then
42 echo "Terminological tagging..." 42 echo "Terminological tagging..."
43 INPUT_PATH=$CORPUS_PATH/lemma 43 INPUT_PATH=$CORPUS_PATH/lemma
44 OUTPUT_PATH=$CORPUS_PATH/term 44 OUTPUT_PATH=$CORPUS_PATH/term
45 -python3.4 biologicalTermTagging_CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt 45 +python3.4 biologicalTermTagging-CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
46 fi 46 fi
47 47
48 if [ "$TRANS" = "TRUE" ]; then 48 if [ "$TRANS" = "TRUE" ]; then
49 echo "Transformation..." 49 echo "Transformation..."
50 INPUT_PATH=$CORPUS_PATH/term 50 INPUT_PATH=$CORPUS_PATH/term
51 OUTPUT_PATH=$CORPUS_PATH/transformed 51 OUTPUT_PATH=$CORPUS_PATH/transformed
52 -python3.4 transforming.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation.txt 52 +python3.4 transforming-CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation.txt
53 fi 53 fi
......
1 +# -*- coding: UTF-8 -*-
2 +import re
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +
8 +__author__ = 'CMendezC'
9 +
10 +# Objective: Transforming BIOLemmatized files:
11 +# 1) Transformed files
12 +# 2) Text files to extract aspects
13 +
14 +# Parameters:
15 +# 1) --inputPath Path to read input files.
16 +# 2) --outputPath Path to place output files.
17 +# 3) --textPath Path to place output files.
18 +# 4) --minWordsInLine Minimum length sentence in number of words
19 +# 5) --classes Classes to indicate final of sentence when line contains: PMID\tNUMSENT\tSENT\tCLASS
20 +
21 +# Output:
22 +# 1) transformed files
23 +# 2) text files
24 +
25 +# Execution:
26 +# GntR
27 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\transformed --minWordsInLine 5
28 +
29 +# FhlA
30 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\transformed --minWordsInLine 5
31 +
32 +# MarA
33 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\transformed --minWordsInLine 5
34 +
35 +# ArgR
36 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\transformed --minWordsInLine 5
37 +
38 +# CytR
39 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\transformed --minWordsInLine 5
40 +
41 +# Rob
42 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\transformed --minWordsInLine 5
43 +
44 +# EXTRACTING REGULATORY INTERACTIONS
45 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\transformed --minWordsInLine 5
46 +
47 +
48 +def length(listWords):
49 + regexWord = re.compile('[a-zA-Z]')
50 + words = 0
51 + chars = 0
52 + for word in listWords:
53 + listTemp = word.split('|')
54 + if regexWord.search(listTemp[1]) is not None:
55 + words += 1
56 + chars += len(listTemp[0])
57 + return words, chars
58 +
59 +###########################################################
60 +# MAIN PROGRAM #
61 +###########################################################
62 +
63 +if __name__ == "__main__":
64 + # Parameter definition
65 + parser = OptionParser()
66 + parser.add_option("-i", "--inputPath", dest="inputPath",
67 + help="Path to read input files", metavar="PATH")
68 + parser.add_option("-o", "--outputPath", dest="outputPath",
69 + help="Path to place transformed files", metavar="PATH")
70 + parser.add_option("--minWordsInLine", type="int", dest="minWordsInLine", default=3,
71 + help="Minimum length sentence in number of words", metavar="NUM")
72 + parser.add_option("--classes", dest="classes",
73 + help="Classes to indicate final of sentence when line contains: PMID-NUMSENT-SENT-CLASS", metavar="CLASS,CLASS")
74 +
75 + (options, args) = parser.parse_args()
76 +
77 + if len(args) > 0:
78 + parser.error("None parameters indicated.")
79 + sys.exit(1)
80 +
81 + # Printing parameter values
82 + print('-------------------------------- PARAMETERS --------------------------------')
83 + print("Path to read input files: " + str(options.inputPath))
84 + print("Path to place transformed files: " + str(options.outputPath))
85 + print("Minimum length sentence in number of words: " + str(options.minWordsInLine))
86 + print("Classes to indicate final of sentence: " + str(options.classes))
87 +
88 + # We realized that POS tags from Biolemmatizer are very specific, therefore we decided to use Standford tags
89 + bioPOST = False
90 + filesProcessed = 0
91 + # minWordsInLine = 3
92 + if not options.classes is None:
93 + listClasses = options.classes.split(',')
94 + t0 = time()
95 + print("Transforming files...")
96 + # Walk directory to read files
97 + for path, dirs, files in os.walk(options.inputPath):
98 + # For each file in dir
99 + for file in files:
100 + print(" Transforming file..." + str(file))
101 + #TrpR NN TrpR NN PennPOS
102 + # , , , , NUPOS
103 + # tryptophan NN tryptophan NN PennPOS
104 + listLine1 = []
105 + listLine2 = []
106 + text = ''
107 + lemma = ''
108 + pos = ''
109 + textTransformed = ''
110 + textText = ''
111 + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
112 + # Create output file to write
113 + with open(os.path.join(options.outputPath, file.replace('term.txt', 'tra.txt')), "w", encoding="utf-8") as transformedFile:
114 + for line in iFile:
115 + if line == '\n':
116 + if options.classes is None:
117 + if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
118 + transformedFile.write(textTransformed + '\n')
119 + textTransformed = ''
120 + textText = ''
121 + else:
122 + continue
123 + else:
124 + line = line.strip('\n')
125 + #print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
126 + listLine1 = line.split('\t')
127 + if len(listLine1) != 3:
128 + continue
129 + text = listLine1[0]
130 + # Replacing an estrange space character
131 + text = text.replace(' ', '-')
132 + listLine2 = listLine1[2].split(' ')
133 + lemma = listLine2[0]
134 + # Replacing an estrange space character
135 + lemma = lemma.replace(' ', '-')
136 + if bioPOST:
137 + pos = listLine2[1]
138 + #print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
139 + else:
140 + pos = listLine1[1]
141 + if listLine2[2] == "TermTag":
142 + tag = listLine2[1]
143 + else:
144 + tag = "O"
145 + textText = textText + text + ' '
146 + textTransformed = textTransformed + text + '|' + lemma + '|' + pos + '|' + tag + ' '
147 + # RI+GC NN RI+GC NN PennPOS
148 + if not options.classes is None:
149 + if text in listClasses:
150 + # if length(textTransformed.split()) > options.minWordsInLine:
151 + if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
152 + transformedFile.write(textTransformed + '\n')
153 + # print(textTransformed)
154 + textTransformed = ''
155 + textText = ''
156 + filesProcessed += 1
157 +
158 + # Imprime archivos procesados
159 + print()
160 + print("Files processed: " + str(filesProcessed))
161 + print("In: %fs" % (time() - t0))