Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

1 +# -*- coding: UTF-8 -*-
2 +import json
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +from nltk.corpus import words
8 +
9 +__author__ = 'CMendezC'
10 +
11 +# Objective: Tagging biological terms from lists of terms related to aspects of interest:
12 +# 1) Changing POS tag by term tag
13 +
14 +# Parameters:
15 +# 1) --inputPath Path to read input files.
16 +# 2) --outputPath Path to place output files.
17 +# 3) --termPath Path to read term lists
18 +# 4) --termFiles JSON file with terms files and tags
19 +# 5) --crf Let POS tag instead of substituting it by term or freq tag
20 +
21 +# Output:
22 +# 1) Files with biological terms tagged
23 +
24 +# Execution:
25 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
26 +
27 +# FhlA
28 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
29 +
30 +# MarA
31 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
32 +
33 +# ArgR
34 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
35 +
36 +# CytR
37 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
38 +
39 +# Rob
40 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
41 +
42 +###########################################################
43 +# MAIN PROGRAM #
44 +###########################################################
45 +
46 +if __name__ == "__main__":
47 + # Parameter definition
48 + parser = OptionParser()
49 + parser.add_option("--inputPath", dest="inputPath",
50 + help="Path to read input files", metavar="PATH")
51 + parser.add_option("--outputPath", dest="outputPath",
52 + help="Path to place transformed files", metavar="PATH")
53 + parser.add_option("--termPath", dest="termPath",
54 + help="Path to read term files", metavar="PATH")
55 + parser.add_option("--termFiles", dest="termFiles",
56 + help="JSON file with terms files and tags", metavar="FILE")
57 + parser.add_option("--crf", default=False,
58 + action="store_true", dest="crf",
59 + help="Let POS tag instead of substituting it by term or freq tag?")
60 + parser.add_option("--termLower", default=False,
61 + action="store_true", dest="termLower",
62 + help="Compare with terms in lower case?")
63 + parser.add_option("--termCapitalize", default=False,
64 + action="store_true", dest="termCapitalize",
65 + help="Compare with capitalize terms?")
66 +
67 + (options, args) = parser.parse_args()
68 +
69 + if len(args) > 0:
70 + parser.error("None parameters indicated.")
71 + sys.exit(1)
72 +
73 + # Printing parameter values
74 + print('-------------------------------- PARAMETERS --------------------------------')
75 + print("Path to read input files: " + str(options.inputPath))
76 + print("Path to place transformed files: " + str(options.outputPath))
77 + print("Path to read term files: " + str(options.termPath))
78 + print("Let POS tag instead of substituting it by term or freq tag? " + str(options.crf))
79 + print("Compare with terms in lower case? " + str(options.termLower))
80 + print("Compare with capitalize terms? " + str(options.termCapitalize))
81 +
82 + print('Loading biological term files...')
83 + with open(os.path.join(options.termPath, options.termFiles)) as data_file:
84 + lists = json.load(data_file)
85 +
86 + hashTermFiles = lists["hashTermFiles"]
87 + hashTerms = lists["hashTerms"]
88 + hashTermsOrig = []
89 +
90 + for key in hashTermFiles.keys():
91 + for f in hashTermFiles[key]:
92 + # print('File: ' + f)
93 + with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile:
94 + for line in iFile:
95 + line = line.strip('\n')
96 + lineHyp = line.replace(' ', '-')
97 + if lineHyp not in hashTerms[key]:
98 + hashTerms[key].append(lineHyp)
99 + hashTermsOrig[key].append(line)
100 + if options.termLower:
101 + hashTerms[key].append(lineHyp.lower())
102 + hashTermsOrig[key].append(line.lower())
103 + if options.termCapitalize:
104 + hashTerms[key].append(lineHyp.capitalize())
105 + hashTermsOrig[key].append(line.capitalize())
106 + print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
107 +
108 + #regularWords = words.words('en')
109 + print()
110 +
111 + filesPreprocessed = 0
112 + t0 = time()
113 + print("Biological term tagging files...")
114 + # Walk directory to read files
115 + for path, dirs, files in os.walk(options.inputPath):
116 + # For each file in dir
117 + for file in files:
118 + print(" Biological term tagging file..." + str(file))
119 + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
120 + # Create output file to write
121 + with open(os.path.join(options.outputPath, file.replace('lem.txt', 'term.txt')), "w", encoding="utf-8") as oFile:
122 + for line in iFile:
123 + if line == '\n':
124 + oFile.write(line)
125 + else:
126 + line = line.strip('\n')
127 + listLine1 = line.split('\t')
128 + if len(listLine1) < 3:
129 + continue
130 + word = listLine1[0]
131 + pos = listLine1[1]
132 + listLine2 = listLine1[2].split(' ')
133 + lemma = listLine2[0]
134 + if len(word) > 1:
135 + for termTag in hashTerms:
136 + if word in hashTerms[termTag]:
137 + wordOrig = word.replace('-', ' ')
138 + if wordOrig in hashTermsOrig[termTag]:
139 + line = ''
140 + for w, l in zip(word.split('-'), lemma.split('-')):
141 + line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
142 + line.rstrip('\n')
143 + else:
144 + line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
145 + #line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
146 + else:
147 + line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + 'O' + ' TermTag'
148 + # line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
149 + #line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
150 + oFile.write(line + '\n')
151 + filesPreprocessed += 1
152 +
153 + # Imprime archivos procesados
154 + print()
155 + print("Files preprocessed: " + str(filesPreprocessed))
156 + print("In: %fs" % (time() - t0))
...@@ -42,7 +42,7 @@ if [ "$TERM" = "TRUE" ]; then ...@@ -42,7 +42,7 @@ if [ "$TERM" = "TRUE" ]; then
42 echo "Terminological tagging..." 42 echo "Terminological tagging..."
43 INPUT_PATH=$CORPUS_PATH/lemma 43 INPUT_PATH=$CORPUS_PATH/lemma
44 OUTPUT_PATH=$CORPUS_PATH/term 44 OUTPUT_PATH=$CORPUS_PATH/term
45 -python3.4 biologicalTermTagging.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt 45 +python3.4 biologicalTermTagging_CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
46 fi 46 fi
47 47
48 if [ "$TRANS" = "TRUE" ]; then 48 if [ "$TRANS" = "TRUE" ]; then
......