Showing
2 changed files
with
157 additions
and
1 deletions
biologicalTermTagging_CRF.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | +import json | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +from time import time | ||
7 | +from nltk.corpus import words | ||
8 | + | ||
9 | +__author__ = 'CMendezC' | ||
10 | + | ||
11 | +# Objective: Tagging biological terms from lists of terms related to aspects of interest: | ||
12 | +# 1) Changing POS tag by term tag | ||
13 | + | ||
14 | +# Parameters: | ||
15 | +# 1) --inputPath Path to read input files. | ||
16 | +# 2) --outputPath Path to place output files. | ||
17 | +# 3) --termPath Path to read term lists | ||
18 | +# 4) --termFiles JSON file with terms files and tags | ||
19 | +# 5) --crf Let POS tag instead of substituting it by term or freq tag | ||
20 | + | ||
21 | +# Output: | ||
22 | +# 1) Files with biological terms tagged | ||
23 | + | ||
24 | +# Execution: | ||
25 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
26 | + | ||
27 | +# FhlA | ||
28 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
29 | + | ||
30 | +# MarA | ||
31 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
32 | + | ||
33 | +# ArgR | ||
34 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
35 | + | ||
36 | +# CytR | ||
37 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
38 | + | ||
39 | +# Rob | ||
40 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
41 | + | ||
42 | +########################################################### | ||
43 | +# MAIN PROGRAM # | ||
44 | +########################################################### | ||
45 | + | ||
46 | +if __name__ == "__main__": | ||
47 | + # Parameter definition | ||
48 | + parser = OptionParser() | ||
49 | + parser.add_option("--inputPath", dest="inputPath", | ||
50 | + help="Path to read input files", metavar="PATH") | ||
51 | + parser.add_option("--outputPath", dest="outputPath", | ||
52 | + help="Path to place transformed files", metavar="PATH") | ||
53 | + parser.add_option("--termPath", dest="termPath", | ||
54 | + help="Path to read term files", metavar="PATH") | ||
55 | + parser.add_option("--termFiles", dest="termFiles", | ||
56 | + help="JSON file with terms files and tags", metavar="FILE") | ||
57 | + parser.add_option("--crf", default=False, | ||
58 | + action="store_true", dest="crf", | ||
59 | + help="Let POS tag instead of substituting it by term or freq tag?") | ||
60 | + parser.add_option("--termLower", default=False, | ||
61 | + action="store_true", dest="termLower", | ||
62 | + help="Compare with terms in lower case?") | ||
63 | + parser.add_option("--termCapitalize", default=False, | ||
64 | + action="store_true", dest="termCapitalize", | ||
65 | + help="Compare with capitalize terms?") | ||
66 | + | ||
67 | + (options, args) = parser.parse_args() | ||
68 | + | ||
69 | + if len(args) > 0: | ||
70 | + parser.error("None parameters indicated.") | ||
71 | + sys.exit(1) | ||
72 | + | ||
73 | + # Printing parameter values | ||
74 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
75 | + print("Path to read input files: " + str(options.inputPath)) | ||
76 | + print("Path to place transformed files: " + str(options.outputPath)) | ||
77 | + print("Path to read term files: " + str(options.termPath)) | ||
78 | + print("Let POS tag instead of substituting it by term or freq tag? " + str(options.crf)) | ||
79 | + print("Compare with terms in lower case? " + str(options.termLower)) | ||
80 | + print("Compare with capitalize terms? " + str(options.termCapitalize)) | ||
81 | + | ||
82 | + print('Loading biological term files...') | ||
83 | + with open(os.path.join(options.termPath, options.termFiles)) as data_file: | ||
84 | + lists = json.load(data_file) | ||
85 | + | ||
86 | + hashTermFiles = lists["hashTermFiles"] | ||
87 | + hashTerms = lists["hashTerms"] | ||
88 | + hashTermsOrig = [] | ||
89 | + | ||
90 | + for key in hashTermFiles.keys(): | ||
91 | + for f in hashTermFiles[key]: | ||
92 | + # print('File: ' + f) | ||
93 | + with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile: | ||
94 | + for line in iFile: | ||
95 | + line = line.strip('\n') | ||
96 | + lineHyp = line.replace(' ', '-') | ||
97 | + if lineHyp not in hashTerms[key]: | ||
98 | + hashTerms[key].append(lineHyp) | ||
99 | + hashTermsOrig[key].append(line) | ||
100 | + if options.termLower: | ||
101 | + hashTerms[key].append(lineHyp.lower()) | ||
102 | + hashTermsOrig[key].append(line.lower()) | ||
103 | + if options.termCapitalize: | ||
104 | + hashTerms[key].append(lineHyp.capitalize()) | ||
105 | + hashTermsOrig[key].append(line.capitalize()) | ||
106 | + print(' Terms read {} size: {}'.format(key, len(hashTerms[key]))) | ||
107 | + | ||
108 | + #regularWords = words.words('en') | ||
109 | + print() | ||
110 | + | ||
111 | + filesPreprocessed = 0 | ||
112 | + t0 = time() | ||
113 | + print("Biological term tagging files...") | ||
114 | + # Walk directory to read files | ||
115 | + for path, dirs, files in os.walk(options.inputPath): | ||
116 | + # For each file in dir | ||
117 | + for file in files: | ||
118 | + print(" Biological term tagging file..." + str(file)) | ||
119 | + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile: | ||
120 | + # Create output file to write | ||
121 | + with open(os.path.join(options.outputPath, file.replace('lem.txt', 'term.txt')), "w", encoding="utf-8") as oFile: | ||
122 | + for line in iFile: | ||
123 | + if line == '\n': | ||
124 | + oFile.write(line) | ||
125 | + else: | ||
126 | + line = line.strip('\n') | ||
127 | + listLine1 = line.split('\t') | ||
128 | + if len(listLine1) < 3: | ||
129 | + continue | ||
130 | + word = listLine1[0] | ||
131 | + pos = listLine1[1] | ||
132 | + listLine2 = listLine1[2].split(' ') | ||
133 | + lemma = listLine2[0] | ||
134 | + if len(word) > 1: | ||
135 | + for termTag in hashTerms: | ||
136 | + if word in hashTerms[termTag]: | ||
137 | + wordOrig = word.replace('-', ' ') | ||
138 | + if wordOrig in hashTermsOrig[termTag]: | ||
139 | + line = '' | ||
140 | + for w, l in zip(word.split('-'), lemma.split('-')): | ||
141 | + line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' | ||
142 | + line.rstrip('\n') | ||
143 | + else: | ||
144 | + line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | ||
145 | + #line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | ||
146 | + else: | ||
147 | + line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + 'O' + ' TermTag' | ||
148 | + # line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | ||
149 | + #line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag' | ||
150 | + oFile.write(line + '\n') | ||
151 | + filesPreprocessed += 1 | ||
152 | + | ||
153 | + # Imprime archivos procesados | ||
154 | + print() | ||
155 | + print("Files preprocessed: " + str(filesPreprocessed)) | ||
156 | + print("In: %fs" % (time() - t0)) |
... | @@ -42,7 +42,7 @@ if [ "$TERM" = "TRUE" ]; then | ... | @@ -42,7 +42,7 @@ if [ "$TERM" = "TRUE" ]; then |
42 | echo "Terminological tagging..." | 42 | echo "Terminological tagging..." |
43 | INPUT_PATH=$CORPUS_PATH/lemma | 43 | INPUT_PATH=$CORPUS_PATH/lemma |
44 | OUTPUT_PATH=$CORPUS_PATH/term | 44 | OUTPUT_PATH=$CORPUS_PATH/term |
45 | -python3.4 biologicalTermTagging.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt | 45 | +python3.4 biologicalTermTagging_CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt |
46 | fi | 46 | fi |
47 | 47 | ||
48 | if [ "$TRANS" = "TRUE" ]; then | 48 | if [ "$TRANS" = "TRUE" ]; then | ... | ... |
-
Please register or login to post a comment