Showing
2 changed files
with
157 additions
and
1 deletions
biologicalTermTagging_CRF.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | +import json | ||
| 3 | +from optparse import OptionParser | ||
| 4 | +import os | ||
| 5 | +import sys | ||
| 6 | +from time import time | ||
| 7 | +from nltk.corpus import words | ||
| 8 | + | ||
| 9 | +__author__ = 'CMendezC' | ||
| 10 | + | ||
| 11 | +# Objective: Tagging biological terms from lists of terms related to aspects of interest: | ||
| 12 | +# 1) Changing POS tag by term tag | ||
| 13 | + | ||
| 14 | +# Parameters: | ||
| 15 | +# 1) --inputPath Path to read input files. | ||
| 16 | +# 2) --outputPath Path to place output files. | ||
| 17 | +# 3) --termPath Path to read term lists | ||
| 18 | +# 4) --termFiles JSON file with terms files and tags | ||
| 19 | +# 5) --crf Let POS tag instead of substituting it by term or freq tag | ||
| 20 | + | ||
| 21 | +# Output: | ||
| 22 | +# 1) Files with biological terms tagged | ||
| 23 | + | ||
| 24 | +# Execution: | ||
| 25 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
| 26 | + | ||
| 27 | +# FhlA | ||
| 28 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
| 29 | + | ||
| 30 | +# MarA | ||
| 31 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
| 32 | + | ||
| 33 | +# ArgR | ||
| 34 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
| 35 | + | ||
| 36 | +# CytR | ||
| 37 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
| 38 | + | ||
| 39 | +# Rob | ||
| 40 | +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json | ||
| 41 | + | ||
| 42 | +########################################################### | ||
| 43 | +# MAIN PROGRAM # | ||
| 44 | +########################################################### | ||
| 45 | + | ||
| 46 | +if __name__ == "__main__": | ||
| 47 | + # Parameter definition | ||
| 48 | + parser = OptionParser() | ||
| 49 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 50 | + help="Path to read input files", metavar="PATH") | ||
| 51 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 52 | + help="Path to place transformed files", metavar="PATH") | ||
| 53 | + parser.add_option("--termPath", dest="termPath", | ||
| 54 | + help="Path to read term files", metavar="PATH") | ||
| 55 | + parser.add_option("--termFiles", dest="termFiles", | ||
| 56 | + help="JSON file with terms files and tags", metavar="FILE") | ||
| 57 | + parser.add_option("--crf", default=False, | ||
| 58 | + action="store_true", dest="crf", | ||
| 59 | + help="Let POS tag instead of substituting it by term or freq tag?") | ||
| 60 | + parser.add_option("--termLower", default=False, | ||
| 61 | + action="store_true", dest="termLower", | ||
| 62 | + help="Compare with terms in lower case?") | ||
| 63 | + parser.add_option("--termCapitalize", default=False, | ||
| 64 | + action="store_true", dest="termCapitalize", | ||
| 65 | + help="Compare with capitalize terms?") | ||
| 66 | + | ||
| 67 | + (options, args) = parser.parse_args() | ||
| 68 | + | ||
| 69 | + if len(args) > 0: | ||
| 70 | + parser.error("None parameters indicated.") | ||
| 71 | + sys.exit(1) | ||
| 72 | + | ||
| 73 | + # Printing parameter values | ||
| 74 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 75 | + print("Path to read input files: " + str(options.inputPath)) | ||
| 76 | + print("Path to place transformed files: " + str(options.outputPath)) | ||
| 77 | + print("Path to read term files: " + str(options.termPath)) | ||
| 78 | + print("Let POS tag instead of substituting it by term or freq tag? " + str(options.crf)) | ||
| 79 | + print("Compare with terms in lower case? " + str(options.termLower)) | ||
| 80 | + print("Compare with capitalize terms? " + str(options.termCapitalize)) | ||
| 81 | + | ||
| 82 | + print('Loading biological term files...') | ||
| 83 | + with open(os.path.join(options.termPath, options.termFiles)) as data_file: | ||
| 84 | + lists = json.load(data_file) | ||
| 85 | + | ||
| 86 | + hashTermFiles = lists["hashTermFiles"] | ||
| 87 | + hashTerms = lists["hashTerms"] | ||
| 88 | + hashTermsOrig = [] | ||
| 89 | + | ||
| 90 | + for key in hashTermFiles.keys(): | ||
| 91 | + for f in hashTermFiles[key]: | ||
| 92 | + # print('File: ' + f) | ||
| 93 | + with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile: | ||
| 94 | + for line in iFile: | ||
| 95 | + line = line.strip('\n') | ||
| 96 | + lineHyp = line.replace(' ', '-') | ||
| 97 | + if lineHyp not in hashTerms[key]: | ||
| 98 | + hashTerms[key].append(lineHyp) | ||
| 99 | + hashTermsOrig[key].append(line) | ||
| 100 | + if options.termLower: | ||
| 101 | + hashTerms[key].append(lineHyp.lower()) | ||
| 102 | + hashTermsOrig[key].append(line.lower()) | ||
| 103 | + if options.termCapitalize: | ||
| 104 | + hashTerms[key].append(lineHyp.capitalize()) | ||
| 105 | + hashTermsOrig[key].append(line.capitalize()) | ||
| 106 | + print(' Terms read {} size: {}'.format(key, len(hashTerms[key]))) | ||
| 107 | + | ||
| 108 | + #regularWords = words.words('en') | ||
| 109 | + print() | ||
| 110 | + | ||
| 111 | + filesPreprocessed = 0 | ||
| 112 | + t0 = time() | ||
| 113 | + print("Biological term tagging files...") | ||
| 114 | + # Walk directory to read files | ||
| 115 | + for path, dirs, files in os.walk(options.inputPath): | ||
| 116 | + # For each file in dir | ||
| 117 | + for file in files: | ||
| 118 | + print(" Biological term tagging file..." + str(file)) | ||
| 119 | + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile: | ||
| 120 | + # Create output file to write | ||
| 121 | + with open(os.path.join(options.outputPath, file.replace('lem.txt', 'term.txt')), "w", encoding="utf-8") as oFile: | ||
| 122 | + for line in iFile: | ||
| 123 | + if line == '\n': | ||
| 124 | + oFile.write(line) | ||
| 125 | + else: | ||
| 126 | + line = line.strip('\n') | ||
| 127 | + listLine1 = line.split('\t') | ||
| 128 | + if len(listLine1) < 3: | ||
| 129 | + continue | ||
| 130 | + word = listLine1[0] | ||
| 131 | + pos = listLine1[1] | ||
| 132 | + listLine2 = listLine1[2].split(' ') | ||
| 133 | + lemma = listLine2[0] | ||
| 134 | + if len(word) > 1: | ||
| 135 | + for termTag in hashTerms: | ||
| 136 | + if word in hashTerms[termTag]: | ||
| 137 | + wordOrig = word.replace('-', ' ') | ||
| 138 | + if wordOrig in hashTermsOrig[termTag]: | ||
| 139 | + line = '' | ||
| 140 | + for w, l in zip(word.split('-'), lemma.split('-')): | ||
| 141 | + line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' | ||
| 142 | + line.rstrip('\n') | ||
| 143 | + else: | ||
| 144 | + line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | ||
| 145 | + #line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | ||
| 146 | + else: | ||
| 147 | + line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + 'O' + ' TermTag' | ||
| 148 | + # line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | ||
| 149 | + #line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag' | ||
| 150 | + oFile.write(line + '\n') | ||
| 151 | + filesPreprocessed += 1 | ||
| 152 | + | ||
| 153 | + # Imprime archivos procesados | ||
| 154 | + print() | ||
| 155 | + print("Files preprocessed: " + str(filesPreprocessed)) | ||
| 156 | + print("In: %fs" % (time() - t0)) |
| ... | @@ -42,7 +42,7 @@ if [ "$TERM" = "TRUE" ]; then | ... | @@ -42,7 +42,7 @@ if [ "$TERM" = "TRUE" ]; then |
| 42 | echo "Terminological tagging..." | 42 | echo "Terminological tagging..." |
| 43 | INPUT_PATH=$CORPUS_PATH/lemma | 43 | INPUT_PATH=$CORPUS_PATH/lemma |
| 44 | OUTPUT_PATH=$CORPUS_PATH/term | 44 | OUTPUT_PATH=$CORPUS_PATH/term |
| 45 | -python3.4 biologicalTermTagging.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt | 45 | +python3.4 biologicalTermTagging_CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt |
| 46 | fi | 46 | fi |
| 47 | 47 | ||
| 48 | if [ "$TRANS" = "TRUE" ]; then | 48 | if [ "$TRANS" = "TRUE" ]; then | ... | ... |
-
Please register or login to post a comment