Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

# -*- coding: UTF-8 -*-
import json
from optparse import OptionParser
import os
import sys
from time import time
from nltk.corpus import words
__author__ = 'CMendezC'
# Objective: Tagging biological terms from lists of terms related to aspects of interest:
# 1) Changing POS tag by term tag
# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to place output files.
# 3) --termPath Path to read term lists
# 4) --termFiles JSON file with terms files and tags
# 5) --crf Let POS tag instead of substituting it by term or freq tag
# Output:
# 1) Files with biological terms tagged
# Execution:
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# FhlA
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# MarA
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# ArgR
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# CytR
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# Rob
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place transformed files", metavar="PATH")
parser.add_option("--termPath", dest="termPath",
help="Path to read term files", metavar="PATH")
parser.add_option("--termFiles", dest="termFiles",
help="JSON file with terms files and tags", metavar="FILE")
parser.add_option("--crf", default=False,
action="store_true", dest="crf",
help="Let POS tag instead of substituting it by term or freq tag?")
parser.add_option("--termLower", default=False,
action="store_true", dest="termLower",
help="Compare with terms in lower case?")
parser.add_option("--termCapitalize", default=False,
action="store_true", dest="termCapitalize",
help="Compare with capitalize terms?")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place transformed files: " + str(options.outputPath))
print("Path to read term files: " + str(options.termPath))
print("Let POS tag instead of substituting it by term or freq tag? " + str(options.crf))
print("Compare with terms in lower case? " + str(options.termLower))
print("Compare with capitalize terms? " + str(options.termCapitalize))
print('Loading biological term files...')
with open(os.path.join(options.termPath, options.termFiles)) as data_file:
lists = json.load(data_file)
hashTermFiles = lists["hashTermFiles"]
hashTerms = lists["hashTerms"]
hashTermsOrig = []
for key in hashTermFiles.keys():
for f in hashTermFiles[key]:
# print('File: ' + f)
with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile:
for line in iFile:
line = line.strip('\n')
lineHyp = line.replace(' ', '-')
if lineHyp not in hashTerms[key]:
hashTerms[key].append(lineHyp)
hashTermsOrig[key].append(line)
if options.termLower:
hashTerms[key].append(lineHyp.lower())
hashTermsOrig[key].append(line.lower())
if options.termCapitalize:
hashTerms[key].append(lineHyp.capitalize())
hashTermsOrig[key].append(line.capitalize())
print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
#regularWords = words.words('en')
print()
filesPreprocessed = 0
t0 = time()
print("Biological term tagging files...")
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print(" Biological term tagging file..." + str(file))
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
# Create output file to write
with open(os.path.join(options.outputPath, file.replace('lem.txt', 'term.txt')), "w", encoding="utf-8") as oFile:
for line in iFile:
if line == '\n':
oFile.write(line)
else:
line = line.strip('\n')
listLine1 = line.split('\t')
if len(listLine1) < 3:
continue
word = listLine1[0]
pos = listLine1[1]
listLine2 = listLine1[2].split(' ')
lemma = listLine2[0]
if len(word) > 1:
for termTag in hashTerms:
if word in hashTerms[termTag]:
wordOrig = word.replace('-', ' ')
if wordOrig in hashTermsOrig[termTag]:
line = ''
for w, l in zip(word.split('-'), lemma.split('-')):
line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
line.rstrip('\n')
else:
line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
#line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
else:
line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + 'O' + ' TermTag'
# line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
#line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
oFile.write(line + '\n')
filesPreprocessed += 1
# Imprime archivos procesados
print()
print("Files preprocessed: " + str(filesPreprocessed))
print("In: %fs" % (time() - t0))
......@@ -42,7 +42,7 @@ if [ "$TERM" = "TRUE" ]; then
echo "Terminological tagging..."
INPUT_PATH=$CORPUS_PATH/lemma
OUTPUT_PATH=$CORPUS_PATH/term
python3.4 biologicalTermTagging.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
python3.4 biologicalTermTagging_CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
fi
if [ "$TRANS" = "TRUE" ]; then
......