New terminological tagging for CRFs

Carlos-Francisco Méndez-Cruz
Commit 81a1f15bc70e2e77f7371e8aab65c8aefb9ec979 81a1f15b 1 parent 51cbcfbb
Showing 2 changed files with 157 additions and 1 deletions
biologicalTermTagging_CRF.py
nlp-preprocessing-pipeline.sh
--- a/biologicalTermTagging_CRF.py 0 → 100644
View file @81a1f15
+++ b/biologicalTermTagging_CRF.py 0 → 100644
View file @81a1f15
+# -*- coding: UTF-8 -*-
+import json
+from optparse import OptionParser
+import os
+import sys
+from time import time
+from nltk.corpus import words
+
+__author__ = 'CMendezC'
+
+# Objective: Tagging biological terms from lists of terms related to aspects of interest:
+#   1) Changing POS tag by term tag
+
+# Parameters:
+#   1) --inputPath Path to read input files.
+#   2) --outputPath Path to place output files.
+#   3) --termPath Path to read term lists
+#   4) --termFiles JSON file with terms files and tags
+#   5) --crf Let POS tag instead of substituting it by term or freq tag
+
+# Output:
+#   1) Files with biological terms tagged
+
+# Execution:
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# FhlA
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# MarA
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# ArgR
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# CytR
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# Rob
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path to read input files", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Path to place transformed files", metavar="PATH")
+    parser.add_option("--termPath", dest="termPath",
+                      help="Path to read term files", metavar="PATH")
+    parser.add_option("--termFiles", dest="termFiles",
+                      help="JSON file with terms files and tags", metavar="FILE")
+    parser.add_option("--crf", default=False,
+                      action="store_true", dest="crf",
+                      help="Let POS tag instead of substituting it by term or freq tag?")
+    parser.add_option("--termLower", default=False,
+                      action="store_true", dest="termLower",
+                      help="Compare with terms in lower case?")
+    parser.add_option("--termCapitalize", default=False,
+                      action="store_true", dest="termCapitalize",
+                      help="Compare with capitalize terms?")
+
+    (options, args) = parser.parse_args()
+
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place transformed files: " + str(options.outputPath))
+    print("Path to read term files: " + str(options.termPath))
+    print("Let POS tag instead of substituting it by term or freq tag? " + str(options.crf))
+    print("Compare with terms in lower case? " + str(options.termLower))
+    print("Compare with capitalize terms? " + str(options.termCapitalize))
+
+    print('Loading biological term files...')
+    with open(os.path.join(options.termPath, options.termFiles)) as data_file:
+        lists = json.load(data_file)
+
+    hashTermFiles = lists["hashTermFiles"]
+    hashTerms = lists["hashTerms"]
+    hashTermsOrig = []
+
+    for key in hashTermFiles.keys():
+        for f in hashTermFiles[key]:
+            # print('File: ' + f)
+            with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile:
+                for line in iFile:
+                    line = line.strip('\n')
+                    lineHyp = line.replace(' ', '-')
+                    if lineHyp not in hashTerms[key]:
+                        hashTerms[key].append(lineHyp)
+                        hashTermsOrig[key].append(line)
+                        if options.termLower:
+                            hashTerms[key].append(lineHyp.lower())
+                            hashTermsOrig[key].append(line.lower())
+                        if options.termCapitalize:
+                            hashTerms[key].append(lineHyp.capitalize())
+                            hashTermsOrig[key].append(line.capitalize())
+        print('   Terms read {} size: {}'.format(key, len(hashTerms[key])))
+
+    #regularWords =  words.words('en')
+    print()
+
+    filesPreprocessed = 0
+    t0 = time()
+    print("Biological term tagging files...")
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("   Biological term tagging file..." + str(file))
+            with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
+                # Create output file to write
+                with open(os.path.join(options.outputPath, file.replace('lem.txt', 'term.txt')), "w", encoding="utf-8") as oFile:
+                    for line in iFile:
+                        if line == '\n':
+                            oFile.write(line)
+                        else:
+                            line = line.strip('\n')
+                            listLine1 = line.split('\t')
+                            if len(listLine1) < 3:
+                                continue
+                            word = listLine1[0]
+                            pos = listLine1[1]
+                            listLine2 = listLine1[2].split(' ')
+                            lemma = listLine2[0]
+                            if len(word) > 1:
+                                for termTag in hashTerms:
+                                    if word in hashTerms[termTag]:
+                                        wordOrig = word.replace('-', ' ')
+                                        if wordOrig in hashTermsOrig[termTag]:
+                                            line = ''
+                                            for w, l in zip(word.split('-'), lemma.split('-')):
+                                                line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
+                                            line.rstrip('\n')
+                                        else:
+                                            line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                        #line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                    else:
+                                        line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + 'O' + ' TermTag'
+                                        # line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                        #line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
+                            oFile.write(line + '\n')
+            filesPreprocessed += 1
+
+    # Imprime archivos procesados
+    print()
+    print("Files preprocessed: " + str(filesPreprocessed))
+    print("In: %fs" % (time() - t0))
--- a/nlp-preprocessing-pipeline.sh
View file @81a1f15
+++ b/nlp-preprocessing-pipeline.sh
View file @81a1f15
@@ -42,7 +42,7 @@ if [ "$TERM" = "TRUE" ]; then
 echo "Terminological tagging..."
 INPUT_PATH=$CORPUS_PATH/lemma
 OUTPUT_PATH=$CORPUS_PATH/term
-python3.4 biologicalTermTagging.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
+python3.4 biologicalTermTagging_CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
 fi
 if [ "$TRANS" = "TRUE" ]; then