New terminological tagging for CRFs

Carlos-Francisco Méndez-Cruz
Commit f742cde91e3643293dfdd703b61159b5bca75a82 f742cde9 1 parent 81a1f15b
Showing 3 changed files with 166 additions and 5 deletions
biologicalTermTagging_CRF.py → biologicalTermTagging-CRF.py
nlp-preprocessing-pipeline.sh
transforming-CRF.py
--- a/biologicalTermTagging_CRF.py → biologicalTermTagging-CRF.py
View file @f742cde
+++ b/biologicalTermTagging_CRF.py → biologicalTermTagging-CRF.py
View file @f742cde
--- a/nlp-preprocessing-pipeline.sh
View file @f742cde
+++ b/nlp-preprocessing-pipeline.sh
View file @f742cde
@@ -6,11 +6,11 @@ TERM_PATH=/export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictiona
 POST_PATH=/export/space1/users/compu2/bionlp/stanford-postagger-2018-02-27
 LEMMA_PATH=/export/space1/users/compu2/bionlp/biolemmatizer
-PRE=TRUE
+PRE=FALSE
 echo "   Preprocessing: $PRE"
-POS=TRUE
+POS=FALSE
 echo "   POS Tagging: $POS"
-LEMMA=TRUE
+LEMMA=FALSE
 echo "   Lemmatization: $LEMMA"
 TERM=TRUE
 echo "   Terminological tagging: $TERM"
@@ -42,12 +42,12 @@ if [ "$TERM" = "TRUE" ]; then
 echo "Terminological tagging..."
 INPUT_PATH=$CORPUS_PATH/lemma
 OUTPUT_PATH=$CORPUS_PATH/term
-python3.4 biologicalTermTagging_CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
+python3.4 biologicalTermTagging-CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
 fi
 if [ "$TRANS" = "TRUE" ]; then
 echo "Transformation..."
 INPUT_PATH=$CORPUS_PATH/term
 OUTPUT_PATH=$CORPUS_PATH/transformed
-python3.4 transforming.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation.txt
+python3.4 transforming-CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation.txt
 fi
--- a/transforming-CRF.py 0 → 100644
View file @f742cde
+++ b/transforming-CRF.py 0 → 100644
View file @f742cde
+# -*- coding: UTF-8 -*-
+import re
+from optparse import OptionParser
+import os
+import sys
+from time import time
+
+__author__ = 'CMendezC'
+
+# Objective: Transforming BIOLemmatized files:
+#   1) Transformed files
+#   2) Text files to extract aspects
+
+# Parameters:
+#   1) --inputPath Path to read input files.
+#   2) --outputPath Path to place output files.
+#   3) --textPath Path to place output files.
+#   4) --minWordsInLine Minimum length sentence in number of words
+#   5) --classes Classes to indicate final of sentence when line contains: PMID\tNUMSENT\tSENT\tCLASS
+
+# Output:
+#   1) transformed files
+#   2) text files
+
+# Execution:
+# GntR
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\transformed --minWordsInLine 5
+
+# FhlA
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\transformed --minWordsInLine 5
+
+# MarA
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\transformed --minWordsInLine 5
+
+# ArgR
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\transformed --minWordsInLine 5
+
+# CytR
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\transformed --minWordsInLine 5
+
+# Rob
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\transformed --minWordsInLine 5
+
+# EXTRACTING REGULATORY INTERACTIONS
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\transformed --minWordsInLine 5
+
+
+def length(listWords):
+    regexWord = re.compile('[a-zA-Z]')
+    words = 0
+    chars = 0
+    for word in listWords:
+        listTemp = word.split('|')
+        if regexWord.search(listTemp[1]) is not None:
+            words += 1
+        chars += len(listTemp[0])
+    return words, chars
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("-i", "--inputPath", dest="inputPath",
+                      help="Path to read input files", metavar="PATH")
+    parser.add_option("-o", "--outputPath", dest="outputPath",
+                      help="Path to place transformed files", metavar="PATH")
+    parser.add_option("--minWordsInLine", type="int", dest="minWordsInLine", default=3,
+                      help="Minimum length sentence in number of words", metavar="NUM")
+    parser.add_option("--classes", dest="classes",
+                      help="Classes to indicate final of sentence when line contains: PMID-NUMSENT-SENT-CLASS", metavar="CLASS,CLASS")
+
+    (options, args) = parser.parse_args()
+
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place transformed files: " + str(options.outputPath))
+    print("Minimum length sentence in number of words: " + str(options.minWordsInLine))
+    print("Classes to indicate final of sentence: " + str(options.classes))
+
+    # We realized that POS tags from Biolemmatizer are very specific, therefore we decided to use Standford tags
+    bioPOST = False
+    filesProcessed = 0
+    # minWordsInLine = 3
+    if not options.classes is None:
+        listClasses = options.classes.split(',')
+    t0 = time()
+    print("Transforming files...")
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("   Transforming file..." + str(file))
+            #TrpR	NN	TrpR NN PennPOS
+            # ,	,	, , NUPOS
+            # tryptophan	NN	tryptophan NN PennPOS
+            listLine1 = []
+            listLine2 = []
+            text = ''
+            lemma = ''
+            pos = ''
+            textTransformed = ''
+            textText = ''
+            with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
+                # Create output file to write
+                with open(os.path.join(options.outputPath, file.replace('term.txt', 'tra.txt')), "w", encoding="utf-8") as transformedFile:
+                    for line in iFile:
+                        if line == '\n':
+                            if options.classes is None:
+                                if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
+                                    transformedFile.write(textTransformed + '\n')
+                                textTransformed = ''
+                                textText = ''
+                            else:
+                                continue
+                        else:
+                            line = line.strip('\n')
+                            #print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
+                            listLine1 = line.split('\t')
+                            if len(listLine1) != 3:
+                                continue
+                            text = listLine1[0]
+                            # Replacing an estrange space character
+                            text = text.replace(' ', '-')
+                            listLine2 = listLine1[2].split(' ')
+                            lemma = listLine2[0]
+                            # Replacing an estrange space character
+                            lemma = lemma.replace(' ', '-')
+                            if bioPOST:
+                                pos = listLine2[1]
+                                #print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
+                            else:
+                                pos = listLine1[1]
+                            if listLine2[2] == "TermTag":
+                                tag = listLine2[1]
+                            else:
+                                tag = "O"
+                            textText = textText + text + ' '
+                            textTransformed = textTransformed + text + '|' + lemma + '|' + pos + '|' + tag + ' '
+                            # RI+GC	NN	RI+GC NN PennPOS
+                            if not options.classes is None:
+                                if text in listClasses:
+                                    # if length(textTransformed.split()) > options.minWordsInLine:
+                                    if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
+                                        transformedFile.write(textTransformed + '\n')
+                                        # print(textTransformed)
+                                    textTransformed = ''
+                                    textText = ''
+            filesProcessed += 1
+
+    # Imprime archivos procesados
+    print()
+    print("Files processed: " + str(filesProcessed))
+    print("In: %fs" % (time() - t0))