Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

......@@ -6,11 +6,11 @@ TERM_PATH=/export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictiona
POST_PATH=/export/space1/users/compu2/bionlp/stanford-postagger-2018-02-27
LEMMA_PATH=/export/space1/users/compu2/bionlp/biolemmatizer
PRE=TRUE
PRE=FALSE
echo " Preprocessing: $PRE"
POS=TRUE
POS=FALSE
echo " POS Tagging: $POS"
LEMMA=TRUE
LEMMA=FALSE
echo " Lemmatization: $LEMMA"
TERM=TRUE
echo " Terminological tagging: $TERM"
......@@ -42,12 +42,12 @@ if [ "$TERM" = "TRUE" ]; then
echo "Terminological tagging..."
INPUT_PATH=$CORPUS_PATH/lemma
OUTPUT_PATH=$CORPUS_PATH/term
python3.4 biologicalTermTagging_CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
python3.4 biologicalTermTagging-CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag.json > outputTerm.txt
fi
if [ "$TRANS" = "TRUE" ]; then
echo "Transformation..."
INPUT_PATH=$CORPUS_PATH/term
OUTPUT_PATH=$CORPUS_PATH/transformed
python3.4 transforming.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation.txt
python3.4 transforming-CRF.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation.txt
fi
......
# -*- coding: UTF-8 -*-
import re
from optparse import OptionParser
import os
import sys
from time import time
__author__ = 'CMendezC'
# Objective: Transforming BIOLemmatized files:
# 1) Transformed files
# 2) Text files to extract aspects
# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to place output files.
# 3) --textPath Path to place output files.
# 4) --minWordsInLine Minimum length sentence in number of words
# 5) --classes Classes to indicate final of sentence when line contains: PMID\tNUMSENT\tSENT\tCLASS
# Output:
# 1) transformed files
# 2) text files
# Execution:
# GntR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\transformed --minWordsInLine 5
# FhlA
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\transformed --minWordsInLine 5
# MarA
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\transformed --minWordsInLine 5
# ArgR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\transformed --minWordsInLine 5
# CytR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\transformed --minWordsInLine 5
# Rob
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\transformed --minWordsInLine 5
# EXTRACTING REGULATORY INTERACTIONS
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\transformed --minWordsInLine 5
def length(listWords):
regexWord = re.compile('[a-zA-Z]')
words = 0
chars = 0
for word in listWords:
listTemp = word.split('|')
if regexWord.search(listTemp[1]) is not None:
words += 1
chars += len(listTemp[0])
return words, chars
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("-i", "--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("-o", "--outputPath", dest="outputPath",
help="Path to place transformed files", metavar="PATH")
parser.add_option("--minWordsInLine", type="int", dest="minWordsInLine", default=3,
help="Minimum length sentence in number of words", metavar="NUM")
parser.add_option("--classes", dest="classes",
help="Classes to indicate final of sentence when line contains: PMID-NUMSENT-SENT-CLASS", metavar="CLASS,CLASS")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place transformed files: " + str(options.outputPath))
print("Minimum length sentence in number of words: " + str(options.minWordsInLine))
print("Classes to indicate final of sentence: " + str(options.classes))
# We realized that POS tags from Biolemmatizer are very specific, therefore we decided to use Standford tags
bioPOST = False
filesProcessed = 0
# minWordsInLine = 3
if not options.classes is None:
listClasses = options.classes.split(',')
t0 = time()
print("Transforming files...")
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print(" Transforming file..." + str(file))
#TrpR NN TrpR NN PennPOS
# , , , , NUPOS
# tryptophan NN tryptophan NN PennPOS
listLine1 = []
listLine2 = []
text = ''
lemma = ''
pos = ''
textTransformed = ''
textText = ''
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
# Create output file to write
with open(os.path.join(options.outputPath, file.replace('term.txt', 'tra.txt')), "w", encoding="utf-8") as transformedFile:
for line in iFile:
if line == '\n':
if options.classes is None:
if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
transformedFile.write(textTransformed + '\n')
textTransformed = ''
textText = ''
else:
continue
else:
line = line.strip('\n')
#print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
listLine1 = line.split('\t')
if len(listLine1) != 3:
continue
text = listLine1[0]
# Replacing an estrange space character
text = text.replace(' ', '-')
listLine2 = listLine1[2].split(' ')
lemma = listLine2[0]
# Replacing an estrange space character
lemma = lemma.replace(' ', '-')
if bioPOST:
pos = listLine2[1]
#print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
else:
pos = listLine1[1]
if listLine2[2] == "TermTag":
tag = listLine2[1]
else:
tag = "O"
textText = textText + text + ' '
textTransformed = textTransformed + text + '|' + lemma + '|' + pos + '|' + tag + ' '
# RI+GC NN RI+GC NN PennPOS
if not options.classes is None:
if text in listClasses:
# if length(textTransformed.split()) > options.minWordsInLine:
if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
transformedFile.write(textTransformed + '\n')
# print(textTransformed)
textTransformed = ''
textText = ''
filesProcessed += 1
# Imprime archivos procesados
print()
print("Files processed: " + str(filesProcessed))
print("In: %fs" % (time() - t0))