Carlos-Francisco Méndez-Cruz

Setting up project

<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
# -*- coding: UTF-8 -*-
import json
from optparse import OptionParser
import os
import sys
from time import time
from nltk.corpus import words
__author__ = 'CMendezC'
# Objective: Tagging biological terms from lists of terms related to aspects of interest:
# 1) Changing POS tag by term tag
# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to place output files.
# 3) --termPath Path to read term lists
# 4) --termFiles JSON file with terms files and tags
# 5) --crf Let POS tag instead of substituting it by term or freq tag
# Output:
# 1) Files with biological terms tagged
# Execution:
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# FhlA
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# MarA
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# ArgR
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# CytR
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# Rob
# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place transformed files", metavar="PATH")
parser.add_option("--termPath", dest="termPath",
help="Path to read term files", metavar="PATH")
parser.add_option("--termFiles", dest="termFiles",
help="JSON file with terms files and tags", metavar="FILE")
parser.add_option("--crf", default=False,
action="store_true", dest="crf",
help="Let POS tag instead of substituting it by term or freq tag?")
parser.add_option("--termLower", default=False,
action="store_true", dest="termLower",
help="Compare with terms in lower case?")
parser.add_option("--termCapitalize", default=False,
action="store_true", dest="termCapitalize",
help="Compare with capitalize terms?")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place transformed files: " + str(options.outputPath))
print("Path to read term files: " + str(options.termPath))
print("Let POS tag instead of substituting it by term or freq tag? " + str(options.crf))
print("Compare with terms in lower case? " + str(options.termLower))
print("Compare with capitalize terms? " + str(options.termCapitalize))
##### LOADING BIOLOGICAL TERM FILES #####
# hashTermFiles = {
# 'DFAM': ['domain_families_1grams.txt', 'domain_families_2grams.txt', 'domain_families_3grams.txt', 'domain_families_4grams.txt', 'domain_families_5Moregrams.txt'],
# 'MF': ['domain_function_1grams.txt', 'domain_function_2grams.txt', 'domain_function_3grams.txt', 'domain_function_4grams.txt' , 'domain_function_5Moregrams.txt'],
# 'RP': ['regulatory_Processes_GO_1grams.txt', 'regulatory_Processes_GO_2grams.txt', 'regulatory_Processes_GO_3grams.txt', 'regulatory_Processes_GO_4grams.txt', 'regulatory_Processes_GO_5Moregrams.txt'],
# 'DPOS': ['domain_position_1grams.txt', 'domain_position_2grams.txt', 'domain_position_5Moregrams.txt'],
# 'DMOT': ['domain_structural_motif_1grams.txt', 'domain_structural_motif_2grams.txt'],
# 'TF': ['tfs.txt']
# }
# hashTerms = {
# 'DFAM': [],
# 'MF': [],
# 'RP': [],
# 'DPOS': [],
# 'DMOT': [],
# 'TF': []
# }
print('Loading biological term files...')
with open(os.path.join(options.termPath, options.termFiles)) as data_file:
lists = json.load(data_file)
hashTermFiles = lists["hashTermFiles"]
hashTerms = lists["hashTerms"]
for key in hashTermFiles.keys():
for f in hashTermFiles[key]:
# print('File: ' + f)
with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile:
for line in iFile:
line = line.strip('\n')
line = line.replace(' ', '-')
if line not in hashTerms[key]:
hashTerms[key].append(line)
if options.termLower:
hashTerms[key].append(line.lower())
if options.termCapitalize:
hashTerms[key].append(line.capitalize())
print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
regularWords = words.words('en')
print()
filesPreprocessed = 0
t0 = time()
print("Biological term tagging files...")
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print(" Biological term tagging file..." + str(file))
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
# Create output file to write
with open(os.path.join(options.outputPath, file.replace('lem.txt', 'term.txt')), "w", encoding="utf-8") as oFile:
for line in iFile:
if line == '\n':
oFile.write(line)
else:
line = line.strip('\n')
listLine1 = line.split('\t')
if len(listLine1) < 3:
continue
word = listLine1[0]
pos = listLine1[1]
listLine2 = listLine1[2].split(' ')
lemma = listLine2[0]
if len(word) > 1:
for termTag in hashTerms:
if termTag == "TF":
for term in hashTerms[termTag]:
if (word == term) or (word.startswith(term) and lemma not in regularWords):
print(" TAG WORD {} AS TF CAUSE START WITH TF {} OR IT IS EQUAL".format(word, term))
if listLine1[1].startswith("NN"):
# line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
elif termTag == "EFFECT":
if word.lower() in hashTerms[termTag]:
line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
elif termTag == "DIS":
for term in hashTerms[termTag]:
if lemma.startswith(term) and (pos not in ["CC", "DT", "FW", "CD", "IN", "PRP$", "JJ", "JJR", "JJS", "VBN", "RB"]):
line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
else:
if word in hashTerms[termTag]:
# listLine2 = listLine1[2].split(' ')
if termTag in ["GENE", "TU"]:
if listLine1[1].startswith("NN"):
# line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
elif termTag in ["GC"]:
if pos not in ["CC", "DT", "FW", "CD", "IN", "PRP$", "NNP"]:
# line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
else:
if termTag in ['FWDOM', 'FWRP']:
# line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' FreqTag'
line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' FreqTag'
else:
# line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
# line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
oFile.write(line + '\n')
filesPreprocessed += 1
# Imprime archivos procesados
print()
print("Files preprocessed: " + str(filesPreprocessed))
print("In: %fs" % (time() - t0))
#!/bin/sh
echo 'Preprocessing files...'
ORIGINAL_CORPUS_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
CORPUS_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
TERM_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/dictionaries
PRE=TRUE
echo " Preprocessing: $PRE"
POS=TRUE
echo " POS Tagging: $POS"
LEMMA=TRUE
echo " Lemmatization: $LEMMA"
TERM=TRUE
echo " Terminological tagging: $TERM"
TRANS=TRUE
echo " Transformation: $TRANS"
if [ "$PRE" = "TRUE" ]; then
echo "Preprocessing..."
INPUT_PATH=$ORIGINAL_CORPUS_PATH
OUTPUT_PATH=$CORPUS_PATH/preprocessed
python3.4 preprocessingTermDetection.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termDetection --termPath $TERM_PATH --termFiles termFilesLength_LREGULONDB.json > outputPreprocessing_lregulondb.txt
# python3.4 preprocessingTermDetection.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH > outputPreprocessing_lregulondb.txt
fi
if [ "$POS" = "TRUE" ]; then
echo "POS Tagging..."
INPUT_PATH=$CORPUS_PATH/preprocessed
OUTPUT_PATH=$CORPUS_PATH/pos
python3.4 posTaggingStanford.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --taggerPath /home/cmendezc/STANFORD_POSTAGGER/stanford-postagger-2015-12-09 --biolemmatizer > outputPOST_lregulondb.txt
fi
if [ "$LEMMA" = "TRUE" ]; then
echo "Lemmatization..."
INPUT_PATH=$CORPUS_PATH/pos
OUTPUT_PATH=$CORPUS_PATH/lemma
python3.4 biolemmatizing.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --biolemmatizerPath /home/cmendezc/BIO_LEMMATIZER > outputLemma_lregulondb.txt
fi
if [ "$TERM" = "TRUE" ]; then
echo "Terminological tagging..."
INPUT_PATH=$CORPUS_PATH/lemma
OUTPUT_PATH=$CORPUS_PATH/term
python3.4 biologicalTermTagging.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag_LREGULONDB.json > outputTerm_lregulondb.txt
fi
if [ "$TRANS" = "TRUE" ]; then
echo "Transformation..."
INPUT_PATH=$CORPUS_PATH/term
OUTPUT_PATH=$CORPUS_PATH/transformed
python3.4 transforming.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation_lregulondb.txt
fi
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
from subprocess import call
__author__ = 'CMendezC'
# Objective: Part-of-Speech Tagging of several files with Stanford POS Tagger.
# Parameters:
# 1) --inputPath Path to read TXT files.
# 2) --outputPath Path to place POST files.
# 3) --taggerPath Path POS Tagger command.
# 4) --biolemmatizer Format for biolemmatizer?.
# Output:
# 1) POS Tagged files.
# 2) If --biolemmatizer with format:
# Rob NNP
# is VBZ
# a DT
# transcriptional JJ
# dual JJ
# regulator NN
# . .
#
# Its PRP$
# N-terminal JJ ...
# Execution:
# GntR
# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
# FhlA
# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
# MarA
# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
# ArgR
# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
# CytR
# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
# Rob
# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
# EXTRACTING REGULATORY INTERACTIONS
# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("-i", "--inputPath", dest="inputPath",
help="Path to read TXT files", metavar="PATH")
parser.add_option("-o", "--outputPath", dest="outputPath",
help="Path to place POST files", metavar="PATH")
parser.add_option("-a", "--taggerPath", dest="taggerPath", default="",
help="Path FreeLing analyzer files", metavar="PATH")
parser.add_option("-p", "--biolemmatizer", default=False,
action="store_true", dest="biolemmatizer",
help="Format for biolemmatizer?")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
print("Path POS Tagger command: " + str(options.taggerPath))
print("Format for biolemmatizer?: " + str(options.biolemmatizer))
filesTagged = 0
t0 = time()
print("Tagging corpus...")
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print(" Tagging file..." + str(file))
try:
# FREELING: taggerPath = os.path.join(options.taggerPath, "analyzer.ex")
# FREELING: command = taggerPath + " -f " + os.path.join("%FREELINGSHARE%", "config", "en.cfg") + " <" + os.path.join(path, file) + "> " + os.path.join(options.outputPath, file) + ".post.txt"
# stanford-postagger models\english-left3words-distsim.tagger
# C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TFsummaries_tagged_SGC_aspectRP-DOM\ECK120011190.Rob.sum.txt
# >
# C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectsOfInterest_TrainingSet\testingTaggers\ECK120011190.Rob.sum.txt
import platform
plat = platform.system()
if plat == 'Linux':
# FOR LINUX
# java -mx300m -cp 'stanford-postagger.jar:lib/*' edu.stanford.nlp.tagger.maxent.MaxentTagger
# -model $1 -textFile $2
command = "java -mx300m -cp " + os.path.join(options.taggerPath, 'stanford-postagger.jar:') + \
os.path.join(options.taggerPath, 'lib/*') + \
' edu.stanford.nlp.tagger.maxent.MaxentTagger -model ' + \
os.path.join(options.taggerPath, 'models', 'english-left3words-distsim.tagger') + \
' -textFile ' + os.path.join(options.inputPath, file) + \
' > ' + os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt'))
else:
# C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\preprocessingCorpus>java -mx300m
# -cp "C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09\stanford-postagger.jar;
# C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09\lib/*"
# edu.stanford.nlp.tagger.maxent.MaxentTagger -model
# C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09\models\english-left3words-distsim.tagger
# -textFile C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\preprocessed\ECK120011190.Rob.sum.pre.txt
#taggerPath = os.path.join('java')
command = "java -mx300m -cp " + os.path.join(options.taggerPath, 'stanford-postagger.jar;') + \
os.path.join(options.taggerPath, 'lib/*') + \
' edu.stanford.nlp.tagger.maxent.MaxentTagger -model ' + \
os.path.join(options.taggerPath, 'models', 'english-left3words-distsim.tagger') + \
' -textFile ' + os.path.join(options.inputPath, file) + \
' > ' + os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt')) #print(command)
retcode = call(command, shell=True)
if retcode < 0:
print(" Child was terminated by signal", -retcode, file=sys.stderr)
else:
print(" Child returned", retcode, file=sys.stderr)
filesTagged += 1
except OSError as e:
print(" Execution failed:", e, file=sys.stderr)
text = ""
if options.biolemmatizer:
with open(os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt')), "r", encoding="utf-8", errors="replace") as iFile:
text = iFile.read()
# -LRB-_-LRB- PTS_NN -RRB-_-RRB-
# for_IN Mlc_NN inactivation_NN ._.
text = text.replace('-LRB-', '(')
text = text.replace('-RRB-', ')')
text = text.replace('-LSB-', '[')
text = text.replace('-RSB-', ']')
text = text.replace('_', '\t')
text = text.replace(' ', '\n')
text = text.replace('.\n', '.\n\n')
with open(os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt')), "w", encoding="utf-8", errors="replace") as oFile:
oFile.write(text)
# Imprime archivos procesados
print()
print("Files POS Tagged: " + str(filesTagged))
print("Files POS Tagged in: %fs" % (time() - t0))
# -*- coding: UTF-8 -*-
import json
import re
from optparse import OptionParser
import os
import sys
from time import time
import nltk
__author__ = 'CMendezC'
# Objective: Preprocessing paper files:
# Eliminate lines beginning with:
# Copyright � 1997
# © 1997 Elsevier
# Copyright © 1998,
# Keywords: GntR; cAMP-CRP; GntP family
# Received 21 October 1996/Accepted 27 December 1996
# Received 6 January 1997; accepted 5 June 1997; Received by A. Nakazawa
# (Received 29 June 1998/Accepted 3 August 1998)
# REFERENCES: Eisenberg, R.C., Dobrogosz, W.J., 1967 | Hung, A., Orozco, A., Zwaig, N., 1970.
# Shine, J. & Dalgarno, L. (1974).
# 34. Saier, M. H., T. M. Ramseier, and J. Reizer. 1996.
# * Corresponding author. Mailing address: Department of Microbiology,
# Phone: (614) 688-3518.
# Fax: (614) 688-3519.
# E-mail: conway.51@osu.edu.
# Downloaded from
# Selecting lines until ACKNOWLEDGMENTS or REFERENCES or Acknowledgements or References
# Biological term detection
# Parameters:
# 1) --inputPath Path to read TXT files.
# 2) --outputPath Path to place POST files.
# 3) --termPath Path to read term lists
# 4) --termFiles JSON file with terms files and length
# 5) --termDetection If term detection is performed
# 6) --multiDocument Processing multidocuments within input file?
# 7) --tabFormat File with format PMID\tNUMSENT\tSENT\tCLASS?
# 8) --joinPunctuation Join separated punctuation (it comes separated from ODIN-XML files)
# Output:
# 1) preprocessed files with biological term detection
# Execution:
# GntR
# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
# FhlA
# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
# MarA
# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
# ArgR
# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
# CytR
# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
# Rob
# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
# EXTRACTING REGULATORY INTERACTIONS
# python preprocessingTermDetection.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\original
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\preprocessed
# --termPath C:\Users\cmendezc\Documents\GENOMICAS\preprocessingTermTagging_v1.0\termLists
# --termFiles termFilesLength.json
# def addEndPeriod(cad):
# if cad.endswith('.'):
# return cad
# else:
# return cad + '.'
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place output files", metavar="PATH")
parser.add_option("--termPath", dest="termPath",
help="Path of term files", metavar="PATH")
parser.add_option("--termFiles", dest="termFiles",
help="JSON file with terms files and length", metavar="PATH")
parser.add_option("--termDetection", default=False,
action="store_true", dest="termDetection",
help="Perform term detection?")
parser.add_option("--multiDocument", default=False,
action="store_true", dest="multiDocument",
help="Processing multidocuments within input file?")
parser.add_option("--tabFormat", default=False,
action="store_true", dest="tabFormat",
help="File with format PMID\tNUMSENT\tSENT\tCLASS?")
parser.add_option("--joinPunctuation", default=False,
action="store_true", dest="joinPunctuation",
help="Join separated punctuation?")
parser.add_option("--termLower", default=False,
action="store_true", dest="termLower",
help="Compare with terms in lower case?")
parser.add_option("--termCapitalize", default=False,
action="store_true", dest="termCapitalize",
help="Compare with capitalize terms?")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
print("Perform term detection?: " + str(options.termDetection))
if options.termDetection:
print("Path to read terminological resources: " + str(options.termPath))
print("JSON file with terms files and length: " + str(options.termFiles))
print("Processing multidocuments within input file?: " + str(options.multiDocument))
print("File with format PMID\tNUMSENT\tSENT\tCLASS?: " + str(options.tabFormat))
print("Join separated punctuation?: " + str(options.joinPunctuation))
# #### REGEX DEFINITION FOR UNNECESSARY LINES #####
regexEmptyLine = re.compile('^\s*$')
# Copyright � 1997
# © 1997 Elsevier
# Copyright © 1998,
# Keywords: GntR; cAMP-CRP; GntP family
# Received 21 October 1996/Accepted 27 December 1996
# Received 6 January 1997; accepted 5 June 1997; Received by A. Nakazawa
# (Received 29 June 1998/Accepted 3 August 1998)
# * Corresponding author. Mailing address: Department of Microbiology,
# Phone: (614) 688-3518.
# Fax: (614) 688-3519.
# E-mail: conway.51@osu.edu.
# Downloaded from
# www.sciencedirect.com Current Opinion in Microbiology 2008, 11:87–93 88 Cell regulation
# DOI 10.1016 / j.mib .2008.02.007
# Correspondence to J
# journal homepage: www.elsevier.com/locate/biotechadv
# Research review paper
# Article history:
# Accepted 18 April 2014
# Available online 26 April 2014
# Abbreviations : ROS ,
# JOURNAL OF
# 0021-9193/02
# Mailing address : CSIC - Estación Experimental del Zaidín , Apdo .
# Correos 419 , E - 18008 Granada , Spain .
# Phone : 34 - 58 - 121011 .
# Fax : 34 - 58 - 129600 .
# Present address : Department of Biology , Imperial College of Science ,
expression = '^(Copyright|© [0-9][0-9][0-9][0-9]|Keywords:|\(?Received [0-9]?[0-9]|\*?\s?Corresponding author|' + \
'Phone:|Fax:|E-mail:|Phone\s:|Fax\s:|E-mail\s:|Mailing\saddress\s:|Present\saddress\s:|' + \
'Downloaded\sfrom|DOI|www\.sciencedirect\.com|Correspondence to [A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]|' + \
'journal homepage:|Research review paper|Article history:|\(?Accepted [0-9]?[0-9]|' + \
'Available online|Abbreviations:|ACKNOWLEDGMENTS\s|REFERENCES\s|' + \
'All rights reserved|Published by Elsevier|' + \
'Verbatim copying and redistribution of this article|J Bacteriol [0-9][0-9][0-9][0-9]|' + \
'Mol Microbiol [0-9][0-9][0-9][0-9]|Nucleic Acids Res [0-9][0-9][0-9][0-9]|' + \
'JOURNAL OF|[0-9][0-9][0-9][0-9]\-[0-9][0-9][0-9]/[0-9][0-9]|[0-9][0-9][0-9] – [0-9][0-9][0-9] Vol)'
regexUnnecessaryLines = re.compile(expression)
#regexUnnecessaryLines = re.compile('^(Copyright)')
# REFERENCES: Eisenberg, R.C., Dobrogosz, W.J., 1967
# Hung, A., Orozco, A., Zwaig, N., 1970.
# Shine, J. & Dalgarno, L. (1974).
# 34. Saier, M. H., T. M. Ramseier, and J. Reizer. 1996.
# 1. Pesavento, C. & Hengge, R. Bacterial nucleotide-based
# Battesti , N .
# Aiba , H . , T .
# Yamamoto , and M .
# regexReferences = re.compile('^([0-9]?[0-9]\.\s)?[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ][a-záéíóúàèìòùüâêîôû\-]+\s?,\s([A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]\s?\.\s?)+.*([0-9][0-9][0-9][0-9])')
# regexReferences = re.compile('^([0-9]?[0-9]\.\s)?[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ][a-záéíóúàèìòùüâêîôû\-]+\s?,\s([A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]\s?\.\s?)+')
regexReferences = re.compile('^([0-9]?[0-9]\.\s)?[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ][a-záéíóúàèìòùüâêîôû\-]+\s?,\s([A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]\s?\.\s?)+($|.*\(\s?[0-9][0-9][0-9][0-9]\s?\))')
# Lines without words, with only symbols
# --.-,.;....a...........c....
# .........
# 2.;
# ..~......: ........................
# ::..:.< -.;-.:.;L.:.5 %..-.-...;..;..,:
# ?........., .....,: ........,,::, , ...
# ..
# .J
# L,.
# 2
# i
# regexLinesNoText = re.compile('^[^a-zA-Z0-9]')
# regexUnderscoreWord = re.compile(r'\b_\b')
# 40 o more dots which appear in index lines
regexIndexLine = re.compile('\.{40}')
# e-mails
regexEmail = re.compile(
'(e-mail : |e-mail: |e-mail )?([a-zA-Z0-9\._\-]+@[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+ |[a-zA-Z0-9\._\-]+@[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+ )')
### DETECTAR CONTENTS Y ELIMINAR HASTA INTRODUCTION (?): Overview of oxidative stress response ... ... 28 2 .
### SI ES INTRODUCTION, AKNOLEDGMENTS U OTRO TÍTULO, PONERLE PUNTO O ELIMINARLO SI ES A INICIO DE PALABRA Y NO HAY OTRO PALABRA DESPUÉS.
# A VECES SE USA Summary
# Join separated punctuation
if options.joinPunctuation:
# 1) join to right: (, [, “, ‘, ±, ~
regexPuncRigth = re.compile('(?P<punct>[\(\[“‘±~])\s')
# 2) join to left: ), ], ., ,, ”, ´, ;, %, :, ’, '
regexPuncLeft = re.compile('\s(?P<punct>[\)\]\.,”´;%:’\'])')
# 3) join both sides: -, /, –, —
regexPuncBoth = re.compile('\s(?P<punct>[-/–—])\s')
# 4) genitive: ArgP ’ s
regexPuncGenitive = re.compile('(?P<before>[a-zA-Z])\s’\ss\s')
# #### LOADING BIOLOGICAL TERM FILES #####
if options.termDetection:
with open(os.path.join(options.termPath, options.termFiles)) as data_file:
hashes = json.load(data_file)
hashTermFiles = hashes["hashTermFiles"]
hashTerms = hashes["hashTerms"]
for key in hashTermFiles.keys():
for f in hashTermFiles[key]:
with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile:
for line in iFile:
line = line.strip('\n')
if line not in hashTerms[key]:
hashTerms[key].append(line)
if options.termLower:
hashTerms[key].append(line.lower())
if options.termCapitalize:
hashTerms[key].append(line.capitalize())
print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
filesProcessed = 0
t0 = time()
print("Preprocessing files...")
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print(" Preprocessing file..." + str(file))
text = ''
listSentences = []
references = 0
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
# Create output file to write
# with open(os.path.join(options.outputPath, file.replace('.txt', '.pre.txt')), "w", encoding="utf-8") as oFile:
for line in iFile:
originalLine = line.strip('\n')
if options.joinPunctuation:
originalLine = regexPuncGenitive.sub(r'\g<before>’s', originalLine)
originalLine = regexPuncRigth.sub(r'\g<punct>', originalLine)
originalLine = regexPuncLeft.sub(r'\g<punct>', originalLine)
originalLine = regexPuncBoth.sub(r'\g<punct>', originalLine)
if options.tabFormat:
listLine = originalLine.split('\t')
line = listLine[2]
### DETECTAR AKNOWLEDGMENTS Y ELIMINAR TODO LO QUE SIGA
# This eliminate usefull part of pepers if line.upper().startswith('ACKNOWLEDGMENT') or line.upper().startswith('REFERENCES') or references > 2:
# Do not eliminate references because within them there are RIs
# if not options.multiDocument:
# if line.upper() == 'ACKNOWLEDGMENTS' or line.upper() == 'REFERENCES' or references > 2:
# break
if not options.multiDocument:
if line.upper() == 'ACKNOWLEDGMENTS':
break
# if line == '' or line == None:
if regexEmptyLine.match(line) != None:
print('Empty line ' + line)
continue
# Do not eliminate references because within them there are RIs
# if regexReferences.match(line) != None:
# print('Reference line ' + str(line.encode(encoding='UTF-8', errors='replace')))
# references += 1
# continue
# if regexUnnecessaryLines.match(line) != None:
if regexUnnecessaryLines.search(line) != None:
print('Unnecessary line ' + str(line.encode(encoding='UTF-8', errors='replace')))
continue
if regexIndexLine.search(line) != None:
print('Index line ' + line)
continue
if regexEmail.search(line) != None:
print('Line with email: ' + line)
line = regexEmail.sub(' ', line)
# print(line)
text += originalLine + '\n'
if options.termDetection:
# #### BIOLOGICAL TERM DETECTION #####
print(' Detecting biological terms...')
for key in sorted(hashTerms.keys(), reverse=True):
#print(' length: ' + str(key))
for term in hashTerms[key]:
#print(str(term.encode(encoding='UTF-8', errors='replace')))
text = text.replace(term, term.replace(' ', '-'))
#regexTerm = re.compile(r'' + term)
#regexTerm.sub(term.replace(' ', '_TERM_'), text)
filesProcessed += 1
with open(os.path.join(options.outputPath, file.replace(' ', '').replace('.txt', '.pre.txt')), "w", encoding="utf-8") as oFile:
oFile.write(text)
# Imprime archivos procesados
print()
print("Files preprocessed: " + str(filesProcessed))
print("In: %fs" % (time() - t0))
# -*- coding: UTF-8 -*-
import re
from optparse import OptionParser
import os
import sys
from time import time
__author__ = 'CMendezC'
# Objective: Transforming BIOLemmatized files:
# 1) Transformed files
# 2) Text files to extract aspects
# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to place output files.
# 3) --textPath Path to place output files.
# 4) --minWordsInLine Minimum length sentence in number of words
# 5) --classes Classes to indicate final of sentence when line contains: PMID\tNUMSENT\tSENT\tCLASS
# Output:
# 1) transformed files
# 2) text files
# Execution:
# GntR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\transformed --minWordsInLine 5
# FhlA
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\transformed --minWordsInLine 5
# MarA
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\transformed --minWordsInLine 5
# ArgR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\transformed --minWordsInLine 5
# CytR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\transformed --minWordsInLine 5
# Rob
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\transformed --minWordsInLine 5
# EXTRACTING REGULATORY INTERACTIONS
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\transformed --minWordsInLine 5
def length(listWords):
regexWord = re.compile('[a-zA-Z]')
words = 0
chars = 0
for word in listWords:
listTemp = word.split('|')
if regexWord.search(listTemp[1]) is not None:
words += 1
chars += len(listTemp[0])
return words, chars
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("-i", "--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("-o", "--outputPath", dest="outputPath",
help="Path to place transformed files", metavar="PATH")
parser.add_option("--minWordsInLine", type="int", dest="minWordsInLine", default=3,
help="Minimum length sentence in number of words", metavar="NUM")
parser.add_option("--classes", dest="classes",
help="Classes to indicate final of sentence when line contains: PMID-NUMSENT-SENT-CLASS", metavar="CLASS,CLASS")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place transformed files: " + str(options.outputPath))
print("Minimum length sentence in number of words: " + str(options.minWordsInLine))
print("Classes to indicate final of sentence: " + str(options.classes))
# We realized that POS tags from Biolemmatizer are very specific, therefore we decided to use Standford tags
bioPOST = False
filesProcessed = 0
# minWordsInLine = 3
if not options.classes is None:
listClasses = options.classes.split(',')
t0 = time()
print("Transforming files...")
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print(" Transforming file..." + str(file))
#TrpR NN TrpR NN PennPOS
# , , , , NUPOS
# tryptophan NN tryptophan NN PennPOS
listLine1 = []
listLine2 = []
text = ''
lemma = ''
pos = ''
textTransformed = ''
textText = ''
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
# Create output file to write
with open(os.path.join(options.outputPath, file.replace('term.txt', 'tra.txt')), "w", encoding="utf-8") as transformedFile:
for line in iFile:
if line == '\n':
if options.classes is None:
if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
transformedFile.write(textTransformed + '\n')
textTransformed = ''
textText = ''
else:
continue
else:
line = line.strip('\n')
#print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
listLine1 = line.split('\t')
if len(listLine1) != 3:
continue
text = listLine1[0]
# Replacing an estrange space character
text = text.replace(' ', '-')
listLine2 = listLine1[2].split(' ')
lemma = listLine2[0]
# Replacing an estrange space character
lemma = lemma.replace(' ', '-')
if bioPOST:
pos = listLine2[1]
#print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
else:
pos = listLine1[1]
textText = textText + text + ' '
textTransformed = textTransformed + text + '|' + lemma + '|' + pos + ' '
# RI+GC NN RI+GC NN PennPOS
if not options.classes is None:
if text in listClasses:
# if length(textTransformed.split()) > options.minWordsInLine:
if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
transformedFile.write(textTransformed + '\n')
# print(textTransformed)
textTransformed = ''
textText = ''
filesProcessed += 1
# Imprime archivos procesados
print()
print("Files processed: " + str(filesProcessed))
print("In: %fs" % (time() - t0))