Setting up project

Carlos-Francisco Méndez-Cruz
Commit b5518966e67291bf2d6021b6da539e461c1c9185 b5518966 0 parents
Showing 6 changed files with 884 additions and 0 deletions
.idea/vcs.xml
biologicalTermTagging.py
nlp-preprocessing-pipeline.sh
posTaggingStanford.py
preprocessingTermDetection.py
transforming.py
--- a/.idea/vcs.xml 0 → 100644
View file @b551896
+++ b/.idea/vcs.xml 0 → 100644
View file @b551896
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/biologicalTermTagging.py 0 → 100644
View file @b551896
+++ b/biologicalTermTagging.py 0 → 100644
View file @b551896
+# -*- coding: UTF-8 -*-
+import json
+from optparse import OptionParser
+import os
+import sys
+from time import time
+from nltk.corpus import words
+
+__author__ = 'CMendezC'
+
+# Objective: Tagging biological terms from lists of terms related to aspects of interest:
+#   1) Changing POS tag by term tag
+
+# Parameters:
+#   1) --inputPath Path to read input files.
+#   2) --outputPath Path to place output files.
+#   3) --termPath Path to read term lists
+#   4) --termFiles JSON file with terms files and tags
+#   5) --crf Let POS tag instead of substituting it by term or freq tag
+
+# Output:
+#   1) Files with biological terms tagged
+
+# Execution:
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# FhlA
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# MarA
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# ArgR
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# CytR
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+# Rob
+# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path to read input files", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Path to place transformed files", metavar="PATH")
+    parser.add_option("--termPath", dest="termPath",
+                      help="Path to read term files", metavar="PATH")
+    parser.add_option("--termFiles", dest="termFiles",
+                      help="JSON file with terms files and tags", metavar="FILE")
+    parser.add_option("--crf", default=False,
+                      action="store_true", dest="crf",
+                      help="Let POS tag instead of substituting it by term or freq tag?")
+    parser.add_option("--termLower", default=False,
+                      action="store_true", dest="termLower",
+                      help="Compare with terms in lower case?")
+    parser.add_option("--termCapitalize", default=False,
+                      action="store_true", dest="termCapitalize",
+                      help="Compare with capitalize terms?")
+
+    (options, args) = parser.parse_args()
+
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place transformed files: " + str(options.outputPath))
+    print("Path to read term files: " + str(options.termPath))
+    print("Let POS tag instead of substituting it by term or freq tag? " + str(options.crf))
+    print("Compare with terms in lower case? " + str(options.termLower))
+    print("Compare with capitalize terms? " + str(options.termCapitalize))
+
+    #####       LOADING BIOLOGICAL TERM FILES    #####
+    # hashTermFiles = {
+    #     'DFAM': ['domain_families_1grams.txt', 'domain_families_2grams.txt', 'domain_families_3grams.txt', 'domain_families_4grams.txt', 'domain_families_5Moregrams.txt'],
+    #     'MF': ['domain_function_1grams.txt', 'domain_function_2grams.txt', 'domain_function_3grams.txt', 'domain_function_4grams.txt' , 'domain_function_5Moregrams.txt'],
+    #     'RP': ['regulatory_Processes_GO_1grams.txt', 'regulatory_Processes_GO_2grams.txt', 'regulatory_Processes_GO_3grams.txt', 'regulatory_Processes_GO_4grams.txt', 'regulatory_Processes_GO_5Moregrams.txt'],
+    #     'DPOS': ['domain_position_1grams.txt', 'domain_position_2grams.txt', 'domain_position_5Moregrams.txt'],
+    #     'DMOT': ['domain_structural_motif_1grams.txt', 'domain_structural_motif_2grams.txt'],
+    #     'TF': ['tfs.txt']
+    # }
+
+    # hashTerms = {
+    #     'DFAM': [],
+    #     'MF': [],
+    #     'RP': [],
+    #     'DPOS': [],
+    #     'DMOT': [],
+    #     'TF': []
+    # }
+
+    print('Loading biological term files...')
+    with open(os.path.join(options.termPath, options.termFiles)) as data_file:
+        lists = json.load(data_file)
+
+    hashTermFiles = lists["hashTermFiles"]
+    hashTerms = lists["hashTerms"]
+
+    for key in hashTermFiles.keys():
+        for f in hashTermFiles[key]:
+            # print('File: ' + f)
+            with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile:
+                for line in iFile:
+                    line = line.strip('\n')
+                    line = line.replace(' ', '-')
+                    if line not in hashTerms[key]:
+                        hashTerms[key].append(line)
+                        if options.termLower:
+                            hashTerms[key].append(line.lower())
+                        if options.termCapitalize:
+                            hashTerms[key].append(line.capitalize())
+        print('   Terms read {} size: {}'.format(key, len(hashTerms[key])))
+
+    regularWords =  words.words('en')
+    print()
+
+    filesPreprocessed = 0
+    t0 = time()
+    print("Biological term tagging files...")
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("   Biological term tagging file..." + str(file))
+            with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
+                # Create output file to write
+                with open(os.path.join(options.outputPath, file.replace('lem.txt', 'term.txt')), "w", encoding="utf-8") as oFile:
+                    for line in iFile:
+                        if line == '\n':
+                            oFile.write(line)
+                        else:
+                            line = line.strip('\n')
+                            listLine1 = line.split('\t')
+                            if len(listLine1) < 3:
+                                continue
+                            word = listLine1[0]
+                            pos = listLine1[1]
+                            listLine2 = listLine1[2].split(' ')
+                            lemma = listLine2[0]
+                            if len(word) > 1:
+                                for termTag in hashTerms:
+                                    if termTag == "TF":
+                                        for term in hashTerms[termTag]:
+                                            if (word == term) or (word.startswith(term) and lemma not in regularWords):
+                                                print(" TAG WORD {} AS TF CAUSE START WITH TF {} OR IT IS EQUAL".format(word, term))
+                                                if listLine1[1].startswith("NN"):
+                                                    # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                                    line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                    elif termTag == "EFFECT":
+                                        if word.lower() in hashTerms[termTag]:
+                                            line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
+                                    elif termTag == "DIS":
+                                        for term in hashTerms[termTag]:
+                                            if lemma.startswith(term) and (pos not in ["CC", "DT", "FW", "CD", "IN", "PRP$", "JJ", "JJR", "JJS", "VBN", "RB"]):
+                                                line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
+                                    else:
+                                        if word in hashTerms[termTag]:
+                                            # listLine2 = listLine1[2].split(' ')
+                                            if termTag in ["GENE", "TU"]:
+                                                if listLine1[1].startswith("NN"):
+                                                    # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                                    line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                            elif termTag in ["GC"]:
+                                                if pos not in ["CC", "DT", "FW", "CD", "IN", "PRP$", "NNP"]:
+                                                    # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                                    line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
+                                            else:
+                                                if termTag in ['FWDOM', 'FWRP']:
+                                                    # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' FreqTag'
+                                                    line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' FreqTag'
+                                                else:
+                                                    # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                                    # line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
+                                                    line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
+                            oFile.write(line + '\n')
+            filesPreprocessed += 1
+
+    # Imprime archivos procesados
+    print()
+    print("Files preprocessed: " + str(filesPreprocessed))
+    print("In: %fs" % (time() - t0))
--- a/nlp-preprocessing-pipeline.sh 0 → 100644
View file @b551896
+++ b/nlp-preprocessing-pipeline.sh 0 → 100644
View file @b551896
+#!/bin/sh
+echo 'Preprocessing files...'
+ORIGINAL_CORPUS_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
+CORPUS_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
+TERM_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/dictionaries
+
+PRE=TRUE
+echo "   Preprocessing: $PRE"
+POS=TRUE
+echo "   POS Tagging: $POS"
+LEMMA=TRUE
+echo "   Lemmatization: $LEMMA"
+TERM=TRUE
+echo "   Terminological tagging: $TERM"
+TRANS=TRUE
+echo "   Transformation: $TRANS"
+
+if [ "$PRE" = "TRUE" ]; then
+echo "Preprocessing..."
+INPUT_PATH=$ORIGINAL_CORPUS_PATH
+OUTPUT_PATH=$CORPUS_PATH/preprocessed
+python3.4 preprocessingTermDetection.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termDetection --termPath $TERM_PATH --termFiles termFilesLength_LREGULONDB.json > outputPreprocessing_lregulondb.txt
+# python3.4 preprocessingTermDetection.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH > outputPreprocessing_lregulondb.txt
+fi
+
+if [ "$POS" = "TRUE" ]; then
+echo "POS Tagging..."
+INPUT_PATH=$CORPUS_PATH/preprocessed
+OUTPUT_PATH=$CORPUS_PATH/pos
+python3.4 posTaggingStanford.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --taggerPath /home/cmendezc/STANFORD_POSTAGGER/stanford-postagger-2015-12-09 --biolemmatizer > outputPOST_lregulondb.txt
+fi
+
+if [ "$LEMMA" = "TRUE" ]; then
+echo "Lemmatization..."
+INPUT_PATH=$CORPUS_PATH/pos
+OUTPUT_PATH=$CORPUS_PATH/lemma
+python3.4 biolemmatizing.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --biolemmatizerPath /home/cmendezc/BIO_LEMMATIZER  > outputLemma_lregulondb.txt
+fi
+
+if [ "$TERM" = "TRUE" ]; then
+echo "Terminological tagging..."
+INPUT_PATH=$CORPUS_PATH/lemma
+OUTPUT_PATH=$CORPUS_PATH/term
+python3.4 biologicalTermTagging.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag_LREGULONDB.json > outputTerm_lregulondb.txt
+fi
+
+if [ "$TRANS" = "TRUE" ]; then
+echo "Transformation..."
+INPUT_PATH=$CORPUS_PATH/term
+OUTPUT_PATH=$CORPUS_PATH/transformed
+python3.4 transforming.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation_lregulondb.txt
+fi
--- a/posTaggingStanford.py 0 → 100644
View file @b551896
+++ b/posTaggingStanford.py 0 → 100644
View file @b551896
+# -*- coding: UTF-8 -*-
+
+from optparse import OptionParser
+import os
+import sys
+from time import time
+from subprocess import call
+
+__author__ = 'CMendezC'
+
+# Objective: Part-of-Speech Tagging of several files with Stanford POS Tagger.
+
+# Parameters:
+#   1) --inputPath Path to read TXT files.
+#   2) --outputPath Path to place POST files.
+#   3) --taggerPath Path POS Tagger command.
+#   4) --biolemmatizer Format for biolemmatizer?.
+
+# Output:
+#   1) POS Tagged files.
+#   2) If --biolemmatizer with format:
+#   Rob	NNP
+#   is	VBZ
+#   a	DT
+#   transcriptional	JJ
+#   dual	JJ
+#   regulator	NN
+#   .	.
+#
+#   Its	PRP$
+#   N-terminal	JJ ...
+
+# Execution:
+# GntR
+# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
+
+# FhlA
+# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
+
+# MarA
+# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
+
+# ArgR
+# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
+
+# CytR
+# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
+
+# Rob
+# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
+
+# EXTRACTING REGULATORY INTERACTIONS
+# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\post  --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("-i", "--inputPath", dest="inputPath",
+                      help="Path to read TXT files", metavar="PATH")
+    parser.add_option("-o", "--outputPath", dest="outputPath",
+                      help="Path to place POST files", metavar="PATH")
+    parser.add_option("-a", "--taggerPath", dest="taggerPath", default="",
+                      help="Path FreeLing analyzer files", metavar="PATH")
+    parser.add_option("-p", "--biolemmatizer", default=False,
+                      action="store_true", dest="biolemmatizer",
+                      help="Format for biolemmatizer?")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place output files: " + str(options.outputPath))
+    print("Path POS Tagger command: " + str(options.taggerPath))
+    print("Format for biolemmatizer?: " + str(options.biolemmatizer))
+
+    filesTagged = 0
+    t0 = time()
+    print("Tagging corpus...")
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("   Tagging file..." + str(file))
+            try:
+                # FREELING: taggerPath = os.path.join(options.taggerPath, "analyzer.ex")
+                # FREELING: command = taggerPath + " -f " + os.path.join("%FREELINGSHARE%", "config", "en.cfg") + " <" + os.path.join(path, file) + "> " + os.path.join(options.outputPath, file) + ".post.txt"
+
+                # stanford-postagger models\english-left3words-distsim.tagger
+                # C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TFsummaries_tagged_SGC_aspectRP-DOM\ECK120011190.Rob.sum.txt
+                # >
+                # C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectsOfInterest_TrainingSet\testingTaggers\ECK120011190.Rob.sum.txt
+
+                import platform
+                plat = platform.system()
+                if plat == 'Linux':
+                    # FOR LINUX
+                    # java -mx300m -cp 'stanford-postagger.jar:lib/*' edu.stanford.nlp.tagger.maxent.MaxentTagger
+                    # -model $1 -textFile $2
+                    command = "java -mx300m -cp " + os.path.join(options.taggerPath, 'stanford-postagger.jar:') + \
+                              os.path.join(options.taggerPath, 'lib/*') + \
+                              ' edu.stanford.nlp.tagger.maxent.MaxentTagger -model ' + \
+                              os.path.join(options.taggerPath, 'models', 'english-left3words-distsim.tagger') + \
+                              ' -textFile ' + os.path.join(options.inputPath, file) + \
+                              ' > ' + os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt'))
+                else:
+                    # C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\preprocessingCorpus>java -mx300m
+                    # -cp "C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09\stanford-postagger.jar;
+                    # C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09\lib/*"
+                    # edu.stanford.nlp.tagger.maxent.MaxentTagger -model
+                    # C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09\models\english-left3words-distsim.tagger
+                    # -textFile C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\preprocessed\ECK120011190.Rob.sum.pre.txt
+                    #taggerPath = os.path.join('java')
+                    command = "java -mx300m -cp " + os.path.join(options.taggerPath, 'stanford-postagger.jar;') + \
+                              os.path.join(options.taggerPath, 'lib/*') + \
+                              ' edu.stanford.nlp.tagger.maxent.MaxentTagger -model ' + \
+                              os.path.join(options.taggerPath, 'models', 'english-left3words-distsim.tagger') + \
+                              ' -textFile ' + os.path.join(options.inputPath, file) + \
+                              ' > ' + os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt'))  #print(command)
+
+                retcode = call(command, shell=True)
+                if retcode < 0:
+                    print("   Child was terminated by signal", -retcode, file=sys.stderr)
+                else:
+                    print("   Child returned", retcode, file=sys.stderr)
+                    filesTagged += 1
+            except OSError as e:
+                print("   Execution failed:", e, file=sys.stderr)
+
+            text = ""
+            if options.biolemmatizer:
+                with open(os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt')), "r", encoding="utf-8", errors="replace") as iFile:
+                    text = iFile.read()
+                    # -LRB-_-LRB- PTS_NN -RRB-_-RRB-
+                    # for_IN Mlc_NN inactivation_NN ._.
+                    text = text.replace('-LRB-', '(')
+                    text = text.replace('-RRB-', ')')
+
+                    text = text.replace('-LSB-', '[')
+                    text = text.replace('-RSB-', ']')
+
+                    text = text.replace('_', '\t')
+                    text = text.replace(' ', '\n')
+                    text = text.replace('.\n', '.\n\n')
+                with open(os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt')), "w", encoding="utf-8", errors="replace") as oFile:
+                    oFile.write(text)
+
+    # Imprime archivos procesados
+    print()
+    print("Files POS Tagged: " + str(filesTagged))
+    print("Files POS Tagged in: %fs" % (time() - t0))
--- a/preprocessingTermDetection.py 0 → 100644
View file @b551896
+++ b/preprocessingTermDetection.py 0 → 100644
View file @b551896
+# -*- coding: UTF-8 -*-
+import json
+import re
+from optparse import OptionParser
+import os
+import sys
+from time import time
+
+import nltk
+
+__author__ = 'CMendezC'
+
+
+# Objective: Preprocessing paper files:
+#     Eliminate lines beginning with:
+#           Copyright � 1997
+#           © 1997 Elsevier
+#           Copyright © 1998,
+#           Keywords: GntR; cAMP-CRP; GntP family
+#           Received 21 October 1996/Accepted 27 December 1996
+#           Received 6 January 1997; accepted 5 June 1997; Received by A. Nakazawa
+#           (Received 29 June 1998/Accepted 3 August 1998)
+#           REFERENCES: Eisenberg, R.C., Dobrogosz, W.J., 1967 | Hung, A., Orozco, A., Zwaig, N., 1970.
+#                       Shine, J. & Dalgarno, L. (1974).
+#                       34. Saier, M. H., T. M. Ramseier, and J. Reizer. 1996.
+#           * Corresponding author. Mailing address: Department of Microbiology,
+#           Phone: (614) 688-3518.
+#           Fax: (614) 688-3519.
+#           E-mail: conway.51@osu.edu.
+#           Downloaded from
+#     Selecting lines until ACKNOWLEDGMENTS or REFERENCES or Acknowledgements or References
+#     Biological term detection
+
+# Parameters:
+#   1) --inputPath Path to read TXT files.
+#   2) --outputPath Path to place POST files.
+#   3) --termPath Path to read term lists
+#   4) --termFiles JSON file with terms files and length
+#   5) --termDetection If term detection is performed
+#   6) --multiDocument  Processing multidocuments within input file?
+#   7) --tabFormat  File with format PMID\tNUMSENT\tSENT\tCLASS?
+#   8) --joinPunctuation Join separated punctuation (it comes separated from ODIN-XML files)
+
+# Output:
+#   1) preprocessed files with biological term detection
+
+# Execution:
+# GntR
+# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists  --termFiles termFilesLength.json
+
+# FhlA
+# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists  --termFiles termFilesLength.json
+
+# MarA
+# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists  --termFiles termFilesLength.json
+
+# ArgR
+# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists  --termFiles termFilesLength.json
+
+# CytR
+# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists  --termFiles termFilesLength.json
+
+# Rob
+# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists  --termFiles termFilesLength.json
+
+# EXTRACTING REGULATORY INTERACTIONS
+# python preprocessingTermDetection.py
+# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\original
+# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\preprocessed
+# --termPath C:\Users\cmendezc\Documents\GENOMICAS\preprocessingTermTagging_v1.0\termLists
+# --termFiles termFilesLength.json
+
+# def addEndPeriod(cad):
+#     if cad.endswith('.'):
+#         return cad
+#     else:
+#         return cad + '.'
+
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path to read input files", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Path to place output files", metavar="PATH")
+    parser.add_option("--termPath", dest="termPath",
+                      help="Path of term files", metavar="PATH")
+    parser.add_option("--termFiles", dest="termFiles",
+                      help="JSON file with terms files and length", metavar="PATH")
+    parser.add_option("--termDetection", default=False,
+                      action="store_true", dest="termDetection",
+                      help="Perform term detection?")
+    parser.add_option("--multiDocument", default=False,
+                      action="store_true", dest="multiDocument",
+                      help="Processing multidocuments within input file?")
+    parser.add_option("--tabFormat", default=False,
+                      action="store_true", dest="tabFormat",
+                      help="File with format PMID\tNUMSENT\tSENT\tCLASS?")
+    parser.add_option("--joinPunctuation", default=False,
+                      action="store_true", dest="joinPunctuation",
+                      help="Join separated punctuation?")
+    parser.add_option("--termLower", default=False,
+                      action="store_true", dest="termLower",
+                      help="Compare with terms in lower case?")
+    parser.add_option("--termCapitalize", default=False,
+                      action="store_true", dest="termCapitalize",
+                      help="Compare with capitalize terms?")
+
+    (options, args) = parser.parse_args()
+
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place output files: " + str(options.outputPath))
+    print("Perform term detection?: " + str(options.termDetection))
+    if options.termDetection:
+        print("Path to read terminological resources: " + str(options.termPath))
+        print("JSON file with terms files and length: " + str(options.termFiles))
+    print("Processing multidocuments within input file?: " + str(options.multiDocument))
+    print("File with format PMID\tNUMSENT\tSENT\tCLASS?: " + str(options.tabFormat))
+    print("Join separated punctuation?: " + str(options.joinPunctuation))
+
+    # ####       REGEX DEFINITION FOR UNNECESSARY LINES    #####
+    regexEmptyLine = re.compile('^\s*$')
+    #           Copyright � 1997
+    #           © 1997 Elsevier
+    #           Copyright © 1998,
+    #           Keywords: GntR; cAMP-CRP; GntP family
+    #           Received 21 October 1996/Accepted 27 December 1996
+    #           Received 6 January 1997; accepted 5 June 1997; Received by A. Nakazawa
+    #           (Received 29 June 1998/Accepted 3 August 1998)
+    #           * Corresponding author. Mailing address: Department of Microbiology,
+    #           Phone: (614) 688-3518.
+    #           Fax: (614) 688-3519.
+    #           E-mail: conway.51@osu.edu.
+    #           Downloaded from
+    #           www.sciencedirect.com Current Opinion in Microbiology 2008, 11:87–9388 Cell regulation
+    #           DOI 10.1016 / j.mib .2008.02.007
+    #           Correspondence to J
+
+    #           journal homepage: www.elsevier.com/locate/biotechadv
+    #           Research review paper
+    #           Article history:
+    #           Accepted 18 April 2014
+    #           Available online 26 April 2014
+    #           Abbreviations : ROS ,
+    #           JOURNAL OF
+    #           0021-9193/02
+
+    #           Mailing address : CSIC - Estación Experimental del Zaidín , Apdo .
+    #           Correos 419 , E - 18008 Granada , Spain .
+    #           Phone : 34 - 58 - 121011 .
+    #           Fax : 34 - 58 - 129600 .
+    #           Present address : Department of Biology , Imperial College of Science ,
+
+    expression = '^(Copyright|© [0-9][0-9][0-9][0-9]|Keywords:|\(?Received [0-9]?[0-9]|\*?\s?Corresponding author|' + \
+                 'Phone:|Fax:|E-mail:|Phone\s:|Fax\s:|E-mail\s:|Mailing\saddress\s:|Present\saddress\s:|' + \
+                 'Downloaded\sfrom|DOI|www\.sciencedirect\.com|Correspondence to [A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]|' + \
+                 'journal homepage:|Research review paper|Article history:|\(?Accepted [0-9]?[0-9]|' + \
+                 'Available online|Abbreviations:|ACKNOWLEDGMENTS\s|REFERENCES\s|' + \
+                 'All rights reserved|Published by Elsevier|' + \
+                 'Verbatim copying and redistribution of this article|J Bacteriol [0-9][0-9][0-9][0-9]|' + \
+                 'Mol Microbiol [0-9][0-9][0-9][0-9]|Nucleic Acids Res [0-9][0-9][0-9][0-9]|' + \
+                 'JOURNAL OF|[0-9][0-9][0-9][0-9]\-[0-9][0-9][0-9]/[0-9][0-9]|[0-9][0-9][0-9] – [0-9][0-9][0-9] Vol)'
+    regexUnnecessaryLines = re.compile(expression)
+    #regexUnnecessaryLines = re.compile('^(Copyright)')
+    #           REFERENCES: Eisenberg, R.C., Dobrogosz, W.J., 1967
+    #                       Hung, A., Orozco, A., Zwaig, N., 1970.
+    #                       Shine, J. & Dalgarno, L. (1974).
+    #                       34. Saier, M. H., T. M. Ramseier, and J. Reizer. 1996.
+    #                       1. Pesavento, C. & Hengge, R. Bacterial nucleotide-based
+    #                       Battesti , N .
+    #                       Aiba , H . , T .
+    #                       Yamamoto , and M .
+    # regexReferences = re.compile('^([0-9]?[0-9]\.\s)?[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ][a-záéíóúàèìòùüâêîôû\-]+\s?,\s([A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]\s?\.\s?)+.*([0-9][0-9][0-9][0-9])')
+    # regexReferences = re.compile('^([0-9]?[0-9]\.\s)?[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ][a-záéíóúàèìòùüâêîôû\-]+\s?,\s([A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]\s?\.\s?)+')
+    regexReferences = re.compile('^([0-9]?[0-9]\.\s)?[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ][a-záéíóúàèìòùüâêîôû\-]+\s?,\s([A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]\s?\.\s?)+($|.*\(\s?[0-9][0-9][0-9][0-9]\s?\))')
+    # Lines without words, with only symbols
+    # --.-,.;....a...........c....
+    # .........
+    # 2.;
+    # ..~......: ........................
+    # ::..:.< -.;-.:.;L.:.5 %..-.-...;..;..,:
+    # ?........., .....,: ........,,::, , ...
+    # ..
+    # .J
+    # L,.
+    # 2
+    # i
+    # regexLinesNoText = re.compile('^[^a-zA-Z0-9]')
+
+    # regexUnderscoreWord = re.compile(r'\b_\b')
+
+    # 40 o more dots which appear in index lines
+    regexIndexLine = re.compile('\.{40}')
+
+    # e-mails
+    regexEmail = re.compile(
+        '(e-mail : |e-mail: |e-mail )?([a-zA-Z0-9\._\-]+@[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+ |[a-zA-Z0-9\._\-]+@[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+ )')
+
+    ### DETECTAR CONTENTS Y ELIMINAR HASTA INTRODUCTION (?): Overview of oxidative stress response ... ... 28 2 .
+    ### SI ES INTRODUCTION, AKNOLEDGMENTS U OTRO TÍTULO, PONERLE PUNTO O ELIMINARLO SI ES A INICIO DE PALABRA Y NO HAY OTRO PALABRA DESPUÉS.
+    # A VECES SE USA Summary
+
+    # Join separated punctuation
+    if options.joinPunctuation:
+        # 1) join to right: (, [, “, ‘, ±, ~
+        regexPuncRigth = re.compile('(?P<punct>[\(\[“‘±~])\s')
+        # 2) join to left: ), ], ., ,, ”, ´, ;, %, :, ’, '
+        regexPuncLeft = re.compile('\s(?P<punct>[\)\]\.,”´;%:’\'])')
+        # 3) join both sides: -, /, –, —
+        regexPuncBoth = re.compile('\s(?P<punct>[-/–—])\s')
+        # 4) genitive: ArgP ’ s
+        regexPuncGenitive = re.compile('(?P<before>[a-zA-Z])\s’\ss\s')
+
+    # ####       LOADING BIOLOGICAL TERM FILES    #####
+    if options.termDetection:
+        with open(os.path.join(options.termPath, options.termFiles)) as data_file:
+            hashes = json.load(data_file)
+
+        hashTermFiles = hashes["hashTermFiles"]
+        hashTerms = hashes["hashTerms"]
+
+        for key in hashTermFiles.keys():
+            for f in hashTermFiles[key]:
+                with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile:
+                    for line in iFile:
+                        line = line.strip('\n')
+                        if line not in hashTerms[key]:
+                            hashTerms[key].append(line)
+                            if options.termLower:
+                                hashTerms[key].append(line.lower())
+                            if options.termCapitalize:
+                                hashTerms[key].append(line.capitalize())
+            print('   Terms read {} size: {}'.format(key, len(hashTerms[key])))
+
+    filesProcessed = 0
+    t0 = time()
+    print("Preprocessing files...")
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("   Preprocessing file..." + str(file))
+            text = ''
+            listSentences = []
+            references = 0
+            with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
+                # Create output file to write
+                # with open(os.path.join(options.outputPath, file.replace('.txt', '.pre.txt')), "w", encoding="utf-8") as oFile:
+                for line in iFile:
+                    originalLine = line.strip('\n')
+                    if options.joinPunctuation:
+                        originalLine = regexPuncGenitive.sub(r'\g<before>’s', originalLine)
+                        originalLine = regexPuncRigth.sub(r'\g<punct>', originalLine)
+                        originalLine = regexPuncLeft.sub(r'\g<punct>', originalLine)
+                        originalLine = regexPuncBoth.sub(r'\g<punct>', originalLine)
+                    if options.tabFormat:
+                        listLine = originalLine.split('\t')
+                        line = listLine[2]
+                    ### DETECTAR AKNOWLEDGMENTS Y ELIMINAR TODO LO QUE SIGA
+                    # This eliminate usefull part of pepers if line.upper().startswith('ACKNOWLEDGMENT') or line.upper().startswith('REFERENCES') or references > 2:
+
+                    # Do not eliminate references because within them there are RIs
+                    # if not options.multiDocument:
+                    #     if line.upper() == 'ACKNOWLEDGMENTS' or line.upper() == 'REFERENCES' or references > 2:
+                    #         break
+                    if not options.multiDocument:
+                        if line.upper() == 'ACKNOWLEDGMENTS':
+                            break
+                    # if line == '' or line == None:
+                    if regexEmptyLine.match(line) != None:
+                        print('Empty line ' + line)
+                        continue
+                    # Do not eliminate references because within them there are RIs
+                    #  if regexReferences.match(line) != None:
+                    #    print('Reference line ' + str(line.encode(encoding='UTF-8', errors='replace')))
+                    #    references += 1
+                    #    continue
+                    # if regexUnnecessaryLines.match(line) != None:
+                    if regexUnnecessaryLines.search(line) != None:
+                        print('Unnecessary line ' + str(line.encode(encoding='UTF-8', errors='replace')))
+                        continue
+                    if regexIndexLine.search(line) != None:
+                        print('Index line ' + line)
+                        continue
+                    if regexEmail.search(line) != None:
+                         print('Line with email: ' + line)
+                         line = regexEmail.sub(' ', line)
+                    #     print(line)
+
+                    text += originalLine + '\n'
+
+            if options.termDetection:
+                # ####       BIOLOGICAL TERM DETECTION    #####
+                print('     Detecting biological terms...')
+                for key in sorted(hashTerms.keys(), reverse=True):
+                    #print('     length: ' + str(key))
+                    for term in hashTerms[key]:
+                        #print(str(term.encode(encoding='UTF-8', errors='replace')))
+                        text = text.replace(term, term.replace(' ', '-'))
+                        #regexTerm = re.compile(r'' + term)
+                        #regexTerm.sub(term.replace(' ', '_TERM_'), text)
+
+            filesProcessed += 1
+            with open(os.path.join(options.outputPath, file.replace(' ', '').replace('.txt', '.pre.txt')), "w", encoding="utf-8") as oFile:
+                oFile.write(text)
+
+    # Imprime archivos procesados
+    print()
+    print("Files preprocessed: " + str(filesProcessed))
+    print("In: %fs" % (time() - t0))
--- a/transforming.py 0 → 100644
View file @b551896
+++ b/transforming.py 0 → 100644
View file @b551896
+# -*- coding: UTF-8 -*-
+import re
+from optparse import OptionParser
+import os
+import sys
+from time import time
+
+__author__ = 'CMendezC'
+
+# Objective: Transforming BIOLemmatized files:
+#   1) Transformed files
+#   2) Text files to extract aspects
+
+# Parameters:
+#   1) --inputPath Path to read input files.
+#   2) --outputPath Path to place output files.
+#   3) --textPath Path to place output files.
+#   4) --minWordsInLine Minimum length sentence in number of words
+#   5) --classes Classes to indicate final of sentence when line contains: PMID\tNUMSENT\tSENT\tCLASS
+
+# Output:
+#   1) transformed files
+#   2) text files
+
+# Execution:
+# GntR
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\transformed --minWordsInLine 5
+
+# FhlA
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\transformed --minWordsInLine 5
+
+# MarA
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\transformed --minWordsInLine 5
+
+# ArgR
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\transformed --minWordsInLine 5
+
+# CytR
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\transformed --minWordsInLine 5
+
+# Rob
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\transformed --minWordsInLine 5
+
+# EXTRACTING REGULATORY INTERACTIONS
+# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\transformed --minWordsInLine 5
+
+
+def length(listWords):
+    regexWord = re.compile('[a-zA-Z]')
+    words = 0
+    chars = 0
+    for word in listWords:
+        listTemp = word.split('|')
+        if regexWord.search(listTemp[1]) is not None:
+            words += 1
+        chars += len(listTemp[0])
+    return words, chars
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("-i", "--inputPath", dest="inputPath",
+                      help="Path to read input files", metavar="PATH")
+    parser.add_option("-o", "--outputPath", dest="outputPath",
+                      help="Path to place transformed files", metavar="PATH")
+    parser.add_option("--minWordsInLine", type="int", dest="minWordsInLine", default=3,
+                      help="Minimum length sentence in number of words", metavar="NUM")
+    parser.add_option("--classes", dest="classes",
+                      help="Classes to indicate final of sentence when line contains: PMID-NUMSENT-SENT-CLASS", metavar="CLASS,CLASS")
+
+    (options, args) = parser.parse_args()
+
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place transformed files: " + str(options.outputPath))
+    print("Minimum length sentence in number of words: " + str(options.minWordsInLine))
+    print("Classes to indicate final of sentence: " + str(options.classes))
+
+    # We realized that POS tags from Biolemmatizer are very specific, therefore we decided to use Standford tags
+    bioPOST = False
+    filesProcessed = 0
+    # minWordsInLine = 3
+    if not options.classes is None:
+        listClasses = options.classes.split(',')
+    t0 = time()
+    print("Transforming files...")
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("   Transforming file..." + str(file))
+            #TrpR	NN	TrpR NN PennPOS
+            # ,	,	, , NUPOS
+            # tryptophan	NN	tryptophan NN PennPOS
+            listLine1 = []
+            listLine2 = []
+            text = ''
+            lemma = ''
+            pos = ''
+            textTransformed = ''
+            textText = ''
+            with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
+                # Create output file to write
+                with open(os.path.join(options.outputPath, file.replace('term.txt', 'tra.txt')), "w", encoding="utf-8") as transformedFile:
+                    for line in iFile:
+                        if line == '\n':
+                            if options.classes is None:
+                                if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
+                                    transformedFile.write(textTransformed + '\n')
+                                textTransformed = ''
+                                textText = ''
+                            else:
+                                continue
+                        else:
+                            line = line.strip('\n')
+                            #print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
+                            listLine1 = line.split('\t')
+                            if len(listLine1) != 3:
+                                continue
+                            text = listLine1[0]
+                            # Replacing an estrange space character
+                            text = text.replace(' ', '-')
+                            listLine2 = listLine1[2].split(' ')
+                            lemma = listLine2[0]
+                            # Replacing an estrange space character
+                            lemma = lemma.replace(' ', '-')
+                            if bioPOST:
+                                pos = listLine2[1]
+                                #print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
+                            else:
+                                pos = listLine1[1]
+                            textText = textText + text + ' '
+                            textTransformed = textTransformed + text + '|' + lemma + '|' + pos + ' '
+                            # RI+GC	NN	RI+GC NN PennPOS
+                            if not options.classes is None:
+                                if text in listClasses:
+                                    # if length(textTransformed.split()) > options.minWordsInLine:
+                                    if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
+                                        transformedFile.write(textTransformed + '\n')
+                                        # print(textTransformed)
+                                    textTransformed = ''
+                                    textText = ''
+            filesProcessed += 1
+
+    # Imprime archivos procesados
+    print()
+    print("Files processed: " + str(filesProcessed))
+    print("In: %fs" % (time() - t0))