Carlos-Francisco Méndez-Cruz

Setting up project

1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="VcsDirectoryMappings">
4 + <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 + </component>
6 +</project>
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- coding: UTF-8 -*-
2 +import json
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +from nltk.corpus import words
8 +
9 +__author__ = 'CMendezC'
10 +
11 +# Objective: Tagging biological terms from lists of terms related to aspects of interest:
12 +# 1) Changing POS tag by term tag
13 +
14 +# Parameters:
15 +# 1) --inputPath Path to read input files.
16 +# 2) --outputPath Path to place output files.
17 +# 3) --termPath Path to read term lists
18 +# 4) --termFiles JSON file with terms files and tags
19 +# 5) --crf Let POS tag instead of substituting it by term or freq tag
20 +
21 +# Output:
22 +# 1) Files with biological terms tagged
23 +
24 +# Execution:
25 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
26 +
27 +# FhlA
28 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
29 +
30 +# MarA
31 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
32 +
33 +# ArgR
34 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
35 +
36 +# CytR
37 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
38 +
39 +# Rob
40 +# python biologicalTermTagging.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
41 +
42 +###########################################################
43 +# MAIN PROGRAM #
44 +###########################################################
45 +
46 +if __name__ == "__main__":
47 + # Parameter definition
48 + parser = OptionParser()
49 + parser.add_option("--inputPath", dest="inputPath",
50 + help="Path to read input files", metavar="PATH")
51 + parser.add_option("--outputPath", dest="outputPath",
52 + help="Path to place transformed files", metavar="PATH")
53 + parser.add_option("--termPath", dest="termPath",
54 + help="Path to read term files", metavar="PATH")
55 + parser.add_option("--termFiles", dest="termFiles",
56 + help="JSON file with terms files and tags", metavar="FILE")
57 + parser.add_option("--crf", default=False,
58 + action="store_true", dest="crf",
59 + help="Let POS tag instead of substituting it by term or freq tag?")
60 + parser.add_option("--termLower", default=False,
61 + action="store_true", dest="termLower",
62 + help="Compare with terms in lower case?")
63 + parser.add_option("--termCapitalize", default=False,
64 + action="store_true", dest="termCapitalize",
65 + help="Compare with capitalize terms?")
66 +
67 + (options, args) = parser.parse_args()
68 +
69 + if len(args) > 0:
70 + parser.error("None parameters indicated.")
71 + sys.exit(1)
72 +
73 + # Printing parameter values
74 + print('-------------------------------- PARAMETERS --------------------------------')
75 + print("Path to read input files: " + str(options.inputPath))
76 + print("Path to place transformed files: " + str(options.outputPath))
77 + print("Path to read term files: " + str(options.termPath))
78 + print("Let POS tag instead of substituting it by term or freq tag? " + str(options.crf))
79 + print("Compare with terms in lower case? " + str(options.termLower))
80 + print("Compare with capitalize terms? " + str(options.termCapitalize))
81 +
82 + ##### LOADING BIOLOGICAL TERM FILES #####
83 + # hashTermFiles = {
84 + # 'DFAM': ['domain_families_1grams.txt', 'domain_families_2grams.txt', 'domain_families_3grams.txt', 'domain_families_4grams.txt', 'domain_families_5Moregrams.txt'],
85 + # 'MF': ['domain_function_1grams.txt', 'domain_function_2grams.txt', 'domain_function_3grams.txt', 'domain_function_4grams.txt' , 'domain_function_5Moregrams.txt'],
86 + # 'RP': ['regulatory_Processes_GO_1grams.txt', 'regulatory_Processes_GO_2grams.txt', 'regulatory_Processes_GO_3grams.txt', 'regulatory_Processes_GO_4grams.txt', 'regulatory_Processes_GO_5Moregrams.txt'],
87 + # 'DPOS': ['domain_position_1grams.txt', 'domain_position_2grams.txt', 'domain_position_5Moregrams.txt'],
88 + # 'DMOT': ['domain_structural_motif_1grams.txt', 'domain_structural_motif_2grams.txt'],
89 + # 'TF': ['tfs.txt']
90 + # }
91 +
92 + # hashTerms = {
93 + # 'DFAM': [],
94 + # 'MF': [],
95 + # 'RP': [],
96 + # 'DPOS': [],
97 + # 'DMOT': [],
98 + # 'TF': []
99 + # }
100 +
101 + print('Loading biological term files...')
102 + with open(os.path.join(options.termPath, options.termFiles)) as data_file:
103 + lists = json.load(data_file)
104 +
105 + hashTermFiles = lists["hashTermFiles"]
106 + hashTerms = lists["hashTerms"]
107 +
108 + for key in hashTermFiles.keys():
109 + for f in hashTermFiles[key]:
110 + # print('File: ' + f)
111 + with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile:
112 + for line in iFile:
113 + line = line.strip('\n')
114 + line = line.replace(' ', '-')
115 + if line not in hashTerms[key]:
116 + hashTerms[key].append(line)
117 + if options.termLower:
118 + hashTerms[key].append(line.lower())
119 + if options.termCapitalize:
120 + hashTerms[key].append(line.capitalize())
121 + print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
122 +
123 + regularWords = words.words('en')
124 + print()
125 +
126 + filesPreprocessed = 0
127 + t0 = time()
128 + print("Biological term tagging files...")
129 + # Walk directory to read files
130 + for path, dirs, files in os.walk(options.inputPath):
131 + # For each file in dir
132 + for file in files:
133 + print(" Biological term tagging file..." + str(file))
134 + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
135 + # Create output file to write
136 + with open(os.path.join(options.outputPath, file.replace('lem.txt', 'term.txt')), "w", encoding="utf-8") as oFile:
137 + for line in iFile:
138 + if line == '\n':
139 + oFile.write(line)
140 + else:
141 + line = line.strip('\n')
142 + listLine1 = line.split('\t')
143 + if len(listLine1) < 3:
144 + continue
145 + word = listLine1[0]
146 + pos = listLine1[1]
147 + listLine2 = listLine1[2].split(' ')
148 + lemma = listLine2[0]
149 + if len(word) > 1:
150 + for termTag in hashTerms:
151 + if termTag == "TF":
152 + for term in hashTerms[termTag]:
153 + if (word == term) or (word.startswith(term) and lemma not in regularWords):
154 + print(" TAG WORD {} AS TF CAUSE START WITH TF {} OR IT IS EQUAL".format(word, term))
155 + if listLine1[1].startswith("NN"):
156 + # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
157 + line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
158 + elif termTag == "EFFECT":
159 + if word.lower() in hashTerms[termTag]:
160 + line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
161 + elif termTag == "DIS":
162 + for term in hashTerms[termTag]:
163 + if lemma.startswith(term) and (pos not in ["CC", "DT", "FW", "CD", "IN", "PRP$", "JJ", "JJR", "JJS", "VBN", "RB"]):
164 + line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
165 + else:
166 + if word in hashTerms[termTag]:
167 + # listLine2 = listLine1[2].split(' ')
168 + if termTag in ["GENE", "TU"]:
169 + if listLine1[1].startswith("NN"):
170 + # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
171 + line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
172 + elif termTag in ["GC"]:
173 + if pos not in ["CC", "DT", "FW", "CD", "IN", "PRP$", "NNP"]:
174 + # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
175 + line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
176 + else:
177 + if termTag in ['FWDOM', 'FWRP']:
178 + # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' FreqTag'
179 + line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' FreqTag'
180 + else:
181 + # line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
182 + # line = listLine1[0] + '\t' + termTag + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
183 + line = word + '\t' + termTag + '\t' + lemma + ' ' + termTag + ' TermTag'
184 + oFile.write(line + '\n')
185 + filesPreprocessed += 1
186 +
187 + # Imprime archivos procesados
188 + print()
189 + print("Files preprocessed: " + str(filesPreprocessed))
190 + print("In: %fs" % (time() - t0))
1 +#!/bin/sh
2 +echo 'Preprocessing files...'
3 +ORIGINAL_CORPUS_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
4 +CORPUS_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
5 +TERM_PATH=/export/space1/users/compu2/bionlp/conditional-random-fields/dictionaries
6 +
7 +PRE=TRUE
8 +echo " Preprocessing: $PRE"
9 +POS=TRUE
10 +echo " POS Tagging: $POS"
11 +LEMMA=TRUE
12 +echo " Lemmatization: $LEMMA"
13 +TERM=TRUE
14 +echo " Terminological tagging: $TERM"
15 +TRANS=TRUE
16 +echo " Transformation: $TRANS"
17 +
18 +if [ "$PRE" = "TRUE" ]; then
19 +echo "Preprocessing..."
20 +INPUT_PATH=$ORIGINAL_CORPUS_PATH
21 +OUTPUT_PATH=$CORPUS_PATH/preprocessed
22 +python3.4 preprocessingTermDetection.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termDetection --termPath $TERM_PATH --termFiles termFilesLength_LREGULONDB.json > outputPreprocessing_lregulondb.txt
23 +# python3.4 preprocessingTermDetection.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH > outputPreprocessing_lregulondb.txt
24 +fi
25 +
26 +if [ "$POS" = "TRUE" ]; then
27 +echo "POS Tagging..."
28 +INPUT_PATH=$CORPUS_PATH/preprocessed
29 +OUTPUT_PATH=$CORPUS_PATH/pos
30 +python3.4 posTaggingStanford.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --taggerPath /home/cmendezc/STANFORD_POSTAGGER/stanford-postagger-2015-12-09 --biolemmatizer > outputPOST_lregulondb.txt
31 +fi
32 +
33 +if [ "$LEMMA" = "TRUE" ]; then
34 +echo "Lemmatization..."
35 +INPUT_PATH=$CORPUS_PATH/pos
36 +OUTPUT_PATH=$CORPUS_PATH/lemma
37 +python3.4 biolemmatizing.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --biolemmatizerPath /home/cmendezc/BIO_LEMMATIZER > outputLemma_lregulondb.txt
38 +fi
39 +
40 +if [ "$TERM" = "TRUE" ]; then
41 +echo "Terminological tagging..."
42 +INPUT_PATH=$CORPUS_PATH/lemma
43 +OUTPUT_PATH=$CORPUS_PATH/term
44 +python3.4 biologicalTermTagging.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --termPath $TERM_PATH --termFiles termFilesTag_LREGULONDB.json > outputTerm_lregulondb.txt
45 +fi
46 +
47 +if [ "$TRANS" = "TRUE" ]; then
48 +echo "Transformation..."
49 +INPUT_PATH=$CORPUS_PATH/term
50 +OUTPUT_PATH=$CORPUS_PATH/transformed
51 +python3.4 transforming.py --inputPath $INPUT_PATH --outputPath $OUTPUT_PATH --minWordsInLine 5 > outputTransformation_lregulondb.txt
52 +fi
1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +from subprocess import call
8 +
9 +__author__ = 'CMendezC'
10 +
11 +# Objective: Part-of-Speech Tagging of several files with Stanford POS Tagger.
12 +
13 +# Parameters:
14 +# 1) --inputPath Path to read TXT files.
15 +# 2) --outputPath Path to place POST files.
16 +# 3) --taggerPath Path POS Tagger command.
17 +# 4) --biolemmatizer Format for biolemmatizer?.
18 +
19 +# Output:
20 +# 1) POS Tagged files.
21 +# 2) If --biolemmatizer with format:
22 +# Rob NNP
23 +# is VBZ
24 +# a DT
25 +# transcriptional JJ
26 +# dual JJ
27 +# regulator NN
28 +# . .
29 +#
30 +# Its PRP$
31 +# N-terminal JJ ...
32 +
33 +# Execution:
34 +# GntR
35 +# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
36 +
37 +# FhlA
38 +# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
39 +
40 +# MarA
41 +# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
42 +
43 +# ArgR
44 +# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
45 +
46 +# CytR
47 +# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
48 +
49 +# Rob
50 +# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
51 +
52 +# EXTRACTING REGULATORY INTERACTIONS
53 +# python posTaggingStanford.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\preprocessed --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\post --taggerPath C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09 --biolemmatizer
54 +
55 +###########################################################
56 +# MAIN PROGRAM #
57 +###########################################################
58 +
59 +if __name__ == "__main__":
60 + # Parameter definition
61 + parser = OptionParser()
62 + parser.add_option("-i", "--inputPath", dest="inputPath",
63 + help="Path to read TXT files", metavar="PATH")
64 + parser.add_option("-o", "--outputPath", dest="outputPath",
65 + help="Path to place POST files", metavar="PATH")
66 + parser.add_option("-a", "--taggerPath", dest="taggerPath", default="",
67 + help="Path FreeLing analyzer files", metavar="PATH")
68 + parser.add_option("-p", "--biolemmatizer", default=False,
69 + action="store_true", dest="biolemmatizer",
70 + help="Format for biolemmatizer?")
71 +
72 + (options, args) = parser.parse_args()
73 + if len(args) > 0:
74 + parser.error("None parameters indicated.")
75 + sys.exit(1)
76 +
77 + # Printing parameter values
78 + print('-------------------------------- PARAMETERS --------------------------------')
79 + print("Path to read input files: " + str(options.inputPath))
80 + print("Path to place output files: " + str(options.outputPath))
81 + print("Path POS Tagger command: " + str(options.taggerPath))
82 + print("Format for biolemmatizer?: " + str(options.biolemmatizer))
83 +
84 + filesTagged = 0
85 + t0 = time()
86 + print("Tagging corpus...")
87 + # Walk directory to read files
88 + for path, dirs, files in os.walk(options.inputPath):
89 + # For each file in dir
90 + for file in files:
91 + print(" Tagging file..." + str(file))
92 + try:
93 + # FREELING: taggerPath = os.path.join(options.taggerPath, "analyzer.ex")
94 + # FREELING: command = taggerPath + " -f " + os.path.join("%FREELINGSHARE%", "config", "en.cfg") + " <" + os.path.join(path, file) + "> " + os.path.join(options.outputPath, file) + ".post.txt"
95 +
96 + # stanford-postagger models\english-left3words-distsim.tagger
97 + # C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TFsummaries_tagged_SGC_aspectRP-DOM\ECK120011190.Rob.sum.txt
98 + # >
99 + # C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectsOfInterest_TrainingSet\testingTaggers\ECK120011190.Rob.sum.txt
100 +
101 + import platform
102 + plat = platform.system()
103 + if plat == 'Linux':
104 + # FOR LINUX
105 + # java -mx300m -cp 'stanford-postagger.jar:lib/*' edu.stanford.nlp.tagger.maxent.MaxentTagger
106 + # -model $1 -textFile $2
107 + command = "java -mx300m -cp " + os.path.join(options.taggerPath, 'stanford-postagger.jar:') + \
108 + os.path.join(options.taggerPath, 'lib/*') + \
109 + ' edu.stanford.nlp.tagger.maxent.MaxentTagger -model ' + \
110 + os.path.join(options.taggerPath, 'models', 'english-left3words-distsim.tagger') + \
111 + ' -textFile ' + os.path.join(options.inputPath, file) + \
112 + ' > ' + os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt'))
113 + else:
114 + # C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\preprocessingCorpus>java -mx300m
115 + # -cp "C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09\stanford-postagger.jar;
116 + # C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09\lib/*"
117 + # edu.stanford.nlp.tagger.maxent.MaxentTagger -model
118 + # C:\Users\cmendezc\Documents\GENOMICAS\STANFORD_POSTAGGER\stanford-postagger-2015-12-09\models\english-left3words-distsim.tagger
119 + # -textFile C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\preprocessed\ECK120011190.Rob.sum.pre.txt
120 + #taggerPath = os.path.join('java')
121 + command = "java -mx300m -cp " + os.path.join(options.taggerPath, 'stanford-postagger.jar;') + \
122 + os.path.join(options.taggerPath, 'lib/*') + \
123 + ' edu.stanford.nlp.tagger.maxent.MaxentTagger -model ' + \
124 + os.path.join(options.taggerPath, 'models', 'english-left3words-distsim.tagger') + \
125 + ' -textFile ' + os.path.join(options.inputPath, file) + \
126 + ' > ' + os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt')) #print(command)
127 +
128 + retcode = call(command, shell=True)
129 + if retcode < 0:
130 + print(" Child was terminated by signal", -retcode, file=sys.stderr)
131 + else:
132 + print(" Child returned", retcode, file=sys.stderr)
133 + filesTagged += 1
134 + except OSError as e:
135 + print(" Execution failed:", e, file=sys.stderr)
136 +
137 + text = ""
138 + if options.biolemmatizer:
139 + with open(os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt')), "r", encoding="utf-8", errors="replace") as iFile:
140 + text = iFile.read()
141 + # -LRB-_-LRB- PTS_NN -RRB-_-RRB-
142 + # for_IN Mlc_NN inactivation_NN ._.
143 + text = text.replace('-LRB-', '(')
144 + text = text.replace('-RRB-', ')')
145 +
146 + text = text.replace('-LSB-', '[')
147 + text = text.replace('-RSB-', ']')
148 +
149 + text = text.replace('_', '\t')
150 + text = text.replace(' ', '\n')
151 + text = text.replace('.\n', '.\n\n')
152 + with open(os.path.join(options.outputPath, file.replace('pre.txt', 'pos.txt')), "w", encoding="utf-8", errors="replace") as oFile:
153 + oFile.write(text)
154 +
155 + # Imprime archivos procesados
156 + print()
157 + print("Files POS Tagged: " + str(filesTagged))
158 + print("Files POS Tagged in: %fs" % (time() - t0))
1 +# -*- coding: UTF-8 -*-
2 +import json
3 +import re
4 +from optparse import OptionParser
5 +import os
6 +import sys
7 +from time import time
8 +
9 +import nltk
10 +
11 +__author__ = 'CMendezC'
12 +
13 +
14 +# Objective: Preprocessing paper files:
15 +# Eliminate lines beginning with:
16 +# Copyright � 1997
17 +# © 1997 Elsevier
18 +# Copyright © 1998,
19 +# Keywords: GntR; cAMP-CRP; GntP family
20 +# Received 21 October 1996/Accepted 27 December 1996
21 +# Received 6 January 1997; accepted 5 June 1997; Received by A. Nakazawa
22 +# (Received 29 June 1998/Accepted 3 August 1998)
23 +# REFERENCES: Eisenberg, R.C., Dobrogosz, W.J., 1967 | Hung, A., Orozco, A., Zwaig, N., 1970.
24 +# Shine, J. & Dalgarno, L. (1974).
25 +# 34. Saier, M. H., T. M. Ramseier, and J. Reizer. 1996.
26 +# * Corresponding author. Mailing address: Department of Microbiology,
27 +# Phone: (614) 688-3518.
28 +# Fax: (614) 688-3519.
29 +# E-mail: conway.51@osu.edu.
30 +# Downloaded from
31 +# Selecting lines until ACKNOWLEDGMENTS or REFERENCES or Acknowledgements or References
32 +# Biological term detection
33 +
34 +# Parameters:
35 +# 1) --inputPath Path to read TXT files.
36 +# 2) --outputPath Path to place POST files.
37 +# 3) --termPath Path to read term lists
38 +# 4) --termFiles JSON file with terms files and length
39 +# 5) --termDetection If term detection is performed
40 +# 6) --multiDocument Processing multidocuments within input file?
41 +# 7) --tabFormat File with format PMID\tNUMSENT\tSENT\tCLASS?
42 +# 8) --joinPunctuation Join separated punctuation (it comes separated from ODIN-XML files)
43 +
44 +# Output:
45 +# 1) preprocessed files with biological term detection
46 +
47 +# Execution:
48 +# GntR
49 +# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
50 +
51 +# FhlA
52 +# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
53 +
54 +# MarA
55 +# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
56 +
57 +# ArgR
58 +# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
59 +
60 +# CytR
61 +# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
62 +
63 +# Rob
64 +# python preprocessingTermDetection.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\original --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\preprocessed --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesLength.json
65 +
66 +# EXTRACTING REGULATORY INTERACTIONS
67 +# python preprocessingTermDetection.py
68 +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\original
69 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\preprocessed
70 +# --termPath C:\Users\cmendezc\Documents\GENOMICAS\preprocessingTermTagging_v1.0\termLists
71 +# --termFiles termFilesLength.json
72 +
73 +# def addEndPeriod(cad):
74 +# if cad.endswith('.'):
75 +# return cad
76 +# else:
77 +# return cad + '.'
78 +
79 +
80 +###########################################################
81 +# MAIN PROGRAM #
82 +###########################################################
83 +
84 +if __name__ == "__main__":
85 + # Parameter definition
86 + parser = OptionParser()
87 + parser.add_option("--inputPath", dest="inputPath",
88 + help="Path to read input files", metavar="PATH")
89 + parser.add_option("--outputPath", dest="outputPath",
90 + help="Path to place output files", metavar="PATH")
91 + parser.add_option("--termPath", dest="termPath",
92 + help="Path of term files", metavar="PATH")
93 + parser.add_option("--termFiles", dest="termFiles",
94 + help="JSON file with terms files and length", metavar="PATH")
95 + parser.add_option("--termDetection", default=False,
96 + action="store_true", dest="termDetection",
97 + help="Perform term detection?")
98 + parser.add_option("--multiDocument", default=False,
99 + action="store_true", dest="multiDocument",
100 + help="Processing multidocuments within input file?")
101 + parser.add_option("--tabFormat", default=False,
102 + action="store_true", dest="tabFormat",
103 + help="File with format PMID\tNUMSENT\tSENT\tCLASS?")
104 + parser.add_option("--joinPunctuation", default=False,
105 + action="store_true", dest="joinPunctuation",
106 + help="Join separated punctuation?")
107 + parser.add_option("--termLower", default=False,
108 + action="store_true", dest="termLower",
109 + help="Compare with terms in lower case?")
110 + parser.add_option("--termCapitalize", default=False,
111 + action="store_true", dest="termCapitalize",
112 + help="Compare with capitalize terms?")
113 +
114 + (options, args) = parser.parse_args()
115 +
116 + if len(args) > 0:
117 + parser.error("None parameters indicated.")
118 + sys.exit(1)
119 +
120 + # Printing parameter values
121 + print('-------------------------------- PARAMETERS --------------------------------')
122 + print("Path to read input files: " + str(options.inputPath))
123 + print("Path to place output files: " + str(options.outputPath))
124 + print("Perform term detection?: " + str(options.termDetection))
125 + if options.termDetection:
126 + print("Path to read terminological resources: " + str(options.termPath))
127 + print("JSON file with terms files and length: " + str(options.termFiles))
128 + print("Processing multidocuments within input file?: " + str(options.multiDocument))
129 + print("File with format PMID\tNUMSENT\tSENT\tCLASS?: " + str(options.tabFormat))
130 + print("Join separated punctuation?: " + str(options.joinPunctuation))
131 +
132 + # #### REGEX DEFINITION FOR UNNECESSARY LINES #####
133 + regexEmptyLine = re.compile('^\s*$')
134 + # Copyright � 1997
135 + # © 1997 Elsevier
136 + # Copyright © 1998,
137 + # Keywords: GntR; cAMP-CRP; GntP family
138 + # Received 21 October 1996/Accepted 27 December 1996
139 + # Received 6 January 1997; accepted 5 June 1997; Received by A. Nakazawa
140 + # (Received 29 June 1998/Accepted 3 August 1998)
141 + # * Corresponding author. Mailing address: Department of Microbiology,
142 + # Phone: (614) 688-3518.
143 + # Fax: (614) 688-3519.
144 + # E-mail: conway.51@osu.edu.
145 + # Downloaded from
146 + # www.sciencedirect.com Current Opinion in Microbiology 2008, 11:87–93 88 Cell regulation
147 + # DOI 10.1016 / j.mib .2008.02.007
148 + # Correspondence to J
149 +
150 + # journal homepage: www.elsevier.com/locate/biotechadv
151 + # Research review paper
152 + # Article history:
153 + # Accepted 18 April 2014
154 + # Available online 26 April 2014
155 + # Abbreviations : ROS ,
156 + # JOURNAL OF
157 + # 0021-9193/02
158 +
159 + # Mailing address : CSIC - Estación Experimental del Zaidín , Apdo .
160 + # Correos 419 , E - 18008 Granada , Spain .
161 + # Phone : 34 - 58 - 121011 .
162 + # Fax : 34 - 58 - 129600 .
163 + # Present address : Department of Biology , Imperial College of Science ,
164 +
165 + expression = '^(Copyright|© [0-9][0-9][0-9][0-9]|Keywords:|\(?Received [0-9]?[0-9]|\*?\s?Corresponding author|' + \
166 + 'Phone:|Fax:|E-mail:|Phone\s:|Fax\s:|E-mail\s:|Mailing\saddress\s:|Present\saddress\s:|' + \
167 + 'Downloaded\sfrom|DOI|www\.sciencedirect\.com|Correspondence to [A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]|' + \
168 + 'journal homepage:|Research review paper|Article history:|\(?Accepted [0-9]?[0-9]|' + \
169 + 'Available online|Abbreviations:|ACKNOWLEDGMENTS\s|REFERENCES\s|' + \
170 + 'All rights reserved|Published by Elsevier|' + \
171 + 'Verbatim copying and redistribution of this article|J Bacteriol [0-9][0-9][0-9][0-9]|' + \
172 + 'Mol Microbiol [0-9][0-9][0-9][0-9]|Nucleic Acids Res [0-9][0-9][0-9][0-9]|' + \
173 + 'JOURNAL OF|[0-9][0-9][0-9][0-9]\-[0-9][0-9][0-9]/[0-9][0-9]|[0-9][0-9][0-9] – [0-9][0-9][0-9] Vol)'
174 + regexUnnecessaryLines = re.compile(expression)
175 + #regexUnnecessaryLines = re.compile('^(Copyright)')
176 + # REFERENCES: Eisenberg, R.C., Dobrogosz, W.J., 1967
177 + # Hung, A., Orozco, A., Zwaig, N., 1970.
178 + # Shine, J. & Dalgarno, L. (1974).
179 + # 34. Saier, M. H., T. M. Ramseier, and J. Reizer. 1996.
180 + # 1. Pesavento, C. & Hengge, R. Bacterial nucleotide-based
181 + # Battesti , N .
182 + # Aiba , H . , T .
183 + # Yamamoto , and M .
184 + # regexReferences = re.compile('^([0-9]?[0-9]\.\s)?[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ][a-záéíóúàèìòùüâêîôû\-]+\s?,\s([A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]\s?\.\s?)+.*([0-9][0-9][0-9][0-9])')
185 + # regexReferences = re.compile('^([0-9]?[0-9]\.\s)?[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ][a-záéíóúàèìòùüâêîôû\-]+\s?,\s([A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]\s?\.\s?)+')
186 + regexReferences = re.compile('^([0-9]?[0-9]\.\s)?[A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ][a-záéíóúàèìòùüâêîôû\-]+\s?,\s([A-ZÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛ]\s?\.\s?)+($|.*\(\s?[0-9][0-9][0-9][0-9]\s?\))')
187 + # Lines without words, with only symbols
188 + # --.-,.;....a...........c....
189 + # .........
190 + # 2.;
191 + # ..~......: ........................
192 + # ::..:.< -.;-.:.;L.:.5 %..-.-...;..;..,:
193 + # ?........., .....,: ........,,::, , ...
194 + # ..
195 + # .J
196 + # L,.
197 + # 2
198 + # i
199 + # regexLinesNoText = re.compile('^[^a-zA-Z0-9]')
200 +
201 + # regexUnderscoreWord = re.compile(r'\b_\b')
202 +
203 + # 40 o more dots which appear in index lines
204 + regexIndexLine = re.compile('\.{40}')
205 +
206 + # e-mails
207 + regexEmail = re.compile(
208 + '(e-mail : |e-mail: |e-mail )?([a-zA-Z0-9\._\-]+@[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+ |[a-zA-Z0-9\._\-]+@[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+ )')
209 +
210 + ### DETECTAR CONTENTS Y ELIMINAR HASTA INTRODUCTION (?): Overview of oxidative stress response ... ... 28 2 .
211 + ### SI ES INTRODUCTION, AKNOLEDGMENTS U OTRO TÍTULO, PONERLE PUNTO O ELIMINARLO SI ES A INICIO DE PALABRA Y NO HAY OTRO PALABRA DESPUÉS.
212 + # A VECES SE USA Summary
213 +
214 + # Join separated punctuation
215 + if options.joinPunctuation:
216 + # 1) join to right: (, [, “, ‘, ±, ~
217 + regexPuncRigth = re.compile('(?P<punct>[\(\[“‘±~])\s')
218 + # 2) join to left: ), ], ., ,, ”, ´, ;, %, :, ’, '
219 + regexPuncLeft = re.compile('\s(?P<punct>[\)\]\.,”´;%:’\'])')
220 + # 3) join both sides: -, /, –, —
221 + regexPuncBoth = re.compile('\s(?P<punct>[-/–—])\s')
222 + # 4) genitive: ArgP ’ s
223 + regexPuncGenitive = re.compile('(?P<before>[a-zA-Z])\s’\ss\s')
224 +
225 + # #### LOADING BIOLOGICAL TERM FILES #####
226 + if options.termDetection:
227 + with open(os.path.join(options.termPath, options.termFiles)) as data_file:
228 + hashes = json.load(data_file)
229 +
230 + hashTermFiles = hashes["hashTermFiles"]
231 + hashTerms = hashes["hashTerms"]
232 +
233 + for key in hashTermFiles.keys():
234 + for f in hashTermFiles[key]:
235 + with open(os.path.join(options.termPath, f), "r", encoding="utf-8", errors="replace") as iFile:
236 + for line in iFile:
237 + line = line.strip('\n')
238 + if line not in hashTerms[key]:
239 + hashTerms[key].append(line)
240 + if options.termLower:
241 + hashTerms[key].append(line.lower())
242 + if options.termCapitalize:
243 + hashTerms[key].append(line.capitalize())
244 + print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
245 +
246 + filesProcessed = 0
247 + t0 = time()
248 + print("Preprocessing files...")
249 + # Walk directory to read files
250 + for path, dirs, files in os.walk(options.inputPath):
251 + # For each file in dir
252 + for file in files:
253 + print(" Preprocessing file..." + str(file))
254 + text = ''
255 + listSentences = []
256 + references = 0
257 + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
258 + # Create output file to write
259 + # with open(os.path.join(options.outputPath, file.replace('.txt', '.pre.txt')), "w", encoding="utf-8") as oFile:
260 + for line in iFile:
261 + originalLine = line.strip('\n')
262 + if options.joinPunctuation:
263 + originalLine = regexPuncGenitive.sub(r'\g<before>’s', originalLine)
264 + originalLine = regexPuncRigth.sub(r'\g<punct>', originalLine)
265 + originalLine = regexPuncLeft.sub(r'\g<punct>', originalLine)
266 + originalLine = regexPuncBoth.sub(r'\g<punct>', originalLine)
267 + if options.tabFormat:
268 + listLine = originalLine.split('\t')
269 + line = listLine[2]
270 + ### DETECTAR AKNOWLEDGMENTS Y ELIMINAR TODO LO QUE SIGA
271 + # This eliminate usefull part of pepers if line.upper().startswith('ACKNOWLEDGMENT') or line.upper().startswith('REFERENCES') or references > 2:
272 +
273 + # Do not eliminate references because within them there are RIs
274 + # if not options.multiDocument:
275 + # if line.upper() == 'ACKNOWLEDGMENTS' or line.upper() == 'REFERENCES' or references > 2:
276 + # break
277 + if not options.multiDocument:
278 + if line.upper() == 'ACKNOWLEDGMENTS':
279 + break
280 + # if line == '' or line == None:
281 + if regexEmptyLine.match(line) != None:
282 + print('Empty line ' + line)
283 + continue
284 + # Do not eliminate references because within them there are RIs
285 + # if regexReferences.match(line) != None:
286 + # print('Reference line ' + str(line.encode(encoding='UTF-8', errors='replace')))
287 + # references += 1
288 + # continue
289 + # if regexUnnecessaryLines.match(line) != None:
290 + if regexUnnecessaryLines.search(line) != None:
291 + print('Unnecessary line ' + str(line.encode(encoding='UTF-8', errors='replace')))
292 + continue
293 + if regexIndexLine.search(line) != None:
294 + print('Index line ' + line)
295 + continue
296 + if regexEmail.search(line) != None:
297 + print('Line with email: ' + line)
298 + line = regexEmail.sub(' ', line)
299 + # print(line)
300 +
301 + text += originalLine + '\n'
302 +
303 + if options.termDetection:
304 + # #### BIOLOGICAL TERM DETECTION #####
305 + print(' Detecting biological terms...')
306 + for key in sorted(hashTerms.keys(), reverse=True):
307 + #print(' length: ' + str(key))
308 + for term in hashTerms[key]:
309 + #print(str(term.encode(encoding='UTF-8', errors='replace')))
310 + text = text.replace(term, term.replace(' ', '-'))
311 + #regexTerm = re.compile(r'' + term)
312 + #regexTerm.sub(term.replace(' ', '_TERM_'), text)
313 +
314 + filesProcessed += 1
315 + with open(os.path.join(options.outputPath, file.replace(' ', '').replace('.txt', '.pre.txt')), "w", encoding="utf-8") as oFile:
316 + oFile.write(text)
317 +
318 + # Imprime archivos procesados
319 + print()
320 + print("Files preprocessed: " + str(filesProcessed))
321 + print("In: %fs" % (time() - t0))
1 +# -*- coding: UTF-8 -*-
2 +import re
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +
8 +__author__ = 'CMendezC'
9 +
10 +# Objective: Transforming BIOLemmatized files:
11 +# 1) Transformed files
12 +# 2) Text files to extract aspects
13 +
14 +# Parameters:
15 +# 1) --inputPath Path to read input files.
16 +# 2) --outputPath Path to place output files.
17 +# 3) --textPath Path to place output files.
18 +# 4) --minWordsInLine Minimum length sentence in number of words
19 +# 5) --classes Classes to indicate final of sentence when line contains: PMID\tNUMSENT\tSENT\tCLASS
20 +
21 +# Output:
22 +# 1) transformed files
23 +# 2) text files
24 +
25 +# Execution:
26 +# GntR
27 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\transformed --minWordsInLine 5
28 +
29 +# FhlA
30 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\transformed --minWordsInLine 5
31 +
32 +# MarA
33 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\transformed --minWordsInLine 5
34 +
35 +# ArgR
36 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\transformed --minWordsInLine 5
37 +
38 +# CytR
39 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\transformed --minWordsInLine 5
40 +
41 +# Rob
42 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\transformed --minWordsInLine 5
43 +
44 +# EXTRACTING REGULATORY INTERACTIONS
45 +# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\transformed --minWordsInLine 5
46 +
47 +
48 +def length(listWords):
49 + regexWord = re.compile('[a-zA-Z]')
50 + words = 0
51 + chars = 0
52 + for word in listWords:
53 + listTemp = word.split('|')
54 + if regexWord.search(listTemp[1]) is not None:
55 + words += 1
56 + chars += len(listTemp[0])
57 + return words, chars
58 +
59 +###########################################################
60 +# MAIN PROGRAM #
61 +###########################################################
62 +
63 +if __name__ == "__main__":
64 + # Parameter definition
65 + parser = OptionParser()
66 + parser.add_option("-i", "--inputPath", dest="inputPath",
67 + help="Path to read input files", metavar="PATH")
68 + parser.add_option("-o", "--outputPath", dest="outputPath",
69 + help="Path to place transformed files", metavar="PATH")
70 + parser.add_option("--minWordsInLine", type="int", dest="minWordsInLine", default=3,
71 + help="Minimum length sentence in number of words", metavar="NUM")
72 + parser.add_option("--classes", dest="classes",
73 + help="Classes to indicate final of sentence when line contains: PMID-NUMSENT-SENT-CLASS", metavar="CLASS,CLASS")
74 +
75 + (options, args) = parser.parse_args()
76 +
77 + if len(args) > 0:
78 + parser.error("None parameters indicated.")
79 + sys.exit(1)
80 +
81 + # Printing parameter values
82 + print('-------------------------------- PARAMETERS --------------------------------')
83 + print("Path to read input files: " + str(options.inputPath))
84 + print("Path to place transformed files: " + str(options.outputPath))
85 + print("Minimum length sentence in number of words: " + str(options.minWordsInLine))
86 + print("Classes to indicate final of sentence: " + str(options.classes))
87 +
88 + # We realized that POS tags from Biolemmatizer are very specific, therefore we decided to use Standford tags
89 + bioPOST = False
90 + filesProcessed = 0
91 + # minWordsInLine = 3
92 + if not options.classes is None:
93 + listClasses = options.classes.split(',')
94 + t0 = time()
95 + print("Transforming files...")
96 + # Walk directory to read files
97 + for path, dirs, files in os.walk(options.inputPath):
98 + # For each file in dir
99 + for file in files:
100 + print(" Transforming file..." + str(file))
101 + #TrpR NN TrpR NN PennPOS
102 + # , , , , NUPOS
103 + # tryptophan NN tryptophan NN PennPOS
104 + listLine1 = []
105 + listLine2 = []
106 + text = ''
107 + lemma = ''
108 + pos = ''
109 + textTransformed = ''
110 + textText = ''
111 + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
112 + # Create output file to write
113 + with open(os.path.join(options.outputPath, file.replace('term.txt', 'tra.txt')), "w", encoding="utf-8") as transformedFile:
114 + for line in iFile:
115 + if line == '\n':
116 + if options.classes is None:
117 + if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
118 + transformedFile.write(textTransformed + '\n')
119 + textTransformed = ''
120 + textText = ''
121 + else:
122 + continue
123 + else:
124 + line = line.strip('\n')
125 + #print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
126 + listLine1 = line.split('\t')
127 + if len(listLine1) != 3:
128 + continue
129 + text = listLine1[0]
130 + # Replacing an estrange space character
131 + text = text.replace(' ', '-')
132 + listLine2 = listLine1[2].split(' ')
133 + lemma = listLine2[0]
134 + # Replacing an estrange space character
135 + lemma = lemma.replace(' ', '-')
136 + if bioPOST:
137 + pos = listLine2[1]
138 + #print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
139 + else:
140 + pos = listLine1[1]
141 + textText = textText + text + ' '
142 + textTransformed = textTransformed + text + '|' + lemma + '|' + pos + ' '
143 + # RI+GC NN RI+GC NN PennPOS
144 + if not options.classes is None:
145 + if text in listClasses:
146 + # if length(textTransformed.split()) > options.minWordsInLine:
147 + if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
148 + transformedFile.write(textTransformed + '\n')
149 + # print(textTransformed)
150 + textTransformed = ''
151 + textText = ''
152 + filesProcessed += 1
153 +
154 + # Imprime archivos procesados
155 + print()
156 + print("Files processed: " + str(filesProcessed))
157 + print("In: %fs" % (time() - t0))