Carlos-Francisco Méndez-Cruz

Setting up project

1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +from subprocess import call
8 +
9 +__author__ = 'CMendezC'
10 +
11 +# Objective: Lemmatizing several files with BIOLemmatizer.
12 +
13 +# Parameters:
14 +# 1) --inputPath Path to read TXT files.
15 +# 2) --outputPath Path to place POST files.
16 +# 3) --biolemmatizerPath Path BIOLemmatizer command.
17 +
18 +# Input:
19 +# 1) POS Tagged files in format:
20 +# Rob NNP
21 +# is VBZ
22 +# a DT
23 +# transcriptional JJ
24 +# dual JJ
25 +# regulator NN
26 +# . .
27 +#
28 +# Its PRP$
29 +# N-terminal JJ ...
30 +
31 +# Output:
32 +# 1) BIOLemmatized files.
33 +
34 +# Execution:
35 +# python biolemmatizing.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\post --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT\ECK120012096_GntR\lemma --biolemmatizerPath C:\Users\cmendezc\Documents\GENOMICAS\BIO_LEMMATIZER
36 +
37 +# FhlA
38 +# python biolemmatizing.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\post --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\lemma --biolemmatizerPath C:\Users\cmendezc\Documents\GENOMICAS\BIO_LEMMATIZER
39 +
40 +# MarA
41 +# python biolemmatizing.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\post --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\lemma --biolemmatizerPath C:\Users\cmendezc\Documents\GENOMICAS\BIO_LEMMATIZER
42 +
43 +# ArgR
44 +# python biolemmatizing.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\post --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\lemma --biolemmatizerPath C:\Users\cmendezc\Documents\GENOMICAS\BIO_LEMMATIZER
45 +
46 +# CytR
47 +# python biolemmatizing.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\post --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\lemma --biolemmatizerPath C:\Users\cmendezc\Documents\GENOMICAS\BIO_LEMMATIZER
48 +
49 +# Rob
50 +# python biolemmatizing.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\post --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\lemma --biolemmatizerPath C:\Users\cmendezc\Documents\GENOMICAS\BIO_LEMMATIZER
51 +
52 +# EXTRACTING REGULATORY INTERACTIONS
53 +# python biolemmatizing.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\post --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\lemma --biolemmatizerPath C:\Users\cmendezc\Documents\GENOMICAS\BIO_LEMMATIZER
54 +
55 +###########################################################
56 +# MAIN PROGRAM #
57 +###########################################################
58 +
59 +if __name__ == "__main__":
60 + # Parameter definition
61 + parser = OptionParser()
62 + parser.add_option("-i", "--inputPath", dest="inputPath",
63 + help="Path to read TXT files", metavar="PATH")
64 + parser.add_option("-o", "--outputPath", dest="outputPath",
65 + help="Path to place POST files", metavar="PATH")
66 + parser.add_option("-a", "--biolemmatizerPath", dest="biolemmatizerPath", default="",
67 + help="Path BIOLemmatizer", metavar="PATH")
68 +
69 + (options, args) = parser.parse_args()
70 + if len(args) > 0:
71 + parser.error("None parameters indicated.")
72 + sys.exit(1)
73 +
74 + # Printing parameter values
75 + print('-------------------------------- PARAMETERS --------------------------------')
76 + print("Path to read input files: " + str(options.inputPath))
77 + print("Path to place output files: " + str(options.outputPath))
78 + print("Path BIOLemmatizer command: " + str(options.biolemmatizerPath))
79 +
80 + filesTagged = 0
81 + t0 = time()
82 + print("Lemmatizing corpus...")
83 + # Walk directory to read files
84 + for path, dirs, files in os.walk(options.inputPath):
85 + # For each file in dir
86 + for file in files:
87 + print(" Lemmatizing file..." + str(file))
88 + try:
89 +
90 + #java -Xmx1G -jar biolemmatizer-core-1.2-jar-with-dependencies.jar
91 + # -i C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectsOfInterest_TrainingSet\sentences_POST_Test.Stanford.post.biolemm.txt
92 + # -o C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectsOfInterest_TrainingSet\sentences_POST_Test.Stanford.post.biolemm.lemm.txt
93 +
94 + taggerPath = os.path.join('java')
95 + command = taggerPath + " -Xmx1G -jar " + os.path.join(options.biolemmatizerPath, 'biolemmatizer-core-1.2-jar-with-dependencies.jar') + \
96 + ' -i ' + os.path.join(options.inputPath, file) + \
97 + ' -o ' + os.path.join(options.outputPath, file.replace('pos.txt', 'lem.txt'))
98 + #print(command)
99 + retcode = call(command, shell=True)
100 + if retcode < 0:
101 + print(" Child was terminated by signal", -retcode, file=sys.stderr)
102 + else:
103 + print(" Child returned", retcode, file=sys.stderr)
104 + filesTagged += 1
105 + except OSError as e:
106 + print(" Execution failed:", e, file=sys.stderr)
107 +
108 + # Imprime archivos procesados
109 + print()
110 + print("Files BIOLemmatized: " + str(filesTagged))
111 + print("Files BIOLemmatized in: %fs" % (time() - t0))