transforming.py
7.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding: UTF-8 -*-
import re
from optparse import OptionParser
import os
import sys
from time import time
__author__ = 'CMendezC'
# Objective: Transforming BIOLemmatized files:
# 1) Transformed files
# 2) Text files to extract aspects
# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to place output files.
# 3) --textPath Path to place output files.
# 4) --minWordsInLine Minimum length sentence in number of words
# 5) --classes Classes to indicate final of sentence when line contains: PMID\tNUMSENT\tSENT\tCLASS
# Output:
# 1) transformed files
# 2) text files
# Execution:
# GntR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\transformed --minWordsInLine 5
# FhlA
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\transformed --minWordsInLine 5
# MarA
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\transformed --minWordsInLine 5
# ArgR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\transformed --minWordsInLine 5
# CytR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\transformed --minWordsInLine 5
# Rob
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\transformed --minWordsInLine 5
# EXTRACTING REGULATORY INTERACTIONS
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\transformed --minWordsInLine 5
def length(listWords):
regexWord = re.compile('[a-zA-Z]')
words = 0
chars = 0
for word in listWords:
listTemp = word.split('|')
if regexWord.search(listTemp[1]) is not None:
words += 1
chars += len(listTemp[0])
return words, chars
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("-i", "--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("-o", "--outputPath", dest="outputPath",
help="Path to place transformed files", metavar="PATH")
parser.add_option("--minWordsInLine", type="int", dest="minWordsInLine", default=3,
help="Minimum length sentence in number of words", metavar="NUM")
parser.add_option("--classes", dest="classes",
help="Classes to indicate final of sentence when line contains: PMID-NUMSENT-SENT-CLASS", metavar="CLASS,CLASS")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place transformed files: " + str(options.outputPath))
print("Minimum length sentence in number of words: " + str(options.minWordsInLine))
print("Classes to indicate final of sentence: " + str(options.classes))
# We realized that POS tags from Biolemmatizer are very specific, therefore we decided to use Standford tags
bioPOST = False
filesProcessed = 0
# minWordsInLine = 3
if not options.classes is None:
listClasses = options.classes.split(',')
t0 = time()
print("Transforming files...")
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print(" Transforming file..." + str(file))
#TrpR NN TrpR NN PennPOS
# , , , , NUPOS
# tryptophan NN tryptophan NN PennPOS
listLine1 = []
listLine2 = []
text = ''
lemma = ''
pos = ''
textTransformed = ''
textText = ''
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
# Create output file to write
with open(os.path.join(options.outputPath, file.replace('term.txt', 'tra.txt')), "w", encoding="utf-8") as transformedFile:
for line in iFile:
if line == '\n':
if options.classes is None:
if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
transformedFile.write(textTransformed + '\n')
textTransformed = ''
textText = ''
else:
continue
else:
line = line.strip('\n')
#print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
listLine1 = line.split('\t')
if len(listLine1) != 3:
continue
text = listLine1[0]
# Replacing an estrange space character
text = text.replace(' ', '-')
listLine2 = listLine1[2].split(' ')
lemma = listLine2[0]
# Replacing an estrange space character
lemma = lemma.replace(' ', '-')
if bioPOST:
pos = listLine2[1]
#print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
else:
pos = listLine1[1]
textText = textText + text + ' '
textTransformed = textTransformed + text + '|' + lemma + '|' + pos + ' '
# RI+GC NN RI+GC NN PennPOS
if not options.classes is None:
if text in listClasses:
# if length(textTransformed.split()) > options.minWordsInLine:
if length(textTransformed.split())[0] > options.minWordsInLine and length(textTransformed.split())[1] <= 1000:
transformedFile.write(textTransformed + '\n')
# print(textTransformed)
textTransformed = ''
textText = ''
filesProcessed += 1
# Imprime archivos procesados
print()
print("Files processed: " + str(filesProcessed))
print("In: %fs" % (time() - t0))