prepare-abstracts.py
2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
import re
__author__ = 'CMendezC'
# Objective: Take text-annotated-abstracts-original.txt as input
# for obtaining abstracts separated in files without tags and collecting dictionary of genes
# for tagging after NLP pipeline.
# Parameters:
# 1) --inputPath Input path.
# 2) --inputFile Input file.
# 3) --outputPath Output path
# Execution:
#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Input path", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="Input file", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Input path: " + str(options.inputPath))
print("Input file", str(options.inputFile))
print("Output path: " + str(options.outputPath))
filesWritten = 0
t0 = time()
hashGenes = {}
rePmid = re.compile(r'([\d])+\|a\|')
reGene = re.compile(r'<g>([^<]+)</g>')
with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
print("Reading file..." + options.inputFile)
for line in iFile:
line = line.strip('\n')
for gene in reGene.findall(line):
print("genes: {}".format(gene))
result = rePmid.match(line)
if result:
with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
oFile.write(line)