prepare-abstracts.py
3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
import re
__author__ = 'CMendezC'
# Objective: Take text-annotated-abstracts-original.txt as input
# for obtaining abstracts separated in files without tags and collecting dictionary of genes
# for tagging after NLP pipeline.
# Parameters:
# 1) --inputPath Input path.
# 2) --inputFile Input file.
# 3) --outputPath Output path
# Execution:
# python3 prepare-abstracts.py
# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
# --inputFile text-annotated-abstracts-original.txt
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
# --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries
# --dicFile genes.txt
# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Input path", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="Input file", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--dicPath", dest="dicPath",
help="Dictionary path", metavar="PATH")
parser.add_option("--dicFile", dest="dicFile",
help="Dictionary file", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Input path: " + str(options.inputPath))
print("Input file", str(options.inputFile))
print("Output path: " + str(options.outputPath))
print("Dictionary path: " + str(options.dicPath))
print("Dictionary file", str(options.dicFile))
filesWritten = 0
t0 = time()
hashGenes = {}
rePmid = re.compile(r'([\d]+)\|a\|')
reGene = re.compile(r'<g>([^<]+)</g>')
reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)')
with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
print("Reading file..." + options.inputFile)
for line in iFile:
line = line.strip('\n')
for gene in reGene.findall(line):
# print("genes: {}".format(gene))
if gene not in hashGenes:
hashGenes[gene] = 1
else:
hashGenes[gene] += 1
line = reTags.sub('', line)
result = rePmid.match(line)
if result:
with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
oFile.write(line)
else:
print("Warning: line without PMID")
with open(os.path.join(options.dicPath, options.dicFile), "w", encoding="utf-8", errors="replace") as dFile:
for gene in hashGenes.keys():
dFile.write("{}\n".format(gene))