Carlos-Francisco Méndez-Cruz

Setting up project

1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="VcsDirectoryMappings">
4 + <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 + </component>
6 +</project>
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +import re
8 +
9 +__author__ = 'CMendezC'
10 +
11 +# Objective: Take text-annotated-abstracts-original.txt as input
12 +# for obtaining abstracts separated in files without tags and collecting dictionary of genes
13 +# for tagging after NLP pipeline.
14 +
15 +# Parameters:
16 +# 1) --inputPath Input path.
17 +# 2) --inputFile Input file.
18 +# 3) --outputPath Output path
19 +
20 +# Execution:
21 +#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original
22 +
23 +if __name__ == "__main__":
24 + # Parameter definition
25 + parser = OptionParser()
26 + parser.add_option("--inputPath", dest="inputPath",
27 + help="Input path", metavar="PATH")
28 + parser.add_option("--inputFile", dest="inputFile",
29 + help="Input file", metavar="FILE")
30 + parser.add_option("--outputPath", dest="outputPath",
31 + help="Output path", metavar="PATH")
32 +
33 + (options, args) = parser.parse_args()
34 + if len(args) > 0:
35 + parser.error("None parameters indicated.")
36 + sys.exit(1)
37 +
38 + # Printing parameter values
39 + print('-------------------------------- PARAMETERS --------------------------------')
40 + print("Input path: " + str(options.inputPath))
41 + print("Input file", str(options.inputFile))
42 + print("Output path: " + str(options.outputPath))
43 +
44 + filesWritten = 0
45 + t0 = time()
46 + hashGenes = {}
47 +
48 + rePmid = re.compile(r'([\d])+\|a\|')
49 + reGene = re.compile(r'<g>([^<]+)</g>')
50 + with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
51 + print("Reading file..." + options.inputFile)
52 + for line in iFile:
53 + line = line.strip('\n')
54 + for gene in reGene.findall(line):
55 + print("genes: {}".format(gene))
56 + result = rePmid.match(line)
57 + if result:
58 + with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
59 + oFile.write(line)
60 +
61 +
62 +
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.