Carlos-Francisco Méndez-Cruz

Setting up project

<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
import re
__author__ = 'CMendezC'
# Objective: Take text-annotated-abstracts-original.txt as input
# for obtaining abstracts separated in files without tags and collecting dictionary of genes
# for tagging after NLP pipeline.
# Parameters:
# 1) --inputPath Input path.
# 2) --inputFile Input file.
# 3) --outputPath Output path
# Execution:
#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Input path", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="Input file", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Input path: " + str(options.inputPath))
print("Input file", str(options.inputFile))
print("Output path: " + str(options.outputPath))
filesWritten = 0
t0 = time()
hashGenes = {}
rePmid = re.compile(r'([\d])+\|a\|')
reGene = re.compile(r'<g>([^<]+)</g>')
with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
print("Reading file..." + options.inputFile)
for line in iFile:
line = line.strip('\n')
for gene in reGene.findall(line):
print("genes: {}".format(gene))
result = rePmid.match(line)
if result:
with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
oFile.write(line)
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.