prepare-abstracts.py 2.05 KB
# -*- coding: UTF-8 -*-

from optparse import OptionParser
import os
import sys
from time import time
import re

__author__ = 'CMendezC'

# Objective: Take text-annotated-abstracts-original.txt as input
# for obtaining abstracts separated in files without tags and collecting dictionary of genes
# for tagging after NLP pipeline.

# Parameters:
#   1) --inputPath      Input path.
#   2) --inputFile   Input file.
#   3) --outputPath     Output path

# Execution:
#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Input path", metavar="PATH")
    parser.add_option("--inputFile", dest="inputFile",
                      help="Input file", metavar="FILE")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Output path", metavar="PATH")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("None parameters indicated.")
        sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Input path: " + str(options.inputPath))
    print("Input file", str(options.inputFile))
    print("Output path: " + str(options.outputPath))

    filesWritten = 0
    t0 = time()
    hashGenes = {}

    rePmid = re.compile(r'([\d])+\|a\|')
    reGene = re.compile(r'<g>([^<]+)</g>')
    with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
        print("Reading file..." + options.inputFile)
        for line in iFile:
            line = line.strip('\n')
            for gene in reGene.findall(line):
                print("genes: {}".format(gene))
            result = rePmid.match(line)
            if result:
                with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
                    oFile.write(line)