Showing
5 changed files
with
68 additions
and
0 deletions
.idea/vcs.xml
0 → 100644
prepare-abstracts.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +from optparse import OptionParser | ||
| 4 | +import os | ||
| 5 | +import sys | ||
| 6 | +from time import time | ||
| 7 | +import re | ||
| 8 | + | ||
| 9 | +__author__ = 'CMendezC' | ||
| 10 | + | ||
| 11 | +# Objective: Take text-annotated-abstracts-original.txt as input | ||
| 12 | +# for obtaining abstracts separated in files without tags and collecting dictionary of genes | ||
| 13 | +# for tagging after NLP pipeline. | ||
| 14 | + | ||
| 15 | +# Parameters: | ||
| 16 | +# 1) --inputPath Input path. | ||
| 17 | +# 2) --inputFile Input file. | ||
| 18 | +# 3) --outputPath Output path | ||
| 19 | + | ||
| 20 | +# Execution: | ||
| 21 | +#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original | ||
| 22 | + | ||
| 23 | +if __name__ == "__main__": | ||
| 24 | + # Parameter definition | ||
| 25 | + parser = OptionParser() | ||
| 26 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 27 | + help="Input path", metavar="PATH") | ||
| 28 | + parser.add_option("--inputFile", dest="inputFile", | ||
| 29 | + help="Input file", metavar="FILE") | ||
| 30 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 31 | + help="Output path", metavar="PATH") | ||
| 32 | + | ||
| 33 | + (options, args) = parser.parse_args() | ||
| 34 | + if len(args) > 0: | ||
| 35 | + parser.error("None parameters indicated.") | ||
| 36 | + sys.exit(1) | ||
| 37 | + | ||
| 38 | + # Printing parameter values | ||
| 39 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 40 | + print("Input path: " + str(options.inputPath)) | ||
| 41 | + print("Input file", str(options.inputFile)) | ||
| 42 | + print("Output path: " + str(options.outputPath)) | ||
| 43 | + | ||
| 44 | + filesWritten = 0 | ||
| 45 | + t0 = time() | ||
| 46 | + hashGenes = {} | ||
| 47 | + | ||
| 48 | + rePmid = re.compile(r'([\d])+\|a\|') | ||
| 49 | + reGene = re.compile(r'<g>([^<]+)</g>') | ||
| 50 | + with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: | ||
| 51 | + print("Reading file..." + options.inputFile) | ||
| 52 | + for line in iFile: | ||
| 53 | + line = line.strip('\n') | ||
| 54 | + for gene in reGene.findall(line): | ||
| 55 | + print("genes: {}".format(gene)) | ||
| 56 | + result = rePmid.match(line) | ||
| 57 | + if result: | ||
| 58 | + with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: | ||
| 59 | + oFile.write(line) | ||
| 60 | + | ||
| 61 | + | ||
| 62 | + |
preparing-training-validation-test.py
0 → 100644
This diff is collapsed. Click to expand it.
tagging_Sklearn_crfsuite.py
0 → 100644
This diff is collapsed. Click to expand it.
training-validation.py
0 → 100644
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment