Showing
5 changed files
with
68 additions
and
0 deletions
.idea/vcs.xml
0 → 100644
prepare-abstracts.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +from time import time | ||
7 | +import re | ||
8 | + | ||
9 | +__author__ = 'CMendezC' | ||
10 | + | ||
11 | +# Objective: Take text-annotated-abstracts-original.txt as input | ||
12 | +# for obtaining abstracts separated in files without tags and collecting dictionary of genes | ||
13 | +# for tagging after NLP pipeline. | ||
14 | + | ||
15 | +# Parameters: | ||
16 | +# 1) --inputPath Input path. | ||
17 | +# 2) --inputFile Input file. | ||
18 | +# 3) --outputPath Output path | ||
19 | + | ||
20 | +# Execution: | ||
21 | +#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original | ||
22 | + | ||
23 | +if __name__ == "__main__": | ||
24 | + # Parameter definition | ||
25 | + parser = OptionParser() | ||
26 | + parser.add_option("--inputPath", dest="inputPath", | ||
27 | + help="Input path", metavar="PATH") | ||
28 | + parser.add_option("--inputFile", dest="inputFile", | ||
29 | + help="Input file", metavar="FILE") | ||
30 | + parser.add_option("--outputPath", dest="outputPath", | ||
31 | + help="Output path", metavar="PATH") | ||
32 | + | ||
33 | + (options, args) = parser.parse_args() | ||
34 | + if len(args) > 0: | ||
35 | + parser.error("None parameters indicated.") | ||
36 | + sys.exit(1) | ||
37 | + | ||
38 | + # Printing parameter values | ||
39 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
40 | + print("Input path: " + str(options.inputPath)) | ||
41 | + print("Input file", str(options.inputFile)) | ||
42 | + print("Output path: " + str(options.outputPath)) | ||
43 | + | ||
44 | + filesWritten = 0 | ||
45 | + t0 = time() | ||
46 | + hashGenes = {} | ||
47 | + | ||
48 | + rePmid = re.compile(r'([\d])+\|a\|') | ||
49 | + reGene = re.compile(r'<g>([^<]+)</g>') | ||
50 | + with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: | ||
51 | + print("Reading file..." + options.inputFile) | ||
52 | + for line in iFile: | ||
53 | + line = line.strip('\n') | ||
54 | + for gene in reGene.findall(line): | ||
55 | + print("genes: {}".format(gene)) | ||
56 | + result = rePmid.match(line) | ||
57 | + if result: | ||
58 | + with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: | ||
59 | + oFile.write(line) | ||
60 | + | ||
61 | + | ||
62 | + |
preparing-training-validation-test.py
0 → 100644
This diff is collapsed. Click to expand it.
tagging_Sklearn_crfsuite.py
0 → 100644
This diff is collapsed. Click to expand it.
training-validation.py
0 → 100644
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment