Showing
1 changed file
with
15 additions
and
3 deletions
... | @@ -18,7 +18,11 @@ __author__ = 'CMendezC' | ... | @@ -18,7 +18,11 @@ __author__ = 'CMendezC' |
18 | # 3) --outputPath Output path | 18 | # 3) --outputPath Output path |
19 | 19 | ||
20 | # Execution: | 20 | # Execution: |
21 | -#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original | 21 | +# python3 prepare-abstracts.py |
22 | +# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | ||
23 | +# --inputFile text-annotated-abstracts-original.txt | ||
24 | +# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original | ||
25 | +# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original | ||
22 | 26 | ||
23 | if __name__ == "__main__": | 27 | if __name__ == "__main__": |
24 | # Parameter definition | 28 | # Parameter definition |
... | @@ -45,18 +49,26 @@ if __name__ == "__main__": | ... | @@ -45,18 +49,26 @@ if __name__ == "__main__": |
45 | t0 = time() | 49 | t0 = time() |
46 | hashGenes = {} | 50 | hashGenes = {} |
47 | 51 | ||
48 | - rePmid = re.compile(r'([\d])+\|a\|') | 52 | + rePmid = re.compile(r'([\d]+)\|a\|') |
49 | reGene = re.compile(r'<g>([^<]+)</g>') | 53 | reGene = re.compile(r'<g>([^<]+)</g>') |
54 | + reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)') | ||
50 | with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: | 55 | with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: |
51 | print("Reading file..." + options.inputFile) | 56 | print("Reading file..." + options.inputFile) |
52 | for line in iFile: | 57 | for line in iFile: |
53 | line = line.strip('\n') | 58 | line = line.strip('\n') |
54 | for gene in reGene.findall(line): | 59 | for gene in reGene.findall(line): |
55 | - print("genes: {}".format(gene)) | 60 | + # print("genes: {}".format(gene)) |
61 | + if gene not in hashGenes: | ||
62 | + hashGenes[gene] = 1 | ||
63 | + else: | ||
64 | + hashGenes[gene] += 1 | ||
65 | + line = reTags.sub('', line) | ||
56 | result = rePmid.match(line) | 66 | result = rePmid.match(line) |
57 | if result: | 67 | if result: |
58 | with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: | 68 | with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: |
59 | oFile.write(line) | 69 | oFile.write(line) |
70 | + else: | ||
71 | + print("Warning: line without PMID") | ||
60 | 72 | ||
61 | 73 | ||
62 | 74 | ... | ... |
-
Please register or login to post a comment