Showing
1 changed file
with
15 additions
and
3 deletions
| ... | @@ -18,7 +18,11 @@ __author__ = 'CMendezC' | ... | @@ -18,7 +18,11 @@ __author__ = 'CMendezC' |
| 18 | # 3) --outputPath Output path | 18 | # 3) --outputPath Output path |
| 19 | 19 | ||
| 20 | # Execution: | 20 | # Execution: |
| 21 | -#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original | 21 | +# python3 prepare-abstracts.py |
| 22 | +# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | ||
| 23 | +# --inputFile text-annotated-abstracts-original.txt | ||
| 24 | +# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original | ||
| 25 | +# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original | ||
| 22 | 26 | ||
| 23 | if __name__ == "__main__": | 27 | if __name__ == "__main__": |
| 24 | # Parameter definition | 28 | # Parameter definition |
| ... | @@ -45,18 +49,26 @@ if __name__ == "__main__": | ... | @@ -45,18 +49,26 @@ if __name__ == "__main__": |
| 45 | t0 = time() | 49 | t0 = time() |
| 46 | hashGenes = {} | 50 | hashGenes = {} |
| 47 | 51 | ||
| 48 | - rePmid = re.compile(r'([\d])+\|a\|') | 52 | + rePmid = re.compile(r'([\d]+)\|a\|') |
| 49 | reGene = re.compile(r'<g>([^<]+)</g>') | 53 | reGene = re.compile(r'<g>([^<]+)</g>') |
| 54 | + reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)') | ||
| 50 | with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: | 55 | with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: |
| 51 | print("Reading file..." + options.inputFile) | 56 | print("Reading file..." + options.inputFile) |
| 52 | for line in iFile: | 57 | for line in iFile: |
| 53 | line = line.strip('\n') | 58 | line = line.strip('\n') |
| 54 | for gene in reGene.findall(line): | 59 | for gene in reGene.findall(line): |
| 55 | - print("genes: {}".format(gene)) | 60 | + # print("genes: {}".format(gene)) |
| 61 | + if gene not in hashGenes: | ||
| 62 | + hashGenes[gene] = 1 | ||
| 63 | + else: | ||
| 64 | + hashGenes[gene] += 1 | ||
| 65 | + line = reTags.sub('', line) | ||
| 56 | result = rePmid.match(line) | 66 | result = rePmid.match(line) |
| 57 | if result: | 67 | if result: |
| 58 | with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: | 68 | with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: |
| 59 | oFile.write(line) | 69 | oFile.write(line) |
| 70 | + else: | ||
| 71 | + print("Warning: line without PMID") | ||
| 60 | 72 | ||
| 61 | 73 | ||
| 62 | 74 | ... | ... |
-
Please register or login to post a comment