Carlos-Francisco Méndez-Cruz

Add data-sets

......@@ -18,7 +18,11 @@ __author__ = 'CMendezC'
# 3) --outputPath Output path
# Execution:
#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original
# python3 prepare-abstracts.py
# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
# --inputFile text-annotated-abstracts-original.txt
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
if __name__ == "__main__":
# Parameter definition
......@@ -45,18 +49,26 @@ if __name__ == "__main__":
t0 = time()
hashGenes = {}
rePmid = re.compile(r'([\d])+\|a\|')
rePmid = re.compile(r'([\d]+)\|a\|')
reGene = re.compile(r'<g>([^<]+)</g>')
reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)')
with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
print("Reading file..." + options.inputFile)
for line in iFile:
line = line.strip('\n')
for gene in reGene.findall(line):
print("genes: {}".format(gene))
# print("genes: {}".format(gene))
if gene not in hashGenes:
hashGenes[gene] = 1
else:
hashGenes[gene] += 1
line = reTags.sub('', line)
result = rePmid.match(line)
if result:
with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
oFile.write(line)
else:
print("Warning: line without PMID")
......