Carlos-Francisco Méndez-Cruz

Add data-sets

...@@ -18,7 +18,11 @@ __author__ = 'CMendezC' ...@@ -18,7 +18,11 @@ __author__ = 'CMendezC'
18 # 3) --outputPath Output path 18 # 3) --outputPath Output path
19 19
20 # Execution: 20 # Execution:
21 -#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original 21 +# python3 prepare-abstracts.py
22 +# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
23 +# --inputFile text-annotated-abstracts-original.txt
24 +# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
25 +# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
22 26
23 if __name__ == "__main__": 27 if __name__ == "__main__":
24 # Parameter definition 28 # Parameter definition
...@@ -45,18 +49,26 @@ if __name__ == "__main__": ...@@ -45,18 +49,26 @@ if __name__ == "__main__":
45 t0 = time() 49 t0 = time()
46 hashGenes = {} 50 hashGenes = {}
47 51
48 - rePmid = re.compile(r'([\d])+\|a\|') 52 + rePmid = re.compile(r'([\d]+)\|a\|')
49 reGene = re.compile(r'<g>([^<]+)</g>') 53 reGene = re.compile(r'<g>([^<]+)</g>')
54 + reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)')
50 with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: 55 with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
51 print("Reading file..." + options.inputFile) 56 print("Reading file..." + options.inputFile)
52 for line in iFile: 57 for line in iFile:
53 line = line.strip('\n') 58 line = line.strip('\n')
54 for gene in reGene.findall(line): 59 for gene in reGene.findall(line):
55 - print("genes: {}".format(gene)) 60 + # print("genes: {}".format(gene))
61 + if gene not in hashGenes:
62 + hashGenes[gene] = 1
63 + else:
64 + hashGenes[gene] += 1
65 + line = reTags.sub('', line)
56 result = rePmid.match(line) 66 result = rePmid.match(line)
57 if result: 67 if result:
58 with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: 68 with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
59 oFile.write(line) 69 oFile.write(line)
70 + else:
71 + print("Warning: line without PMID")
60 72
61 73
62 74
......