Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
conditional-random-fields
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-03-06 18:33:59 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
90224aa935f089062b51113b31cfa66f6e74a634
90224aa9
1 parent
dccc6977
Add data-sets
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
15 additions
and
3 deletions
prepare-abstracts.py
prepare-abstracts.py
View file @
90224aa
...
...
@@ -18,7 +18,11 @@ __author__ = 'CMendezC'
# 3) --outputPath Output path
# Execution:
#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original
# python3 prepare-abstracts.py
# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
# --inputFile text-annotated-abstracts-original.txt
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
if
__name__
==
"__main__"
:
# Parameter definition
...
...
@@ -45,18 +49,26 @@ if __name__ == "__main__":
t0
=
time
()
hashGenes
=
{}
rePmid
=
re
.
compile
(
r'([\d]
)+
\|a\|'
)
rePmid
=
re
.
compile
(
r'([\d]
+)
\|a\|'
)
reGene
=
re
.
compile
(
r'<g>([^<]+)</g>'
)
reTags
=
re
.
compile
(
r'(<g>|</g>|<d>|</d>|<i>|</i>)'
)
with
open
(
os
.
path
.
join
(
options
.
inputPath
,
options
.
inputFile
),
"r"
,
encoding
=
"utf-8"
,
errors
=
"replace"
)
as
iFile
:
print
(
"Reading file..."
+
options
.
inputFile
)
for
line
in
iFile
:
line
=
line
.
strip
(
'
\n
'
)
for
gene
in
reGene
.
findall
(
line
):
print
(
"genes: {}"
.
format
(
gene
))
# print("genes: {}".format(gene))
if
gene
not
in
hashGenes
:
hashGenes
[
gene
]
=
1
else
:
hashGenes
[
gene
]
+=
1
line
=
reTags
.
sub
(
''
,
line
)
result
=
rePmid
.
match
(
line
)
if
result
:
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
result
.
group
(
1
)
+
".txt"
),
"w"
,
encoding
=
"utf-8"
,
errors
=
"replace"
)
as
oFile
:
oFile
.
write
(
line
)
else
:
print
(
"Warning: line without PMID"
)
...
...
Please
register
or
login
to post a comment