Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
automatic-extraction-growth-conditions
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Authored by
cmendezc
2023-03-24 23:28:00 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
ecabc02573ad1ee2c1cf62c6e8d0b8e116c4a63c
ecabc025
1 parent
12e7dd61
Nuevo procesamiento para reentrenamiento
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
413 additions
and
139 deletions
data-sets/bin/check_mco_terms_in_sentences_v1.py
data-sets/bin/extract-sentences-from-softfiles_v1.py
data-sets/bin/extract-sentences-from-softfiles_v2.py
data-sets/bin/extract-sentences-from-softfiles_v3.py
data-sets/bin/mco_terms.csv
data-sets/bin/transform_sentences_to_check_to_XML.py
data-sets/bin/check_mco_terms_in_sentences_v1.py
0 → 100644
View file @
ecabc02
import
stanza
import
argparse
import
re
import
os
import
pandas
as
pd
# Objective
# Check if MCO terms appear in raw sentences from extracted sentences from softfiles
#
# Input parameters
# --inputPath=PATH Path to geo_sentences_to_check_fixed.csv
# /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputFile=PATH File geo_sentences_to_check_fixed.csv
# --inputPathMco Path to MCO term file
# /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco File with MCO terms GC_Terms.txt (tsv)
# --outputPath=PATH Path to place MCO terms that appeared in input file
#
# Output
# Files with MCO terms that appeared in input file
#
# _v1
# python check_mco_terms_in_sentences_v1.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputFile geo_sentences_to_check_fixed.csv
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco GC_Terms.txt
# python check_mco_terms_in_sentences_v1.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputFile geo_sentences_to_check_fixed.csv --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb --inputFileMco GC_Terms.txt
'''
RESULTADO DE LA REVISIÓN:
Sólo se encontraron los siguientes términos en las oraciones extraídas de la curación de los softfiles:
TERM_NAME
L broth (también fue anotado por el curador)
MOPS (anotado por el curador como <Med> MOPS minimal glucose media </Med>)
glucose (también fue anotado por el curador, pero no como palabra aislada)
nitrate (también fue anotado por el curador, aislado como <Supp> nitrate </Supp> y también como parte de varios suplementos anotados por el curador)
M9 minimal medium (también fue anotado por el curador)
OD600 of 0.3 (también fue anotado por el curador)
Escherichia coli (no estamos considerando organismos)
LB medium (no anotado por el curador)
'''
##########################################
# MAIN PROGRAM #
##########################################
if
__name__
==
"__main__"
:
# Defining parameters
parser
=
argparse
.
ArgumentParser
(
prog
=
'check_mco_terms_in_sentences_v1-py'
,
description
=
'Check if MCO terms appear in raw sentences from extracted sentences from softfiles.'
,
epilog
=
''
)
parser
.
add_argument
(
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path to extracted sentences from softfiles"
,
metavar
=
"PATH"
)
parser
.
add_argument
(
"--inputFile"
,
dest
=
"inputFile"
,
help
=
"Input extracted sentences from softfiles"
,
metavar
=
"FILE"
)
parser
.
add_argument
(
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Path to place MCO terms that appeared in input file"
,
metavar
=
"PATH"
)
parser
.
add_argument
(
"--inputPathMco"
,
dest
=
"inputPathMco"
,
help
=
"Path to MCO file"
,
metavar
=
"PATH"
)
parser
.
add_argument
(
"--inputFileMco"
,
dest
=
"inputFileMco"
,
help
=
"MCO file"
,
metavar
=
"FILE"
)
args
=
parser
.
parse_args
()
print
(
'-------------------------------- PARAMETERS --------------------------------'
)
print
(
"Path to extracted sentences from softfiles: "
+
args
.
inputPath
)
print
(
"Input extracted sentences from softfiles: "
+
args
.
inputFile
)
print
(
"Path to place MCO terms that appeared in input file: "
+
args
.
outputPath
)
print
(
"Path to MCO file: "
+
args
.
inputPathMco
)
print
(
"MCO file: "
+
args
.
inputFileMco
)
print
(
'-------------------------------- PROCESSING --------------------------------'
)
df_sentences_to_check
=
pd
.
read_csv
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputFile
))
print
(
df_sentences_to_check
.
head
(
3
))
print
(
df_sentences_to_check
.
shape
)
df_mco_terms
=
pd
.
read_csv
(
os
.
path
.
join
(
args
.
inputPathMco
,
args
.
inputFileMco
),
sep
=
"
\t
"
)
print
(
df_mco_terms
.
head
(
3
))
print
(
df_mco_terms
.
shape
)
df_mco_terms_found
=
pd
.
DataFrame
(
columns
=
[
'TERM_TYPE'
,
'TERM_NAME'
,
'SENTENCE'
])
text_sentences
=
[]
for
ind
in
df_sentences_to_check
.
index
:
line_trans
=
df_sentences_to_check
[
'transformed_sentence'
][
ind
]
list_line
=
line_trans
.
split
()
list_sentence
=
[
tokens
.
split
(
"|"
)[
0
]
for
tokens
in
list_line
]
text_sentence
=
" "
.
join
(
list_sentence
)
# print(text_sentence)
if
text_sentence
not
in
text_sentences
:
text_sentences
.
append
(
text_sentence
)
nlp
=
stanza
.
Pipeline
(
lang
=
'en'
,
processors
=
'tokenize,mwt'
)
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
"MCO_terms_found.tsv"
),
mode
=
'w'
)
as
ofile
:
for
ind
in
df_mco_terms
.
index
:
term_type
=
df_mco_terms
[
'TERM_TYPE'
][
ind
]
term_name
=
df_mco_terms
[
'TERM_NAME'
][
ind
]
doc
=
nlp
(
term_name
)
word_list
=
[
w
.
text
for
w
in
doc
.
sentences
[
0
]
.
words
]
term_name_new
=
" "
.
join
(
word_list
)
#print(term_name_new)
sentences_found
=
[
sent
for
sent
in
text_sentences
if
term_name_new
in
sent
]
for
s
in
sentences_found
:
print
(
"TERM_TYPE {} TERM_NAME {} SENT {}"
.
format
(
term_type
,
term_name
,
s
))
new_row
=
{
'TERM_TYPE'
:
term_type
,
'TERM_NAME'
:
term_name
,
'SENTENCE'
:
s
}
df_mco_terms_found
=
df_mco_terms_found
.
append
(
new_row
,
ignore_index
=
True
)
df_mco_terms_found
.
to_csv
(
os
.
path
.
join
(
args
.
outputPath
,
'MCO_terms_found_in_softfiles.tsv'
),
sep
=
"
\t
"
)
data-sets/bin/extract-sentences-from-softfiles_v1.py
View file @
ecabc02
...
...
@@ -15,11 +15,11 @@ import pandas as pd
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles.py
# python extract-sentences-from-softfiles
_v2
.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# python extract-sentences-from-softfiles
_v2
.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
##########################################
# MAIN PROGRAM #
...
...
data-sets/bin/extract-sentences-from-softfiles_v2.py
View file @
ecabc02
...
...
@@ -15,11 +15,11 @@ import pandas as pd
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles.py
# python extract-sentences-from-softfiles
_v2
.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# python extract-sentences-from-softfiles
_v2
.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
##########################################
# MAIN PROGRAM #
...
...
@@ -70,7 +70,7 @@ if __name__ == "__main__":
tags
=
[
'Gtype'
,
'Med'
,
'Phase'
,
'Supp'
,
'Temp'
,
'OD'
,
'Anti'
,
'Agit'
,
'Air'
,
'Vess'
,
'pH'
]
deleted_tags
=
[
'Gversion'
,
'Substrain'
,
'Strain'
,
'Technique'
]
deleted_tags
=
[
'Gversion'
,
'Substrain'
,
'Strain'
,
'Technique'
,
'Orgn'
]
all_tags
=
tags
+
deleted_tags
# Regex to check if line has a tag
regex_has_tag
=
re
.
compile
(
r'<('
+
'|'
.
join
(
all_tags
)
+
r')>'
)
...
...
@@ -89,9 +89,11 @@ if __name__ == "__main__":
testing_file
=
"GSE54899_family_retagged-05242019_validated.xml"
# Define stanza pipeline for sentence segmentation
nlp_sentence_segmentation
=
stanza
.
Pipeline
(
lang
=
'en'
,
processors
=
'tokenize'
)
#
nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
# Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
nlp
=
stanza
.
Pipeline
(
lang
=
'en'
,
processors
=
'tokenize,mwt,pos,lemma'
,
tokenize_no_ssplit
=
True
)
# nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
# Define stanza pipeline for lemmatization and pos tagging with sentence segmentation
nlp
=
stanza
.
Pipeline
(
lang
=
'en'
,
processors
=
'tokenize,mwt,pos,lemma'
)
# Store field_name (bangline) and field_text
field_name
=
""
...
...
@@ -117,140 +119,70 @@ if __name__ == "__main__":
for
path
,
dirs
,
files
in
os
.
walk
(
args
.
inputPath
):
# For each file in dir
for
file
in
files
:
if
file
==
testing_file
:
print
(
" Reading file..."
+
str
(
file
))
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
file
))
as
iFile
:
for
line
in
iFile
:
line
=
line
.
rstrip
(
'
\n
'
)
if
line
.
find
(
" = "
)
==
-
1
:
continue
list_line
=
line
.
split
(
" = "
)
field_name
=
list_line
[
0
]
#print("field_name: {}".format(field_name))
field_text
=
list_line
[
1
]
#print("field_text: {}".format(field_text))
if
field_name
==
"^SERIES"
:
serie
=
field_text
elif
field_name
==
"!Series_pubmed_id"
:
serie_pubmed_id
=
field_text
elif
field_name
==
"^SAMPLE"
:
sample
=
field_text
elif
regex_has_tag
.
search
(
line
):
# Contains GC tag
if
field_name
in
hash_field_name
:
hash_field_name
[
field_name
]
+=
1
else
:
hash_field_name
[
field_name
]
=
1
original_sentence
=
field_text
# delete GC tags
modified_sentence
=
regex_delete_tag
.
sub
(
""
,
field_text
)
modified_sentence
=
regex_delete_tag
.
sub
(
""
,
modified_sentence
)
# substitute tags
# p = re.compile(r'blue (?P<animal>dog|cat)')
# p.sub(r'gray \g<animal>', s)
modified_sentence
=
regex_subs_ini_tag
.
sub
(
r' INI_\g<tag> '
,
modified_sentence
)
modified_sentence
=
regex_subs_end_tag
.
sub
(
r' END_\g<tag> '
,
modified_sentence
)
doc
=
nlp
(
modified_sentence
)
for
i
,
sentence
in
enumerate
(
doc
.
sentences
):
# print(sentence.text)
list_transformed_sentence
=
[]
# For GC tag
gc_tag
=
"O"
in_tag
=
False
for
word
in
sentence
.
words
:
result
=
regex_gc_ini_tag
.
match
(
word
.
text
)
# if file == testing_file:
print
(
" Reading file..."
+
str
(
file
))
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
file
))
as
iFile
:
for
line
in
iFile
:
line
=
line
.
rstrip
(
'
\n
'
)
if
line
.
find
(
" = "
)
==
-
1
:
continue
list_line
=
line
.
split
(
" = "
)
field_name
=
list_line
[
0
]
#print("field_name: {}".format(field_name))
field_text
=
list_line
[
1
]
#print("field_text: {}".format(field_text))
if
field_name
==
"^SERIES"
:
serie
=
field_text
elif
field_name
==
"!Series_pubmed_id"
:
serie_pubmed_id
=
field_text
elif
field_name
==
"^SAMPLE"
:
sample
=
field_text
elif
regex_has_tag
.
search
(
line
):
# Contains GC tag
if
field_name
in
hash_field_name
:
hash_field_name
[
field_name
]
+=
1
else
:
hash_field_name
[
field_name
]
=
1
# original_sentence = field_text
# delete GC tags
modified_sentence
=
regex_delete_tag
.
sub
(
""
,
field_text
)
modified_sentence
=
regex_delete_tag
.
sub
(
""
,
modified_sentence
)
# substitute tags
# p = re.compile(r'blue (?P<animal>dog|cat)')
# p.sub(r'gray \g<animal>', s)
modified_sentence
=
regex_subs_ini_tag
.
sub
(
r' INI_\g<tag> '
,
modified_sentence
)
modified_sentence
=
regex_subs_end_tag
.
sub
(
r' END_\g<tag> '
,
modified_sentence
)
doc
=
nlp
(
modified_sentence
)
for
i
,
sentence
in
enumerate
(
doc
.
sentences
):
# print(sentence.text)
list_transformed_sentence
=
[]
# For GC tag
gc_tag
=
"O"
in_tag
=
False
for
word
in
sentence
.
words
:
result
=
regex_gc_ini_tag
.
match
(
word
.
text
)
if
result
:
gc_tag
=
result
.
group
(
"tag"
)
in_tag
=
True
continue
else
:
result
=
regex_gc_end_tag
.
match
(
word
.
text
)
if
result
:
gc_tag
=
result
.
group
(
"tag"
)
in_tag
=
Tru
e
gc_tag
=
"O"
in_tag
=
Fals
e
continue
else
:
result
=
regex_gc_end_tag
.
match
(
word
.
text
)
if
result
:
if
not
in_tag
:
gc_tag
=
"O"
in_tag
=
False
continue
else
:
if
not
in_tag
:
gc_tag
=
"O"
list_transformed_sentence
.
append
(
"{}|{}|{}|{}"
.
format
(
word
.
text
,
word
.
lemma
,
word
.
xpos
,
gc_tag
))
transformed_sentence
=
" "
.
join
(
list_transformed_sentence
)
new_row
=
{
'serie'
:
serie
,
'serie_pubmed_id'
:
serie_pubmed_id
,
'sample'
:
sample
,
'field_name'
:
field_name
,
'original_sentence'
:
original_sentence
,
'modified_sentence'
:
sentence
.
text
,
'transformed_sentence'
:
transformed_sentence
}
df_sentences_to_check
=
df_sentences_to_check
.
append
(
new_row
,
ignore_index
=
True
)
list_transformed_sentence
.
append
(
"{}|{}|{}|{}"
.
format
(
word
.
text
,
word
.
lemma
,
word
.
xpos
,
gc_tag
))
transformed_sentence
=
" "
.
join
(
list_transformed_sentence
)
original_sentence
=
regex_gc_ini_tag
.
sub
(
r'<\g<tag>>'
,
sentence
.
text
)
original_sentence
=
regex_gc_end_tag
.
sub
(
r'</\g<tag>>'
,
original_sentence
)
new_row
=
{
'serie'
:
serie
,
'serie_pubmed_id'
:
serie_pubmed_id
,
'sample'
:
sample
,
'field_name'
:
field_name
,
'original_sentence'
:
original_sentence
,
'modified_sentence'
:
sentence
.
text
,
'transformed_sentence'
:
transformed_sentence
}
df_sentences_to_check
=
df_sentences_to_check
.
append
(
new_row
,
ignore_index
=
True
)
df_sentences_to_check
.
to_csv
(
os
.
path
.
join
(
args
.
outputPath
,
'geo_sentences_to_check.csv'
))
#print(token)
quit
()
## End of tagging
out_labels
=
{
'</Gtype>'
:
'O'
,
'</Gversion>'
:
'O'
,
'</Med>'
:
'O'
,
'</Phase>'
:
'O'
,
'</Substrain>'
:
'O'
,
'</Supp>'
:
'O'
,
'</Strain>'
:
'O'
,
'</Technique>'
:
'O'
,
'</Temp>'
:
'O'
,
'</OD>'
:
'O'
,
'</Anti>'
:
'O'
,
'</Agit>'
:
'O'
,
'</Air>'
:
'O'
,
'</Vess>'
:
'O'
,
'</pH>'
:
'O'
}
old_labels
=
{
'<Orgn>'
:
'O'
,
'</Orgn>'
:
'O'
}
# Other label
flag
=
'O'
lista
=
[]
# First sentence
sentence
=
''
n
=
0
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputFile
),
"r"
)
as
input_file
:
for
line
in
input_file
:
if
len
(
line
.
split
(
'
\t
'
))
>
1
:
w
=
line
.
split
(
'
\t
'
)[
1
]
if
w
in
in_labels
or
w
in
out_labels
:
# Tagging
if
w
in
in_labels
.
keys
():
flag
=
in_labels
[
w
]
if
w
in
out_labels
:
flag
=
out_labels
[
w
]
else
:
if
w
==
"PGCGROWTHCONDITIONS"
:
n
=
n
+
1
words
=
sentence
.
split
(
' '
)
# End of sentence
tags
=
[
tag
for
tag
in
words
if
tag
.
split
(
'|'
)[
-
1
]
in
in_labels
.
values
()]
# At least one true-tag on sentence
if
len
(
tags
)
>
0
:
lista
.
append
(
sentence
)
# New setence
sentence
=
''
elif
w
not
in
old_labels
.
keys
():
# Building and save tagging sentence
sentence
=
sentence
+
' '
+
(
'|'
.
join
(
line
.
split
(
'
\t
'
)[
1
:
args
.
index
])
+
'|'
+
flag
+
' '
)
print
(
"Number of sentences with at least one tag: "
+
str
(
len
(
lista
)))
print
(
"Number of sentences from CoreNLP: "
+
str
(
n
))
# Split 70 30 training and test sentences
trainingIndex
=
random
.
sample
(
range
(
len
(
lista
)),
int
(
len
(
lista
)
*
.
70
))
testIndex
=
[
n
for
n
in
range
(
len
(
lista
))
if
n
not
in
trainingIndex
]
print
(
"Number of sentences for training: "
+
str
(
len
(
trainingIndex
)))
print
(
"Number of sentences for test: "
+
str
(
len
(
testIndex
)))
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
args
.
trainingFile
),
"w"
)
as
oFile
:
Data
=
[
lista
[
i
]
for
i
in
trainingIndex
]
oFile
.
write
(
'
\n
'
.
join
(
Data
))
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
args
.
testFile
),
"w"
)
as
oFile
:
Data
=
[
lista
[
i
]
for
i
in
testIndex
]
oFile
.
write
(
'
\n
'
.
join
(
Data
))
print
(
"==================================END==================================="
)
...
...
data-sets/bin/extract-sentences-from-softfiles_v3.py
0 → 100644
View file @
ecabc02
import
stanza
import
argparse
import
re
import
os
import
pandas
as
pd
# Objective
# Sentences extraction from XML Soft files. _v3 includes dictionary-based NER of MCO conditions
#
# Input parameters
# --inputPath=PATH Path to XML Soft files
# --outputPath=PATH Path to place output files
#
# Output
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles_v2.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# _v3
# python extract-sentences-from-softfiles_v3.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco
##########################################
# MAIN PROGRAM #
##########################################
if
__name__
==
"__main__"
:
# Defining parameters
parser
=
argparse
.
ArgumentParser
(
prog
=
'extract-sentences-from-softfiles'
,
description
=
'Sentences extraction from XML Soft files.'
,
epilog
=
''
)
parser
.
add_argument
(
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path to XML Soft files"
,
metavar
=
"PATH"
)
parser
.
add_argument
(
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Path for output files"
,
metavar
=
"PATH"
)
parser
.
add_argument
(
"--inputPathMco"
,
dest
=
"inputPathMco"
,
help
=
"Path to MCO file"
,
metavar
=
"PATH"
)
parser
.
add_argument
(
"--inputFileMco"
,
dest
=
"inputFileMco"
,
help
=
"MCO file"
,
metavar
=
"FILE"
)
args
=
parser
.
parse_args
()
print
(
'-------------------------------- PARAMETERS --------------------------------'
)
print
(
"Path to XML Soft files: "
+
args
.
inputPath
)
print
(
"Path to output files: "
+
args
.
outputPath
)
print
(
"Path to MCO file: "
+
args
.
inputPathMco
)
print
(
"MCO file: "
+
args
.
inputFileMco
)
print
(
'-------------------------------- PROCESSING --------------------------------'
)
## Tags of GCs into consideration
# culture medium, medium supplements, aeration, temperature,
# pH, agitation, growth phase, optical density, genetic background
tags
=
{
'<Gtype>'
:
'Gtype'
,
# '<Gversion>': 'Gversion',
'<Med>'
:
'Med'
,
'<Phase>'
:
'Phase'
,
# '<Substrain>': 'Substrain',
'<Supp>'
:
'Supp'
,
# '<Strain>': 'Strain',
# '<Technique>': 'Technique',
'<Temp>'
:
'Temp'
,
'<OD>'
:
'OD'
,
'<Anti>'
:
'Anti'
,
'<Agit>'
:
'Agit'
,
'<Air>'
:
'Air'
,
'<Vess>'
:
'Vess'
,
'<pH>'
:
'pH'
}
#tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
# '<Temp>', '<OD>', '<Anti>', '<Agit>',
# '<Air>', '<Vess>', '<pH>']
#deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
tags
=
[
'Gtype'
,
'Med'
,
'Phase'
,
'Supp'
,
'Temp'
,
'OD'
,
'Anti'
,
'Agit'
,
'Air'
,
'Vess'
,
'pH'
]
deleted_tags
=
[
'Gversion'
,
'Substrain'
,
'Strain'
,
'Technique'
,
'Orgn'
]
all_tags
=
tags
+
deleted_tags
# Regex to check if line has a tag
regex_has_tag
=
re
.
compile
(
r'<('
+
'|'
.
join
(
all_tags
)
+
r')>'
)
# Regex to delete tags
regex_delete_tag
=
re
.
compile
(
r'</?('
+
'|'
.
join
(
deleted_tags
)
+
r')>'
)
# Regex to substitute tags
regex_subs_ini_tag
=
re
.
compile
(
r'<(?P<tag>('
+
'|'
.
join
(
tags
)
+
r'))>'
)
regex_subs_end_tag
=
re
.
compile
(
r'</(?P<tag>('
+
'|'
.
join
(
tags
)
+
r'))>'
)
#p = re.compile(r'blue (?P<animal>dog|cat)')
#p.sub(r'gray \g<animal>', s)
# Regex to tag GCs
regex_gc_ini_tag
=
re
.
compile
(
r'INI_(?P<tag>('
+
'|'
.
join
(
tags
)
+
r'))'
)
regex_gc_end_tag
=
re
.
compile
(
r'END_(?P<tag>('
+
'|'
.
join
(
tags
)
+
r'))'
)
# Testing file: GSE54899_family_retagged-05242019_validated.xml
testing_file
=
"GSE54899_family_retagged-05242019_validated.xml"
# Define stanza pipeline for sentence segmentation
# nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
# Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
# nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
# Define stanza pipeline for lemmatization and pos tagging with sentence segmentation
nlp
=
stanza
.
Pipeline
(
lang
=
'en'
,
processors
=
'tokenize,mwt,pos,lemma'
)
# Store field_name (bangline) and field_text
field_name
=
""
field_text
=
""
# Store list of unique field_name
hash_field_name
=
{}
# Store sentences from fields that contained at least one GC tag.
# We want to use this list for someone to check it
df_sentences_to_check
=
pd
.
DataFrame
(
columns
=
[
'serie'
,
'serie_pubmed_id'
,
'sample'
,
'field_name'
,
'original_sentence'
,
'modified_sentence'
,
'transformed_sentence'
])
# Store serie number
# ^SERIES = GSE54899
serie
=
""
# Store series pubmed id
# !Series_pubmed_id = 25222563
serie_pubmed_id
=
""
# Store sample
# ^SAMPLE = GSM1326335
sample
=
""
for
path
,
dirs
,
files
in
os
.
walk
(
args
.
inputPath
):
# For each file in dir
for
file
in
files
:
# if file == testing_file:
print
(
" Reading file..."
+
str
(
file
))
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
file
))
as
iFile
:
for
line
in
iFile
:
line
=
line
.
rstrip
(
'
\n
'
)
if
line
.
find
(
" = "
)
==
-
1
:
continue
list_line
=
line
.
split
(
" = "
)
field_name
=
list_line
[
0
]
#print("field_name: {}".format(field_name))
field_text
=
list_line
[
1
]
#print("field_text: {}".format(field_text))
if
field_name
==
"^SERIES"
:
serie
=
field_text
elif
field_name
==
"!Series_pubmed_id"
:
serie_pubmed_id
=
field_text
elif
field_name
==
"^SAMPLE"
:
sample
=
field_text
elif
regex_has_tag
.
search
(
line
):
# Contains GC tag
if
field_name
in
hash_field_name
:
hash_field_name
[
field_name
]
+=
1
else
:
hash_field_name
[
field_name
]
=
1
# original_sentence = field_text
# delete GC tags
modified_sentence
=
regex_delete_tag
.
sub
(
""
,
field_text
)
modified_sentence
=
regex_delete_tag
.
sub
(
""
,
modified_sentence
)
# substitute tags
# p = re.compile(r'blue (?P<animal>dog|cat)')
# p.sub(r'gray \g<animal>', s)
modified_sentence
=
regex_subs_ini_tag
.
sub
(
r' INI_\g<tag> '
,
modified_sentence
)
modified_sentence
=
regex_subs_end_tag
.
sub
(
r' END_\g<tag> '
,
modified_sentence
)
doc
=
nlp
(
modified_sentence
)
for
i
,
sentence
in
enumerate
(
doc
.
sentences
):
# print(sentence.text)
list_transformed_sentence
=
[]
# For GC tag
gc_tag
=
"O"
in_tag
=
False
for
word
in
sentence
.
words
:
result
=
regex_gc_ini_tag
.
match
(
word
.
text
)
if
result
:
gc_tag
=
result
.
group
(
"tag"
)
in_tag
=
True
continue
else
:
result
=
regex_gc_end_tag
.
match
(
word
.
text
)
if
result
:
gc_tag
=
"O"
in_tag
=
False
continue
else
:
if
not
in_tag
:
gc_tag
=
"O"
list_transformed_sentence
.
append
(
"{}|{}|{}|{}"
.
format
(
word
.
text
,
word
.
lemma
,
word
.
xpos
,
gc_tag
))
transformed_sentence
=
" "
.
join
(
list_transformed_sentence
)
original_sentence
=
regex_gc_ini_tag
.
sub
(
r'<\g<tag>>'
,
sentence
.
text
)
original_sentence
=
regex_gc_end_tag
.
sub
(
r'</\g<tag>>'
,
original_sentence
)
new_row
=
{
'serie'
:
serie
,
'serie_pubmed_id'
:
serie_pubmed_id
,
'sample'
:
sample
,
'field_name'
:
field_name
,
'original_sentence'
:
original_sentence
,
'modified_sentence'
:
sentence
.
text
,
'transformed_sentence'
:
transformed_sentence
}
df_sentences_to_check
=
df_sentences_to_check
.
append
(
new_row
,
ignore_index
=
True
)
df_sentences_to_check
.
to_csv
(
os
.
path
.
join
(
args
.
outputPath
,
'geo_sentences_to_check.csv'
))
data-sets/bin/mco_terms.csv
0 → 100644
View file @
ecabc02
This diff could not be displayed because it is too large.
data-sets/bin/transform_sentences_to_check_to_XML.py
0 → 100644
View file @
ecabc02
import
pandas
as
pd
import
os
def
transform_sentence_to_check_to_XML
(
inputPath
,
outputPath
,
inputFile
,
outputFile
):
df_sentences_to_check
=
pd
.
read_csv
(
os
.
path
.
join
(
inputPath
,
inputFile
))
df_sentences_to_check
.
rename
(
columns
=
{
'Unnamed: 0'
:
'row'
},
inplace
=
True
)
df_sentences_to_check
=
df_sentences_to_check
.
sort_values
(
by
=
[
'original_sentence'
])
print
(
df_sentences_to_check
.
head
(
5
))
with
open
(
os
.
path
.
join
(
outputPath
,
outputFile
),
mode
=
'w'
)
as
ofile
:
ofile
.
write
(
'<?xml version="1.0" encoding="UTF-8"?>
\n
'
)
ofile
.
write
(
'<gcs_to_check xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="esquema-gcs-to-check.xsd">
\n
'
)
for
ind
in
df_sentences_to_check
.
index
:
#
line
=
'<row id="{}">
\n
'
.
format
(
df_sentences_to_check
[
'row'
][
ind
])
line
=
line
+
"
\t
<serie>{}</serie>
\n
"
.
format
(
df_sentences_to_check
[
'serie'
][
ind
])
line
=
line
+
"
\t
<serie_pubmed_id>{}</serie_pubmed_id>
\n
"
.
format
(
df_sentences_to_check
[
'serie_pubmed_id'
][
ind
])
line
=
line
+
"
\t
<sample>{}</sample>
\n
"
.
format
(
df_sentences_to_check
[
'sample'
][
ind
])
line
=
line
+
"
\t
<field_name>{}</field_name>
\n
"
.
format
(
df_sentences_to_check
[
'field_name'
][
ind
])
line
=
line
+
"
\t
<original_sentence>{}</original_sentence>
\n
"
.
format
(
df_sentences_to_check
[
'original_sentence'
][
ind
])
line
=
line
+
"
\t
<corrected_sentence>{}</corrected_sentence>
\n
"
.
format
(
df_sentences_to_check
[
'original_sentence'
][
ind
])
line
=
line
+
"</row>
\n
"
ofile
.
write
(
line
)
ofile
.
write
(
'</gcs_to_check>
\n
'
)
transform_sentence_to_check_to_XML
(
inputPath
=
'/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences'
,
outputPath
=
'/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences'
,
inputFile
=
'geo_sentences_to_check_fixed.csv'
,
#inputFile='geo_sentences_to_check.csv',
outputFile
=
'geo_sentences_to_check_fixed.xml'
)
\ No newline at end of file
Please
register
or
login
to post a comment