Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
automatic-extraction-growth-conditions
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Authored by
Estefani Gaytan Nunez
2020-09-01 16:27:56 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
07d3119f885e2b761f8d0c52ea9cc8ce33820009
07d3119f
1 parent
582f6ed0
upload
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
105 additions
and
124 deletions
predict-annot/bin/tagging/tagging.py
predict-annot/output/annot-input_bg_outputII.txt
predict-annot/reports/output_tagging_report.txt
predict-annot/bin/tagging/tagging.py
View file @
07d3119
# -*- coding: UTF-8 -*-
import
os
import
re
from
pandas
import
DataFrame
as
DF
from
optparse
import
OptionParser
from
time
import
time
...
...
@@ -29,25 +30,37 @@ import training_validation_v14 as training
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of transformed files x|y|z
# --modelPath Path to CRF model
# --modelName Model name
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# --inputPath=PATH Path of transformed files x|y|z
# --outputPath Output path to place output files
# --outputFileI Output tagged file I
# --outputFileII Output tagged file II
# --modelPath Path to CRF model
# --modelName Model name
# --infoPath Path of GSE-GSM index file
# --infoFile GSE-GSM index file",
# --variant Part of S2 variant
# --S1 Inner word features set
# --S2 Complete word features
# --S3 Extended context features
# --S4 Semantic features
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Tagged files in transformed format
# Examples
# python3 tagging.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --filterSymbols
# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --outputFileI annot-input_bg_outputI.txt
# --outputFileII annot-input_bg_outputII.txt
# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models
# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
# --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
# --infoFile bg_sentences_midx.txt
# --variant 13
#python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI.txt --outputFileII annot-input_bg_outputII.txt --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx.txt --variant 13 --S4 --S1 > ../../reports/output_tagging_report.txt
__author__
=
'egaytan'
...
...
@@ -60,9 +73,13 @@ if __name__ == "__main__":
parser
=
OptionParser
()
parser
.
add_option
(
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path of training data set"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Output path to place output files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--outputFileI"
,
dest
=
"outFileI"
,
help
=
"Output tagged file I"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--outputFileII"
,
dest
=
"outFileII"
,
help
=
"Output tagged file II"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--modelPath"
,
dest
=
"modelPath"
,
help
=
"Path to read CRF model"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--modelName"
,
dest
=
"modelName"
,
help
=
"Model name"
,
metavar
=
"TEXT"
)
parser
.
add_option
(
"--variant"
,
dest
=
"variant"
,
help
=
"Report file"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--infoPath"
,
dest
=
"infoPath"
,
help
=
"Path of GSE-GSM index file"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--infoFile"
,
dest
=
"idx"
,
help
=
"GSE-GSM index file"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--variant"
,
dest
=
"variant"
,
help
=
"Run variant"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--S1"
,
dest
=
"S1"
,
help
=
"General features"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--S2"
,
dest
=
"S2"
,
help
=
"Inner/Complete word features"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--S3"
,
dest
=
"S3"
,
help
=
"Extended context features"
,
action
=
"store_true"
,
default
=
False
)
...
...
@@ -75,14 +92,25 @@ if __name__ == "__main__":
parser
.
error
(
"Any parameter given."
)
sys
.
exit
(
1
)
print
(
'-------------------------------- PARAMETERS --------------------------------'
)
print
(
"Path to read input files: "
+
options
.
inputPath
)
print
(
"Mode name: "
+
str
(
options
.
modelName
))
print
(
"Model path: "
+
options
.
modelPath
)
print
(
"Path to place output files: "
+
options
.
outputPath
)
print
(
"Filtering stop words: "
+
str
(
options
.
filterStopWords
))
print
(
"Levels: "
+
"S1: "
+
str
(
options
.
S1
)
+
"S2: "
+
str
(
options
.
S2
)
+
"S3: "
+
str
(
options
.
S3
)
+
"S4: "
+
str
(
options
.
S4
))
print
(
"Run variant: "
+
str
(
options
.
variant
))
print
(
"--inputPath Path of training data set : "
+
str
(
options
.
inputPath
))
print
(
"--outputPath Output path to place output files: "
+
str
(
options
.
outputPath
))
print
(
"--outputFileI Output tagged file I : "
+
str
(
options
.
outFileI
))
print
(
"--outputFileII Output tagged file II : "
+
str
(
options
.
outFileII
))
print
(
"--modelPath Path to read CRF model : "
+
str
(
options
.
modelPath
))
print
(
"--modelName Model name : "
+
str
(
options
.
modelName
))
print
(
"--infoPath Path of GSE-GSM index file : "
+
str
(
options
.
infoPath
))
print
(
"--infoFile GSE-GSM index file : "
+
str
(
options
.
idx
))
print
(
"--variant Run variant : "
+
str
(
options
.
variant
))
print
(
"--S1 General features : "
+
str
(
options
.
S1
))
print
(
"--S2 Inner/Complete word features : "
+
str
(
options
.
S2
))
print
(
"--S3 Extended context features : "
+
str
(
options
.
S3
))
print
(
"--S4 Semantic features : "
+
str
(
options
.
S4
))
print
(
"--filteringStopWords Filtering stop words : "
+
str
(
options
.
filterStopWords
))
print
(
"--filterSymbols Filtering punctuation marks : "
+
str
(
options
.
filterSymbols
))
symbols
=
[
'.'
,
','
,
':'
,
';'
,
'?'
,
'!'
,
'
\'
'
,
'"'
,
'<'
,
'>'
,
'('
,
')'
,
'-'
,
'_'
,
'/'
,
'
\\
'
,
'¿'
,
'¡'
,
'+'
,
'{'
,
'}'
,
'['
,
']'
,
'*'
,
'
%
'
,
'$'
,
'#'
,
'&'
,
'°'
,
'`'
,
'...'
]
...
...
@@ -92,7 +120,9 @@ if __name__ == "__main__":
print
(
'-------------------------------- PROCESSING --------------------------------'
)
stopwords
=
[
word
for
word
in
stopwords
.
words
(
'english'
)]
# Read index
idx
=
open
(
os
.
path
.
join
(
options
.
infoPath
,
options
.
idx
),
"r"
)
.
readlines
()
# Read CRF model
t0
=
time
()
print
(
'Reading CRF model...'
)
...
...
@@ -108,8 +138,9 @@ if __name__ == "__main__":
# For each file in dir
for
file
in
files
:
print
(
"Preprocessing file..."
+
str
(
file
))
sentencesInputData
=
[]
sentencesOutputData
=
[]
sentencesInputData
=
[]
sentencesOutputDataI
=
[]
sentencesOutputDataII
=
[]
with
open
(
os
.
path
.
join
(
options
.
inputPath
,
file
),
"r"
)
as
iFile
:
lines
=
iFile
.
readlines
()
for
line
in
lines
:
...
...
@@ -142,10 +173,12 @@ if __name__ == "__main__":
# Tagging with CRF model
print
(
"Tagging file"
)
lidx
=
0
for
line
,
tagLine
in
zip
(
lines
,
y_pred
):
Ltags
=
set
(
labels
)
.
intersection
(
set
(
tagLine
))
outputLine
=
''
line
=
line
.
strip
(
'
\n
'
)
line
=
line
.
strip
(
'
\n
'
)
#print("\nLine: " + str(line))
#print ("CRF tagged line: " + str(tagLine))
tb
=
'O'
...
...
@@ -153,20 +186,25 @@ if __name__ == "__main__":
if
len
(
tagLine
)
==
1
:
if
tagLine
[
0
]
in
labels
:
start
=
'<'
+
tagLine
[
0
]
+
'> '
end
=
'<'
+
tagLine
[
0
]
+
'/>'
word
=
line
.
split
(
'|'
)[
0
]
+
' '
end
=
'<
/
'
+
tagLine
[
0
]
+
'/>'
word
=
line
.
split
(
'|'
)[
0
]
+
' '
outputLine
=
start
+
word
+
end
else
:
outputLine
=
line
.
split
(
' '
)[
0
]
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputData
.
append
([
outputLine
,
', '
.
join
(
Ltags
)])
sentencesOutputDataI
.
append
([
outputLine
,
', '
.
join
(
Ltags
)])
sentencesOutputDataII
.
append
(
idx
[
lidx
]
.
replace
(
'
\n
'
,
'
\t
'
)
+
word
.
split
(
'|'
)[
0
]
+
'
\t
'
+
tag
)
continue
sentence
=
''
sb
=
False
for
word
,
tag
in
zip
(
line
.
split
(
' '
),
tagLine
):
# start tagging
if
tag
in
labels
and
tb
==
'O'
:
if
tag
in
labels
and
tb
!=
tag
:
# start tagging
outputLine
+=
'<'
+
tag
+
'> '
sb
=
True
sentence
=
word
.
split
(
'|'
)[
0
]
+
' '
tb
=
tag
outputLine
+=
word
.
split
(
'|'
)[
0
]
+
' '
i
+=
1
...
...
@@ -174,40 +212,38 @@ if __name__ == "__main__":
# end tagging
elif
tb
in
labels
:
if
i
+
1
==
len
(
tagLine
):
# end
tagging
# end
sentence
outputLine
+=
word
.
split
(
'|'
)[
0
]
+
' '
outputLine
+=
'<'
+
tag
+
'/> '
outputLine
+=
'</'
+
tag
+
'/> '
sentencesOutputDataII
.
append
(
idx
[
lidx
]
.
replace
(
'
\n
'
,
'
\t
'
)
+
sentence
+
word
.
split
(
'|'
)[
0
]
+
'
\t
'
+
tag
)
sb
=
False
tb
=
'O'
i
+=
1
continue
elif
tag
Line
[
i
+
1
]
==
'O'
:
#
end taggin
g
elif
tag
!=
tagLine
[
i
+
1
]
:
#
start new ta
g
outputLine
+=
word
.
split
(
'|'
)[
0
]
+
' '
outputLine
+=
'<'
+
tag
+
'/> '
outputLine
+=
'</'
+
tag
+
'/> '
sentencesOutputDataII
.
append
(
idx
[
lidx
]
.
replace
(
'
\n
'
,
'
\t
'
)
+
sentence
+
word
.
split
(
'|'
)[
0
]
+
'
\t
'
+
tag
)
sb
=
False
tb
=
'O'
i
+=
1
continue
# word tagged
outputLine
+=
word
.
split
(
'|'
)[
0
]
+
' '
i
+=
1
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputData
.
append
([
outputLine
,
', '
.
join
(
Ltags
)])
if
sb
:
sentence
+=
word
.
split
(
'|'
)[
0
]
+
' '
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputDataI
.
append
([
outputLine
,
', '
.
join
(
Ltags
)])
lidx
+=
1
print
(
DF
(
sentencesOutputData
)
)
#print( DF(sentencesOutputDataI
) )
#print( '\n'.join(sentencesOutputDataII) )
# Save tags
'''
with open(os.path.join(options.outputPath, file), "w") as oFile
:
for line in sentencesOutputData:
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
options
.
outFileII
),
"w"
)
as
oFile
:
for
line
in
sentencesOutputDataII
:
#print(line)
oFile
.
write
(
line
+
'
\n
'
)
print
(
"Processing corpus done in:
%
fs"
%
(
time
()
-
t0
))
'''
...
...
predict-annot/output/annot-input_bg_outputII.txt
0 → 100644
View file @
07d3119
This diff could not be displayed because it is too large.
predict-annot/reports/output_tagging_report.txt
View file @
07d3119
-------------------------------- PARAMETERS --------------------------------
Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models
Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
Filtering stop words: False
Levels: S1: FalseS2: FalseS3: FalseS4: False
Run variant: None
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI.txt
--outputFileII Output tagged file II : annot-input_bg_outputII.txt
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
--infoFile GSE-GSM index file : bg_sentences_midx.txt
--variant Run variant : 13
--S1 General features : True
--S2 Inner/Complete word features : False
--S3 Extended context features : False
--S4 Semantic features : True
--filteringStopWords Filtering stop words : False
--filterSymbols Filtering punctuation marks : False
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.0083
42
s
Reading CRF model done in: 0.0083
36
s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in:
0.983480
s
Prediction done in:
1.688127
s
Tagging file
0 1
0 <Gtype> antibody : Flag <Gtype/> Gtype
1 <Gversion> ChIP-Seq <Gversion/> Gversion
2 Cultures of Caulobacter -LRB- TLS1631-TLS1633 ... Gtype
3 <Gtype> developmental stage : mixed population... Gtype
4 DNA was isolated using the Qiagen Cell Lysis a...
5 Escherichia coli
6 Escherichia coli AB1157
7 For analysis of ChIP-seq data , Hiseq 2500 Ill...
8 For analysis of IDAP-seq data , Hiseq 2500 Ill... Gtype
9 Genome _ build : NC _ 000913.3
10 Genome _ build : NC _ 011916.1
11 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
12 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
13 <Gtype> genotype : AB1157 ybbD : : parS site 1... Gtype
14 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
15 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
16 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
17 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
18 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
19 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
20 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
21 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
22 <Gtype> genotype : AB1157 ybbD : : parS site 6... Gtype
23 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
24 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
25 Hiseq 2500 Illumina short reads -LRB- 50 bp -R...
26 LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG
27 LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG
28 LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG
29 LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG
... ... ...
14686 <Phase> ESBL019 Coliform <Phase/> Phase
14687 <Gtype> ESBL019 Filamented <Gtype/> Gtype
14688 ESBL019 Reverted
14689 <Phase> ESBL019 Transition <Phase/> Phase
14690 Escherichia coli
14691 Four morphologic states of ESBL019 were used d...
14692 <Gtype> morphology : Coliform <Gtype/> Gtype
14693 <Gtype> morphology : Filamented <Gtype/> Gtype
14694 morphology : Reverted -LRB- reverted back from...
14695 morphology : Transition -LRB- from Coli into F...
14696 RNA isolation was performed using an RNeasy mi...
14697 <Gtype> strain : beta-lactamase -LRB- ESBL -RR... Gtype
14698 The E. coli isolate ESBL019 was originally iso...
14699 Escherichia coli
14700 lexA 10 ' after UV vs. 0 ' , MG1655
14701 <Gtype> lexA 10 min after UV treatment , 25 ug... Gtype
14702 lexA 20 ' after NOuv vs. 0 ' , MG1655
14703 lexA 20 ' after UV vs. 0 ' , MG1655
14704 lexA 20 min after NOuv , 25 ug total RNA , 2 u...
14705 <Gtype> lexA 20 min after UV treatment , 25 ug... Gtype
14706 lexA 40 ' after UV vs. 0 ' , MG1655
14707 <Gtype> lexA 40 min after UV treatment , 25 ug... Gtype
14708 lexA 5 ' after UV vs. 0 ' , MG1655
14709 <Gtype> lexA 5 min after UV treatment , 25 ug ... Gtype
14710 lexA 60 ' after NOuv vs. 0 ' , MG1655
14711 lexA 60 ' after UV vs. 0 ' , MG1655
14712 lexA 60 min after NOuv , 25 ug total RNA , 2 u...
14713 <Gtype> lexA 60 min after UV treatment , 25 ug... Gtype
14714 lexA vs. wt , before UV treatment , MG1655
14715 untreated cells , 25 ug total RNA
[14716 rows x 2 columns]
Processing corpus done in: 3.948320s
...
...
Please
register
or
login
to post a comment