upload

Estefani Gaytan Nunez
Commit 07d3119f885e2b761f8d0c52ea9cc8ce33820009 07d3119f 1 parent 582f6ed0
Showing 3 changed files with 105 additions and 124 deletions
predict-annot/bin/tagging/tagging.py
predict-annot/output/annot-input_bg_outputII.txt
predict-annot/reports/output_tagging_report.txt
--- a/predict-annot/bin/tagging/tagging.py
View file @07d3119
+++ b/predict-annot/bin/tagging/tagging.py
View file @07d3119
 # -*- coding: UTF-8 -*-
 
 import os
+ import re
 from pandas import DataFrame as DF
 from optparse import OptionParser
 from time import time
@@ -29,25 +30,37 @@ import training_validation_v14 as training
 # Tagging transformed file with CRF model with sklearn-crfsuite.
 #
 # Input parameters
- # --inputPath=PATH      Path of transformed files x|y|z
- # --modelPath           Path to CRF model
- # --modelName           Model name
- # --outputPath=PATH     Output path to place output files
- # --filteringStopWords  Filtering stop words
- # --filterSymbols       Filtering punctuation marks
+ # --inputPath=PATH          Path of transformed files x|y|z
+ # --outputPath              Output path to place output files
+ # --outputFileI             Output tagged file I
+ # --outputFileII            Output tagged file II
+ # --modelPath               Path to CRF model
+ # --modelName               Model name
+ # --infoPath                Path of GSE-GSM index file	
+ # --infoFile	            GSE-GSM index file",      
+ # --variant	                Part of S2 variant
+ # --S1                      Inner word features set
+ # --S2                      Complete word features
+ # --S3                      Extended context features
+ # --S4                      Semantic features
+ # --filteringStopWords      Filtering stop words    
+ # --filterSymbols           Filtering punctuation marks
 
 # Output
 # 1) Tagged files in transformed format
 
 # Examples
- # python3 tagging.py
- # --inputPath           /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
- # --modelName           model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
- # --modelPath           /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
- # --outputPath          /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
- # --filterSymbols
- 
- # python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt
+ # --inputPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+ # --outputPath	    /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+ # --outputFileI     annot-input_bg_outputI.txt
+ # --outputFileII    annot-input_bg_outputII.txt
+ # --modelPath		/home/egaytan/automatic-extraction-growth-conditions/CRF/models
+ # --modelName		model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+ # --infoPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+ # --infoFile		bg_sentences_midx.txt
+ # --variant		    13 
+ 
+ #python3 tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI.txt  --outputFileII  annot-input_bg_outputII.txt  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx.txt  --variant 13   --S4   --S1 > ../../reports/output_tagging_report.txt
 
 __author__ = 'egaytan'
 
@@ -60,9 +73,13 @@ if __name__ == "__main__":
     parser = OptionParser()
     parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
     parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
+     parser.add_option("--outputFileI",     dest="outFileI",        help="Output tagged file I",              metavar="FILE")
+     parser.add_option("--outputFileII",    dest="outFileII",       help="Output tagged file II",             metavar="FILE")
     parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
     parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
-     parser.add_option("--variant",         dest="variant",         help="Report file",                       metavar="FILE")
+     parser.add_option("--infoPath",        dest="infoPath",        help="Path of GSE-GSM index file",        metavar="PATH")
+     parser.add_option("--infoFile",        dest="idx",             help="GSE-GSM index file",                metavar="FILE")
+     parser.add_option("--variant",         dest="variant",         help="Run variant",                       metavar="FILE")
     parser.add_option("--S1",              dest="S1",              help="General features",                  action="store_true", default=False)
     parser.add_option("--S2",              dest="S2",              help="Inner/Complete word features",      action="store_true", default=False)
     parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)
@@ -75,14 +92,25 @@ if __name__ == "__main__":
         parser.error("Any parameter given.")
         sys.exit(1)
 
+     
     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path to read input files: " + options.inputPath)
-     print("Mode name: " + str(options.modelName))
-     print("Model path: " + options.modelPath)
-     print("Path to place output files: " + options.outputPath)
-     print("Filtering stop words: " + str(options.filterStopWords))
-     print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
-     print("Run variant: " + str(options.variant))
+     
+     print("--inputPath          Path of training data set        : " + str(options.inputPath        ))
+     print("--outputPath         Output path to place output files: " + str(options.outputPath       ))
+     print("--outputFileI        Output tagged file I             : " + str(options.outFileI         ))
+     print("--outputFileII       Output tagged file II            : " + str(options.outFileII        ))
+     print("--modelPath          Path to read CRF model           : " + str(options.modelPath        ))
+     print("--modelName          Model name                       : " + str(options.modelName        ))
+     print("--infoPath           Path of GSE-GSM index file       : " + str(options.infoPath         ))
+     print("--infoFile	        GSE-GSM index file               : " + str(options.idx              ))
+     print("--variant	        Run variant                      : " + str(options.variant          ))
+     print("--S1                 General features                 : " + str(options.S1               ))
+     print("--S2                 Inner/Complete word features     : " + str(options.S2               ))
+     print("--S3                 Extended context features        : " + str(options.S3               ))
+     print("--S4                 Semantic features                : " + str(options.S4               ))
+     print("--filteringStopWords Filtering stop words             : " + str(options.filterStopWords  ))
+     print("--filterSymbols      Filtering punctuation marks      : " + str(options.filterSymbols    ))
+            
     
     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
@@ -92,7 +120,9 @@ if __name__ == "__main__":
     print('-------------------------------- PROCESSING --------------------------------')
 
     stopwords = [word for word in stopwords.words('english')]
- 
+     # Read index
+     idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
+     
     # Read CRF model
     t0 = time()
     print('Reading CRF model...')    
@@ -108,8 +138,9 @@ if __name__ == "__main__":
         # For each file in dir
         for file in files:
             print("Preprocessing file..." + str(file))
-             sentencesInputData = []
-             sentencesOutputData = []
+             sentencesInputData  = []
+             sentencesOutputDataI = []
+             sentencesOutputDataII = []
             with open(os.path.join(options.inputPath, file), "r") as iFile:
                 lines = iFile.readlines()
                 for line in lines:
@@ -142,10 +173,12 @@ if __name__ == "__main__":
                 
                 # Tagging with CRF model
                 print("Tagging file")
+                 lidx = 0
                 for line, tagLine in zip(lines, y_pred):
                     Ltags = set(labels).intersection(set(tagLine))
                     outputLine = ''                    
-                     line = line.strip('\n')                   
+                     line = line.strip('\n')
+                     
                     #print("\nLine: " + str(line))
                     #print ("CRF tagged line: " + str(tagLine))
                     tb = 'O'
@@ -153,20 +186,25 @@ if __name__ == "__main__":
                     if len(tagLine)==1:
                         if tagLine[0] in labels: 
                             start = '<' + tagLine[0] + '> '
-                             end   = '<' + tagLine[0] + '/>'
-                             word  = line.split('|')[0] + ' '                            
+                             end   = '</' + tagLine[0] + '/>'
+                             word  = line.split('|')[0] + ' '
                             outputLine = start + word + end
                         else:                             
                             outputLine = line.split(' ')[0]
                         #print(outputLine + '\t' + ', '.join(Ltags))
-                         sentencesOutputData.append([outputLine, ', '.join(Ltags)])
+                         sentencesOutputDataI.append([outputLine, ', '.join(Ltags)])
+                         sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
                         continue
                         
+                     sentence = ''
+                     sb = False
                     for word,tag in zip(line.split(' '), tagLine):
                         # start tagging
-                         if tag in labels and tb == 'O':
+                         if tag in labels and tb != tag:
                             # start tagging
                             outputLine += '<' + tag + '> '
+                             sb = True
+                             sentence = word.split('|')[0] + ' '
                             tb = tag
                             outputLine += word.split('|')[0] + ' '
                             i += 1
@@ -174,40 +212,38 @@ if __name__ == "__main__":
                         # end tagging
                         elif tb in labels:
                             if i+1==len(tagLine):
-                                 # end tagging
+                                 # end sentence
                                 outputLine += word.split('|')[0] + ' '
-                                 outputLine += '<' + tag + '/> '
+                                 outputLine += '</' + tag + '/> '
+                                 sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
+                                 sb = False
                                 tb = 'O'
                                 i += 1
                                 continue
-                             elif tagLine[i+1]=='O':
-                                 # end tagging
+                             elif tag!=tagLine[i+1]:
+                                 # start new tag
                                 outputLine += word.split('|')[0] + ' '
-                                 outputLine += '<' + tag + '/> '
+                                 outputLine += '</' + tag + '/> '
+                                 sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
+                                 sb = False
                                 tb = 'O'
                                 i += 1
                                 continue
                         # word tagged
                         outputLine += word.split('|')[0] + ' '
                         i += 1
-                     #print(outputLine + '\t' + ', '.join(Ltags))
-                     sentencesOutputData.append([outputLine, ', '.join(Ltags)])
+                         if sb:
+                             sentence+= word.split('|')[0] + ' '
+                     #print(outputLine + '\t' + ', '.join(Ltags))                  
+                     sentencesOutputDataI.append([outputLine, ', '.join(Ltags)])
+                     lidx += 1
                     
-             print( DF(sentencesOutputData) )
-         
+             #print( DF(sentencesOutputDataI) )
+             #print( '\n'.join(sentencesOutputDataII) )        
             # Save tags
-             '''
-             with open(os.path.join(options.outputPath, file), "w") as oFile:
-                 for line in sentencesOutputData:
+             with open(os.path.join(options.outputPath, options.outFileII), "w") as oFile:
+                 for line in sentencesOutputDataII:
+                     #print(line)
                     oFile.write(line + '\n')
 
     print("Processing corpus done in: %fs" % (time() - t0))
- '''
- 
- 
- 
- 
- 
- 
- 
- 
--- a/predict-annot/output/annot-input_bg_outputII.txt 0 → 100644
View file @07d3119
+++ b/predict-annot/output/annot-input_bg_outputII.txt 0 → 100644
View file @07d3119
--- a/predict-annot/reports/output_tagging_report.txt
View file @07d3119
+++ b/predict-annot/reports/output_tagging_report.txt
View file @07d3119
 -------------------------------- PARAMETERS --------------------------------
- Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
- Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
- Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models
- Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
- Filtering stop words: False
- Levels: S1: FalseS2: FalseS3: FalseS4: False
- Run variant: None
+ --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+ --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+ --outputFileI        Output tagged file I             : annot-input_bg_outputI.txt
+ --outputFileII       Output tagged file II            : annot-input_bg_outputII.txt
+ --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
+ --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+ --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+ --infoFile	        GSE-GSM index file               : bg_sentences_midx.txt
+ --variant	        Run variant                      : 13
+ --S1                 General features                 : True
+ --S2                 Inner/Complete word features     : False
+ --S3                 Extended context features        : False
+ --S4                 Semantic features                : True
+ --filteringStopWords Filtering stop words             : False
+ --filterSymbols      Filtering punctuation marks      : False
 Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
 -------------------------------- PROCESSING --------------------------------
 Reading CRF model...
- Reading CRF model done in: 0.008342s
+ Reading CRF model done in: 0.008336s
 Processing corpus...
 Preprocessing file...annot-input_bg_v3.txt
 Sentences input data: 14716
 Predicting tags with model
- Prediction done in: 0.983480s
+ Prediction done in: 1.688127s
 Tagging file
-                                                        0         1
- 0                      <Gtype> antibody : Flag <Gtype/>      Gtype
- 1                        <Gversion> ChIP-Seq <Gversion/>  Gversion
- 2      Cultures of Caulobacter -LRB- TLS1631-TLS1633 ...     Gtype
- 3      <Gtype> developmental stage : mixed population...     Gtype
- 4      DNA was isolated using the Qiagen Cell Lysis a...          
- 5                                      Escherichia coli           
- 6                               Escherichia coli AB1157           
- 7      For analysis of ChIP-seq data , Hiseq 2500 Ill...          
- 8      For analysis of IDAP-seq data , Hiseq 2500 Ill...     Gtype
- 9                        Genome _ build : NC _ 000913.3           
- 10                       Genome _ build : NC _ 011916.1           
- 11     <Gtype> genotype : AB1157 ybbD : : parS scramb...     Gtype
- 12     <Gtype> genotype : AB1157 ybbD : : parS scramb...     Gtype
- 13     <Gtype> genotype : AB1157 ybbD : : parS site 1...     Gtype
- 14     <Gtype> genotype : AB1157 ybbD : : parS site 2...     Gtype
- 15     <Gtype> genotype : AB1157 ybbD : : parS site 2...     Gtype
- 16     <Gtype> genotype : AB1157 ybbD : : parS site 3...     Gtype
- 17     <Gtype> genotype : AB1157 ybbD : : parS site 3...     Gtype
- 18     <Gtype> genotype : AB1157 ybbD : : parS site 4...     Gtype
- 19     <Gtype> genotype : AB1157 ybbD : : parS site 4...     Gtype
- 20     <Gtype> genotype : AB1157 ybbD : : parS site 5...     Gtype
- 21     <Gtype> genotype : AB1157 ybbD : : parS site 5...     Gtype
- 22     <Gtype> genotype : AB1157 ybbD : : parS site 6...     Gtype
- 23     <Gtype> genotype : AB1157 ybbD : : parS site 7...     Gtype
- 24     <Gtype> genotype : AB1157 ybbD : : parS site 7...     Gtype
- 25     Hiseq 2500 Illumina short reads -LRB- 50 bp -R...          
- 26           LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG           
- 27           LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG           
- 28           LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG           
- 29           LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG           
- ...                                                  ...       ...
- 14686                 <Phase> ESBL019 Coliform <Phase/>      Phase
- 14687               <Gtype> ESBL019 Filamented <Gtype/>      Gtype
- 14688                                  ESBL019 Reverted           
- 14689               <Phase> ESBL019 Transition <Phase/>      Phase
- 14690                                  Escherichia coli           
- 14691  Four morphologic states of ESBL019 were used d...          
- 14692            <Gtype> morphology : Coliform <Gtype/>      Gtype
- 14693          <Gtype> morphology : Filamented <Gtype/>      Gtype
- 14694  morphology : Reverted -LRB- reverted back from...          
- 14695  morphology : Transition -LRB- from Coli into F...          
- 14696  RNA isolation was performed using an RNeasy mi...          
- 14697  <Gtype> strain : beta-lactamase -LRB- ESBL -RR...     Gtype
- 14698  The E. coli isolate ESBL019 was originally iso...          
- 14699                                  Escherichia coli           
- 14700               lexA 10 ' after UV vs. 0 ' , MG1655           
- 14701  <Gtype> lexA 10 min after UV treatment , 25 ug...     Gtype
- 14702             lexA 20 ' after NOuv vs. 0 ' , MG1655           
- 14703               lexA 20 ' after UV vs. 0 ' , MG1655           
- 14704  lexA 20 min after NOuv , 25 ug total RNA , 2 u...          
- 14705  <Gtype> lexA 20 min after UV treatment , 25 ug...     Gtype
- 14706               lexA 40 ' after UV vs. 0 ' , MG1655           
- 14707  <Gtype> lexA 40 min after UV treatment , 25 ug...     Gtype
- 14708                lexA 5 ' after UV vs. 0 ' , MG1655           
- 14709  <Gtype> lexA 5 min after UV treatment , 25 ug ...     Gtype
- 14710             lexA 60 ' after NOuv vs. 0 ' , MG1655           
- 14711               lexA 60 ' after UV vs. 0 ' , MG1655           
- 14712  lexA 60 min after NOuv , 25 ug total RNA , 2 u...          
- 14713  <Gtype> lexA 60 min after UV treatment , 25 ug...     Gtype
- 14714        lexA vs. wt , before UV treatment , MG1655           
- 14715                 untreated cells , 25 ug total RNA           
- 
- [14716 rows x 2 columns]
+ Processing corpus done in: 3.948320s