upload

Estefani Gaytan Nunez
Commit 582f6ed0d3afc4d89f08698c5b6aac44c59cf050 582f6ed0 1 parent 5596beb2
Showing 8 changed files with 334 additions and 5 deletions
CRF/bin/preprocessing/label-split_training_test_v4.py
predict-annot/bin/preprocessing/built_bg_sentences.py
predict-annot/bin/tagging/__pycache__/training_validation_v14.cpython-36.pyc
predict-annot/bin/tagging/tagging.py
predict-annot/bin/tagging/tlibs.py
predict-annot/bin/tagging/training_validation_v14.py
predict-annot/input/annot-input_bg_v3.txt
predict-annot/reports/output_tagging_report.txt
--- a/CRF/bin/preprocessing/label-split_training_test_v4.py
View file @582f6ed
+++ b/CRF/bin/preprocessing/label-split_training_test_v4.py
View file @582f6ed
@@ -9,9 +9,11 @@ import random
 #
 # Input parameters
 # --inputPath=PATH    		Path of inputfile
+# --inputFile 			Output CoreNLP file with tagging sentences
 # --outputPath=PATH   		Path to place output files
 # --trainingFile=testFile  	Output training data set
 # --testFile=testFile  	  	Output test data set
+# --index			Select a limit CoreNLP output column
 #
 # Output
 # training and test data set
@@ -23,7 +25,7 @@ import random
 # --trainingFile training-data-set-70_v4.txt
 # --testFile test-data-set-30_v4.txt
 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
-#
+# --index 5
 # 
 # python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
--- a/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @582f6ed
+++ b/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @582f6ed
@@ -11,6 +11,7 @@ from optparse import OptionParser
 # --outputFile=File        	Output data set
 # --minWordLen                  Minimum word length
 # --minSenLen			Minimum sentence length
+# --index                       Select a limit CoreNLP output column
 #
 # Output
 # Tagged sentences reconstruction
@@ -23,6 +24,7 @@ from optparse import OptionParser
 # --outputPath              /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
 # --minWordLen		    2
 # --minSenLen               1
+# --index		    5
 #
 #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
@@ -39,7 +41,7 @@ if __name__ == "__main__":
     parser.add_option("--outputFile", dest="outputFile", help="File with training data set",         metavar="FILE")
     parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
     parser.add_option("--minSenLen",  dest="sL", help="Minimum word length", type="int")
-    
+    parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)    
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -58,23 +60,26 @@ if __name__ == "__main__":
     lista = []
     #First sentence
     sentence = ''
+    #count
+    i = 0
     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
         for line in input_file:
             if len(line.split('\t')) > 1:
                 w = line.split('\t')[1]
                 if w == "PGCGROWTHCONDITIONS":
+                    i = i + 1
                     if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:                         
-                         print( "EXCLUDE: " + sentence.lstrip() )
+                         print( "EXCLUDE: " + str(i) + "line" + sentence.lstrip() )
                     else:
                          #End of sentence
                          lista.append(sentence.lstrip())
                          #New setence
                          n = n+1                         
                     #New setence
-                    sentence = ''                   
+                    sentence = ''                  
                 else:
                     #Building and save tagging sentence
-                    sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
+                    sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index]))
     print("Number of sentences: " + str(n))
--- a/predict-annot/bin/tagging/__pycache__/training_validation_v14.cpython-36.pyc 0 → 100644
View file @582f6ed
+++ b/predict-annot/bin/tagging/__pycache__/training_validation_v14.cpython-36.pyc 0 → 100644
View file @582f6ed
--- a/predict-annot/bin/tagging/tagging.py 0 → 100644
View file @582f6ed
+++ b/predict-annot/bin/tagging/tagging.py 0 → 100644
View file @582f6ed
+# -*- coding: UTF-8 -*-
+
+import os
+from pandas import DataFrame as DF
+from optparse import OptionParser
+from time import time
+from collections import Counter
+
+import nltk
+import sklearn
+import scipy.stats
+import sys
+
+import joblib
+from sklearn.metrics import make_scorer
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import RandomizedSearchCV
+
+import sklearn_crfsuite
+from sklearn_crfsuite import scorers
+from sklearn_crfsuite import metrics
+
+from nltk.corpus import stopwords
+
+import training_validation_v14 as training
+
+#-------------------------------------------------------------------------------
+# Objective
+# Tagging transformed file with CRF model with sklearn-crfsuite.
+#
+# Input parameters
+# --inputPath=PATH      Path of transformed files x|y|z
+# --modelPath           Path to CRF model
+# --modelName           Model name
+# --outputPath=PATH     Output path to place output files
+# --filteringStopWords  Filtering stop words
+# --filterSymbols       Filtering punctuation marks
+
+# Output
+# 1) Tagged files in transformed format
+
+# Examples
+# python3 tagging.py
+# --inputPath           /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+# --modelName           model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
+# --modelPath           /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
+# --outputPath          /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+# --filterSymbols
+
+# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt
+
+__author__ = 'egaytan'
+
+##########################################
+#               MAIN PROGRAM             #
+##########################################
+
+if __name__ == "__main__":
+    # Defining parameters
+    parser = OptionParser()
+    parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
+    parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
+    parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
+    parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
+    parser.add_option("--variant",         dest="variant",         help="Report file",                       metavar="FILE")
+    parser.add_option("--S1",              dest="S1",              help="General features",                  action="store_true", default=False)
+    parser.add_option("--S2",              dest="S2",              help="Inner/Complete word features",      action="store_true", default=False)
+    parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)
+    parser.add_option("--S4",              dest="S4",              help="Semantic features",                 action="store_true", default=False)   
+    parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words",              action="store_true", default=False)
+    parser.add_option("--filterSymbols",   dest="filterSymbols",   help="Filtering punctuation marks",       action="store_true", default=False)
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + options.inputPath)
+    print("Mode name: " + str(options.modelName))
+    print("Model path: " + options.modelPath)
+    print("Path to place output files: " + options.outputPath)
+    print("Filtering stop words: " + str(options.filterStopWords))
+    print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
+    print("Run variant: " + str(options.variant))
+    
+    symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+               '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
+
+    print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+
+    print('-------------------------------- PROCESSING --------------------------------')
+
+    stopwords = [word for word in stopwords.words('english')]
+
+    # Read CRF model
+    t0 = time()
+    print('Reading CRF model...')    
+    crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
+    print("Reading CRF model done in: %fs" % (time() - t0))
+
+    # Reading sentences
+    print('Processing corpus...')
+    t0 = time()
+    labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("Preprocessing file..." + str(file))
+            sentencesInputData = []
+            sentencesOutputData = []
+            with open(os.path.join(options.inputPath, file), "r") as iFile:
+                lines = iFile.readlines()
+                for line in lines:
+                    listLine = []
+                    for token in line.strip('\n').split():
+                        if options.filterStopWords:
+                            listToken = token.split('|')
+                            lemma = listToken[1]                            
+                            if lemma in stopwords:
+                                continue
+                        if options.filterSymbols:
+                            listToken = token.split('|')
+                            lemma = listToken[1]
+                            if lemma in symbols:
+                                if lemma == ',':
+                                    print("Coma , identificada")
+                                continue
+                        listLine.append(token)
+                    sentencesInputData.append(listLine)
+                X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
+                print("Sentences input data: " + str(len(sentencesInputData)))
+                
+                
+                # Predicting tags
+                t1 = time()               
+                print("Predicting tags with model")
+                y_pred = crf.predict(X_input)                
+                print("Prediction done in: %fs" % (time() - t1))
+                
+                
+                # Tagging with CRF model
+                print("Tagging file")
+                for line, tagLine in zip(lines, y_pred):
+                    Ltags = set(labels).intersection(set(tagLine))
+                    outputLine = ''                    
+                    line = line.strip('\n')                   
+                    #print("\nLine: " + str(line))
+                    #print ("CRF tagged line: " + str(tagLine))
+                    tb = 'O'
+                    i = 0
+                    if len(tagLine)==1:
+                        if tagLine[0] in labels: 
+                            start = '<' + tagLine[0] + '> '
+                            end   = '<' + tagLine[0] + '/>'
+                            word  = line.split('|')[0] + ' '                            
+                            outputLine = start + word + end
+                        else:                             
+                            outputLine = line.split(' ')[0]
+                        #print(outputLine + '\t' + ', '.join(Ltags))
+                        sentencesOutputData.append([outputLine, ', '.join(Ltags)])
+                        continue
+                        
+                    for word,tag in zip(line.split(' '), tagLine):
+                        # start tagging
+                        if tag in labels and tb == 'O':
+                            # start tagging
+                            outputLine += '<' + tag + '> '
+                            tb = tag
+                            outputLine += word.split('|')[0] + ' '
+                            i += 1
+                            continue
+                        # end tagging
+                        elif tb in labels:
+                            if i+1==len(tagLine):
+                                # end tagging
+                                outputLine += word.split('|')[0] + ' '
+                                outputLine += '<' + tag + '/> '
+                                tb = 'O'
+                                i += 1
+                                continue
+                            elif tagLine[i+1]=='O':
+                                # end tagging
+                                outputLine += word.split('|')[0] + ' '
+                                outputLine += '<' + tag + '/> '
+                                tb = 'O'
+                                i += 1
+                                continue
+                        # word tagged
+                        outputLine += word.split('|')[0] + ' '
+                        i += 1
+                    #print(outputLine + '\t' + ', '.join(Ltags))
+                    sentencesOutputData.append([outputLine, ', '.join(Ltags)])
+                    
+            print( DF(sentencesOutputData) )
+        
+            # Save tags
+            '''
+            with open(os.path.join(options.outputPath, file), "w") as oFile:
+                for line in sentencesOutputData:
+                    oFile.write(line + '\n')
+
+    print("Processing corpus done in: %fs" % (time() - t0))
+'''
+
+
+
+
+
+
+
+
--- a/predict-annot/bin/tagging/tlibs.py 0 → 100644
View file @582f6ed
+++ b/predict-annot/bin/tagging/tlibs.py 0 → 100644
View file @582f6ed
+# -*- coding: UTF-8 -*-
+
+import os
+from optparse import OptionParser
+from time import time
+from collections import Counter
+
+import nltk
+import sklearn
+import scipy.stats
+import sys
+
+#from sklearn.externals import joblib
+import joblib
+from sklearn.metrics import make_scorer
+#from sklearn.cross_validation import cross_val_score
+from sklearn.model_selection import cross_val_score
+#from sklearn.grid_search import RandomizedSearchCV
+from sklearn.model_selection import RandomizedSearchCV
+
+import sklearn_crfsuite
+from sklearn_crfsuite import scorers
+from sklearn_crfsuite import metrics
+
+from nltk.corpus import stopwords
+
+#################################
--- a/predict-annot/bin/tagging/training_validation_v14.py 0 → 100644
View file @582f6ed
+++ b/predict-annot/bin/tagging/training_validation_v14.py 0 → 100644
View file @582f6ed
--- a/predict-annot/input/annot-input_bg_v3.txt
View file @582f6ed
+++ b/predict-annot/input/annot-input_bg_v3.txt
View file @582f6ed
--- a/predict-annot/reports/output_tagging_report.txt 0 → 100644
View file @582f6ed
+++ b/predict-annot/reports/output_tagging_report.txt 0 → 100644
View file @582f6ed
+-------------------------------- PARAMETERS --------------------------------
+Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models
+Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+Filtering stop words: False
+Levels: S1: FalseS2: FalseS3: FalseS4: False
+Run variant: None
+Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
+-------------------------------- PROCESSING --------------------------------
+Reading CRF model...
+Reading CRF model done in: 0.008342s
+Processing corpus...
+Preprocessing file...annot-input_bg_v3.txt
+Sentences input data: 14716
+Predicting tags with model
+Prediction done in: 0.983480s
+Tagging file
+                                                       0         1
+0                      <Gtype> antibody : Flag <Gtype/>      Gtype
+1                        <Gversion> ChIP-Seq <Gversion/>  Gversion
+2      Cultures of Caulobacter -LRB- TLS1631-TLS1633 ...     Gtype
+3      <Gtype> developmental stage : mixed population...     Gtype
+4      DNA was isolated using the Qiagen Cell Lysis a...          
+5                                      Escherichia coli           
+6                               Escherichia coli AB1157           
+7      For analysis of ChIP-seq data , Hiseq 2500 Ill...          
+8      For analysis of IDAP-seq data , Hiseq 2500 Ill...     Gtype
+9                        Genome _ build : NC _ 000913.3           
+10                       Genome _ build : NC _ 011916.1           
+11     <Gtype> genotype : AB1157 ybbD : : parS scramb...     Gtype
+12     <Gtype> genotype : AB1157 ybbD : : parS scramb...     Gtype
+13     <Gtype> genotype : AB1157 ybbD : : parS site 1...     Gtype
+14     <Gtype> genotype : AB1157 ybbD : : parS site 2...     Gtype
+15     <Gtype> genotype : AB1157 ybbD : : parS site 2...     Gtype
+16     <Gtype> genotype : AB1157 ybbD : : parS site 3...     Gtype
+17     <Gtype> genotype : AB1157 ybbD : : parS site 3...     Gtype
+18     <Gtype> genotype : AB1157 ybbD : : parS site 4...     Gtype
+19     <Gtype> genotype : AB1157 ybbD : : parS site 4...     Gtype
+20     <Gtype> genotype : AB1157 ybbD : : parS site 5...     Gtype
+21     <Gtype> genotype : AB1157 ybbD : : parS site 5...     Gtype
+22     <Gtype> genotype : AB1157 ybbD : : parS site 6...     Gtype
+23     <Gtype> genotype : AB1157 ybbD : : parS site 7...     Gtype
+24     <Gtype> genotype : AB1157 ybbD : : parS site 7...     Gtype
+25     Hiseq 2500 Illumina short reads -LRB- 50 bp -R...          
+26           LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG           
+27           LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG           
+28           LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG           
+29           LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG           
+...                                                  ...       ...
+14686                 <Phase> ESBL019 Coliform <Phase/>      Phase
+14687               <Gtype> ESBL019 Filamented <Gtype/>      Gtype
+14688                                  ESBL019 Reverted           
+14689               <Phase> ESBL019 Transition <Phase/>      Phase
+14690                                  Escherichia coli           
+14691  Four morphologic states of ESBL019 were used d...          
+14692            <Gtype> morphology : Coliform <Gtype/>      Gtype
+14693          <Gtype> morphology : Filamented <Gtype/>      Gtype
+14694  morphology : Reverted -LRB- reverted back from...          
+14695  morphology : Transition -LRB- from Coli into F...          
+14696  RNA isolation was performed using an RNeasy mi...          
+14697  <Gtype> strain : beta-lactamase -LRB- ESBL -RR...     Gtype
+14698  The E. coli isolate ESBL019 was originally iso...          
+14699                                  Escherichia coli           
+14700               lexA 10 ' after UV vs. 0 ' , MG1655           
+14701  <Gtype> lexA 10 min after UV treatment , 25 ug...     Gtype
+14702             lexA 20 ' after NOuv vs. 0 ' , MG1655           
+14703               lexA 20 ' after UV vs. 0 ' , MG1655           
+14704  lexA 20 min after NOuv , 25 ug total RNA , 2 u...          
+14705  <Gtype> lexA 20 min after UV treatment , 25 ug...     Gtype
+14706               lexA 40 ' after UV vs. 0 ' , MG1655           
+14707  <Gtype> lexA 40 min after UV treatment , 25 ug...     Gtype
+14708                lexA 5 ' after UV vs. 0 ' , MG1655           
+14709  <Gtype> lexA 5 min after UV treatment , 25 ug ...     Gtype
+14710             lexA 60 ' after NOuv vs. 0 ' , MG1655           
+14711               lexA 60 ' after UV vs. 0 ' , MG1655           
+14712  lexA 60 min after NOuv , 25 ug total RNA , 2 u...          
+14713  <Gtype> lexA 60 min after UV treatment , 25 ug...     Gtype
+14714        lexA vs. wt , before UV treatment , MG1655           
+14715                 untreated cells , 25 ug total RNA           
+
+[14716 rows x 2 columns]