upload

Estefani Gaytan Nunez
Commit 582f6ed0d3afc4d89f08698c5b6aac44c59cf050 582f6ed0 1 parent 5596beb2
Showing 8 changed files with 887 additions and 5 deletions
CRF/bin/preprocessing/label-split_training_test_v4.py
predict-annot/bin/preprocessing/built_bg_sentences.py
predict-annot/bin/tagging/__pycache__/training_validation_v14.cpython-36.pyc
predict-annot/bin/tagging/tagging.py
predict-annot/bin/tagging/tlibs.py
predict-annot/bin/tagging/training_validation_v14.py
predict-annot/input/annot-input_bg_v3.txt
predict-annot/reports/output_tagging_report.txt
--- a/CRF/bin/preprocessing/label-split_training_test_v4.py
View file @582f6ed
+++ b/CRF/bin/preprocessing/label-split_training_test_v4.py
View file @582f6ed
@@ -9,9 +9,11 @@ import random
 #
 # Input parameters
 # --inputPath=PATH    		Path of inputfile
+ # --inputFile 			Output CoreNLP file with tagging sentences
 # --outputPath=PATH   		Path to place output files
 # --trainingFile=testFile  	Output training data set
 # --testFile=testFile  	  	Output test data set
+ # --index			Select a limit CoreNLP output column
 #
 # Output
 # training and test data set
@@ -23,7 +25,7 @@ import random
 # --trainingFile training-data-set-70_v4.txt
 # --testFile test-data-set-30_v4.txt
 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
- #
+ # --index 5
 # 
 # python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
 
--- a/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @582f6ed
+++ b/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @582f6ed
@@ -11,6 +11,7 @@ from optparse import OptionParser
 # --outputFile=File        	Output data set
 # --minWordLen                  Minimum word length
 # --minSenLen			Minimum sentence length
+ # --index                       Select a limit CoreNLP output column
 #
 # Output
 # Tagged sentences reconstruction
@@ -23,6 +24,7 @@ from optparse import OptionParser
 # --outputPath              /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
 # --minWordLen		    2
 # --minSenLen               1
+ # --index		    5
 #
 #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
 
@@ -39,7 +41,7 @@ if __name__ == "__main__":
     parser.add_option("--outputFile", dest="outputFile", help="File with training data set",         metavar="FILE")
     parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
     parser.add_option("--minSenLen",  dest="sL", help="Minimum word length", type="int")
-     
+     parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)    
 
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -58,23 +60,26 @@ if __name__ == "__main__":
     lista = []
     #First sentence
     sentence = ''
+     #count
+     i = 0
     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
         for line in input_file:
             if len(line.split('\t')) > 1:
                 w = line.split('\t')[1]
                 if w == "PGCGROWTHCONDITIONS":
+                     i = i + 1
                     if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:                         
-                          print( "EXCLUDE: " + sentence.lstrip() )
+                          print( "EXCLUDE: " + str(i) + "line" + sentence.lstrip() )
                     else:
                          #End of sentence
                          lista.append(sentence.lstrip())
                          #New setence
                          n = n+1                         
                     #New setence
-                     sentence = ''                   
+                     sentence = ''                  
                 else:
                     #Building and save tagging sentence
-                     sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
+                     sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index]))
 
     print("Number of sentences: " + str(n))
 
--- a/predict-annot/bin/tagging/__pycache__/training_validation_v14.cpython-36.pyc 0 → 100644
View file @582f6ed
+++ b/predict-annot/bin/tagging/__pycache__/training_validation_v14.cpython-36.pyc 0 → 100644
View file @582f6ed
--- a/predict-annot/bin/tagging/tagging.py 0 → 100644
View file @582f6ed
+++ b/predict-annot/bin/tagging/tagging.py 0 → 100644
View file @582f6ed
+ # -*- coding: UTF-8 -*-
+ 
+ import os
+ from pandas import DataFrame as DF
+ from optparse import OptionParser
+ from time import time
+ from collections import Counter
+ 
+ import nltk
+ import sklearn
+ import scipy.stats
+ import sys
+ 
+ import joblib
+ from sklearn.metrics import make_scorer
+ from sklearn.model_selection import cross_val_score
+ from sklearn.model_selection import RandomizedSearchCV
+ 
+ import sklearn_crfsuite
+ from sklearn_crfsuite import scorers
+ from sklearn_crfsuite import metrics
+ 
+ from nltk.corpus import stopwords
+ 
+ import training_validation_v14 as training
+ 
+ #-------------------------------------------------------------------------------
+ # Objective
+ # Tagging transformed file with CRF model with sklearn-crfsuite.
+ #
+ # Input parameters
+ # --inputPath=PATH      Path of transformed files x|y|z
+ # --modelPath           Path to CRF model
+ # --modelName           Model name
+ # --outputPath=PATH     Output path to place output files
+ # --filteringStopWords  Filtering stop words
+ # --filterSymbols       Filtering punctuation marks
+ 
+ # Output
+ # 1) Tagged files in transformed format
+ 
+ # Examples
+ # python3 tagging.py
+ # --inputPath           /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+ # --modelName           model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
+ # --modelPath           /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
+ # --outputPath          /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+ # --filterSymbols
+ 
+ # python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt
+ 
+ __author__ = 'egaytan'
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
+     parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
+     parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
+     parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
+     parser.add_option("--variant",         dest="variant",         help="Report file",                       metavar="FILE")
+     parser.add_option("--S1",              dest="S1",              help="General features",                  action="store_true", default=False)
+     parser.add_option("--S2",              dest="S2",              help="Inner/Complete word features",      action="store_true", default=False)
+     parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)
+     parser.add_option("--S4",              dest="S4",              help="Semantic features",                 action="store_true", default=False)   
+     parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words",              action="store_true", default=False)
+     parser.add_option("--filterSymbols",   dest="filterSymbols",   help="Filtering punctuation marks",       action="store_true", default=False)
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path to read input files: " + options.inputPath)
+     print("Mode name: " + str(options.modelName))
+     print("Model path: " + options.modelPath)
+     print("Path to place output files: " + options.outputPath)
+     print("Filtering stop words: " + str(options.filterStopWords))
+     print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
+     print("Run variant: " + str(options.variant))
+     
+     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
+ 
+     print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+ 
+     print('-------------------------------- PROCESSING --------------------------------')
+ 
+     stopwords = [word for word in stopwords.words('english')]
+ 
+     # Read CRF model
+     t0 = time()
+     print('Reading CRF model...')    
+     crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
+     print("Reading CRF model done in: %fs" % (time() - t0))
+ 
+     # Reading sentences
+     print('Processing corpus...')
+     t0 = time()
+     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
+     # Walk directory to read files
+     for path, dirs, files in os.walk(options.inputPath):
+         # For each file in dir
+         for file in files:
+             print("Preprocessing file..." + str(file))
+             sentencesInputData = []
+             sentencesOutputData = []
+             with open(os.path.join(options.inputPath, file), "r") as iFile:
+                 lines = iFile.readlines()
+                 for line in lines:
+                     listLine = []
+                     for token in line.strip('\n').split():
+                         if options.filterStopWords:
+                             listToken = token.split('|')
+                             lemma = listToken[1]                            
+                             if lemma in stopwords:
+                                 continue
+                         if options.filterSymbols:
+                             listToken = token.split('|')
+                             lemma = listToken[1]
+                             if lemma in symbols:
+                                 if lemma == ',':
+                                     print("Coma , identificada")
+                                 continue
+                         listLine.append(token)
+                     sentencesInputData.append(listLine)
+                 X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
+                 print("Sentences input data: " + str(len(sentencesInputData)))
+                 
+                 
+                 # Predicting tags
+                 t1 = time()               
+                 print("Predicting tags with model")
+                 y_pred = crf.predict(X_input)                
+                 print("Prediction done in: %fs" % (time() - t1))
+                 
+                 
+                 # Tagging with CRF model
+                 print("Tagging file")
+                 for line, tagLine in zip(lines, y_pred):
+                     Ltags = set(labels).intersection(set(tagLine))
+                     outputLine = ''                    
+                     line = line.strip('\n')                   
+                     #print("\nLine: " + str(line))
+                     #print ("CRF tagged line: " + str(tagLine))
+                     tb = 'O'
+                     i = 0
+                     if len(tagLine)==1:
+                         if tagLine[0] in labels: 
+                             start = '<' + tagLine[0] + '> '
+                             end   = '<' + tagLine[0] + '/>'
+                             word  = line.split('|')[0] + ' '                            
+                             outputLine = start + word + end
+                         else:                             
+                             outputLine = line.split(' ')[0]
+                         #print(outputLine + '\t' + ', '.join(Ltags))
+                         sentencesOutputData.append([outputLine, ', '.join(Ltags)])
+                         continue
+                         
+                     for word,tag in zip(line.split(' '), tagLine):
+                         # start tagging
+                         if tag in labels and tb == 'O':
+                             # start tagging
+                             outputLine += '<' + tag + '> '
+                             tb = tag
+                             outputLine += word.split('|')[0] + ' '
+                             i += 1
+                             continue
+                         # end tagging
+                         elif tb in labels:
+                             if i+1==len(tagLine):
+                                 # end tagging
+                                 outputLine += word.split('|')[0] + ' '
+                                 outputLine += '<' + tag + '/> '
+                                 tb = 'O'
+                                 i += 1
+                                 continue
+                             elif tagLine[i+1]=='O':
+                                 # end tagging
+                                 outputLine += word.split('|')[0] + ' '
+                                 outputLine += '<' + tag + '/> '
+                                 tb = 'O'
+                                 i += 1
+                                 continue
+                         # word tagged
+                         outputLine += word.split('|')[0] + ' '
+                         i += 1
+                     #print(outputLine + '\t' + ', '.join(Ltags))
+                     sentencesOutputData.append([outputLine, ', '.join(Ltags)])
+                     
+             print( DF(sentencesOutputData) )
+         
+             # Save tags
+             '''
+             with open(os.path.join(options.outputPath, file), "w") as oFile:
+                 for line in sentencesOutputData:
+                     oFile.write(line + '\n')
+ 
+     print("Processing corpus done in: %fs" % (time() - t0))
+ '''
+ 
+ 
+ 
+ 
+ 
+ 
+ 
+ 
--- a/predict-annot/bin/tagging/tlibs.py 0 → 100644
View file @582f6ed
+++ b/predict-annot/bin/tagging/tlibs.py 0 → 100644
View file @582f6ed
+ # -*- coding: UTF-8 -*-
+ 
+ import os
+ from optparse import OptionParser
+ from time import time
+ from collections import Counter
+ 
+ import nltk
+ import sklearn
+ import scipy.stats
+ import sys
+ 
+ #from sklearn.externals import joblib
+ import joblib
+ from sklearn.metrics import make_scorer
+ #from sklearn.cross_validation import cross_val_score
+ from sklearn.model_selection import cross_val_score
+ #from sklearn.grid_search import RandomizedSearchCV
+ from sklearn.model_selection import RandomizedSearchCV
+ 
+ import sklearn_crfsuite
+ from sklearn_crfsuite import scorers
+ from sklearn_crfsuite import metrics
+ 
+ from nltk.corpus import stopwords
+ 
+ #################################
--- a/predict-annot/bin/tagging/training_validation_v14.py 0 → 100644
View file @582f6ed
+++ b/predict-annot/bin/tagging/training_validation_v14.py 0 → 100644
View file @582f6ed
+ # -*- coding: UTF-8 -*-
+ 
+ import os                           # Access operative sistem
+ #from itertools import chain        # No se ocupa
+ from optparse import OptionParser   # Number of transitions
+ from time import time               # Return the time in seconds since the epoch as a float
+ from collections import Counter     # Dict subclass for counting hashable objects
+ #import re                          # No se ocupa
+ 
+ import nltk                         # Natural Language Toolkit platform to work with human language data
+ import sklearn                      # Free software machine learning
+ import scipy.stats                  # library of statistical functions
+ import sys                          # to exit from Python.
+ 
+ import joblib			                            # provide lightweight pipelining
+ from sklearn.metrics import make_scorer                     # Make a scorer from a performance metric or loss function
+ from sklearn.model_selection import cross_val_score         # Evaluate a score by cross-validation
+ from sklearn.model_selection import RandomizedSearchCV      # Randomized search on hyper parameters
+         
+ import sklearn_crfsuite                                 # Thin CRFsuite  
+ from sklearn_crfsuite import scorers                    # Added scorers.sequence_accuracy
+ from sklearn_crfsuite import metrics                    # Add flat recall score to metrics
+ 
+ from pandas import DataFrame as DF                      # Contruct dataframe object
+ from nltk.corpus import stopwords                       # To exclude top words
+ 
+ #-------------------------------------------------------------------------------
+ # Objective
+ # Training and evaluation of CRFs with sklearn-crfsuite.
+ #
+ # Input parameters
+ # (1)   --inputPath                   Path of training and test data set
+ # (2)   --outputPath                  Output path to place output files
+ # (3)   --trainingFile                File with training data set
+ # (4)   --testFile                    File with test data set
+ # (5)   --reportName                  Number of run
+ # (6)   --variant                     Part of S2 variant
+ # (7)   --nrules                      Number of crf transitions
+ # (8)   --S1                          Inner word features set
+ # (9)   --S2                          Complete word features
+ # (10)  --S3                          Extended context features
+ # (11)  --S4                          Semantic features
+ # (12)  --excludeStopWords            
+ # (13)  --excludeSymbols
+ 
+ # Output
+ # 1) Best model
+ # 2) Report
+ 
+ # Examples
+ # python3 training_validation_v14.0.1.py
+ # --inputPath     /home/egaytan/automatic-extraction-growth-conditions/CRF/input/
+ # --trainingFile  training-data-set-70-NER.txt
+ # --testFile      test-data-set-30-NER.txt
+ # --outputPath    /home/egaytan/automatic-extraction-growth-conditions/CRF/
+ # --nrules        500
+ # --reportName    Run1
+ # --variant       11
+ # --S1
+ # --S2
+ # --S3
+ # --S4
+ 
+ # python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 10 >                 ../../outputs/enero/Run1_v10.txt
+ 
+ ##################################################################
+ #                             FEATURES                           # 
+ ##################################################################
+ 
+ #================== COMPLETE WORD FEATURES ======================#
+ 
+ def isGreek(word):
+     ## Complete word are greek letters
+     alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
+     'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
+     if word in alphabet:
+         return True
+     else:
+         return False 
+ 
+ #================ INNER OF THE WORD FEATURES ====================#
+ 
+ def hGreek(word):
+     ## Search for at least has one greek letter
+     alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω','α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
+     # hexadicimal code
+     matches = [letter for letter in word if letter in alphabet]
+     if (len(matches) > 0):
+         return(True)
+     else: return(False)
+     ## At least a greek letter
+ 
+ def hNumber(word):
+     ## Al leats has one greek letter
+     for l in word:
+         if l.isdigit():
+             return True
+     return False
+ 
+ def hUpper(word):
+     ## At least an upper letter
+     for l in word:
+         if l.isupper(): return True
+     return False
+ 
+ def hLower(word):
+     ## At least a lower letter
+     for l in word:
+         if l.islower(): return True
+     return False 
+ 
+ #============================FEATURES===========================#
+ 
+ def word2features(sent, i, S1, S2, S3, S4, v): #SA, v
+     ## Getting word features
+ 
+     ## Saving CoreNLP annotations
+     listElem = sent[i].split('|')
+     ## Split CoreNLP output by columns    
+     word   = listElem[0]
+     lemma  = listElem[1]
+     postag = listElem[2]
+     ner    = listElem[3]
+ 
+     #=========================== G =============================#
+     ## NAME LEVEL G
+     ## FUTURE TYPE General features
+ 
+     ## Adding to features dictionary
+     features = {
+         ## basal features
+         'lemma': lemma,
+         'postag': postag
+         }
+ 
+     ## Anterior lemma and postag
+     ## need more tha one word in sentence
+     if i > 0:        
+         ## Split CoreNLP output by columns
+         listElem = sent[i - 1].split('|')
+ 
+         ## Saving CoreNLP annotations
+         lemma0 = listElem[1]
+         postag0 = listElem[2]
+         ## Adding features to dictionary        
+         features.update({            
+             #LemaG anterior      
+             '-1:lemma': lemma0,
+             #Postag anterior 
+             '-1:postag': postag0,
+         })
+ 
+     ## Posterior lemma and postag
+     ## is not the last word
+     if i < len(sent) - 1:
+         ## Posterior word 
+         listElem = sent[i + 1].split('|')
+         ## Saving CoreNLP annotations       
+         lemma2 = listElem[1]
+         postag2 = listElem[2]
+         ## Adding to features dictionary
+         features.update({           
+             #LemaG  posterior      
+             '+1:lemma': lemma2,
+             #Postag posterior 
+             '+1:postag': postag2,
+         })    
+ 
+     #=========================== S1 =============================#
+     ## NAME LEVEL S1
+     ## FEATURE TYPE Inner word features
+ 
+     if S1:
+         ## Adding features to dictionary
+         features.update({
+         'hUpper' :  hUpper(word),
+         'hLower' :  hLower(word),
+         'hGreek' :  hGreek(word),     
+         'symb'   :  word.isalnum()
+         })
+         #========== Variants of inner words features ============#
+         if v == 10:
+           #word first character
+           features['word[:1]']= word[:1]
+ 
+           #word second character
+           if len(word)>1:
+               features['word[:2]']= word[:2]
+ 
+         if v == 11:
+           #lemma and postag first dharacter
+           features['lemma[:1]']= lemma[:1]
+           features['postag[:1]']= postag[:1]
+ 
+           #lemma and postag secondChar
+           if len(lemma)>1:
+               features['lemma[:2]']= lemma[:2]
+           if len(postag)>1:
+               features['postag[:2]']= postag[:2]
+ 
+         if v == 12:
+           #word first character
+           features['word[:1]']= word[:1]
+ 
+           #word second character
+           if len(word)>1:
+               features['word[:2]']= word[:2]
+ 
+           #postag first character
+           features['postag[:1]']= postag[:1]
+ 
+           #postag second character
+           if len(postag)>1:
+               features['postag[:2]']= postag[:2]
+ 
+         if v == 13:
+           #lemma first character
+           features['lemma[:1]']= lemma[:1]
+ 
+           #lemma second character
+           if len(lemma)>1:
+               features['lemma[:2]']= lemma[:2]
+ 
+     #=========================== S2 =============================#
+     ## NAME LEVEL S2
+     ## FEATURE TYPE Complete word features
+ 
+     if S2:
+         #Add features to dictionary
+         features.update({
+             'word'      :  word,
+             'isUpper'   :  word.isupper(),
+             'isLower'   :  word.islower(),
+             'isGreek'   :  isGreek(word),
+             'isNumber'  :  word.isdigit()
+         })
+         ## Anterior word
+         ## sentence needs more tha one word
+         if i > 0:
+             ## Split CoreNLP output by columns
+             listElem = sent[i - 1].split('|')
+             ## Saving CoreNLP annotations
+             word0 = listElem[0]
+             features['-1:word']=  word0
+ 
+         ## Posterior word
+         ## is not the last word
+         if i < len(sent)-1:
+             ## Split CoreNLP output by columns
+             listElem = sent[i + 1].split('|')
+             ## Saving CoreNLP annotations
+             word2 = listElem[0]
+             features['+1:word']=  word2
+           
+     #=========================== S3 =============================#
+     ## NAME LEVEL S3
+     ## FEATURE TYPE Extended context features
+     if S3:
+         ## more than two words in sentence
+         if i > 1:
+             ## Split CoreNLP output by columns
+             listElem = sent[i - 2].split('|')
+             ## Saving CoreNLP annotations
+             ## two anterior lemma and postag
+             lemma01 = listElem[1]
+             postag01 = listElem[2]
+             features['-2:lemma']=  lemma01
+             features['-2:postag']=  postag01
+ 
+         ## is not the penultimate word
+         if i < len(sent) - 2:
+             ## Split CoreNLP output by columns
+             listElem = sent[i + 2].split('|')
+             ## Saving CoreNLP annotations
+             lemma02 = listElem[1]
+             postag02 = listElem[2]
+             ## two posterior lemma and postag
+             features['+2:lemma']=  lemma02
+             features['+2:postag']=  postag02
+ 
+     #=========================== S4 =============================#
+     ## NAME LEVEL S4if S4:
+     ## FEATURE TYPE NER
+     if S4:
+         ## more than one word in sentence
+         if i > 0:
+           ## Split CoreNLP output by columns
+           listElem = sent[i - 1].split('|')
+           ## ===============  Anterior ner  ====================##
+           ## Saving CoreNLP annotations according column position
+           ner0 = listElem[3]
+           ## Adding to features dictionary
+           features['-1:ner'] = ner
+ 
+         ## is not the last word
+         if i < len(sent) - 1:
+           ## Split CoreNLP output by columns
+           listElem = sent[i + 1].split('|')
+           ## =============  Posterior ner  ====================##
+           ## Saving CoreNLP annotations according column position
+           ner2 = listElem[3]
+           ## Adding to features dictionary
+           features['+1:ner'] = ner2
+ 
+         if i > 1:
+           ## Split CoreNLP output by columns
+           listElem = sent[i - 2].split('|')
+           ## Saving CoreNLP annotations
+           ## ===============  2 Anterior ner  =================##
+           ner01 = listElem[3]
+           features['-2:ner']=  ner01
+ 
+         ## is not the penultimate word
+         if i < len(sent) - 2:
+           ## Split CoreNLP output by columns
+           listElem = sent[i + 2].split('|')
+           ## Saving CoreNLP annotations
+           ner02 = listElem[3]
+           ## =============  2 Posterior ner  =================##
+           features['+2:ner']=  ner02
+ 
+     return features
+ 
+ def sent2features(sent, S1, S2, S3, S4, v):
+     ## Itering in sentence for each word and saving its features
+     return [word2features(sent, i, S1, S2, S3, S4, v) for i in range(len(sent))]
+ 
+ def sent2labels(sent):
+     ## Save tag, last position by word tokens
+     return [elem.split('|')[-1] for elem in sent]
+ 
+ def sent2tokens(sent):
+     return [token for token, postag, label in sent]
+ 
+ def print_transitions(trans_features, f):
+     for (label_from, label_to), weight in trans_features:
+         f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
+ 
+ def print_state_features(state_features, f):
+     for (attr, label), weight in state_features:
+         f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
+ 
+ 
+ __author__ = 'egaytan'
+ 
+ ##################################################################
+ #                            MAIN PROGRAM                        #
+ ##################################################################
+ 
+ if __name__ == "__main__":
+     ## Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath",        dest="inputPath",       help="Path of training data set",           metavar="PATH")
+     parser.add_option("--outputPath",       dest="outputPath",      help="Output path to place output files",   metavar="PATH")
+     parser.add_option("--trainingFile",     dest="trainingFile",    help="File with training data set",         metavar="FILE")
+     parser.add_option("--testFile",         dest="testFile",        help="File with test data set",             metavar="FILE")
+     parser.add_option("--reportName",       dest="reportName",      help="Report number run",                   metavar="FILE")
+     parser.add_option("--variant",          dest="variant",         help="Report file",                         metavar="FILE")
+     parser.add_option("--S1",               dest="S1",              help="General features",                    action="store_true", default=False)
+     parser.add_option("--S2",               dest="S2",              help="Inner/Complete word features",        action="store_true", default=False)
+     parser.add_option("--S3",               dest="S3",              help="Extended context features",           action="store_true", default=False)
+     parser.add_option("--S4",               dest="S4",              help="Semantic features",                   action="store_true", default=False)    
+     parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words",                  action="store_true", default=False)
+     parser.add_option("--excludeSymbols",   dest="excludeSymbols",  help="Exclude punctuation marks",           action="store_true", default=False)
+     parser.add_option("--nrules",           dest="nrules",          help="Number of crf rules on report",       type="int")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path of test and training data sets: " + options.inputPath)
+     print("Path of outputs: "  + options.outputPath)
+     print("File with training data set: " + str(options.trainingFile))
+     print("File with test data set: " + str(options.testFile))
+     print("reportName: " + str(options.reportName))
+     print("Exclude stop words: " + str(options.excludeStopWords))
+     print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
+     print("Run variant: " + str(options.variant))
+     print("Number of rules on report file: " + str(options.nrules))
+ 
+     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']   
+     print("Exclude symbols: " + str(options.excludeSymbols))
+ 
+     print('-------------------------------- PROCESSING --------------------------------')
+     print('Reading corpus...')
+     t0 = time()
+ 
+     sentencesTrainingData = []
+     sentencesTestData = []
+ 
+     stopwords = [word for word in stopwords.words('english')]
+ 
+     with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
+         for line in iFile.readlines():
+             listLine = []
+             line = line.strip('\n')
+             for token in line.split():
+                 if options.excludeStopWords:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in stopwords:
+                         continue
+                 if options.excludeSymbols:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in symbols:
+                         continue
+                 listLine.append(token)
+             sentencesTrainingData.append(listLine)
+         print("   Sentences training data: " + str(len(sentencesTrainingData)))
+ 
+     with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
+         for line in iFile.readlines():
+             listLine = []
+             line = line.strip('\n')
+             for token in line.split():
+                 if options.excludeStopWords:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in stopwords:
+                         continue
+                 if options.excludeSymbols:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in symbols:
+                         continue
+                 listLine.append(token)
+             sentencesTestData.append(listLine)
+         print("   Sentences test data: " + str(len(sentencesTestData)))
+ 
+     print("Reading corpus done in: %fs" % (time() - t0))
+ 
+     print('-------------------------------- FEATURES --------------------------------')
+ 
+     Dtraning = sent2features(sentencesTrainingData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
+     Dtest = sent2features(sentencesTestData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
+     print('--------------------------Features Training ---------------------------')
+     print(DF(list(Dtraning.items())))
+     print('--------------------------- FeaturesTest -----------------------------')
+     print(DF(list(Dtest.items())))
+ 
+     t0 = time()
+ 
+     X_train = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTrainingData]
+     y_train = [sent2labels(s) for s in sentencesTrainingData]
+ 
+     X_test = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTestData]
+     # print X_test
+     y_test = [sent2labels(s) for s in sentencesTestData]
+ 
+     '''
+     Fixed parameters
+     crf = sklearn_crfsuite.CRF(
+         algorithm='lbfgs',
+         c1=0.1,
+         c2=0.1,
+         max_iterations=100,
+         all_pgossible_transitions=True
+     )
+     '''
+     # Hyperparameter Optimization
+     crf = sklearn_crfsuite.CRF(
+         algorithm='lbfgs',
+         max_iterations=100,
+         all_possible_transitions=True
+     )
+     params_space = {
+         'c1': scipy.stats.expon(scale=0.5),
+         'c2': scipy.stats.expon(scale=0.05),
+     }
+ 
+     # Original: labels = list(crf.classes_)
+     # Original: labels.remove('O')    
+     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
+ 
+     # use the same metric for evaluation
+     f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)
+ 
+     # search
+     rs = RandomizedSearchCV(crf, params_space,
+                             cv=5,
+                             verbose=3,
+                             n_jobs=-1,
+                             n_iter=100,
+                             scoring=f1_scorer,
+                             random_state=42)    
+ 
+     rs.fit(X_train, y_train)
+ 
+     # Fixed parameters
+     # crf.fit(X_train, y_train)
+ 
+     # Best hiperparameters
+     # crf = rs.best_estimator_
+   
+     nameReport = str(options.reportName) + '_v'+ str(options.variant) + '.txt'
+     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
+         oFile.write("********** TRAINING AND TESTING REPORT **********\n")
+         oFile.write("Training file: " + options.trainingFile + '\n')
+         oFile.write('\n')
+         oFile.write('best params:' + str(rs.best_params_) + '\n')
+         oFile.write('best CV score:' + str(rs.best_score_) + '\n')
+         oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
+ 
+     print("Training done in: %fs" % (time() - t0))
+     t0 = time()
+ 
+     # Update best crf
+     crf = rs.best_estimator_
+ 
+     # Saving model
+     print("     Saving training model...")
+     t1 = time()
+     nameModel = 'model_' + str(options.reportName) + '_v'+ str(options.variant) + '_S1_' + str(options.S1) + '_S2_' + str(options.S2) + '_S3_' + str(options.S3) + '_S4_' + str(options.S4) + '_' + str(options.reportName) + '_v' + str(options.variant) +'.mod'
+     joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
+     print("        Saving training model done in: %fs" % (time() - t1))
+ 
+     # Evaluation against test data
+     y_pred = crf.predict(X_test)
+     print("*********************************")
+     print("Prediction done in: %fs" % (time() - t0))
+ 
+     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
+         oFile.write('\n')
+         oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
+         oFile.write('\n')
+         # labels = list(crf.classes_)
+         sorted_labels = sorted(
+             labels,
+             key=lambda name: (name[1:], name[0])
+         )
+         oFile.write(metrics.flat_classification_report( y_test, y_pred, labels=sorted_labels, digits=3))
+         oFile.write('\n')
+ 
+         oFile.write("\nTop likely transitions:\n")
+         print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop unlikely transitions:\n")
+         print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop positive:\n")
+         print_state_features(Counter(crf.state_features_).most_common(options.nrules), oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop negative:\n")
+         print_state_features(Counter(crf.state_features_).most_common()[-options.nrules:], oFile)
+         oFile.write('\n')
+ 
--- a/predict-annot/input/annot-input_bg_v3.txt
View file @582f6ed
+++ b/predict-annot/input/annot-input_bg_v3.txt
View file @582f6ed
--- a/predict-annot/reports/output_tagging_report.txt 0 → 100644
View file @582f6ed
+++ b/predict-annot/reports/output_tagging_report.txt 0 → 100644
View file @582f6ed
+ -------------------------------- PARAMETERS --------------------------------
+ Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+ Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+ Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models
+ Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+ Filtering stop words: False
+ Levels: S1: FalseS2: FalseS3: FalseS4: False
+ Run variant: None
+ Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
+ -------------------------------- PROCESSING --------------------------------
+ Reading CRF model...
+ Reading CRF model done in: 0.008342s
+ Processing corpus...
+ Preprocessing file...annot-input_bg_v3.txt
+ Sentences input data: 14716
+ Predicting tags with model
+ Prediction done in: 0.983480s
+ Tagging file
+                                                        0         1
+ 0                      <Gtype> antibody : Flag <Gtype/>      Gtype
+ 1                        <Gversion> ChIP-Seq <Gversion/>  Gversion
+ 2      Cultures of Caulobacter -LRB- TLS1631-TLS1633 ...     Gtype
+ 3      <Gtype> developmental stage : mixed population...     Gtype
+ 4      DNA was isolated using the Qiagen Cell Lysis a...          
+ 5                                      Escherichia coli           
+ 6                               Escherichia coli AB1157           
+ 7      For analysis of ChIP-seq data , Hiseq 2500 Ill...          
+ 8      For analysis of IDAP-seq data , Hiseq 2500 Ill...     Gtype
+ 9                        Genome _ build : NC _ 000913.3           
+ 10                       Genome _ build : NC _ 011916.1           
+ 11     <Gtype> genotype : AB1157 ybbD : : parS scramb...     Gtype
+ 12     <Gtype> genotype : AB1157 ybbD : : parS scramb...     Gtype
+ 13     <Gtype> genotype : AB1157 ybbD : : parS site 1...     Gtype
+ 14     <Gtype> genotype : AB1157 ybbD : : parS site 2...     Gtype
+ 15     <Gtype> genotype : AB1157 ybbD : : parS site 2...     Gtype
+ 16     <Gtype> genotype : AB1157 ybbD : : parS site 3...     Gtype
+ 17     <Gtype> genotype : AB1157 ybbD : : parS site 3...     Gtype
+ 18     <Gtype> genotype : AB1157 ybbD : : parS site 4...     Gtype
+ 19     <Gtype> genotype : AB1157 ybbD : : parS site 4...     Gtype
+ 20     <Gtype> genotype : AB1157 ybbD : : parS site 5...     Gtype
+ 21     <Gtype> genotype : AB1157 ybbD : : parS site 5...     Gtype
+ 22     <Gtype> genotype : AB1157 ybbD : : parS site 6...     Gtype
+ 23     <Gtype> genotype : AB1157 ybbD : : parS site 7...     Gtype
+ 24     <Gtype> genotype : AB1157 ybbD : : parS site 7...     Gtype
+ 25     Hiseq 2500 Illumina short reads -LRB- 50 bp -R...          
+ 26           LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG           
+ 27           LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG           
+ 28           LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG           
+ 29           LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG           
+ ...                                                  ...       ...
+ 14686                 <Phase> ESBL019 Coliform <Phase/>      Phase
+ 14687               <Gtype> ESBL019 Filamented <Gtype/>      Gtype
+ 14688                                  ESBL019 Reverted           
+ 14689               <Phase> ESBL019 Transition <Phase/>      Phase
+ 14690                                  Escherichia coli           
+ 14691  Four morphologic states of ESBL019 were used d...          
+ 14692            <Gtype> morphology : Coliform <Gtype/>      Gtype
+ 14693          <Gtype> morphology : Filamented <Gtype/>      Gtype
+ 14694  morphology : Reverted -LRB- reverted back from...          
+ 14695  morphology : Transition -LRB- from Coli into F...          
+ 14696  RNA isolation was performed using an RNeasy mi...          
+ 14697  <Gtype> strain : beta-lactamase -LRB- ESBL -RR...     Gtype
+ 14698  The E. coli isolate ESBL019 was originally iso...          
+ 14699                                  Escherichia coli           
+ 14700               lexA 10 ' after UV vs. 0 ' , MG1655           
+ 14701  <Gtype> lexA 10 min after UV treatment , 25 ug...     Gtype
+ 14702             lexA 20 ' after NOuv vs. 0 ' , MG1655           
+ 14703               lexA 20 ' after UV vs. 0 ' , MG1655           
+ 14704  lexA 20 min after NOuv , 25 ug total RNA , 2 u...          
+ 14705  <Gtype> lexA 20 min after UV treatment , 25 ug...     Gtype
+ 14706               lexA 40 ' after UV vs. 0 ' , MG1655           
+ 14707  <Gtype> lexA 40 min after UV treatment , 25 ug...     Gtype
+ 14708                lexA 5 ' after UV vs. 0 ' , MG1655           
+ 14709  <Gtype> lexA 5 min after UV treatment , 25 ug ...     Gtype
+ 14710             lexA 60 ' after NOuv vs. 0 ' , MG1655           
+ 14711               lexA 60 ' after UV vs. 0 ' , MG1655           
+ 14712  lexA 60 min after NOuv , 25 ug total RNA , 2 u...          
+ 14713  <Gtype> lexA 60 min after UV treatment , 25 ug...     Gtype
+ 14714        lexA vs. wt , before UV treatment , MG1655           
+ 14715                 untreated cells , 25 ug total RNA           
+ 
+ [14716 rows x 2 columns]