scripts

Estefani Gaytan Nunez
Commit fd616ef9e8fbd1952192b721036b8573f2c26f34 fd616ef9 1 parent b9839453
Showing 5 changed files with 1092 additions and 1 deletions
CRF/bin/label-split_training_test_v2.py
CRF/bin/label-split_training_test_v2.py.save
CRF/bin/training_validation_v3.py
CRF/bin/training_validation_v4.py
CRF/bin/training_validation_v5.py
--- a/CRF/bin/label-split_training_test_v2.py 0 → 100644
View file @fd616ef
+++ b/CRF/bin/label-split_training_test_v2.py 0 → 100644
View file @fd616ef
+ #!/bin/python3    
+ from optparse import OptionParser
+ import re
+ import os
+ import random
+ 
+ 
+ # Objective
+ # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
+ # make data sets using only sentences with at least one true-tag
+ #
+ # Input parameters
+ # --inputPath=PATH    		Path of inputfile
+ # --outputPath=PATH   		Path to place output files
+ # --trainingFile=testFile  	Output training data set
+ # --testFile=testFile  	  	Output test data set
+ #
+ # Output
+ # training and test data set
+ #
+ # Examples
+ # python label-split_training_test_v2.py
+ # --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
+ # --inputFile sentences.tsv_pakal_.conll
+ # --trainingFile training-data-set-70.txt
+ # --testFile test-data-set-30.txt
+ # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ #
+ # 
+ # python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ 
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path of output from CoreNLP", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path to place output files",
+                       metavar="PATH")
+     parser.add_option("--inputFile", dest="inputFile",
+                       help="File with CoreNLP-tagging sentences", metavar="FILE")
+     parser.add_option("--trainingFile", dest="trainingFile",
+                       help="File with training data set", metavar="FILE")
+     parser.add_option("--testFile", dest="testFile",
+                       help="File with test data set", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path of CoreNLP output: " + options.inputPath)
+     print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
+     print("Path of training data set: " + options.outputPath)
+     print("File with training data set: " + str(options.trainingFile))
+     print("Path of test data set: " + options.outputPath)
+     print("File with test data set: " + str(options.testFile))
+     print('-------------------------------- PROCESSING --------------------------------')
+     ## begin of tagging
+     in_labels = {
+      '<Gtype>': 'Gtype',
+      '<Gversion>': 'Gversion',
+      '<Med>': 'Med',
+      '<Phase>': 'Phase',
+      '<Supp>': 'Supp',
+      '<Technique>': 'Technique',
+      '<Temp>': 'Temp',
+      '<OD>': 'OD',
+      '<Anti>': 'Anti'
+     }
+     ## End of tagging
+     out_labels = {
+      '<Air>': 'O',    
+      '</Air>': 'O',
+      '</Gtype>': 'O',
+      '</Gversion>': 'O',
+      '</Med>': 'O',
+      '</Phase>': 'O',
+      '<Sample>': 'O',
+      '</Sample>': 'O',
+      '<Serie>': 'O',
+      '</Serie>': 'O',
+      '<Strain>': 'O',
+      '</Strain>': 'O',
+      '<Substrain>': 'O',
+      '</Substrain>': 'O',
+      '</Supp>': 'O',
+      '</Technique>': 'O',
+      '</Temp>': 'O',
+      '</OD>': 'O',
+      '<Agit>': 'O',
+      '</Agit>': 'O',
+      '<Name>': 'O',
+      '</Name>': 'O',
+      '<Orgn>': 'O',
+      '</Orgn>': 'O',
+      '</Anti>': 'O',
+      '<Vess>': 'O',
+      '</Vess>': 'O'}
+     
+     # Other label
+     flag = 'O'
+     # sentences counter    
+     lista = []
+     #First sentence
+     sentence = ''
+     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
+ 	    for line in input_file:
+ 		    if len(line.split('\t')) > 1:
+ 			    w = line.split('\t')[1]
+ 			    if w in in_labels or w in out_labels:
+ 			    	#Tagging                    
+ 				    if w in in_labels.keys(): flag = in_labels[w]                    
+ 				    if w in out_labels: flag = out_labels[w]					
+ 			    else:                    
+ 				    if w == "PGCGROWTHCONDITIONS":                        
+ 			                        words = sentence.split(' ')
+ 				    	#End of sentence
+ 		                        	tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
+ 	               		        #At least one true-tag on sentence
+ 			                        if len(tags)> 0:
+                 			            lista.append(sentence)
+ 		                        #New setence
+ 	                            		sentence = ''                            
+ 				    else:			            	
+ 						sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
+ 
+     print("Number of sentences: " + str( len(lista) ) )
+ 
+ 
+     # Split 70 30 training and test sentences
+     trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
+     testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
+ 
+     with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
+       Data = [lista[i]  for i in trainingIndex]
+       oFile.write('\n'.join(Data))
+ 
+     with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
+       Data = [lista[i]  for i in testIndex]
+       oFile.write('\n'.join(Data))
+ 
+     print("==================================END===================================")
--- a/CRF/bin/label-split_training_test_v2.py.save 0 → 100644
View file @fd616ef
+++ b/CRF/bin/label-split_training_test_v2.py.save 0 → 100644
View file @fd616ef
+ #!/bin/python3    
+ from optparse import OptionParser
+ import re
+ import os
+ import random
+ 
+ 
+ # Objective
+ # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
+ # make data sets using only sentences with at least one true-tag
+ #
+ # Input parameters
+ # --inputPath=PATH    		Path of inputfile
+ # --outputPath=PATH   		Path to place output files
+ # --trainingFile=testFile  	Output training data set
+ # --testFile=testFile  	  	Output test data set
+ #
+ # Output
+ # training and test data set
+ #
+ # Examples
+ # python label-split_training_test_v2.py
+ # --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
+ # --inputFile sentences.tsv_pakal_.conll
+ # --trainingFile training-data-set-70.txt
+ # --testFile test-data-set-30.txt
+ # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ #
+ # 
+ # python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ 
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path of output from CoreNLP", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path to place output files",
+                       metavar="PATH")
+     parser.add_option("--inputFile", dest="inputFile",
+                       help="File with CoreNLP-tagging sentences", metavar="FILE")
+     parser.add_option("--trainingFile", dest="trainingFile",
+                       help="File with training data set", metavar="FILE")
+     parser.add_option("--testFile", dest="testFile",
+                       help="File with test data set", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path of CoreNLP output: " + options.inputPath)
+     print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
+     print("Path of training data set: " + options.outputPath)
+     print("File with training data set: " + str(options.trainingFile))
+     print("Path of test data set: " + options.outputPath)
+     print("File with test data set: " + str(options.testFile))
+     print('-------------------------------- PROCESSING --------------------------------')
+     ## begin of tagging
+     in_labels = {
+      '<Gtype>': 'Gtype',
+      '<Gversion>': 'Gversion',
+      '<Med>': 'Med',
+      '<Phase>': 'Phase',
+      '<Supp>': 'Supp',
+      '<Technique>': 'Technique',
+      '<Temp>': 'Temp',
+      '<OD>': 'OD',
+      '<Anti>': 'Anti',
+      '<Agit>': 'Agit',
+      '<Vess>': 'Vess'
+     }
+     ## End of tagging
+     out_labels = {
+      '<Air>': 'O',    
+      '</Air>': 'O',
+      '</Gtype>': 'O',
+      '</Gversion>': 'O',
+      '</Med>': 'O',
+      '</Phase>': 'O',
+      '<Sample>': 'O',
+      '</Sample>': 'O',
+      '<Serie>': 'O',
+      '</Serie>': 'O',
+      '<Strain>': 'O',
+      '</Strain>': 'O',
+      '<Substrain>': 'O',
+      '</Substrain>': 'O',
+      '</Supp>': 'O',
+      '</Technique>': 'O',
+      '</Temp>': 'O',
+      '</OD>': 'O',
+      '</Anti>': 'O',
+      '</Agit>': 'O',
+      '<Name>': 'O',
+      '</Name>': 'O',
+      '<Orgn>': 'O',
+      '</Orgn>': 'O',
+      '</Vess>': 'O'}
+     
+     # Other label
+     flag = 'O'
+     # sentences counter
+     n=0
+     lista = []
+     #First sentence
+     sentence = ''
+     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
+ 	    for line in input_file:
+ 		    if len(line.split('\t')) > 1:
+ 			    w = line.split('\t')[1]
+ 			    if w in in_labels or w in out_labels:
+ 			    	#Tagging                    
+ 				    if w in in_labels.keys(): flag = in_labels[w]                    
+ 				    if w in out_labels: flag = out_labels[w]					
+ 			    else:                    
+ 				    if w == "PGCGROWTHCONDITIONS":
+ 					words = sentence.split(' ')
+                         tags = [tag for tag in words if word.split('|')[-1] in in_labels.values() ]
+                         #At least one true-tag on sentence
+                         if len(tags)> 0:
+                             lista.append(sentence)
+                             #New setence
+                             sentence = ''
+                             n=n+1
+ 				    else:
+ 				    	#Building and save tagging sentence
+ 					sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
+ 
+ 	print("Number of sentences: " + str(n) + str(len(lista)+1))
+ 
+ 
+     # Split 70 30 training and test sentences
+     trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
+     testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
+ 
+     with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
+       Data = [lista[i]  for i in trainingIndex]
+       oFile.write('\n'.join(Data))
+ 
+     with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
+       Data = [lista[i]  for i in testIndex]
+       oFile.write('\n'.join(Data))
+ 
+     print("==================================END===================================")
--- a/CRF/bin/training_validation_v3.py
View file @fd616ef
+++ b/CRF/bin/training_validation_v3.py
View file @fd616ef
@@ -299,7 +299,7 @@ if __name__ == "__main__":
 
     # Original: labels = list(crf.classes_)
     # Original: labels.remove('O')
-     labels = list(['Air', 'Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Vess'])
+     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
 
     # use the same metric for evaluation
     f1_scorer = make_scorer(metrics.flat_f1_score,
--- a/CRF/bin/training_validation_v4.py 0 → 100644
View file @fd616ef
+++ b/CRF/bin/training_validation_v4.py 0 → 100644
View file @fd616ef
+ # -*- coding: UTF-8 -*-
+ 
+ import os
+ from itertools import chain
+ from optparse import OptionParser
+ from time import time
+ from collections import Counter
+ import re
+ 
+ import nltk
+ import sklearn
+ import scipy.stats
+ import sys
+ 
+ from sklearn.externals import joblib
+ from sklearn.metrics import make_scorer
+ from sklearn.cross_validation import cross_val_score
+ from sklearn.grid_search import RandomizedSearchCV
+ 
+ import sklearn_crfsuite
+ from sklearn_crfsuite import scorers
+ from sklearn_crfsuite import metrics
+ 
+ from nltk.corpus import stopwords
+ 
+ 
+ # Objective
+ # Training and evaluation of CRFs with sklearn-crfsuite.
+ #
+ # Input parameters
+ # --inputPath=PATH    Path of training and test data set
+ # --trainingFile        File with training data set
+ # --testFile        File with test data set
+ # --outputPath=PATH    Output path to place output files
+ 
+ # Output
+ # 1) Best model
+ 
+ # Examples
+ # python training_validation_v3.py
+ # --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ # --trainingFile training-data-set-70.txt
+ # --testFile test-data-set-30.txt
+ # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
+ # python3.4 training-validation_v3.py --inputPatTH /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
+ 
+ #################################
+ #           FUNCTIONS           #
+ #################################
+ 
+ def isGreek(word):
+     alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
+     'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
+     if word in alphabet:
+         return True
+     else:
+         return False 
+ 
+ def word2features(sent, i):
+     listElem = sent[i].split('|')
+     word = listElem[0]
+     lemma = listElem[1]
+     postag = listElem[2]
+ 
+     features = {
+         # Suffixes
+         #'word[-3:]': word[-3:],
+         #'word[-2:]': word[-2:],
+         #'word[-1:]': word[-1:],
+         #'word.isupper()': word.isupper(),
+         'word': word,
+         'lemma': lemma,
+         #'postag': postag,
+         #'lemma[-3:]': lemma[-3:],
+         #'lemma[-2:]': lemma[-2:],
+         #'lemma[-1:]': lemma[-1:],
+         #'lemma[+3:]': lemma[:3],
+         #'lemma[+2:]': lemma[:2],
+         #'lemma[+1:]': lemma[:1],
+         #'word[:3]': word[:3],
+         #'word[:2]': word[:2],
+         #'word[:1]': word[:1],
+         #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
+         'isNumber()': word.isdigit(),
+         'isGreek(){}'.format(isGreek(word)): isGreek(word),
+         'isupper()' : word.isupper(),
+         'islower()' : word.islower()        
+     }
+     if i > 0:
+         listElem = sent[i - 1].split('|')
+         word1 = listElem[0]
+         lemma1 = listElem[1]
+         postag1 = listElem[2]
+         features.update({
+             #'-1:word': word1,
+             '-1:lemma': lemma1,
+             '-1:postag': postag1,
+         })
+ 
+     if i < len(sent) - 1:
+         listElem = sent[i + 1].split('|')
+         #word1 = listElem[0]
+         lemma1 = listElem[1]
+         postag1 = listElem[2]
+         features.update({
+             #'+1:word': word1,
+             '+1:lemma': lemma1,
+             '+1:postag': postag1,
+         })
+ 
+     '''    
+     if i > 1:
+         listElem = sent[i - 2].split('|')
+         word2 = listElem[0]
+         lemma2 = listElem[1]
+         postag2 = listElem[2]
+         features.update({
+             '-2:word': word2,
+             '-2:lemma': lemma2,
+         })
+ 
+     if i < len(sent) - 2:
+         listElem = sent[i + 2].split('|')
+         word2 = listElem[0]
+         lemma2 = listElem[1]
+         postag2 = listElem[2]
+         features.update({
+             '+2:word': word2,
+             '+2:lemma': lemma2,
+         })
+ 
+     trigrams = False
+     if trigrams:
+         if i > 2:
+             listElem = sent[i - 3].split('|')
+             word3 = listElem[0]
+             lemma3 = listElem[1]
+             postag3 = listElem[2]
+             features.update({
+                 '-3:word': word3,
+                 '-3:lemma': lemma3,
+             })
+ 
+         if i < len(sent) - 3:
+             listElem = sent[i + 3].split('|')
+             word3 = listElem[0]
+             lemma3 = listElem[1]
+             postag3 = listElem[2]
+             features.update({
+                 '+3:word': word3,
+                 '+3:lemma': lemma3,
+             })
+     '''
+     return features
+ 
+ 
+ def sent2features(sent):
+     return [word2features(sent, i) for i in range(len(sent))]
+ 
+ 
+ def sent2labels(sent):
+     return [elem.split('|')[3] for elem in sent]
+ 
+ 
+ def sent2tokens(sent):
+     return [token for token, postag, label in sent]
+ 
+ 
+ def print_transitions(trans_features, f):
+     for (label_from, label_to), weight in trans_features:
+         f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
+ 
+ 
+ def print_state_features(state_features, f):
+     for (attr, label), weight in state_features:
+         f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
+ 
+ 
+ __author__ = 'CMendezC'
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path of training data set", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path to place output files",
+                       metavar="PATH")
+     parser.add_option("--trainingFile", dest="trainingFile",
+                       help="File with training data set", metavar="FILE")
+     parser.add_option("--testFile", dest="testFile",
+                       help="File with test data set", metavar="FILE")
+     parser.add_option("--excludeStopWords", default=False,
+                       action="store_true", dest="excludeStopWords",
+                       help="Exclude stop words")
+     parser.add_option("--excludeSymbols", default=False,
+                       action="store_true", dest="excludeSymbols",
+                       help="Exclude punctuation marks")
+     parser.add_option("--reportFile", dest="reportFile",
+                       help="Report file", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path of training data set: " + options.inputPath)
+     print("File with training data set: " + str(options.trainingFile))
+     print("Path of test data set: " + options.inputPath)
+     print("File with test data set: " + str(options.testFile))
+     print("Exclude stop words: " + str(options.excludeStopWords))
+     print("Report file: " + str(options.reportFile))
+     
+     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
+     #print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols))
+     print("Exclude symbols: " + str(options.excludeSymbols))
+ 
+     print('-------------------------------- PROCESSING --------------------------------')
+     print('Reading corpus...')
+     t0 = time()
+ 
+     sentencesTrainingData = []
+     sentencesTestData = []
+ 
+     stopwords = [word for word in stopwords.words('english')]
+ 
+     with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
+         for line in iFile.readlines():
+             listLine = []
+             line = line.strip('\n')
+             for token in line.split():
+                 if options.excludeStopWords:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in stopwords:
+                         continue
+                 if options.excludeSymbols:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in symbols:
+                         continue
+                 listLine.append(token)
+             sentencesTrainingData.append(listLine)
+         print("   Sentences training data: " + str(len(sentencesTrainingData)))
+ 
+     with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
+         for line in iFile.readlines():
+             listLine = []
+             line = line.strip('\n')
+             for token in line.split():
+                 if options.excludeStopWords:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in stopwords:
+                         continue
+                 if options.excludeSymbols:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in symbols:
+                         continue
+                 listLine.append(token)
+             sentencesTestData.append(listLine)
+         print("   Sentences test data: " + str(len(sentencesTestData)))
+ 
+     print("Reading corpus done in: %fs" % (time() - t0))
+ 
+     print(sent2features(sentencesTrainingData[0])[0])
+     print(sent2features(sentencesTestData[0])[0])
+     t0 = time()
+ 
+     X_train = [sent2features(s) for s in sentencesTrainingData]
+     y_train = [sent2labels(s) for s in sentencesTrainingData]
+ 
+     X_test = [sent2features(s) for s in sentencesTestData]
+     # print X_test
+     y_test = [sent2labels(s) for s in sentencesTestData]
+ 
+     # Fixed parameters
+     # crf = sklearn_crfsuite.CRF(
+     #     algorithm='lbfgs',
+     #     c1=0.1,
+     #     c2=0.1,
+     #     max_iterations=100,
+     #     all_possible_transitions=True
+     # )
+ 
+     # Hyperparameter Optimization
+     crf = sklearn_crfsuite.CRF(
+         algorithm='lbfgs',
+         max_iterations=100,
+         all_possible_transitions=True
+     )
+     params_space = {
+         'c1': scipy.stats.expon(scale=0.5),
+         'c2': scipy.stats.expon(scale=0.05),
+     }
+ 
+     # Original: labels = list(crf.classes_)
+     # Original: labels.remove('O')
+     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
+ 
+     # use the same metric for evaluation
+     f1_scorer = make_scorer(metrics.flat_f1_score,
+                             average='weighted', labels=labels)
+ 
+     # search
+     rs = RandomizedSearchCV(crf, params_space,
+                             cv=10,
+                             verbose=3,
+                             n_jobs=-1,
+                             n_iter=20,
+                             # n_iter=50,
+                             scoring=f1_scorer)
+     rs.fit(X_train, y_train)
+ 
+     # Fixed parameters
+     # crf.fit(X_train, y_train)
+ 
+     # Best hiperparameters
+     # crf = rs.best_estimator_
+     #nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(options.excludeSymbols) + '.txt')
+     nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.reportFile))
+     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
+         oFile.write("********** TRAINING AND TESTING REPORT **********\n")
+         oFile.write("Training file: " + options.trainingFile + '\n')
+         oFile.write('\n')
+         oFile.write('best params:' + str(rs.best_params_) + '\n')
+         oFile.write('best CV score:' + str(rs.best_score_) + '\n')
+         oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
+ 
+     print("Training done in: %fs" % (time() - t0))
+     t0 = time()
+ 
+     # Update best crf
+     crf = rs.best_estimator_
+ 
+     # Saving model
+     print("     Saving training model...")
+     t1 = time()
+     nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.mod')
+     joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
+     print("        Saving training model done in: %fs" % (time() - t1))
+ 
+     # Evaluation against test data
+     y_pred = crf.predict(X_test)
+     print("*********************************")
+     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.txt')
+     with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
+         for y in y_pred:
+             oFile.write(str(y) + '\n')
+ 
+     print("*********************************")
+     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.txt')
+     with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
+         for y in y_test:
+             oFile.write(str(y) + '\n')
+ 
+     print("Prediction done in: %fs" % (time() - t0))
+ 
+     # labels = list(crf.classes_)
+     # labels.remove('O')
+ 
+     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
+         oFile.write('\n')
+         oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
+         oFile.write('\n')
+         # labels = list(crf.classes_)
+         sorted_labels = sorted(
+             labels,
+             key=lambda name: (name[1:], name[0])
+         )
+         oFile.write(metrics.flat_classification_report(
+             y_test, y_pred, labels=sorted_labels, digits=3
+         ))
+         oFile.write('\n')
+ 
+         oFile.write("\nTop likely transitions:\n")
+         print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop unlikely transitions:\n")
+         print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop positive:\n")
+         print_state_features(Counter(crf.state_features_).most_common(200), oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop negative:\n")
+         print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
+         oFile.write('\n')
+ 
--- a/CRF/bin/training_validation_v5.py 0 → 100644
View file @fd616ef
+++ b/CRF/bin/training_validation_v5.py 0 → 100644
View file @fd616ef
+ # -*- coding: UTF-8 -*-
+ 
+ import os
+ from itertools import chain
+ from optparse import OptionParser
+ from time import time
+ from collections import Counter
+ import re
+ 
+ import nltk
+ import sklearn
+ import scipy.stats
+ import sys
+ 
+ from sklearn.externals import joblib
+ from sklearn.metrics import make_scorer
+ from sklearn.cross_validation import cross_val_score
+ from sklearn.grid_search import RandomizedSearchCV
+ 
+ import sklearn_crfsuite
+ from sklearn_crfsuite import scorers
+ from sklearn_crfsuite import metrics
+ 
+ from nltk.corpus import stopwords
+ 
+ 
+ # Objective
+ # Training and evaluation of CRFs with sklearn-crfsuite.
+ #
+ # Input parameters
+ # --inputPath=PATH    	Path of training and test data set
+ # --trainingFile        File with training data set
+ # --testFile        	File with test data set
+ # --outputPath=PATH    	Output path to place output files
+ # --reportFile    		Report Fileneme
+ 
+ # Output
+ # 1) Best model
+ 
+ # Examples
+ # python training_validation_v5.py
+ # --inputPath 			/home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ # --trainingFile 		training-data-set-70.txt
+ # --testFile			test-data-set-30.txt
+ # --outputPath 			/home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
+ # --reportFile    		report_1
+ # python3.4 training-validation_v5.py --inputPatTH /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
+ 
+ #################################
+ #           FUNCTIONS           #
+ #################################
+ 
+ def isGreek(word):
+     alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
+     'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
+     if word in alphabet:
+         return True
+     else:
+         return False 
+ def hUpper(word):
+     for l in word:
+         if l.isupper(): return True
+     return False
+ 
+ def hLower(word):
+     for l in word:
+         if l.islower(): return True
+     return False 
+ 
+ def hGreek(word):
+     for l in word:
+         if isGreek(l): return True
+     return False    
+ 
+ 
+ def word2features(sent, i, S1, S2):
+     listElem = sent[i].split('|')
+     word = listElem[0]
+     lemma = listElem[1]
+     postag = listElem[2]
+     ner = listElem[3]
+ 
+     features = {
+         #General
+         'lemma': lemma,
+         'postag': postag
+         }
+ 
+     if S1:
+         #S1
+         features['word']:    word
+         features['hUpper']:  hUpper(word)
+         features['hLower']:  hUpper(word)
+         features['hGreek']:  hGreek(word)        
+         #features['hAlfNum']: hAlfNum(word)
+ 
+     if S2:
+         #S2
+         features['isUpper']:  word.isupper()
+         features['isLower']:  word.isLower()
+         features['isGreek']:  isGreek(word)        
+         features['isNumber']: word.isdigit()    
+ 
+     if i > 0:        
+         listElem = sent[i - 1].split('|')
+         word1 = listElem[0]
+         lemma1 = listElem[1]
+         postag1 = listElem[2]
+         features.update({
+             #Word anterioir
+             '-1:word': word1,
+             #LemaG  posterior      
+             '-1:lemma': lemma1,
+             #PostG posterior 
+             '-1:postag': postag1,
+         })
+ 
+     if i < len(sent) - 1:
+         listElem = sent[i + 1].split('|')
+         word1 = listElem[0]
+         lemma1 = listElem[1]
+         postag1 = listElem[2]
+         features.update({
+             #Word anterioir
+             '+1:word': word1,
+             #LemaG  posterior      
+             '+1:lemma': lemma1,
+             #PostG posterior 
+             '+1:postag': postag1,
+         })    
+     return features
+ 
+ 
+ def sent2features(sent, S1, S2):
+     return [word2features(sent, i, S1, S2) for i in range(len(sent))]
+ 
+ 
+ def sent2labels(sent):
+     return [elem.split('|')[3] for elem in sent]
+ 
+ 
+ def sent2tokens(sent):
+     return [token for token, postag, label in sent]
+ 
+ 
+ def print_transitions(trans_features, f):
+     for (label_from, label_to), weight in trans_features:
+         f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
+ 
+ 
+ def print_state_features(state_features, f):
+     for (attr, label), weight in state_features:
+         f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
+ 
+ 
+ __author__ = 'CMendezC'
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path of training data set", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path to place output files",
+                       metavar="PATH")
+     parser.add_option("--trainingFile", dest="trainingFile",
+                       help="File with training data set", metavar="FILE")
+     parser.add_option("--testFile", dest="testFile",
+                       help="File with test data set", metavar="FILE")
+     parser.add_option("--excludeStopWords", default=False,
+                       action="store_true", dest="excludeStopWords",
+                       help="Exclude stop words")
+     parser.add_option("--excludeSymbols", default=False,
+                       action="store_true", dest="excludeSymbols",
+                       help="Exclude punctuation marks")
+     parser.add_option("--reportFile", dest="reportFile",
+                       help="Report file", metavar="FILE")
+     parser.add_option("--S1", default=False,
+                       action="store_true", dest="S1",
+                       help="Level specificity")
+     parser.add_option("--S2", default=False,
+                       action="store_true", dest="S2",
+                       help="Level specificity")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path of training data set: " + options.inputPath)
+     print("File with training data set: " + str(options.trainingFile))
+     print("Path of test data set: " + options.inputPath)
+     print("File with test data set: " + str(options.testFile))
+     print("Exclude stop words: " + str(options.excludeStopWords))
+     print("Levels: " + str(options.S1) + " " + str(options.S2))
+     print("Report file: " + str(options.reportFile))
+ 
+     
+     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']   
+     print("Exclude symbols: " + str(options.excludeSymbols))
+ 
+     print('-------------------------------- PROCESSING --------------------------------')
+     print('Reading corpus...')
+     t0 = time()
+ 
+     sentencesTrainingData = []
+     sentencesTestData = []
+ 
+     stopwords = [word for word in stopwords.words('english')]
+ 
+     with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
+         for line in iFile.readlines():
+             listLine = []
+             line = line.strip('\n')
+             for token in line.split():
+                 if options.excludeStopWords:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in stopwords:
+                         continue
+                 if options.excludeSymbols:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in symbols:
+                         continue
+                 listLine.append(token)
+             sentencesTrainingData.append(listLine)
+         print("   Sentences training data: " + str(len(sentencesTrainingData)))
+ 
+     with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
+         for line in iFile.readlines():
+             listLine = []
+             line = line.strip('\n')
+             for token in line.split():
+                 if options.excludeStopWords:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in stopwords:
+                         continue
+                 if options.excludeSymbols:
+                     listToken = token.split('|')
+                     lemma = listToken[1]
+                     if lemma in symbols:
+                         continue
+                 listLine.append(token)
+             sentencesTestData.append(listLine)
+         print("   Sentences test data: " + str(len(sentencesTestData)))
+ 
+     print("Reading corpus done in: %fs" % (time() - t0))
+ 
+     if options.S1: S1 = 0
+     else: S1 = 1
+     if options.S2: S2 = 0
+     else: S2 = 1
+ 
+     print(sent2features(sentencesTrainingData[0], S1, S2)[0])
+     print(sent2features(sentencesTestData[0], S1, S2)[0])
+     t0 = time()
+ 
+     X_train = [sent2features(s, S1, S2) for s in sentencesTrainingData]
+     y_train = [sent2labels(s) for s in sentencesTrainingData]
+ 
+     X_test = [sent2features(s, S1, S2) for s in sentencesTestData]
+     # print X_test
+     y_test = [sent2labels(s) for s in sentencesTestData]
+ 
+     # Fixed parameters
+     # crf = sklearn_crfsuite.CRF(
+     #     algorithm='lbfgs',
+     #     c1=0.1,
+     #     c2=0.1,
+     #     max_iterations=100,
+     #     all_possible_transitions=True
+     # )
+ 
+     # Hyperparameter Optimization
+     crf = sklearn_crfsuite.CRF(
+         algorithm='lbfgs',
+         max_iterations=100,
+         all_possible_transitions=True
+     )
+     params_space = {
+         'c1': scipy.stats.expon(scale=0.5),
+         'c2': scipy.stats.expon(scale=0.05),
+     }
+ 
+     # Original: labels = list(crf.classes_)
+     # Original: labels.remove('O')
+     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
+ 
+     # use the same metric for evaluation
+     f1_scorer = make_scorer(metrics.flat_f1_score,
+                             average='weighted', labels=labels)
+ 
+     # search
+     rs = RandomizedSearchCV(crf, params_space,
+                             cv=10,
+                             verbose=3,
+                             n_jobs=-1,
+                             n_iter=20,
+                             # n_iter=50,
+                             scoring=f1_scorer)
+     rs.fit(X_train, y_train)
+ 
+     # Fixed parameters
+     # crf.fit(X_train, y_train)
+ 
+     # Best hiperparameters
+     # crf = rs.best_estimator_    
+     nameReport = options.trainingFile.replace('.txt', str(options.reportFile) + '.txt')
+     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
+         oFile.write("********** TRAINING AND TESTING REPORT **********\n")
+         oFile.write("Training file: " + options.trainingFile + '\n')
+         oFile.write('\n')
+         oFile.write('best params:' + str(rs.best_params_) + '\n')
+         oFile.write('best CV score:' + str(rs.best_score_) + '\n')
+         oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
+ 
+     print("Training done in: %fs" % (time() - t0))
+     t0 = time()
+ 
+     # Update best crf
+     crf = rs.best_estimator_
+ 
+     # Saving model
+     print("     Saving training model...")
+     t1 = time()
+     nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.mod')
+     joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
+     print("        Saving training model done in: %fs" % (time() - t1))
+ 
+     # Evaluation against test data
+     y_pred = crf.predict(X_test)
+     print("*********************************")
+     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.txt')
+     with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
+         for y in y_pred:
+             oFile.write(str(y) + '\n')
+ 
+     print("*********************************")
+     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
+         options.excludeSymbols) + '.txt')
+     with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
+         for y in y_test:
+             oFile.write(str(y) + '\n')
+ 
+     print("Prediction done in: %fs" % (time() - t0))
+ 
+     # labels = list(crf.classes_)
+     # labels.remove('O')
+ 
+     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
+         oFile.write('\n')
+         oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
+         oFile.write('\n')
+         # labels = list(crf.classes_)
+         sorted_labels = sorted(
+             labels,
+             key=lambda name: (name[1:], name[0])
+         )
+         oFile.write(metrics.flat_classification_report(
+             y_test, y_pred, labels=sorted_labels, digits=3
+         ))
+         oFile.write('\n')
+ 
+         oFile.write("\nTop likely transitions:\n")
+         print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop unlikely transitions:\n")
+         print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop positive:\n")
+         print_state_features(Counter(crf.state_features_).most_common(200), oFile)
+         oFile.write('\n')
+ 
+         oFile.write("\nTop negative:\n")
+         print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
+         oFile.write('\n')
+ 
+