update

Estefani Gaytan Nunez
Commit 5cea83404b1beda20cbb1f859f60168c3ba8887f 5cea8340 1 parent d7b7af1d
Showing 33 changed files with 138 additions and 837 deletions
CRF/bin/label-split_training_test_v1.py
CRF/bin/label-split_training_test_v2.py
CRF/bin/label-split_training_test_v2.py.save → CRF/bin/label-split_training_test_v3.py
CRF/bin/params.py
CRF/bin/training_validation_v3.py
CRF/bin/training_validation_v5.py → CRF/bin/training_validation_v7.py
CRF/bin/training_validation_v4.py → CRF/bin/training_validation_v8.py
CRF/data-sets/Tags.txt
CRF/data-sets/test-data-set-30.txt
CRF/data-sets/test-data-set-30_v2.txt
CRF/data-sets/test-data-set-30_v3.txt
CRF/data-sets/test-data-set-30_v4.txt
CRF/data-sets/training-data-set-70.txt
CRF/data-sets/training-data-set-70_v2.txt
CRF/data-sets/training-data-set-70_v3.txt
CRF/data-sets/training-data-set-70_v4.txt
CRF/models/model_S1_False_S2_False_v1.mod
CRF/models/model_S1_False_S2_True_v1.mod
CRF/models/training-data-set-70.fStopWords_False.fSymbols_False.mod → CRF/models/model_S1_True_S2_False_v1.mod
CRF/models/model_S1_True_S2_True_v1.mod
--- a/CRF/bin/label-split_training_test_v1.py deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/bin/label-split_training_test_v1.py deleted 100644 → 0
View file @d7b7af1
- #!/bin/python3
- import os
- from itertools import chain
- from optparse import OptionParser
- from time import time
- from collections import Counter
- import re
- 
- import nltk
- import sklearn
- import scipy.stats
- import sys
- 
- from sklearn.externals import joblib
- from sklearn.metrics import make_scorer
- from sklearn.cross_validation import cross_val_score
- from sklearn.grid_search import RandomizedSearchCV
- 
- import sklearn_crfsuite
- from sklearn_crfsuite import scorers
- from sklearn_crfsuite import metrics
- 
- from nltk.corpus import stopwords
- import random
- 
- 
- # Objective
- # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
- #
- # Input parameters
- # --inputPath=PATH    		Path of inputfile
- # --outputPath=PATH   		Path to place output files
- # --trainingFile=testFile  	Output training data set
- # --testFile=testFile  	  	Output test data set
- #
- # Output
- # training and test data set
- #
- # Examples
- # python label-split_training_test_v1.py
- # --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
- # --inputFile sentences.tsv_pakal_.conll
- # --trainingFile training-data-set-70.txt
- # --testFile test-data-set-30.txt
- # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
- #
- # 
- # python label-split_training_test_v1.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile sentences.tsv_pakal_.conll --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
- 
- 
- ##########################################
- #               MAIN PROGRAM             #
- ##########################################
- 
- if __name__ == "__main__":
-     # Defining parameters
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Path of output from CoreNLP", metavar="PATH")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Output path to place output files",
-                       metavar="PATH")
-     parser.add_option("--inputFile", dest="inputFile",
-                       help="File with CoreNLP-tagging sentences", metavar="FILE")
-     parser.add_option("--trainingFile", dest="trainingFile",
-                       help="File with training data set", metavar="FILE")
-     parser.add_option("--testFile", dest="testFile",
-                       help="File with test data set", metavar="FILE")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("Any parameter given.")
-         sys.exit(1)
- 
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path of CoreNLP output: " + options.inputPath)
-     print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
-     print("Path of training data set: " + str(options.outputPath))
-     print("File with training data set: " + str(options.trainingFile))
-     print("Path of test data set: " + str(options.outputPath))
-     print("File with test data set: " + str(options.testFile))
-     print('-------------------------------- PROCESSING --------------------------------')
-     ## begin of tagging
-     in_labels = {
-      '<Gtype>': 'Gtype',
-      '<Gversion>': 'Gversion',
-      '<Med>': 'Med',
-      '<Phase>': 'Phase',
-      '<Sample>': 'Sample',
-      '<Serie>': 'Serie',
-      '<Substrain>': 'Substrain',
-      '<Supp>': 'Supp',
-      '<Technique>': 'Technique',
-      '<Temp>': 'Temp',
-      '<OD>': 'OD',
-      '<Anti>': 'Anti',
-      '<Agit>': 'Agit',
-      '<Vess>': 'Vess'
-     }
-     ## End of tagging
-     out_labels = {
-      '</Air>': 'O',
-      '</Gtype>': 'O',
-      '</Gversion>': 'O',
-      '</Med>': 'O',
-      '</Phase>': 'O',
-      '</Sample>': 'O',
-      '</Serie>': 'O',
-      '</Strain>': 'O',
-      '<Strain>': 'O',
-      '</Substrain>': 'O',
-      '</Supp>': 'O',
-      '</Technique>': 'O',
-      '</Temp>': 'O',
-      '</OD>': 'O',
-      '</Anti>': 'O',
-      '</Agit>': 'O',
-      '<Name>': 'O',
-      '</Name>': 'O',
-      '<Orgn>': 'O',
-      '</Orgn>': 'O',
-      '</Vess>': 'O'}
-     
-     # Other label
-     flag = 'O'
-     # sentences counter
-     n=0
-     lista = []
-     #First sentence
-     sentence = ''
-     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
- 	    for line in input_file:
- 		    if len(line.split('\t')) > 1:
- 			    w = line.split('\t')[1]
- 			    if w in in_labels or w in out_labels:
- 			    	#Tagging
- 				    if w in in_labels.keys(): flag = in_labels[w]
- 				    if w in out_labels: flag = out_labels[w]					
- 			    else:
- 				    if w == "PGCGROWTHCONDITIONS":
- 				    	#End of sentence
- 					    lista.append(sentence)
- 					    #New setence
- 					    sentence = ''
- 					    n=n+1
- 				    else:
- 				    	#Building and save tagging sentence
- 					    sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
- 
-     print("Number of sentences: " + str(n))		
- 				 
-     # Split 70 30 training and test sentences
-     trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
-     testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
-     print(len(trainingIndex))
-     print(len(testIndex))
- 
-     with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
-       Data = [lista[i]  for i in trainingIndex]
-       oFile.write('\n'.join(Data))
- 
-     with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
-       Data = [lista[i]  for i in testIndex]
-       oFile.write('\n'.join(Data))
- 
-     print("==================================END===================================")
--- a/CRF/bin/label-split_training_test_v2.py deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/bin/label-split_training_test_v2.py deleted 100644 → 0
View file @d7b7af1
- #!/bin/python3    
- from optparse import OptionParser
- import re
- import os
- import random
- 
- 
- # Objective
- # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
- # make data sets using only sentences with at least one true-tag
- #
- # Input parameters
- # --inputPath=PATH    		Path of inputfile
- # --outputPath=PATH   		Path to place output files
- # --trainingFile=testFile  	Output training data set
- # --testFile=testFile  	  	Output test data set
- #
- # Output
- # training and test data set
- #
- # Examples
- # python label-split_training_test_v2.py
- # --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
- # --inputFile sentences.tsv_pakal_.conll
- # --trainingFile training-data-set-70.txt
- # --testFile test-data-set-30.txt
- # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
- #
- # 
- # python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
- 
- 
- ##########################################
- #               MAIN PROGRAM             #
- ##########################################
- 
- if __name__ == "__main__":
-     # Defining parameters
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Path of output from CoreNLP", metavar="PATH")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Output path to place output files",
-                       metavar="PATH")
-     parser.add_option("--inputFile", dest="inputFile",
-                       help="File with CoreNLP-tagging sentences", metavar="FILE")
-     parser.add_option("--trainingFile", dest="trainingFile",
-                       help="File with training data set", metavar="FILE")
-     parser.add_option("--testFile", dest="testFile",
-                       help="File with test data set", metavar="FILE")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("Any parameter given.")
-         sys.exit(1)
- 
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path of CoreNLP output: " + options.inputPath)
-     print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
-     print("Path of training data set: " + options.outputPath)
-     print("File with training data set: " + str(options.trainingFile))
-     print("Path of test data set: " + options.outputPath)
-     print("File with test data set: " + str(options.testFile))
-     print('-------------------------------- PROCESSING --------------------------------')
-     ## begin of tagging
-     in_labels = {
-      '<Gtype>': 'Gtype',
-      '<Gversion>': 'Gversion',
-      '<Med>': 'Med',
-      '<Phase>': 'Phase',
-      '<Supp>': 'Supp',
-      '<Technique>': 'Technique',
-      '<Temp>': 'Temp',
-      '<OD>': 'OD',
-      '<Anti>': 'Anti'
-     }
-     ## End of tagging
-     out_labels = {
-      '<Air>': 'O',    
-      '</Air>': 'O',
-      '</Gtype>': 'O',
-      '</Gversion>': 'O',
-      '</Med>': 'O',
-      '</Phase>': 'O',
-      '<Sample>': 'O',
-      '</Sample>': 'O',
-      '<Serie>': 'O',
-      '</Serie>': 'O',
-      '<Strain>': 'O',
-      '</Strain>': 'O',
-      '<Substrain>': 'O',
-      '</Substrain>': 'O',
-      '</Supp>': 'O',
-      '</Technique>': 'O',
-      '</Temp>': 'O',
-      '</OD>': 'O',
-      '<Agit>': 'O',
-      '</Agit>': 'O',
-      '<Name>': 'O',
-      '</Name>': 'O',
-      '<Orgn>': 'O',
-      '</Orgn>': 'O',
-      '</Anti>': 'O',
-      '<Vess>': 'O',
-      '</Vess>': 'O'}
-     
-     # Other label
-     flag = 'O'
-     # sentences counter    
-     lista = []
-     #First sentence
-     sentence = ''
-     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
- 	    for line in input_file:
- 		    if len(line.split('\t')) > 1:
- 			    w = line.split('\t')[1]
- 			    if w in in_labels or w in out_labels:
- 			    	#Tagging                    
- 				    if w in in_labels.keys(): flag = in_labels[w]                    
- 				    if w in out_labels: flag = out_labels[w]					
- 			    else:                    
- 				    if w == "PGCGROWTHCONDITIONS":                        
- 			                        words = sentence.split(' ')
- 				    	#End of sentence
- 		                        	tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
- 	               		        #At least one true-tag on sentence
- 			                        if len(tags)> 0:
-                 			            lista.append(sentence)
- 		                        #New setence
- 	                            		sentence = ''                            
- 				    else:			            	
- 						sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
- 
-     print("Number of sentences: " + str( len(lista) ) )
- 
- 
-     # Split 70 30 training and test sentences
-     trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
-     testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
- 
-     with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
-       Data = [lista[i]  for i in trainingIndex]
-       oFile.write('\n'.join(Data))
- 
-     with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
-       Data = [lista[i]  for i in testIndex]
-       oFile.write('\n'.join(Data))
- 
-     print("==================================END===================================")
--- a/CRF/bin/label-split_training_test_v2.py.save → CRF/bin/label-split_training_test_v3.py
View file @5cea834
+++ b/CRF/bin/label-split_training_test_v2.py.save → CRF/bin/label-split_training_test_v3.py
View file @5cea834
- #!/bin/python3    
 from optparse import OptionParser
 import re
 import os
@@ -7,7 +6,6 @@ import random
 
 # Objective
 # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
- # make data sets using only sentences with at least one true-tag
 #
 # Input parameters
 # --inputPath=PATH    		Path of inputfile
@@ -19,15 +17,15 @@ import random
 # training and test data set
 #
 # Examples
- # python label-split_training_test_v2.py
- # --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
- # --inputFile sentences.tsv_pakal_.conll
- # --trainingFile training-data-set-70.txt
- # --testFile test-data-set-30.txt
- # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ # python label-split_training_test_v1.py
+ # --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/
+ # --inputFile raw-metadata-senteneces_v2.txt.conll
+ # --trainingFile training-data-set-70_v4.txt
+ # --testFile test-data-set-30_v4.txt
+ # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
 #
 # 
- # python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ # python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
 
 
 ##########################################
@@ -67,78 +65,79 @@ if __name__ == "__main__":
      '<Gtype>': 'Gtype',
      '<Gversion>': 'Gversion',
      '<Med>': 'Med',
-      '<Phase>': 'Phase',
+      '<Phase>': 'Phase',        
+      '<Substrain>': 'Substrain',
      '<Supp>': 'Supp',
+      '<Strain>': 'Strain',
      '<Technique>': 'Technique',
      '<Temp>': 'Temp',
      '<OD>': 'OD',
      '<Anti>': 'Anti',
      '<Agit>': 'Agit',
-      '<Vess>': 'Vess'
+      '<Air>': 'Air',    
+      '<Vess>': 'Vess',
+      '<pH>': 'pH'
     }
     ## End of tagging
     out_labels = {
-      '<Air>': 'O',    
-      '</Air>': 'O',
      '</Gtype>': 'O',
      '</Gversion>': 'O',
      '</Med>': 'O',
      '</Phase>': 'O',
-      '<Sample>': 'O',
-      '</Sample>': 'O',
-      '<Serie>': 'O',
-      '</Serie>': 'O',
-      '<Strain>': 'O',
-      '</Strain>': 'O',
-      '<Substrain>': 'O',
      '</Substrain>': 'O',
      '</Supp>': 'O',
+      '</Strain>': 'O',
      '</Technique>': 'O',
      '</Temp>': 'O',
      '</OD>': 'O',
      '</Anti>': 'O',
      '</Agit>': 'O',
-      '<Name>': 'O',
-      '</Name>': 'O',
+      '</Air>': 'O',
+      '</Vess>': 'O',
+      '</pH>': 'O'}
+     old_labels = {
      '<Orgn>': 'O',
-      '</Orgn>': 'O',
-      '</Vess>': 'O'}
+      '</Orgn>': 'O'
+      }
     
     # Other label
-     flag = 'O'
-     # sentences counter
-     n=0
+     flag = 'O'    
     lista = []
     #First sentence
     sentence = ''
+     n = 0
     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
 	    for line in input_file:
 		    if len(line.split('\t')) > 1:
- 			    w = line.split('\t')[1]
+ 			    w = line.split('\t')[1]                
 			    if w in in_labels or w in out_labels:
- 			    	#Tagging                    
- 				    if w in in_labels.keys(): flag = in_labels[w]                    
- 				    if w in out_labels: flag = out_labels[w]					
- 			    else:                    
+ 			    	#Tagging
+ 				    if w in in_labels.keys(): flag = in_labels[w]
+ 				    if w in out_labels: flag = out_labels[w]                    
+ 			    else:
 				    if w == "PGCGROWTHCONDITIONS":
- 					words = sentence.split(' ')
-                         tags = [tag for tag in words if word.split('|')[-1] in in_labels.values() ]
-                         #At least one true-tag on sentence
-                         if len(tags)> 0:
-                             lista.append(sentence)
-                             #New setence
-                             sentence = ''
-                             n=n+1
- 				    else:
- 				    	#Building and save tagging sentence
+ 					n=n+1
+ 				    	words = sentence.split(' ')
+         		                #End of sentence
+ 	                	        tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
+ 		                        #At least one true-tag on sentence
+                 		        if len(tags)> 0:
+ 		                            lista.append(sentence)
+ 					    #New setence
+ 					sentence = ''				    
+ 				    elif w not in old_labels.keys():
+                         		#Building and save tagging sentence
 					sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
 
- 	print("Number of sentences: " + str(n) + str(len(lista)+1))
+     print("Number of sentences with at least one tag: " + str(len(lista)))		
+     print("Number of sentences from CoreNLP: " + str(n))
 
 
     # Split 70 30 training and test sentences
-     trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
-     testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
+     trainingIndex = random.sample(range(len(lista)), int(len(lista)*.70))
+     testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
+     print("Number of sentences for training: " + str(len(trainingIndex)))
+     print("Number of sentences for test: " + str(len(testIndex)))
 
     with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
       Data = [lista[i]  for i in trainingIndex]
--- a/CRF/bin/params.py deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/bin/params.py deleted 100644 → 0
View file @d7b7af1
- #!/bin/python3
- import os
- from itertools import chain
- from optparse import OptionParser
- from time import time
- from collections import Counter
- import re
- 
- import nltk
- import sklearn
- import scipy.stats
- import sys
- 
- from sklearn.externals import joblib
- from sklearn.metrics import make_scorer
- from sklearn.cross_validation import cross_val_score
- from sklearn.grid_search import RandomizedSearchCV
- 
- import sklearn_crfsuite
- from sklearn_crfsuite import scorers
- from sklearn_crfsuite import metrics
- 
- from nltk.corpus import stopwords
- 
- import random
- 
- 
- # Objective
- # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
- #
- # Input parameters
- # --inputPath=PATH    		Path of inputfile
- # --outputPath=PATH   		Path to place output files
- # --trainingFile=testFile  	Output training data set
- # --testFile=testFile  	  	Output test data set
- #
- # Output
- # training and test data set
- #
- # Examples
- # python label-split_training_test_v1.py
- # --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
- # --inputFile sentences.tsv_pakal_.conll
- # --trainingFile training-data-set-70.txt
- # --testFile test-data-set-30.txt
- # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
- #
- # 
- # python label-split_training_test_v1.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile sentences.tsv_pakal_.conll --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
- 
- 
- ##########################################
- #               MAIN PROGRAM             #
- ##########################################
- 
- if __name__ == "__main__":
-     # Defining parameters
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Path of output from CoreNLP", metavar="PATH")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Output path to place output files",
-                       metavar="PATH")
-     parser.add_option("--inputFile", dest="inputFile",
-                       help="File with CoreNLP-tagging sentences", metavar="FILE")
-     parser.add_option("--trainingFile", dest="trainingFile",
-                       help="File with training data set", metavar="FILE")
-     parser.add_option("--testFile", dest="testFile",
-                       help="File with test data set", metavar="FILE")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("Any parameter given.")
-         sys.exit(1)
- 
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path of CoreNLP output: " + str(options.inputPath))
-     print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
-     print("Path of training data set: " + str(options.outputPath))
-     print("File with training data set: " + str(options.trainingFile))
-     print("Path of test data set: " + str(options.outputPath))
-     print("File with test data set: " + str(options.testFile))
-     print('-------------------------------- PROCESSING --------------------------------')
-     ## begin of tagging
-     in_labels = {
-      '<Gtype>': 'Gtype',
-      '<Gversion>': 'Gversion',
-      '<Med>': 'Med',
-      '<Phase>': 'Phase',
-      '<Sample>': 'Sample',
-      '<Serie>': 'Serie',
-      '<Substrain>': 'Substrain',
-      '<Supp>': 'Supp',
-      '<Technique>': 'Technique',
-      '<Temp>': 'Temp',
-      '<OD>': 'OD',
-      '<Anti>': 'Anti',
-      '<Agit>': 'Agit',
-      '<Vess>': 'Vess'
-     }
-     ## End of tagging
-     out_labels = {
-      '</Air>': 'O',
-      '</Gtype>': 'O',
-      '</Gversion>': 'O',
-      '</Med>': 'O',
-      '</Phase>': 'O',
-      '</Sample>': 'O',
-      '</Serie>': 'O',
-      '</Strain>': 'O',
-      '<Strain>': 'O',
-      '</Substrain>': 'O',
-      '</Supp>': 'O',
-      '</Technique>': 'O',
-      '</Temp>': 'O',
-      '</OD>': 'O',
-      '</Anti>': 'O',
-      '</Agit>': 'O',
-      '<Name>': 'O',
-      '</Name>': 'O',
-      '<Orgn>': 'O',
-      '</Orgn>': 'O',
-      '</Vess>': 'O'}
-     
-     # Other label
-     flag = 'O'
-     # sentences counter
-     n=0
-     lista = []
-     #First sentence
-     sentence = ''
-     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
- 	    for line in input_file:
- 		    if len(line.split('\t')) > 1:
- 			    w = line.split('\t')[1]
- 			    if w in in_labels or w in out_labels:
- 			    	#Tagging
- 				    if w in in_labels.keys(): flag = in_labels[w]
- 				    if w in out_labels: flag = out_labels[w]					
- 			    else:
- 				    if w == "PGCGROWTHCONDITIONS":
- 				    	#End of sentence
- 					    lista.append(sentence)
- 					    #New setence
- 					    sentence = ''
- 					    n=n+1
- 				    else:
- 				    	#Building and save tagging sentence
- 					    sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
- 
-     print("Number of sentences: " + str(n))		
-     print('\n'.join(lista))				 
-     # Split 70 30 training and test sentences
- #    trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
- #    testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
- 
- #    with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
- #      Data = [lista[i]  for i in trainingIndex]
- #      oFile.write('\n'.join(Data))
- 
- #    with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
- #      Data = [lista[i]  for i in testIndex]
- #      oFile.write('\n'.join(Data))
- 
- #    print("==================================END===================================")
- 
--- a/CRF/bin/training_validation_v3.py deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/bin/training_validation_v3.py deleted 100644 → 0
View file @d7b7af1
--- a/CRF/bin/training_validation_v5.py → CRF/bin/training_validation_v7.py
View file @5cea834
+++ b/CRF/bin/training_validation_v5.py → CRF/bin/training_validation_v7.py
View file @5cea834
@@ -32,7 +32,7 @@ from nltk.corpus import stopwords
 # --trainingFile        File with training data set
 # --testFile        	File with test data set
 # --outputPath=PATH    	Output path to place output files
- # --reportFile    		Report Fileneme
+ # --version    		    Version Report
 
 # Output
 # 1) Best model
@@ -43,31 +43,54 @@ from nltk.corpus import stopwords
 # --trainingFile 		training-data-set-70.txt
 # --testFile			test-data-set-30.txt
 # --outputPath 			/home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
- # --reportFile    		report_1
- # python3.4 training-validation_v5.py --inputPatTH /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
+ # --version    		    _v2
+ # python3 training_validation_v7.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70_v4.txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --version _v1 
 
 #################################
 #           FUNCTIONS           #
 #################################
 
 def isGreek(word):
+     #al greek letters
     alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
     'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
     if word in alphabet:
         return True
     else:
         return False 
+ 
+ def hNumber(word):
+     for l in word:
+         if l.isdigit():
+             return True
+     return False
+ 
+ def symb(word):
+     n=0
+     #at least a not alphanumeric character
+     for l in word:
+         if l.isdigit(): n = n+1
+         if l.isalpha(): n = n+1
+         #Exclude Greek letters
+         if isGreek(l): n = n+1
+ 
+     if n<len(word): return True
+     else: return False
+ 
 def hUpper(word):
+     #at least an uppers
     for l in word:
         if l.isupper(): return True
     return False
 
 def hLower(word):
+     #at least a lower
     for l in word:
         if l.islower(): return True
     return False 
 
 def hGreek(word):
+     #at least an greek letter
     for l in word:
         if isGreek(l): return True
     return False    
@@ -80,54 +103,69 @@ def word2features(sent, i, S1, S2):
     postag = listElem[2]
     ner = listElem[3]
 
+     #====================== G1 ======================#
+ 
     features = {
         #General
         'lemma': lemma,
         'postag': postag
         }
 
-     if S1:
-         #S1
-         features['word']:    word
-         features['hUpper']:  hUpper(word)
-         features['hLower']:  hUpper(word)
-         features['hGreek']:  hGreek(word)        
-         #features['hAlfNum']: hAlfNum(word)
- 
-     if S2:
-         #S2
-         features['isUpper']:  word.isupper()
-         features['isLower']:  word.isLower()
-         features['isGreek']:  isGreek(word)        
-         features['isNumber']: word.isdigit()    
- 
     if i > 0:        
-         listElem = sent[i - 1].split('|')
-         word1 = listElem[0]
+         listElem = sent[i - 1].split('|')        
         lemma1 = listElem[1]
         postag1 = listElem[2]
-         features.update({
-             #Word anterioir
-             '-1:word': word1,
+         
+         features.update({            
             #LemaG  posterior      
             '-1:lemma': lemma1,
             #PostG posterior 
             '-1:postag': postag1,
         })
 
-     if i < len(sent) - 1:
-         listElem = sent[i + 1].split('|')
-         word1 = listElem[0]
+     if i < len(sent) - 1:        
+         listElem = sent[i + 1].split('|')        
         lemma1 = listElem[1]
         postag1 = listElem[2]
-         features.update({
-             #Word anterioir
-             '+1:word': word1,
+ 
+         features.update({           
             #LemaG  posterior      
             '+1:lemma': lemma1,
             #PostG posterior 
             '+1:postag': postag1,
         })    
+ 
+     #====================== S1 ======================#
+     if S1:
+         listElem = sent[i - 1].split('|')        
+         lemma1 = listElem[1]
+         postag1 = listElem[2]
+         
+         features['hUpper']:  hUpper(word)
+         features['hLower']:  hUpper(word)
+         features['hGreek']:  hGreek(word)        
+         features['symb']: symb(word)
+         #firstChar
+         features['lemma1[:1]']: lemma1[:1]
+         #secondChar
+         features['postag[:1]']: lemma1[:1]
+         features['postag[:2]']: lemma1[:2]
+         features['lemma[:2]']: lemma1[:2]
+ 
+     #====================== S2 ======================#
+     if S2:
+         #S2
+         features['isUpper']:  word.isupper()
+         features['isLower']:  word.isLower()
+         features['isGreek']:  isGreek(word)        
+         features['isNumber']: word.isdigit()    
+ 
+ 
+     '''
+     #====================== S3 ======================#
+     if S3:        
+         features['word']: word
+     '''
     return features
 
 
@@ -153,7 +191,7 @@ def print_state_features(state_features, f):
         f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
 
 
- __author__ = 'CMendezC'
+ __author__ = 'egaytan'
 
 ##########################################
 #               MAIN PROGRAM             #
@@ -177,7 +215,7 @@ if __name__ == "__main__":
     parser.add_option("--excludeSymbols", default=False,
                       action="store_true", dest="excludeSymbols",
                       help="Exclude punctuation marks")
-     parser.add_option("--reportFile", dest="reportFile",
+     parser.add_option("--version", dest="version",
                       help="Report file", metavar="FILE")
     parser.add_option("--S1", default=False,
                       action="store_true", dest="S1",
@@ -198,7 +236,7 @@ if __name__ == "__main__":
     print("File with test data set: " + str(options.testFile))
     print("Exclude stop words: " + str(options.excludeStopWords))
     print("Levels: " + str(options.S1) + " " + str(options.S2))
-     print("Report file: " + str(options.reportFile))
+     print("Report file: " + str(options.version))
 
     
     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
@@ -254,19 +292,14 @@ if __name__ == "__main__":
 
     print("Reading corpus done in: %fs" % (time() - t0))
 
-     if options.S1: S1 = 0
-     else: S1 = 1
-     if options.S2: S2 = 0
-     else: S2 = 1
- 
-     print(sent2features(sentencesTrainingData[0], S1, S2)[0])
-     print(sent2features(sentencesTestData[0], S1, S2)[0])
+     print(sent2features(sentencesTrainingData[0], options.S1, options.S2)[0])
+     print(sent2features(sentencesTestData[0], options.S1, options.S2)[0])
     t0 = time()
 
-     X_train = [sent2features(s, S1, S2) for s in sentencesTrainingData]
+     X_train = [sent2features(s, options.S1, options.S2) for s in sentencesTrainingData]
     y_train = [sent2labels(s) for s in sentencesTrainingData]
 
-     X_test = [sent2features(s, S1, S2) for s in sentencesTestData]
+     X_test = [sent2features(s, options.S1, options.S2) for s in sentencesTestData]
     # print X_test
     y_test = [sent2labels(s) for s in sentencesTestData]
 
@@ -292,7 +325,7 @@ if __name__ == "__main__":
 
     # Original: labels = list(crf.classes_)
     # Original: labels.remove('O')
-     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
+     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
 
     # use the same metric for evaluation
     f1_scorer = make_scorer(metrics.flat_f1_score,
@@ -312,8 +345,10 @@ if __name__ == "__main__":
     # crf.fit(X_train, y_train)
 
     # Best hiperparameters
-     # crf = rs.best_estimator_    
-     nameReport = options.trainingFile.replace('.txt', str(options.reportFile) + '.txt')
+     # crf = rs.best_estimator_
+     
+ 
+     nameReport = str(options.S1) + '_S2_' + str(options.S2) +  str(options.version) + '.txt'
     with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
         oFile.write("********** TRAINING AND TESTING REPORT **********\n")
         oFile.write("Training file: " + options.trainingFile + '\n')
@@ -331,27 +366,13 @@ if __name__ == "__main__":
     # Saving model
     print("     Saving training model...")
     t1 = time()
-     nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
-         options.excludeSymbols) + '.mod')
+     nameModel = 'model_S1_' + str(options.S1) + '_S2_' + str(options.S2) + str(options.version) + '.mod'
     joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
     print("        Saving training model done in: %fs" % (time() - t1))
 
     # Evaluation against test data
     y_pred = crf.predict(X_test)
     print("*********************************")
-     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
-         options.excludeSymbols) + '.txt')
-     with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
-         for y in y_pred:
-             oFile.write(str(y) + '\n')
- 
-     print("*********************************")
-     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
-         options.excludeSymbols) + '.txt')
-     with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
-         for y in y_test:
-             oFile.write(str(y) + '\n')
- 
     print("Prediction done in: %fs" % (time() - t0))
 
     # labels = list(crf.classes_)
@@ -387,4 +408,3 @@ if __name__ == "__main__":
         print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
         oFile.write('\n')
 
- 
--- a/CRF/bin/training_validation_v4.py → CRF/bin/training_validation_v8.py
View file @5cea834
+++ b/CRF/bin/training_validation_v4.py → CRF/bin/training_validation_v8.py
View file @5cea834
--- a/CRF/data-sets/Tags.txt
View file @5cea834
+++ b/CRF/data-sets/Tags.txt
View file @5cea834
@@ -3,8 +3,6 @@ Gtype
 Gversion
 Med
 Phase
- Sample
- Serie
 Strain
 Supp
 Technique
@@ -13,4 +11,5 @@ OD
 Anti
 Agit
 Vess
- 
+ Substrain
+ pH
--- a/CRF/data-sets/test-data-set-30.txt deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/data-sets/test-data-set-30.txt deleted 100644 → 0
View file @d7b7af1
--- a/CRF/data-sets/test-data-set-30_v2.txt deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/data-sets/test-data-set-30_v2.txt deleted 100644 → 0
View file @d7b7af1
--- a/CRF/data-sets/test-data-set-30_v3.txt deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/data-sets/test-data-set-30_v3.txt deleted 100644 → 0
View file @d7b7af1
--- a/CRF/data-sets/test-data-set-30_v4.txt 0 → 100644
View file @5cea834
+++ b/CRF/data-sets/test-data-set-30_v4.txt 0 → 100644
View file @5cea834
--- a/CRF/data-sets/training-data-set-70.txt deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/data-sets/training-data-set-70.txt deleted 100644 → 0
View file @d7b7af1
--- a/CRF/data-sets/training-data-set-70_v2.txt deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/data-sets/training-data-set-70_v2.txt deleted 100644 → 0
View file @d7b7af1
--- a/CRF/data-sets/training-data-set-70_v3.txt deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/data-sets/training-data-set-70_v3.txt deleted 100644 → 0
View file @d7b7af1
--- a/CRF/data-sets/training-data-set-70_v4.txt 0 → 100644
View file @5cea834
+++ b/CRF/data-sets/training-data-set-70_v4.txt 0 → 100644
View file @5cea834
--- a/CRF/models/model_S1_False_S2_False_v1.mod 0 → 100644
View file @5cea834
+++ b/CRF/models/model_S1_False_S2_False_v1.mod 0 → 100644
View file @5cea834
--- a/CRF/models/model_S1_False_S2_True_v1.mod 0 → 100644
View file @5cea834
+++ b/CRF/models/model_S1_False_S2_True_v1.mod 0 → 100644
View file @5cea834
--- a/CRF/models/training-data-set-70.fStopWords_False.fSymbols_False.mod → CRF/models/model_S1_True_S2_False_v1.mod
View file @5cea834
+++ b/CRF/models/training-data-set-70.fStopWords_False.fSymbols_False.mod → CRF/models/model_S1_True_S2_False_v1.mod
View file @5cea834
--- a/CRF/models/model_S1_True_S2_True_v1.mod 0 → 100644
View file @5cea834
+++ b/CRF/models/model_S1_True_S2_True_v1.mod 0 → 100644
View file @5cea834
--- a/CRF/reports/report_False_S2_False_v1.txt 0 → 100644
View file @5cea834
+++ b/CRF/reports/report_False_S2_False_v1.txt 0 → 100644
View file @5cea834
--- a/CRF/reports/report_False_S2_True_v1.txt 0 → 100644
View file @5cea834
+++ b/CRF/reports/report_False_S2_True_v1.txt 0 → 100644
View file @5cea834
--- a/CRF/reports/report_True_S2_False_v1.txt 0 → 100644
View file @5cea834
+++ b/CRF/reports/report_True_S2_False_v1.txt 0 → 100644
View file @5cea834
--- a/CRF/reports/report_True_S2_True_v1.txt 0 → 100644
View file @5cea834
+++ b/CRF/reports/report_True_S2_True_v1.txt 0 → 100644
View file @5cea834
--- a/CRF/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @d7b7af1
--- a/CRF/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @d7b7af1
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O']
- ['Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Med', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'Gversion', 'Gversion', 'Gversion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Anti', 'Anti', 'Anti']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Supp']
- ['O', 'O', 'O', 'Gtype', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gtype', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O', 'Med', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'Anti', 'Anti', 'Anti', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Anti', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'Anti', 'Anti']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD']
- ['O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Anti']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'Substrain']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gtype', 'Gtype', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Med']
- ['O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['Gversion', 'Gversion']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gversion', 'Gversion', 'Gversion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Anti', 'Anti', 'Anti', 'Anti']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['Med', 'Med', 'Med', 'Med', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--- a/CRF/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @d7b7af1
+++ b/CRF/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @d7b7af1
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Technique']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'Substrain']
- ['O', 'O', 'O', 'Med', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'OD', 'OD', 'OD', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'Gversion', 'Gversion', 'Gversion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype', 'Gtype', 'Gtype']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Anti', 'Anti', 'Anti']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Supp']
- ['O', 'O', 'O', 'Anti', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype']
- ['Substrain', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O', 'Med', 'Med', 'Med', 'Med', 'O', 'Supp', 'Supp', 'Supp', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'Anti']
- ['O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'Anti', 'Anti', 'Anti', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Anti', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype']
- ['Substrain', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'Anti', 'Anti', 'Anti', 'Anti']
- ['OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD', 'OD']
- ['O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'Anti']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'Supp']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gtype', 'Gtype', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'OD', 'OD', 'OD', 'OD', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Med']
- ['O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Supp']
- ['O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Gversion', 'Gversion', 'Gversion', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Phase', 'Phase', 'O', 'O', 'O', 'Temp', 'Temp', 'O', 'Med', 'Med', 'Med', 'O', 'O', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'O', 'O', 'O', 'O', 'O', 'O', 'Supp', 'Supp', 'O']
- ['O', 'O', 'Gtype']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'Anti', 'Anti', 'Anti', 'Anti']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['O', 'O', 'O']
- ['O', 'O', 'O', 'O']
- ['O', 'O', 'O', 'O', 'O', 'O', 'O']
- ['Med', 'Med', 'Med', 'Med', 'O', 'O', 'O', 'O', 'O', 'O', 'Temp', 'Temp', 'Temp', 'O', 'O', 'Agit', 'Agit', 'Agit', 'Agit', 'Agit', 'Agit', 'Agit']
--- a/CoreNLP/bin/get-raw-sentences.sh
View file @5cea834
+++ b/CoreNLP/bin/get-raw-sentences.sh
View file @5cea834
- cd /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data
+ 
+ 
+ # Orgiginal files
+ #cd /home/egaytan/automatic-extraction-growth-conditions/data-sets/report-manually-tagged-gcs/
+ 
+ # Re-tagged
+ cd /home/egaytan/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
 echo
 echo
 echo
@@ -18,9 +24,9 @@ echo
 echo
 echo "Filter all paragraphs with tags..."
 echo "Add sentence-end-tag PGCGROWTHCONDITIONS..."
- grep -E "<[^<]*>"  * | grep -E  '!'| cut -f2 -d'='|sort|uniq|awk '{ print $_" PGCGROWTHCONDITIONS"; }'  > /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt
+ grep -E "<[^<]*>"  * | grep -E  '!'| cut -f2 -d'='|sort|uniq|awk '{ print $_" PGCGROWTHCONDITIONS"; }'  > /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces_v2.txt
 echo
- echo "Number of total tag sentences: "$(wc /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt -l);
+ echo "Number of total tag sentences: "$(wc /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces_v2.txt -l);
 echo
 echo
- echo "Saving file: /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt";
+ echo "Saving file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces_v2.txt";
--- a/CoreNLP/bin/single_run.sh
View file @5cea834
+++ b/CoreNLP/bin/single_run.sh
View file @5cea834
@@ -4,8 +4,8 @@ echo "==============================Run CoreNLP=================================
 echo
 echo
 
- input="/home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt";
- output="/home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/";
+ input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces_v2.txt";
+ output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/";
 echo "input file: "$input;
 echo
 echo "output directory: "$output;
--- a/CoreNLP/input/raw-metadata-senteneces.txt → CoreNLP/input/raw-metadata-senteneces_v1.txt
View file @5cea834
+++ b/CoreNLP/input/raw-metadata-senteneces.txt → CoreNLP/input/raw-metadata-senteneces_v1.txt
View file @5cea834
--- a/CoreNLP/input/raw-metadata-senteneces_v2.txt 0 → 100644
View file @5cea834
+++ b/CoreNLP/input/raw-metadata-senteneces_v2.txt 0 → 100644
View file @5cea834
--- a/CoreNLP/output/raw-metadata-senteneces.txt.conll → CoreNLP/output/raw-metadata-senteneces_v1.txt.conll
View file @5cea834
+++ b/CoreNLP/output/raw-metadata-senteneces.txt.conll → CoreNLP/output/raw-metadata-senteneces_v1.txt.conll
View file @5cea834
--- a/CoreNLP/output/raw-metadata-senteneces_v2.txt.conll 0 → 100644
View file @5cea834
+++ b/CoreNLP/output/raw-metadata-senteneces_v2.txt.conll 0 → 100644
View file @5cea834