upload

Estefani Gaytan Nunez
Commit ee43c20633af0c13a6a8b3c527cbdf717166d35d ee43c206 1 parent 87ac8726
Showing 17 changed files with 1388 additions and 1157 deletions
CoreNLP/bin/annotation/get-raw-sentences_v4.sh
CoreNLP/input/annotation/bg_sentences_v4.txt
CoreNLP/output/annotation/bg_sentences_v4.txt.ner
predict-annot/bin/tagging/tagging.py
predict-annot/input/annot-input_bg_v3.txt
predict-annot/input/annot-input_bg_v4.txt
predict-annot/mapping/bg_sentences_midx_v3.txt
predict-annot/mapping/bg_sentences_midx_v4.txt
predict-annot/output/annot-input_bg_outputII.txt
predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v4.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/reports/annot-input_bg_report_v3.txt
predict-annot/reports/annot-input_bg_report_v4.txt
predict-annot/reports/output_tagging_report.txt
predict-annot/reports/output_tagging_report_v4.txt
--- a/CoreNLP/bin/annotation/get-raw-sentences_v4.sh
View file @ee43c20
+++ b/CoreNLP/bin/annotation/get-raw-sentences_v4.sh
View file @ee43c20
@@ -34,7 +34,7 @@ echo
 echo
 echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
 #cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g'  | sed 's/-/\t/' | sed 's/-/\t/' )
-cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sed 's/\\null\\/null/g' | sort | uniq)
+cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g'  | sort | uniq)
 echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
 wc $output
 echo "$cext" | cut -f1-3,5 > $mapping
--- a/CoreNLP/input/annotation/bg_sentences_v4.txt
View file @ee43c20
+++ b/CoreNLP/input/annotation/bg_sentences_v4.txt
View file @ee43c20
--- a/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
View file @ee43c20
+++ b/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
View file @ee43c20
--- a/predict-annot/bin/tagging/tagging.py
View file @ee43c20
+++ b/predict-annot/bin/tagging/tagging.py
View file @ee43c20
@@ -60,8 +60,10 @@ import training_validation_v14 as training
 # --infoFile		bg_sentences_midx.txt
 # --variant		    13 
+#Examples
 #python3 tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI.txt  --outputFileII  annot-input_bg_outputII.txt  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx.txt  --variant 13   --S4   --S1 > ../../reports/output_tagging_report.txt
 #python3 predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v4.txt  --outputFileII  annot-input_bg_outputII_v4  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 13   --S4   --S1 > predict-annot/reports/output_tagging_report_v4.txt
+#python3 predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v4.txt  --outputFileII  annot-input_bg_outputII_v4 --outputFileII  annot-input_bg_outputIII_v4  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 13   --S4   --S1 > predict-annot/reports/annot-input_bg_report_v4.txt
 __author__ = 'egaytan'
@@ -70,12 +72,13 @@ __author__ = 'egaytan'
 ##########################################
 if __name__ == "__main__":
-    # Defining parameters
+    ########################################### Defining parameters ##########################################
     parser = OptionParser()
     parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
     parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
     parser.add_option("--outputFileI",     dest="outFileI",        help="Output tagged file I",              metavar="FILE")
     parser.add_option("--outputFileII",    dest="outFileII",       help="Output tagged file II",             metavar="FILE")
+    parser.add_option("--outputFileIII",   dest="outFileIII",      help="Output tagged file III",            metavar="FILE")
     parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
     parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
     parser.add_option("--infoPath",        dest="infoPath",        help="Path of GSE-GSM index file",        metavar="PATH")
@@ -93,13 +96,14 @@ if __name__ == "__main__":
         parser.error("Any parameter given.")
         sys.exit(1)
-    
+    ########################################### DISP PARAMETERS ##########################################
     print('-------------------------------- PARAMETERS --------------------------------')
     print("--inputPath          Path of training data set        : " + str(options.inputPath        ))
     print("--outputPath         Output path to place output files: " + str(options.outputPath       ))
     print("--outputFileI        Output tagged file I             : " + str(options.outFileI         ))
     print("--outputFileII       Output tagged file II            : " + str(options.outFileII        ))
+    print("--outputFileII       Output tagged file III           : " + str(options.outFileIII       ))
     print("--modelPath          Path to read CRF model           : " + str(options.modelPath        ))
     print("--modelName          Model name                       : " + str(options.modelName        ))
     print("--infoPath           Path of GSE-GSM index file       : " + str(options.infoPath         ))
@@ -115,25 +119,29 @@ if __name__ == "__main__":
     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
-
+    #print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
-    print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+    ########################################### PROCESSING ##########################################
-
     print('-------------------------------- PROCESSING --------------------------------')
     stopwords = [word for word in stopwords.words('english')]
-    # Read index
+    # Read index mapping GSE file information
     idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
-    # Read CRF model
+    
+    ########################################### Read CRF model ##########################################
     t0 = time()
     print('Reading CRF model...')    
     crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
     print("Reading CRF model done in: %fs" % (time() - t0))
-    # Reading sentences
+    
+    ########################################### Reading sentences ########################################## 
     print('Processing corpus...')
     t0 = time()
     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
+    
+    
+    #####################################################################################
     # Walk directory to read files
     for path, dirs, files in os.walk(options.inputPath):
         # For each file in dir
@@ -165,25 +173,24 @@ if __name__ == "__main__":
                 print("Sentences input data: " + str(len(sentencesInputData)))
-                # Predicting tags
+                ########################################### Predicting tags ###########################################
                 t1 = time()               
-                print("Predicting tags with model")
+                print("Predicting tags with model...")
                 y_pred = crf.predict(X_input)                
                 print("Prediction done in: %fs" % (time() - t1))
-                # Tagging with CRF model
+                ########################################### Tagging with CRF model ###########################################
-                print("Tagging file")
+                print("Tagging file...")
                 lidx = 0
                 for line, tagLine in zip(lines, y_pred):
                     Ltags = set(labels).intersection(set(tagLine))
                     outputLine = ''                    
                     line = line.strip('\n')
-                    
+                                        
-                    #print("\nLine: " + str(line))
-                    #print ("CRF tagged line: " + str(tagLine))
                     tb = 'O'
                     i = 0
+                    ########################## one word sentences ##########################
                     if len(tagLine)==1:
                         if tagLine[0] in labels: 
                             start = '<' + tagLine[0] + '> '
@@ -192,9 +199,11 @@ if __name__ == "__main__":
                             outputLine = start + word + end
                         else:                             
                             outputLine = line.split(' ')[0]
-                        #print(outputLine + '\t' + ', '.join(Ltags))
+                        ########################## Saving Sentence Ouput I ##########################
-                        sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + ', '.join(Ltags))
+                        sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
+                        ########################## Saving Sentence Ouput II ##########################
                         sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
+                        lidx += 1
                         continue
                     sentence = ''
@@ -216,6 +225,7 @@ if __name__ == "__main__":
                                 # end sentence
                                 outputLine += word.split('|')[0] + ' '
                                 outputLine += '</' + tag + '/> '
+                                ########################## Saving Sentence Ouput II ##########################
                                 sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
                                 sb = False
                                 tb = 'O'
@@ -225,6 +235,7 @@ if __name__ == "__main__":
                                 # start new tag
                                 outputLine += word.split('|')[0] + ' '
                                 outputLine += '</' + tag + '/> '
+                                ########################## Saving Sentence Ouput II ##########################
                                 sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
                                 sb = False
                                 tb = 'O'
@@ -235,21 +246,32 @@ if __name__ == "__main__":
                         i += 1
                         if sb:
                             sentence+= word.split('|')[0] + ' '
-                    #print(outputLine + '\t' + ', '.join(Ltags))                  
+                    ########################## Saving Sentence Ouput I ##########################
-                    sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ ', '.join(Ltags))
+                    sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
-                    lidx += 1
+                    lidx += 1                    
-                    
+            
-            #print( DF(sentencesOutputDataI) )
+            ########################################### Save Output I ##########################################
-            #print( '\n'.join(sentencesOutputDataII) )        
+            print("Saving Ouput I...")
-            # Save tags
-            with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFile:
-                for line in sentencesOutputDataII:
-                    #print(line)
-                    oFile.write(line + '\n')
             with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
                 for line in sentencesOutputDataI:
                     if re.findall('</', line):
-                        print(line)
+                        #print(line)
-                    #oFileI.write(line + '\n')
+                        oline = line.replace('LDR','(')
+                        oline = oline.replace('RDR',')')
+                        oFileI.write(oline + '\n')
+            ########################################### Save Output II ##########################################
+            print("Saving Ouput II...")
+            with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
+                for line in sentencesOutputDataII:
+                    #print(line)
+                    oline = line.replace('LDR','(')
+                    oline = oline.replace('RDR',')')
+                    oFileII.write(oline + '\n')
+            ########################################### Save Output III ##########################################
+            print("Saving Ouput III...")
+            with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
+                for line, tagLine in zip(lines, y_pred):                    
+                    oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
+                    oFileIII.write(' '.join(oline) + '\n')                                       
     print("Processing corpus done in: %fs" % (time() - t0))
--- a/predict-annot/input/annot-input_bg_v3.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/input/annot-input_bg_v3.txt deleted 100644 → 0
View file @87ac872
--- a/predict-annot/input/annot-input_bg_v4.txt
View file @ee43c20
+++ b/predict-annot/input/annot-input_bg_v4.txt
View file @ee43c20
--- a/predict-annot/mapping/bg_sentences_midx_v3.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/mapping/bg_sentences_midx_v3.txt deleted 100644 → 0
View file @87ac872
--- a/predict-annot/mapping/bg_sentences_midx_v4.txt
View file @ee43c20
+++ b/predict-annot/mapping/bg_sentences_midx_v4.txt
View file @ee43c20
@@ -9328,7 +9328,7 @@ GSE12006	GSM303526	GPL3154-PMID:18940002	characteristics_ch1.1
 GSE12006	GSM303526	GPL3154-PMID:18940002	growth_protocol_ch1.1
 GSE12006	GSM303526	GPL3154-PMID:18940002	growth_protocol_ch1.2
 GSE12006	GSM303527	GPL3154-PMID:18940002	growth_protocol_ch1.4
-GSE12006	GSM303527	GPL3154-PMID:18940002
+GSE12006	GSM303527	GPL3154-PMID:18940002	extract_protocol_ch1.3
 GSE12006	GSM303527	GPL3154-PMID:18940002	title.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	source_name_ch1.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	organism_ch1.1
@@ -9340,7 +9340,7 @@ GSE12006	GSM303527	GPL3154-PMID:18940002	characteristics_ch1.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	growth_protocol_ch1.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	growth_protocol_ch1.2
 GSE12006	GSM303528	GPL3154-PMID:18940002	growth_protocol_ch1.4
-GSE12006	GSM303528	GPL3154-PMID:18940002
+GSE12006	GSM303528	GPL3154-PMID:18940002	extract_protocol_ch1.3
 GSE12006	GSM303528	GPL3154-PMID:18940002	title.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	source_name_ch1.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	organism_ch1.1
@@ -9352,7 +9352,7 @@ GSE12006	GSM303528	GPL3154-PMID:18940002	characteristics_ch1.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	growth_protocol_ch1.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	growth_protocol_ch1.2
 GSE12006	GSM303529	GPL3154-PMID:18940002	growth_protocol_ch1.4
-GSE12006	GSM303529	GPL3154-PMID:18940002
+GSE12006	GSM303529	GPL3154-PMID:18940002	extract_protocol_ch1.3
 GSE12006	GSM303529	GPL3154-PMID:18940002	title.1
 GSE12006	GSM303529	GPL3154-PMID:18940002	source_name_ch1.1
 GSE12006	GSM303529	GPL3154-PMID:18940002	organism_ch1.1
--- a/predict-annot/output/annot-input_bg_outputII.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/output/annot-input_bg_outputII.txt deleted 100644 → 0
View file @87ac872
--- a/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputI_v4.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/output/annot-input_bg_outputI_v4.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @87ac872
--- a/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
--- a/predict-annot/reports/annot-input_bg_report_v3.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/reports/annot-input_bg_report_v3.txt deleted 100644 → 0
View file @87ac872
--------------------------------- PARAMETERS --------------------------------
-Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
-File with CoreNLP-tagging bg-sentences: bg_sentences_v3.txt.ner
-Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
-File to save recontrsucted bg-sentences: annot-input_bg_v3.txt
--------------------------------- PROCESSING --------------------------------
-Number of sentences: 14716
-==================================END===================================
--- a/predict-annot/reports/annot-input_bg_report_v4.txt
View file @ee43c20
+++ b/predict-annot/reports/annot-input_bg_report_v4.txt
View file @ee43c20
 -------------------------------- PARAMETERS --------------------------------
-Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+--inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
-File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner
+--outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
-Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
+--outputFileI        Output tagged file I             : annot-input_bg_outputI_v4
-File to save recontrsucted bg-sentences: annot-input_bg_v4.txt
+--outputFileII       Output tagged file II            : annot-input_bg_outputII_v4
+--outputFileII       Output tagged file III           : annot-input_bg_outputIII_v4
+--modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
+--modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+--infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+--infoFile	        GSE-GSM index file               : bg_sentences_midx_v4.txt
+--variant	        Run variant                      : 13
+--S1                 General features                 : True
+--S2                 Inner/Complete word features     : False
+--S3                 Extended context features        : False
+--S4                 Semantic features                : True
+--filteringStopWords Filtering stop words             : False
+--filterSymbols      Filtering punctuation marks      : False
 -------------------------------- PROCESSING --------------------------------
-Number of sentences: 90904
+Reading CRF model...
-==================================END===================================
+Reading CRF model done in: 0.009463s
+Processing corpus...
+Preprocessing file...annot-input_bg_v4.txt
+Sentences input data: 90688
+Predicting tags with model...
+Prediction done in: 26.367272s
+Tagging file...
+Saving Ouput I...
+Saving Ouput II...
+Saving Ouput III...
+Processing corpus done in: 56.584394s
--- a/predict-annot/reports/output_tagging_report.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/reports/output_tagging_report.txt deleted 100644 → 0
View file @87ac872
--------------------------------- PARAMETERS --------------------------------
---inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
---outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
---outputFileI        Output tagged file I             : annot-input_bg_outputI.txt
---outputFileII       Output tagged file II            : annot-input_bg_outputII.txt
---modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
---modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
---infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
---infoFile	        GSE-GSM index file               : bg_sentences_midx.txt
---variant	        Run variant                      : 13
---S1                 General features                 : True
---S2                 Inner/Complete word features     : False
---S3                 Extended context features        : False
---S4                 Semantic features                : True
---filteringStopWords Filtering stop words             : False
---filterSymbols      Filtering punctuation marks      : False
-Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
--------------------------------- PROCESSING --------------------------------
-Reading CRF model...
-Reading CRF model done in: 0.008336s
-Processing corpus...
-Preprocessing file...annot-input_bg_v3.txt
-Sentences input data: 14716
-Predicting tags with model
-Prediction done in: 1.688127s
-Tagging file
-Processing corpus done in: 3.948320s
--- a/predict-annot/reports/output_tagging_report_v4.txt
View file @ee43c20
+++ b/predict-annot/reports/output_tagging_report_v4.txt
View file @ee43c20
@@ -17,10 +17,16 @@
 Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
 -------------------------------- PROCESSING --------------------------------
 Reading CRF model...
-Reading CRF model done in: 0.009804s
+Reading CRF model done in: 0.009363s
 Processing corpus...
 Preprocessing file...annot-input_bg_v3.txt
 Sentences input data: 14716
 Predicting tags with model
-Prediction done in: 1.811103s
+Prediction done in: 1.737334s
 Tagging file
+Preprocessing file...annot-input_bg_v4.txt
+Sentences input data: 90688
+Predicting tags with model
+Prediction done in: 26.434549s
+Tagging file
+Processing corpus done in: 58.304885s