upload

Estefani Gaytan Nunez
Commit ee43c20633af0c13a6a8b3c527cbdf717166d35d ee43c206 1 parent 87ac8726
Showing 17 changed files with 1385 additions and 1154 deletions
CoreNLP/bin/annotation/get-raw-sentences_v4.sh
CoreNLP/input/annotation/bg_sentences_v4.txt
CoreNLP/output/annotation/bg_sentences_v4.txt.ner
predict-annot/bin/tagging/tagging.py
predict-annot/input/annot-input_bg_v3.txt
predict-annot/input/annot-input_bg_v4.txt
predict-annot/mapping/bg_sentences_midx_v3.txt
predict-annot/mapping/bg_sentences_midx_v4.txt
predict-annot/output/annot-input_bg_outputII.txt
predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v4.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/reports/annot-input_bg_report_v3.txt
predict-annot/reports/annot-input_bg_report_v4.txt
predict-annot/reports/output_tagging_report.txt
predict-annot/reports/output_tagging_report_v4.txt
--- a/CoreNLP/bin/annotation/get-raw-sentences_v4.sh
View file @ee43c20
+++ b/CoreNLP/bin/annotation/get-raw-sentences_v4.sh
View file @ee43c20
@@ -34,7 +34,7 @@ echo
 echo
 echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
 #cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g'  | sed 's/-/\t/' | sed 's/-/\t/' )
- cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sed 's/\\null\\/null/g' | sort | uniq)
+ cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g'  | sort | uniq)
 echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
 wc $output
 echo "$cext" | cut -f1-3,5 > $mapping
--- a/CoreNLP/input/annotation/bg_sentences_v4.txt
View file @ee43c20
+++ b/CoreNLP/input/annotation/bg_sentences_v4.txt
View file @ee43c20
--- a/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
View file @ee43c20
+++ b/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
View file @ee43c20
--- a/predict-annot/bin/tagging/tagging.py
View file @ee43c20
+++ b/predict-annot/bin/tagging/tagging.py
View file @ee43c20
@@ -60,8 +60,10 @@ import training_validation_v14 as training
 # --infoFile		bg_sentences_midx.txt
 # --variant		    13 
 
+ #Examples
 #python3 tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI.txt  --outputFileII  annot-input_bg_outputII.txt  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx.txt  --variant 13   --S4   --S1 > ../../reports/output_tagging_report.txt
 #python3 predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v4.txt  --outputFileII  annot-input_bg_outputII_v4  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 13   --S4   --S1 > predict-annot/reports/output_tagging_report_v4.txt
+ #python3 predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v4.txt  --outputFileII  annot-input_bg_outputII_v4 --outputFileII  annot-input_bg_outputIII_v4  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 13   --S4   --S1 > predict-annot/reports/annot-input_bg_report_v4.txt
 
 __author__ = 'egaytan'
 
@@ -70,12 +72,13 @@ __author__ = 'egaytan'
 ##########################################
 
 if __name__ == "__main__":
-     # Defining parameters
+     ########################################### Defining parameters ##########################################
     parser = OptionParser()
     parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
     parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
     parser.add_option("--outputFileI",     dest="outFileI",        help="Output tagged file I",              metavar="FILE")
     parser.add_option("--outputFileII",    dest="outFileII",       help="Output tagged file II",             metavar="FILE")
+     parser.add_option("--outputFileIII",   dest="outFileIII",      help="Output tagged file III",            metavar="FILE")
     parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
     parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
     parser.add_option("--infoPath",        dest="infoPath",        help="Path of GSE-GSM index file",        metavar="PATH")
@@ -93,13 +96,14 @@ if __name__ == "__main__":
         parser.error("Any parameter given.")
         sys.exit(1)
 
-     
+     ########################################### DISP PARAMETERS ##########################################
     print('-------------------------------- PARAMETERS --------------------------------')
     
     print("--inputPath          Path of training data set        : " + str(options.inputPath        ))
     print("--outputPath         Output path to place output files: " + str(options.outputPath       ))
     print("--outputFileI        Output tagged file I             : " + str(options.outFileI         ))
     print("--outputFileII       Output tagged file II            : " + str(options.outFileII        ))
+     print("--outputFileII       Output tagged file III           : " + str(options.outFileIII       ))
     print("--modelPath          Path to read CRF model           : " + str(options.modelPath        ))
     print("--modelName          Model name                       : " + str(options.modelName        ))
     print("--infoPath           Path of GSE-GSM index file       : " + str(options.infoPath         ))
@@ -115,25 +119,29 @@ if __name__ == "__main__":
     
     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
- 
-     print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
- 
+     #print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+     ########################################### PROCESSING ##########################################
     print('-------------------------------- PROCESSING --------------------------------')
 
     stopwords = [word for word in stopwords.words('english')]
-     # Read index
+     # Read index mapping GSE file information
     idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
     
-     # Read CRF model
+     
+     ########################################### Read CRF model ##########################################
     t0 = time()
     print('Reading CRF model...')    
     crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
     print("Reading CRF model done in: %fs" % (time() - t0))
 
-     # Reading sentences
+     
+     ########################################### Reading sentences ########################################## 
     print('Processing corpus...')
     t0 = time()
     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
+     
+     
+     #####################################################################################
     # Walk directory to read files
     for path, dirs, files in os.walk(options.inputPath):
         # For each file in dir
@@ -165,25 +173,24 @@ if __name__ == "__main__":
                 print("Sentences input data: " + str(len(sentencesInputData)))
                 
                 
-                 # Predicting tags
+                 ########################################### Predicting tags ###########################################
                 t1 = time()               
-                 print("Predicting tags with model")
+                 print("Predicting tags with model...")
                 y_pred = crf.predict(X_input)                
                 print("Prediction done in: %fs" % (time() - t1))
                 
                 
-                 # Tagging with CRF model
-                 print("Tagging file")
+                 ########################################### Tagging with CRF model ###########################################
+                 print("Tagging file...")
                 lidx = 0
                 for line, tagLine in zip(lines, y_pred):
                     Ltags = set(labels).intersection(set(tagLine))
                     outputLine = ''                    
                     line = line.strip('\n')
                                         
-                     #print("\nLine: " + str(line))
-                     #print ("CRF tagged line: " + str(tagLine))
                     tb = 'O'
                     i = 0
+                     ########################## one word sentences ##########################
                     if len(tagLine)==1:
                         if tagLine[0] in labels: 
                             start = '<' + tagLine[0] + '> '
@@ -192,9 +199,11 @@ if __name__ == "__main__":
                             outputLine = start + word + end
                         else:                             
                             outputLine = line.split(' ')[0]
-                         #print(outputLine + '\t' + ', '.join(Ltags))
-                         sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + ', '.join(Ltags))
+                         ########################## Saving Sentence Ouput I ##########################
+                         sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
+                         ########################## Saving Sentence Ouput II ##########################
                         sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
+                         lidx += 1
                         continue
                         
                     sentence = ''
@@ -216,6 +225,7 @@ if __name__ == "__main__":
                                 # end sentence
                                 outputLine += word.split('|')[0] + ' '
                                 outputLine += '</' + tag + '/> '
+                                 ########################## Saving Sentence Ouput II ##########################
                                 sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
                                 sb = False
                                 tb = 'O'
@@ -225,6 +235,7 @@ if __name__ == "__main__":
                                 # start new tag
                                 outputLine += word.split('|')[0] + ' '
                                 outputLine += '</' + tag + '/> '
+                                 ########################## Saving Sentence Ouput II ##########################
                                 sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
                                 sb = False
                                 tb = 'O'
@@ -235,21 +246,32 @@ if __name__ == "__main__":
                         i += 1
                         if sb:
                             sentence+= word.split('|')[0] + ' '
-                     #print(outputLine + '\t' + ', '.join(Ltags))                  
-                     sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ ', '.join(Ltags))
+                     ########################## Saving Sentence Ouput I ##########################
+                     sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
                     lidx += 1                    
             
-             #print( DF(sentencesOutputDataI) )
-             #print( '\n'.join(sentencesOutputDataII) )        
-             # Save tags
-             with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFile:
-                 for line in sentencesOutputDataII:
-                     #print(line)
-                     oFile.write(line + '\n')
+             ########################################### Save Output I ##########################################
+             print("Saving Ouput I...")
             with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
                 for line in sentencesOutputDataI:
                     if re.findall('</', line):
-                         print(line)
-                     #oFileI.write(line + '\n')
+                         #print(line)
+                         oline = line.replace('LDR','(')
+                         oline = oline.replace('RDR',')')
+                         oFileI.write(oline + '\n')
+             ########################################### Save Output II ##########################################
+             print("Saving Ouput II...")
+             with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
+                 for line in sentencesOutputDataII:
+                     #print(line)
+                     oline = line.replace('LDR','(')
+                     oline = oline.replace('RDR',')')
+                     oFileII.write(oline + '\n')
+             ########################################### Save Output III ##########################################
+             print("Saving Ouput III...")
+             with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
+                 for line, tagLine in zip(lines, y_pred):                    
+                     oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
                     
+                     oFileIII.write(' '.join(oline) + '\n')                                       
     print("Processing corpus done in: %fs" % (time() - t0))
--- a/predict-annot/input/annot-input_bg_v3.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/input/annot-input_bg_v3.txt deleted 100644 → 0
View file @87ac872
--- a/predict-annot/input/annot-input_bg_v4.txt
View file @ee43c20
+++ b/predict-annot/input/annot-input_bg_v4.txt
View file @ee43c20
--- a/predict-annot/mapping/bg_sentences_midx_v3.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/mapping/bg_sentences_midx_v3.txt deleted 100644 → 0
View file @87ac872
--- a/predict-annot/mapping/bg_sentences_midx_v4.txt
View file @ee43c20
+++ b/predict-annot/mapping/bg_sentences_midx_v4.txt
View file @ee43c20
@@ -9328,7 +9328,7 @@ GSE12006	GSM303526	GPL3154-PMID:18940002	characteristics_ch1.1
 GSE12006	GSM303526	GPL3154-PMID:18940002	growth_protocol_ch1.1
 GSE12006	GSM303526	GPL3154-PMID:18940002	growth_protocol_ch1.2
 GSE12006	GSM303527	GPL3154-PMID:18940002	growth_protocol_ch1.4
- GSE12006	GSM303527	GPL3154-PMID:18940002
+ GSE12006	GSM303527	GPL3154-PMID:18940002	extract_protocol_ch1.3
 GSE12006	GSM303527	GPL3154-PMID:18940002	title.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	source_name_ch1.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	organism_ch1.1
@@ -9340,7 +9340,7 @@ GSE12006	GSM303527	GPL3154-PMID:18940002	characteristics_ch1.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	growth_protocol_ch1.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	growth_protocol_ch1.2
 GSE12006	GSM303528	GPL3154-PMID:18940002	growth_protocol_ch1.4
- GSE12006	GSM303528	GPL3154-PMID:18940002
+ GSE12006	GSM303528	GPL3154-PMID:18940002	extract_protocol_ch1.3
 GSE12006	GSM303528	GPL3154-PMID:18940002	title.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	source_name_ch1.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	organism_ch1.1
@@ -9352,7 +9352,7 @@ GSE12006	GSM303528	GPL3154-PMID:18940002	characteristics_ch1.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	growth_protocol_ch1.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	growth_protocol_ch1.2
 GSE12006	GSM303529	GPL3154-PMID:18940002	growth_protocol_ch1.4
- GSE12006	GSM303529	GPL3154-PMID:18940002
+ GSE12006	GSM303529	GPL3154-PMID:18940002	extract_protocol_ch1.3
 GSE12006	GSM303529	GPL3154-PMID:18940002	title.1
 GSE12006	GSM303529	GPL3154-PMID:18940002	source_name_ch1.1
 GSE12006	GSM303529	GPL3154-PMID:18940002	organism_ch1.1
--- a/predict-annot/output/annot-input_bg_outputII.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/output/annot-input_bg_outputII.txt deleted 100644 → 0
View file @87ac872
--- a/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputI_v4.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/output/annot-input_bg_outputI_v4.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @87ac872
--- a/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
--- a/predict-annot/reports/annot-input_bg_report_v3.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/reports/annot-input_bg_report_v3.txt deleted 100644 → 0
View file @87ac872
- -------------------------------- PARAMETERS --------------------------------
- Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
- File with CoreNLP-tagging bg-sentences: bg_sentences_v3.txt.ner
- Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
- File to save recontrsucted bg-sentences: annot-input_bg_v3.txt
- -------------------------------- PROCESSING --------------------------------
- Number of sentences: 14716
- ==================================END===================================
--- a/predict-annot/reports/annot-input_bg_report_v4.txt
View file @ee43c20
+++ b/predict-annot/reports/annot-input_bg_report_v4.txt
View file @ee43c20
 -------------------------------- PARAMETERS --------------------------------
- Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
- File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner
- Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
- File to save recontrsucted bg-sentences: annot-input_bg_v4.txt
+ --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+ --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+ --outputFileI        Output tagged file I             : annot-input_bg_outputI_v4
+ --outputFileII       Output tagged file II            : annot-input_bg_outputII_v4
+ --outputFileII       Output tagged file III           : annot-input_bg_outputIII_v4
+ --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
+ --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+ --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+ --infoFile	        GSE-GSM index file               : bg_sentences_midx_v4.txt
+ --variant	        Run variant                      : 13
+ --S1                 General features                 : True
+ --S2                 Inner/Complete word features     : False
+ --S3                 Extended context features        : False
+ --S4                 Semantic features                : True
+ --filteringStopWords Filtering stop words             : False
+ --filterSymbols      Filtering punctuation marks      : False
 -------------------------------- PROCESSING --------------------------------
- Number of sentences: 90904
- ==================================END===================================
+ Reading CRF model...
+ Reading CRF model done in: 0.009463s
+ Processing corpus...
+ Preprocessing file...annot-input_bg_v4.txt
+ Sentences input data: 90688
+ Predicting tags with model...
+ Prediction done in: 26.367272s
+ Tagging file...
+ Saving Ouput I...
+ Saving Ouput II...
+ Saving Ouput III...
+ Processing corpus done in: 56.584394s
--- a/predict-annot/reports/output_tagging_report.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/reports/output_tagging_report.txt deleted 100644 → 0
View file @87ac872
- -------------------------------- PARAMETERS --------------------------------
- --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
- --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
- --outputFileI        Output tagged file I             : annot-input_bg_outputI.txt
- --outputFileII       Output tagged file II            : annot-input_bg_outputII.txt
- --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
- --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
- --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
- --infoFile	        GSE-GSM index file               : bg_sentences_midx.txt
- --variant	        Run variant                      : 13
- --S1                 General features                 : True
- --S2                 Inner/Complete word features     : False
- --S3                 Extended context features        : False
- --S4                 Semantic features                : True
- --filteringStopWords Filtering stop words             : False
- --filterSymbols      Filtering punctuation marks      : False
- Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
- -------------------------------- PROCESSING --------------------------------
- Reading CRF model...
- Reading CRF model done in: 0.008336s
- Processing corpus...
- Preprocessing file...annot-input_bg_v3.txt
- Sentences input data: 14716
- Predicting tags with model
- Prediction done in: 1.688127s
- Tagging file
- Processing corpus done in: 3.948320s
--- a/predict-annot/reports/output_tagging_report_v4.txt
View file @ee43c20
+++ b/predict-annot/reports/output_tagging_report_v4.txt
View file @ee43c20
@@ -17,10 +17,16 @@
 Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
 -------------------------------- PROCESSING --------------------------------
 Reading CRF model...
- Reading CRF model done in: 0.009804s
+ Reading CRF model done in: 0.009363s
 Processing corpus...
 Preprocessing file...annot-input_bg_v3.txt
 Sentences input data: 14716
 Predicting tags with model
- Prediction done in: 1.811103s
+ Prediction done in: 1.737334s
 Tagging file
+ Preprocessing file...annot-input_bg_v4.txt
+ Sentences input data: 90688
+ Predicting tags with model
+ Prediction done in: 26.434549s
+ Tagging file
+ Processing corpus done in: 58.304885s