upload

Estefani Gaytan Nunez
Commit ec09fc7e48333f5b31c0c578d77c50f139476c7a ec09fc7e 1 parent 6a0c7437
Showing 9 changed files with 271 additions and 40 deletions
predict-annot/bin/tagging/tagging_v2_21Oct.py
predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/reports/annot-input_bg_report_v4.txt
predict-annot/reports/output_tagging_report_v5.txt → predict-annot/reports/annot-input_bg_report_v6.txt
predict-annot/reports/output_tagging_report_v4.txt
--- a/predict-annot/bin/tagging/tagging_v2_21Oct.py 0 → 100644
View file @ec09fc7
+++ b/predict-annot/bin/tagging/tagging_v2_21Oct.py 0 → 100644
View file @ec09fc7
+# -*- coding: UTF-8 -*-
+
+import os
+import re
+from pandas import DataFrame as DF
+from optparse import OptionParser
+from time import time
+from collections import Counter
+
+import nltk
+import sklearn
+import scipy.stats
+import sys
+
+import joblib
+from sklearn.metrics import make_scorer
+from sklearn.model_selection import cross_val_score
+from sklearn.model_selection import RandomizedSearchCV
+
+import sklearn_crfsuite
+from sklearn_crfsuite import scorers
+from sklearn_crfsuite import metrics
+
+from nltk.corpus import stopwords
+
+import training_validation_v14 as training
+
+import json
+
+#-------------------------------------------------------------------------------
+# Objective
+# Tagging transformed file with CRF model with sklearn-crfsuite.
+#
+# Input parameters
+# --inputPath=PATH          Path of transformed files x|y|z
+# --outputPath              Output path to place output files
+# --outputFileI             Output tagged file I
+# --outputFileII            Output tagged file II
+# --modelPath               Path to CRF model
+# --modelName               Model name
+# --infoPath                Path of GSE-GSM index file
+# --infoFile	            GSE-GSM index file",
+# --variant	                Part of S2 variant
+# --S1                      Inner word features set
+# --S2                      Complete word features
+# --S3                      Extended context features
+# --S4                      Semantic features
+# --filteringStopWords      Filtering stop words
+# --filterSymbols           Filtering punctuation marks
+
+# Output
+# 1) Tagged files in transformed format
+
+# Examples
+# --inputPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+# --outputPath	    /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+# --outputFileI     annot-input_bg_outputI.txt
+# --outputFileII    annot-input_bg_outputII.txt
+# --modelPath		/home/egaytan/automatic-extraction-growth-conditions/CRF/models
+# --modelName		model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+# --infoPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+# --infoFile		bg_sentences_midx.txt
+# --variant		    13
+
+#Examples
+#predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v5.txt  --outputFileII  annot-input_bg_outputII_v5 --outputFileIII  annot-input_bg_outputIII_v5  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 10   --S2   > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
+__author__ = 'egaytan'
+
+##########################################
+#               MAIN PROGRAM             #
+##########################################
+
+if __name__ == "__main__":
+    ########################################### Defining parameters ##########################################
+    parser = OptionParser()
+    parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
+    parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
+    parser.add_option("--outputFileI",     dest="outFileI",        help="Output tagged file I",              metavar="FILE")
+    parser.add_option("--outputFileII",    dest="outFileII",       help="Output tagged file II",             metavar="FILE")
+    parser.add_option("--outputFileIII",   dest="outFileIII",      help="Output tagged file III",            metavar="FILE")
+    parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
+    parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
+    parser.add_option("--infoPath",        dest="infoPath",        help="Path of GSE-GSM index file",        metavar="PATH")
+    parser.add_option("--infoFile",        dest="idx",             help="GSE-GSM index file",                metavar="FILE")
+    parser.add_option("--variant",         dest="variant",         help="Run variant",                       metavar="FILE")
+    parser.add_option("--S1",              dest="S1",              help="Inner word features",               action="store_true", default=False)
+    parser.add_option("--S2",              dest="S2",              help="Complete word features",            action="store_true", default=False)
+    parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)
+    parser.add_option("--S4",              dest="S4",              help="Semantic features",                 action="store_true", default=False)
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    ########################################### DISP PARAMETERS ##########################################
+    print('-------------------------------- PARAMETERS --------------------------------')
+
+    print("--inputPath          Path of training data set        : " + str(options.inputPath        ))
+    print("--outputPath         Output path to place output files: " + str(options.outputPath       ))
+    print("--outputFileI        Output tagged file I             : " + str(options.outFileI         ))
+    print("--outputFileII       Output tagged file II            : " + str(options.outFileII        ))
+    print("--outputFileII       Output tagged file III           : " + str(options.outFileIII       ))
+    print("--modelPath          Path to read CRF model           : " + str(options.modelPath        ))
+    print("--modelName          Model name                       : " + str(options.modelName        ))
+    print("--infoPath           Path of GSE-GSM index file       : " + str(options.infoPath         ))
+    print("--infoFile	        GSE-GSM index file               : " + str(options.idx              ))
+    print("--variant	        Run variant                      : " + str(options.variant          ))
+    print("--S1                 Inner word features set          : " + str(options.S1               ))
+    print("--S2                 Complete word features     : " + str(options.S2               ))
+    print("--S3                 Extended context features     : " + str(options.S3               ))
+    print("--S4                 Semantic features     : " + str(options.S4               ))
+
+    ########################################### PROCESSING ##########################################
+    print('-------------------------------- PROCESSING --------------------------------')
+
+    # Read index mapping GSE file information
+    idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
+
+    ########################################### Read CRF model ##########################################
+    t0 = time()
+    print('Reading CRF model...')
+    crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
+    print("Reading CRF model done in: %fs" % (time() - t0))
+
+    ########################################### Reading sentences ##########################################
+    print('Processing corpus...')
+    t0 = time()
+    labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
+
+    ########################################### Preprocessing ###########################################
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("Preprocessing file..." + str(file))
+            sentencesInputData    = []
+            sentencesOutputDataI = []
+            # Preprocessing input sentences
+            with open(os.path.join(options.inputPath, file), "r") as iFile:
+                lines = iFile.readlines()
+                sentencesInputData = [ line.strip('\n').split() for line in lines]
+                # Save input sentences
+                X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
+                print("Sentences input data: " + str(len(sentencesInputData)))
+
+                ###########################################     Predicting tags    ###########################################
+                t1 = time()
+                print("Predicting tags with model...")
+                y_pred = crf.predict(X_input)
+
+                #print(y_pred)
+                print("Prediction done in: %fs" % (time() - t1))
+
+                ########################################### Tagging with CRF model ###########################################
+                print("Tagging file...")
+                lidx = 0
+                for line, tagLine in zip(lines, y_pred):
+                    # unique tags
+                    Ltags = set(labels).intersection(set(tagLine))
+                    # Skip untagged sentence
+                    if Ltags == {'O'}: continue
+                    line = line.strip('\n')
+                    # starting empty sentence
+                    outputLine = ''
+                    # tag behind
+                    tb = 'O'
+                    # per sentence word count
+                    i  = 0
+                    # Exception for one word sentences
+                    if len(tagLine) == 1:
+                        if tagLine[0] in labels:
+                            # add start tagging signature
+                            start = '<'  + tagLine[0]  + '> '
+                            # add end tagging signature
+                            end   = '</' + tagLine[0]  + '>'
+                            word  = line.split('|')[0] + ' '
+                            # save output tagged sentence
+                            outputLine = start + word + end
+                        else:
+                            outputLine = line.split(' ')[0]
+                        # Saving Sentence Ouput I
+                        #print(outputLine)
+                        sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
+                        # Increase sentence counter
+                        lidx += 1
+                        # Continue with the next sentence
+                        continue
+                    # Tagging sentences
+                    for word,tag in zip(line.split(' '), tagLine):
+                        # start tagging
+                        if tag in labels and tb != tag:
+                            # check the last tagged word(s)
+                            if tb in labels and outputLine[-2:] != '> ':
+                              outputLine += '<' + tb + '> '
+                            outputLine += '<' + tag + '> '
+                            outputLine += word.split('|')[0] + ' '
+                            tb = tag
+                            i += 1
+                            continue
+                        # end tagging
+                        elif tag in labels: #elif tb in labels: #elif tag in labels and tag!=tagLine[i+1]:
+                            if i+1==len(tagLine):
+                                # end sentence
+                                #print(outputLine)
+                                outputLine += word.split('|')[0] + ' '
+                                outputLine += '</' + tag + '> '
+                                tb = 'O'
+                                i += 1
+                                continue
+                            elif tag!=tagLine[i+1]:
+                                # start new tag
+                                outputLine += word.split('|')[0] + ' '
+                                outputLine += '</' + tag + '> '
+                                tb = 'O'
+                                i += 1
+                                continue
+                        # check the last tagged word(s)
+                        if tb != tag and tb in labels and outputLine[-2:] != '<' + tb + '> ':
+                              outputLine += '</' + tb + '> '
+                        # word tagged
+                        outputLine += word.split('|')[0] + ' '
+                        i += 1
+                    # Saving Sentence Ouput I
+                    #print(outputLine)
+                    sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
+                    lidx += 1
+
+                #print("\n".join(sentencesOutputDataI[1:3]))
+
+            ########################################### Save Output I ##########################################
+            print("Saving Ouput I...")
+            with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
+                for line in sentencesOutputDataI:
+                    if re.findall('</', line):
+                        #print(line)
+                        oline = line.replace('-LRB-','(')
+                        oline = oline.replace('-RRB-',')')
+                        oFileI.write(oline + '\n')
+
+            ########################################### Save Output II ##########################################
+            print("Saving Ouput II...")
+            with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
+                for line in sentencesOutputDataI:
+                    oline = line.replace('-LRB-','(')
+                    oline = oline.replace('-RRB-',')')
+                    for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
+                        lline = oline.split('\t')[0:-2] + [ttex, tag]
+                        nline = '\t'.join(lline)
+                        oFileII.write(nline  + '\n')
+
+            ########################################### Save Output III ##########################################
+            print("Saving Ouput III...")
+            with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
+                for line, tagLine in zip(lines, y_pred):
+                    oline = [ w.split('|')[0].replace('-LRB-','(').replace('-LRB-','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
+
+                    oFileIII.write(' '.join(oline) + '\n')
+
+            ########################################### Save Probs ##########################################
+            y_probs = crf.predict_marginals(X_input)
+            # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
+            with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
+                json.dump(y_probs, fp)
+    print("Passing corpus done in: %fs" % (time() - t0))
--- a/predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv → predict-annot/output/annot-input_bg_outputIII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ec09fc7
+++ b/predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv → predict-annot/output/annot-input_bg_outputIII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ec09fc7
--- a/predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @6a0c743
+++ b/predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @6a0c743
--- a/predict-annot/output/annot-input_bg_outputII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ec09fc7
+++ b/predict-annot/output/annot-input_bg_outputII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ec09fc7
--- a/predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @6a0c743
+++ b/predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @6a0c743
--- a/predict-annot/output/annot-input_bg_outputI_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ec09fc7
+++ b/predict-annot/output/annot-input_bg_outputI_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ec09fc7
--- a/predict-annot/reports/annot-input_bg_report_v4.txt deleted 100644 → 0
View file @6a0c743
+++ b/predict-annot/reports/annot-input_bg_report_v4.txt deleted 100644 → 0
View file @6a0c743
--- a/predict-annot/reports/output_tagging_report_v5.txt → predict-annot/reports/annot-input_bg_report_v6.txt
View file @ec09fc7
+++ b/predict-annot/reports/output_tagging_report_v5.txt → predict-annot/reports/annot-input_bg_report_v6.txt
View file @ec09fc7
 -------------------------------- PARAMETERS --------------------------------
 --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
 --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
---outputFileI        Output tagged file I             : annot-input_bg_outputI_v5.txt
+--outputFileI        Output tagged file I             : annot-input_bg_outputI_v6
---outputFileII       Output tagged file II            : annot-input_bg_outputII_v5
+--outputFileII       Output tagged file II            : annot-input_bg_outputII_v6
---outputFileII       Output tagged file III           : annot-input_bg_outputIII_v5
+--outputFileII       Output tagged file III           : annot-input_bg_outputIII_v6
 --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
 --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
 --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
@@ -15,16 +15,14 @@
 --S4                 Semantic features     : False
 -------------------------------- PROCESSING --------------------------------
 Reading CRF model...
-Reading CRF model done in: 0.009225s
+Reading CRF model done in: 0.009524s
 Processing corpus...
 Preprocessing file...annot-input_bg_v4.txt
 Sentences input data: 90688
 Predicting tags with model...
-Prediction done in: 27.733279s
+Prediction done in: 27.458162s
 Tagging file...
-GSE100233	GSM2675514	GPL18006-GPL18133-PMID:29186514	library_strategy.1	<Technique> ChIP-Seq </Technique>	Technique
-GSE100233	GSM2675514	GPL18006-GPL18133-PMID:29186514	growth_protocol_ch1.1	Cultures of Caulobacter -LRB- TLS1631-TLS1633 -RRB- were grown at 30oC in PYE and supplemented with antibiotics , as necessary , at appropriate concentrations . To deplete wild-type non-tagged ParB , exponential-phase cells were washed off xylose and re-introduced to PYE +0.2 % glucose for an additional <Supp> 5 hours </Supp> . After 4 hours , vanillate was added to induce the expression of flag-parB -LRB- WT -RRB- or flag-parB -LRB- G101S/R104A -RRB- for an hour . Cultures of Escherichia coli -LRB- TLS1637-TLS1650 -RRB- were grown at 30oC in LB and supplemented with antibiotics , as necessary , at appropriate concentrations . IPTG -LRB- 0.5 mM -RRB- was added to induce the production of T18-ParB -LRB- WT -RRB- or T18-ParB -LRB- G101S -RRB- . After an hour , formadehyde -LRB- 1 % final concentration -RRB- were added to fix cells for ChIP-seq . 	Supp
 Saving Ouput I...
 Saving Ouput II...
 Saving Ouput III...
-Pssing corpus done in: 258.328259s
+Passing corpus done in: 257.970281s
--- a/predict-annot/reports/output_tagging_report_v4.txt deleted 100644 → 0
View file @6a0c743
+++ b/predict-annot/reports/output_tagging_report_v4.txt deleted 100644 → 0
View file @6a0c743
--------------------------------- PARAMETERS --------------------------------
---inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
---outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
---outputFileI        Output tagged file I             : annot-input_bg_outputI_v4.txt
---outputFileII       Output tagged file II            : annot-input_bg_outputII_v4
---modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
---modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
---infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
---infoFile	        GSE-GSM index file               : bg_sentences_midx_v4.txt
---variant	        Run variant                      : 13
---S1                 General features                 : True
---S2                 Inner/Complete word features     : False
---S3                 Extended context features        : False
---S4                 Semantic features                : True
---filteringStopWords Filtering stop words             : False
---filterSymbols      Filtering punctuation marks      : False
-Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
--------------------------------- PROCESSING --------------------------------
-Reading CRF model...
-Reading CRF model done in: 0.009363s
-Processing corpus...
-Preprocessing file...annot-input_bg_v3.txt
-Sentences input data: 14716
-Predicting tags with model
-Prediction done in: 1.737334s
-Tagging file
-Preprocessing file...annot-input_bg_v4.txt
-Sentences input data: 90688
-Predicting tags with model
-Prediction done in: 26.434549s
-Tagging file
-Processing corpus done in: 58.304885s