Conditional Random Fields

Carlos-Francisco Méndez-Cruz
Commit 68cde46d69f1c76e869cb86224db058af48d7c23 68cde46d 1 parent 7666e789
Showing 10 changed files with 13 additions and 270 deletions
data-sets/text-abstracts-original.txt
data-sets/text-annotated-abstracts-original.txt
models/training-data-set-70.fStopWords_False.fSymbols_False.mod
prepare-abstracts.py
prepare-training-test.py
reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt
reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt
reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt
training-validation-original.py
training-validation-v1.py
--- a/data-sets/text-abstracts-original.txt deleted 100644 → 0
View file @7666e78
+++ b/data-sets/text-abstracts-original.txt deleted 100644 → 0
View file @7666e78
--- a/data-sets/text-annotated-abstracts-original.txt deleted 100644 → 0
View file @7666e78
+++ b/data-sets/text-annotated-abstracts-original.txt deleted 100644 → 0
View file @7666e78
--- a/models/training-data-set-70.fStopWords_False.fSymbols_False.mod deleted 100644 → 0
View file @7666e78
+++ b/models/training-data-set-70.fStopWords_False.fSymbols_False.mod deleted 100644 → 0
View file @7666e78
--- a/prepare-abstracts.py deleted 100644 → 0
View file @7666e78
+++ b/prepare-abstracts.py deleted 100644 → 0
View file @7666e78
-# -*- coding: UTF-8 -*-
-
-from optparse import OptionParser
-import os
-import sys
-from time import time
-import re
-
-__author__ = 'CMendezC'
-
-# Objective: Take text-annotated-abstracts-original.txt as input
-# for obtaining abstracts separated in files without tags and collecting dictionary of genes
-# for tagging after NLP pipeline.
-
-# Parameters:
-#   1) --inputPath      Input path.
-#   2) --inputFile   Input file.
-#   3) --outputPath     Output path
-
-# Execution:
-# python3 prepare-abstracts.py
-# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
-# --inputFile text-annotated-abstracts.txt
-# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
-# --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries
-# --dicFile genes.txt
-# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries --dicFile genes.txt
-
-
-if __name__ == "__main__":
-    # Parameter definition
-    parser = OptionParser()
-    parser.add_option("--inputPath", dest="inputPath",
-                      help="Input path", metavar="PATH")
-    parser.add_option("--inputFile", dest="inputFile",
-                      help="Input file", metavar="FILE")
-    parser.add_option("--outputPath", dest="outputPath",
-                      help="Output path", metavar="PATH")
-    parser.add_option("--dicPath", dest="dicPath",
-                      help="Dictionary path", metavar="PATH")
-    parser.add_option("--dicFile", dest="dicFile",
-                      help="Dictionary file", metavar="FILE")
-
-    (options, args) = parser.parse_args()
-    if len(args) > 0:
-        parser.error("None parameters indicated.")
-        sys.exit(1)
-
-    # Printing parameter values
-    print('-------------------------------- PARAMETERS --------------------------------')
-    print("Input path: " + str(options.inputPath))
-    print("Input file", str(options.inputFile))
-    print("Output path: " + str(options.outputPath))
-    print("Dictionary path: " + str(options.dicPath))
-    print("Dictionary file", str(options.dicFile))
-
-    filesWritten = 0
-    t0 = time()
-    hashGenes = {}
-
-    rePmid = re.compile(r'([\d]+)\|t\|')
-    reGene = re.compile(r'<g>([^<]+)</g>')
-    reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)')
-    with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
-        print("Reading file..." + options.inputFile)
-        for line in iFile:
-            line = line.strip('\r\n')
-            for gene in reGene.findall(line):
-                # print("genes: {}".format(gene))
-                if gene not in hashGenes:
-                    hashGenes[gene] = 1
-                else:
-                    hashGenes[gene] += 1
-            line = reTags.sub('', line)
-            result = rePmid.match(line)
-            if result:
-                line = rePmid.sub('', line)
-                with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
-                    oFile.write(line)
-            else:
-                print("Warning: line without PMID")
-    with open(os.path.join(options.dicPath, options.dicFile), "w", encoding="utf-8", errors="replace") as dFile:
-        for gene in hashGenes.keys():
-            dFile.write("{}\n".format(gene))
-
-
--- a/prepare-training-test.py deleted 100644 → 0
View file @7666e78
+++ b/prepare-training-test.py deleted 100644 → 0
View file @7666e78
-# -*- coding: UTF-8 -*-
-
-from optparse import OptionParser
-import os
-import sys
-from time import time
-
-__author__ = 'CMendezC'
-
-# Objective: Join transformed files for obtaining training and test data sets
-
-# Parameters:
-#   1) --inputPath      Path to read files.
-#   2) --trainingFile   File name for training data.
-#   3) --testFile       File name for test data.
-#   4) --outputPath     Path to write files.
-
-# Ouput:
-#   1) Files created.
-
-# Execution:
-# python3.4 prepare-training-test.py
-# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed
-# --trainingFile training-data-set-70.txt
-# --testFile test-data-set-30.txt
-# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
-# python3.4 prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
-
-###########################################################
-#                       MAIN PROGRAM                      #
-###########################################################
-
-if __name__ == "__main__":
-    # Parameter definition
-    parser = OptionParser()
-    parser.add_option("--inputPath", dest="inputPath",
-                      help="Path to read files", metavar="PATH")
-    parser.add_option("--trainingFile", dest="trainingFile",
-                      help="File for training examples", metavar="FILE")
-    parser.add_option("--testFile", dest="testFile",
-                      help="File for test examples", metavar="FILE")
-    parser.add_option("--outputPath", dest="outputPath",
-                      help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
-
-    (options, args) = parser.parse_args()
-    if len(args) > 0:
-        parser.error("None parameters indicated.")
-        sys.exit(1)
-
-    # Printing parameter values
-    print('-------------------------------- PARAMETERS --------------------------------')
-    print("Path to read files: " + str(options.inputPath))
-    print("File for training examples", str(options.trainingFile))
-    print("File for test examples", str(options.testFile))
-    print("Path to write output files: " + str(options.outputPath))
-
-    t0 = time()
-    trainingDataset = []
-    testDataset = []
-
-    counter = 1
-    for path, dirs, files in os.walk(options.inputPath):
-        # For each file in dir
-        for file in files:
-            if counter <= 70:
-                print("   Joining file {} {} to training data set".format(counter, file))
-                with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
-                    for line in iFile:
-                        line = line.strip('\r\n')
-                        trainingDataset.append(line)
-            elif counter > 70 and counter <= 100:
-                print("   Joining file {} {} to test data set".format(counter, file))
-                with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
-                    for line in iFile:
-                        line = line.strip('\r\n')
-                        testDataset.append(line)
-            counter += 1
-    with open(os.path.join(options.outputPath, options.trainingFile), "w", encoding="utf-8", errors="replace") as oFile:
-        for line in trainingDataset:
-            oFile.write("{}\n".format(line))
-    with open(os.path.join(options.outputPath, options.testFile), "w", encoding="utf-8", errors="replace") as oFile:
-        for line in testDataset:
-            oFile.write("{}\n".format(line))
-
--- a/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
+++ b/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
--- a/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
+++ b/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
--- a/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
+++ b/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
--- a/training-validation-original.py deleted 100644 → 0
View file @7666e78
+++ b/training-validation-original.py deleted 100644 → 0
View file @7666e78
--- a/training-validation-v1.py
View file @68cde46
+++ b/training-validation-v1.py
View file @68cde46
@@ -49,12 +49,6 @@ from nltk.corpus import stopwords
 #################################
 #           FUNCTIONS           #
 #################################
-def endsConLow(word):
-    miregex = re.compile(r'[^aeiouA-Z0-9]$')
-    if miregex.search(word):
-        return True
-    else:
-        return False
 def word2features(sent, i):
     listElem = sent[i].split('|')
@@ -63,21 +57,9 @@ def word2features(sent, i):
     postag = listElem[2]
     features = {
-        # Suffixes
-        'word[-3:]': word[-3:],
-        'word[-2:]': word[-2:],
-        'word[-1:]': word[-1:],
-        #'word.isupper()': word.isupper(),
         'word': word,
-        'lemma': lemma,
+        #'lemma': lemma,
-        'postag': postag,
+        #'postag': postag,
-        'lemma[-3:]': lemma[-3:],
-        'lemma[-2:]': lemma[-2:],
-        'lemma[-1:]': lemma[-1:],
-        'word[:3]': word[:3],
-        'word[:2]': word[:2],
-        'word[:1]': word[:1],
-        'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
     }
     if i > 0:
         listElem = sent[i - 1].split('|')
@@ -86,8 +68,8 @@ def word2features(sent, i):
         postag1 = listElem[2]
         features.update({
             '-1:word': word1,
-            '-1:lemma': lemma1,
+            #'-1:lemma': lemma1,
-            '-1:postag': postag1,
+            #'-1:postag': postag1,
         })
     if i < len(sent) - 1:
@@ -97,55 +79,10 @@ def word2features(sent, i):
         postag1 = listElem[2]
         features.update({
             '+1:word': word1,
-            '+1:lemma': lemma1,
+            #'+1:lemma': lemma1,
-            '+1:postag': postag1,
+            #'+1:postag': postag1,
         })
-    '''    
-    if i > 1:
-        listElem = sent[i - 2].split('|')
-        word2 = listElem[0]
-        lemma2 = listElem[1]
-        postag2 = listElem[2]
-        features.update({
-            '-2:word': word2,
-            '-2:lemma': lemma2,
-        })
-
-    if i < len(sent) - 2:
-        listElem = sent[i + 2].split('|')
-        word2 = listElem[0]
-        lemma2 = listElem[1]
-        postag2 = listElem[2]
-        features.update({
-            '+2:word': word2,
-            '+2:lemma': lemma2,
-        })
-
-    trigrams = False
-    if trigrams:
-        if i > 2:
-            listElem = sent[i - 3].split('|')
-            word3 = listElem[0]
-            lemma3 = listElem[1]
-            postag3 = listElem[2]
-            features.update({
-                '-3:word': word3,
-                '-3:lemma': lemma3,
-            })
-
-        if i < len(sent) - 3:
-            listElem = sent[i + 3].split('|')
-            word3 = listElem[0]
-            lemma3 = listElem[1]
-            postag3 = listElem[2]
-            features.update({
-                '+3:word': word3,
-                '+3:lemma': lemma3,
-            })
-    '''
-    return features
-
 def sent2features(sent):
     return [word2features(sent, i) for i in range(len(sent))]
@@ -271,24 +208,13 @@ if __name__ == "__main__":
     y_test = [sent2labels(s) for s in sentencesTestData]
     # Fixed parameters
-    # crf = sklearn_crfsuite.CRF(
-    #     algorithm='lbfgs',
-    #     c1=0.1,
-    #     c2=0.1,
-    #     max_iterations=100,
-    #     all_possible_transitions=True
-    # )
-
-    # Hyperparameter Optimization
     crf = sklearn_crfsuite.CRF(
          algorithm='lbfgs',
+         c1=0.1,
+         c2=0.1,
          max_iterations=100,
          all_possible_transitions=True
      )
-    params_space = {
-        'c1': scipy.stats.expon(scale=0.5),
-        'c2': scipy.stats.expon(scale=0.05),
-    }
     # Original: labels = list(crf.classes_)
     # Original: labels.remove('O')
@@ -298,18 +224,8 @@ if __name__ == "__main__":
     f1_scorer = make_scorer(metrics.flat_f1_score,
                             average='weighted', labels=labels)
-    # search
-    rs = RandomizedSearchCV(crf, params_space,
-                            cv=10,
-                            verbose=3,
-                            n_jobs=-1,
-                            n_iter=20,
-                            # n_iter=50,
-                            scoring=f1_scorer)
-    rs.fit(X_train, y_train)
-
     # Fixed parameters
-    # crf.fit(X_train, y_train)
+    crf.fit(X_train, y_train)
     # Best hiperparameters
     # crf = rs.best_estimator_
@@ -319,16 +235,13 @@ if __name__ == "__main__":
         oFile.write("********** TRAINING AND TESTING REPORT **********\n")
         oFile.write("Training file: " + options.trainingFile + '\n')
         oFile.write('\n')
-        oFile.write('best params:' + str(rs.best_params_) + '\n')
+        oFile.write('best params:' + str(crf.best_params_) + '\n')
-        oFile.write('best CV score:' + str(rs.best_score_) + '\n')
+        oFile.write('best CV score:' + str(crf.best_score_) + '\n')
-        oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
+        oFile.write('model size: {:0.2f}M\n'.format(crf.best_estimator_.size_ / 1000000))
     print("Training done in: %fs" % (time() - t0))
     t0 = time()
-    # Update best crf
-    crf = rs.best_estimator_
-
     # Saving model
     print("     Saving training model...")
     t1 = time()
@@ -337,7 +250,7 @@ if __name__ == "__main__":
     joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
     print("        Saving training model done in: %fs" % (time() - t1))
-    # Evaluation against test data
+    # Evaluation against evaluation data
     y_pred = crf.predict(X_test)
     print("*********************************")
     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(