Conditional Random Fields

Carlos-Francisco Méndez-Cruz
Commit 68cde46d69f1c76e869cb86224db058af48d7c23 68cde46d 1 parent 7666e789
Showing 10 changed files with 13 additions and 270 deletions
data-sets/text-abstracts-original.txt
data-sets/text-annotated-abstracts-original.txt
models/training-data-set-70.fStopWords_False.fSymbols_False.mod
prepare-abstracts.py
prepare-training-test.py
reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt
reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt
reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt
training-validation-original.py
training-validation-v1.py
--- a/data-sets/text-abstracts-original.txt deleted 100644 → 0
View file @7666e78
+++ b/data-sets/text-abstracts-original.txt deleted 100644 → 0
View file @7666e78
--- a/data-sets/text-annotated-abstracts-original.txt deleted 100644 → 0
View file @7666e78
+++ b/data-sets/text-annotated-abstracts-original.txt deleted 100644 → 0
View file @7666e78
--- a/models/training-data-set-70.fStopWords_False.fSymbols_False.mod deleted 100644 → 0
View file @7666e78
+++ b/models/training-data-set-70.fStopWords_False.fSymbols_False.mod deleted 100644 → 0
View file @7666e78
--- a/prepare-abstracts.py deleted 100644 → 0
View file @7666e78
+++ b/prepare-abstracts.py deleted 100644 → 0
View file @7666e78
- # -*- coding: UTF-8 -*-
- 
- from optparse import OptionParser
- import os
- import sys
- from time import time
- import re
- 
- __author__ = 'CMendezC'
- 
- # Objective: Take text-annotated-abstracts-original.txt as input
- # for obtaining abstracts separated in files without tags and collecting dictionary of genes
- # for tagging after NLP pipeline.
- 
- # Parameters:
- #   1) --inputPath      Input path.
- #   2) --inputFile   Input file.
- #   3) --outputPath     Output path
- 
- # Execution:
- # python3 prepare-abstracts.py
- # --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
- # --inputFile text-annotated-abstracts.txt
- # --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
- # --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries
- # --dicFile genes.txt
- # python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries --dicFile genes.txt
- 
- 
- if __name__ == "__main__":
-     # Parameter definition
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Input path", metavar="PATH")
-     parser.add_option("--inputFile", dest="inputFile",
-                       help="Input file", metavar="FILE")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Output path", metavar="PATH")
-     parser.add_option("--dicPath", dest="dicPath",
-                       help="Dictionary path", metavar="PATH")
-     parser.add_option("--dicFile", dest="dicFile",
-                       help="Dictionary file", metavar="FILE")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("None parameters indicated.")
-         sys.exit(1)
- 
-     # Printing parameter values
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Input path: " + str(options.inputPath))
-     print("Input file", str(options.inputFile))
-     print("Output path: " + str(options.outputPath))
-     print("Dictionary path: " + str(options.dicPath))
-     print("Dictionary file", str(options.dicFile))
- 
-     filesWritten = 0
-     t0 = time()
-     hashGenes = {}
- 
-     rePmid = re.compile(r'([\d]+)\|t\|')
-     reGene = re.compile(r'<g>([^<]+)</g>')
-     reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)')
-     with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
-         print("Reading file..." + options.inputFile)
-         for line in iFile:
-             line = line.strip('\r\n')
-             for gene in reGene.findall(line):
-                 # print("genes: {}".format(gene))
-                 if gene not in hashGenes:
-                     hashGenes[gene] = 1
-                 else:
-                     hashGenes[gene] += 1
-             line = reTags.sub('', line)
-             result = rePmid.match(line)
-             if result:
-                 line = rePmid.sub('', line)
-                 with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
-                     oFile.write(line)
-             else:
-                 print("Warning: line without PMID")
-     with open(os.path.join(options.dicPath, options.dicFile), "w", encoding="utf-8", errors="replace") as dFile:
-         for gene in hashGenes.keys():
-             dFile.write("{}\n".format(gene))
- 
- 
--- a/prepare-training-test.py deleted 100644 → 0
View file @7666e78
+++ b/prepare-training-test.py deleted 100644 → 0
View file @7666e78
- # -*- coding: UTF-8 -*-
- 
- from optparse import OptionParser
- import os
- import sys
- from time import time
- 
- __author__ = 'CMendezC'
- 
- # Objective: Join transformed files for obtaining training and test data sets
- 
- # Parameters:
- #   1) --inputPath      Path to read files.
- #   2) --trainingFile   File name for training data.
- #   3) --testFile       File name for test data.
- #   4) --outputPath     Path to write files.
- 
- # Ouput:
- #   1) Files created.
- 
- # Execution:
- # python3.4 prepare-training-test.py
- # --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed
- # --trainingFile training-data-set-70.txt
- # --testFile test-data-set-30.txt
- # --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
- # python3.4 prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
- 
- ###########################################################
- #                       MAIN PROGRAM                      #
- ###########################################################
- 
- if __name__ == "__main__":
-     # Parameter definition
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Path to read files", metavar="PATH")
-     parser.add_option("--trainingFile", dest="trainingFile",
-                       help="File for training examples", metavar="FILE")
-     parser.add_option("--testFile", dest="testFile",
-                       help="File for test examples", metavar="FILE")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("None parameters indicated.")
-         sys.exit(1)
- 
-     # Printing parameter values
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path to read files: " + str(options.inputPath))
-     print("File for training examples", str(options.trainingFile))
-     print("File for test examples", str(options.testFile))
-     print("Path to write output files: " + str(options.outputPath))
- 
-     t0 = time()
-     trainingDataset = []
-     testDataset = []
- 
-     counter = 1
-     for path, dirs, files in os.walk(options.inputPath):
-         # For each file in dir
-         for file in files:
-             if counter <= 70:
-                 print("   Joining file {} {} to training data set".format(counter, file))
-                 with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
-                     for line in iFile:
-                         line = line.strip('\r\n')
-                         trainingDataset.append(line)
-             elif counter > 70 and counter <= 100:
-                 print("   Joining file {} {} to test data set".format(counter, file))
-                 with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
-                     for line in iFile:
-                         line = line.strip('\r\n')
-                         testDataset.append(line)
-             counter += 1
-     with open(os.path.join(options.outputPath, options.trainingFile), "w", encoding="utf-8", errors="replace") as oFile:
-         for line in trainingDataset:
-             oFile.write("{}\n".format(line))
-     with open(os.path.join(options.outputPath, options.testFile), "w", encoding="utf-8", errors="replace") as oFile:
-         for line in testDataset:
-             oFile.write("{}\n".format(line))
- 
--- a/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
+++ b/reports/report_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
--- a/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
+++ b/reports/y_pred_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
--- a/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
+++ b/reports/y_test_training-data-set-70.fStopWords_False.fSymbols_False.txt deleted 100644 → 0
View file @7666e78
--- a/training-validation-original.py deleted 100644 → 0
View file @7666e78
+++ b/training-validation-original.py deleted 100644 → 0
View file @7666e78
--- a/training-validation-v1.py
View file @68cde46
+++ b/training-validation-v1.py
View file @68cde46
@@ -49,12 +49,6 @@ from nltk.corpus import stopwords
 #################################
 #           FUNCTIONS           #
 #################################
- def endsConLow(word):
-     miregex = re.compile(r'[^aeiouA-Z0-9]$')
-     if miregex.search(word):
-         return True
-     else:
-         return False
 
 def word2features(sent, i):
     listElem = sent[i].split('|')
@@ -63,21 +57,9 @@ def word2features(sent, i):
     postag = listElem[2]
 
     features = {
-         # Suffixes
-         'word[-3:]': word[-3:],
-         'word[-2:]': word[-2:],
-         'word[-1:]': word[-1:],
-         #'word.isupper()': word.isupper(),
         'word': word,
-         'lemma': lemma,
-         'postag': postag,
-         'lemma[-3:]': lemma[-3:],
-         'lemma[-2:]': lemma[-2:],
-         'lemma[-1:]': lemma[-1:],
-         'word[:3]': word[:3],
-         'word[:2]': word[:2],
-         'word[:1]': word[:1],
-         'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
+         #'lemma': lemma,
+         #'postag': postag,
     }
     if i > 0:
         listElem = sent[i - 1].split('|')
@@ -86,8 +68,8 @@ def word2features(sent, i):
         postag1 = listElem[2]
         features.update({
             '-1:word': word1,
-             '-1:lemma': lemma1,
-             '-1:postag': postag1,
+             #'-1:lemma': lemma1,
+             #'-1:postag': postag1,
         })
 
     if i < len(sent) - 1:
@@ -97,55 +79,10 @@ def word2features(sent, i):
         postag1 = listElem[2]
         features.update({
             '+1:word': word1,
-             '+1:lemma': lemma1,
-             '+1:postag': postag1,
+             #'+1:lemma': lemma1,
+             #'+1:postag': postag1,
         })
 
-     '''    
-     if i > 1:
-         listElem = sent[i - 2].split('|')
-         word2 = listElem[0]
-         lemma2 = listElem[1]
-         postag2 = listElem[2]
-         features.update({
-             '-2:word': word2,
-             '-2:lemma': lemma2,
-         })
- 
-     if i < len(sent) - 2:
-         listElem = sent[i + 2].split('|')
-         word2 = listElem[0]
-         lemma2 = listElem[1]
-         postag2 = listElem[2]
-         features.update({
-             '+2:word': word2,
-             '+2:lemma': lemma2,
-         })
- 
-     trigrams = False
-     if trigrams:
-         if i > 2:
-             listElem = sent[i - 3].split('|')
-             word3 = listElem[0]
-             lemma3 = listElem[1]
-             postag3 = listElem[2]
-             features.update({
-                 '-3:word': word3,
-                 '-3:lemma': lemma3,
-             })
- 
-         if i < len(sent) - 3:
-             listElem = sent[i + 3].split('|')
-             word3 = listElem[0]
-             lemma3 = listElem[1]
-             postag3 = listElem[2]
-             features.update({
-                 '+3:word': word3,
-                 '+3:lemma': lemma3,
-             })
-     '''
-     return features
- 
 
 def sent2features(sent):
     return [word2features(sent, i) for i in range(len(sent))]
@@ -271,24 +208,13 @@ if __name__ == "__main__":
     y_test = [sent2labels(s) for s in sentencesTestData]
 
     # Fixed parameters
-     # crf = sklearn_crfsuite.CRF(
-     #     algorithm='lbfgs',
-     #     c1=0.1,
-     #     c2=0.1,
-     #     max_iterations=100,
-     #     all_possible_transitions=True
-     # )
- 
-     # Hyperparameter Optimization
     crf = sklearn_crfsuite.CRF(
          algorithm='lbfgs',
+          c1=0.1,
+          c2=0.1,
          max_iterations=100,
          all_possible_transitions=True
      )
-     params_space = {
-         'c1': scipy.stats.expon(scale=0.5),
-         'c2': scipy.stats.expon(scale=0.05),
-     }
 
     # Original: labels = list(crf.classes_)
     # Original: labels.remove('O')
@@ -298,18 +224,8 @@ if __name__ == "__main__":
     f1_scorer = make_scorer(metrics.flat_f1_score,
                             average='weighted', labels=labels)
 
-     # search
-     rs = RandomizedSearchCV(crf, params_space,
-                             cv=10,
-                             verbose=3,
-                             n_jobs=-1,
-                             n_iter=20,
-                             # n_iter=50,
-                             scoring=f1_scorer)
-     rs.fit(X_train, y_train)
- 
     # Fixed parameters
-     # crf.fit(X_train, y_train)
+     crf.fit(X_train, y_train)
 
     # Best hiperparameters
     # crf = rs.best_estimator_
@@ -319,16 +235,13 @@ if __name__ == "__main__":
         oFile.write("********** TRAINING AND TESTING REPORT **********\n")
         oFile.write("Training file: " + options.trainingFile + '\n')
         oFile.write('\n')
-         oFile.write('best params:' + str(rs.best_params_) + '\n')
-         oFile.write('best CV score:' + str(rs.best_score_) + '\n')
-         oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
+         oFile.write('best params:' + str(crf.best_params_) + '\n')
+         oFile.write('best CV score:' + str(crf.best_score_) + '\n')
+         oFile.write('model size: {:0.2f}M\n'.format(crf.best_estimator_.size_ / 1000000))
 
     print("Training done in: %fs" % (time() - t0))
     t0 = time()
 
-     # Update best crf
-     crf = rs.best_estimator_
- 
     # Saving model
     print("     Saving training model...")
     t1 = time()
@@ -337,7 +250,7 @@ if __name__ == "__main__":
     joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
     print("        Saving training model done in: %fs" % (time() - t1))
 
-     # Evaluation against test data
+     # Evaluation against evaluation data
     y_pred = crf.predict(X_test)
     print("*********************************")
     name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(