Showing
1 changed file
with
37 additions
and
15 deletions
| ... | @@ -3,7 +3,8 @@ from sklearn.cross_validation import train_test_split as splitt | ... | @@ -3,7 +3,8 @@ from sklearn.cross_validation import train_test_split as splitt |
| 3 | from sklearn.feature_extraction.text import TfidfVectorizer | 3 | from sklearn.feature_extraction.text import TfidfVectorizer |
| 4 | from sklearn.model_selection import RandomizedSearchCV | 4 | from sklearn.model_selection import RandomizedSearchCV |
| 5 | from sklearn.model_selection import GridSearchCV | 5 | from sklearn.model_selection import GridSearchCV |
| 6 | -from sklearn import metrics | 6 | +from sklearn.model_selection import train_test_split |
| 7 | +from sklearn.metrics import recall_score, precision_score, f1_score, classification_report | ||
| 7 | from sklearn.svm import SVC | 8 | from sklearn.svm import SVC |
| 8 | import numpy as np | 9 | import numpy as np |
| 9 | import argparse | 10 | import argparse |
| ... | @@ -16,18 +17,23 @@ from scipy.stats import expon | ... | @@ -16,18 +17,23 @@ from scipy.stats import expon |
| 16 | from sklearn.preprocessing import label_binarize | 17 | from sklearn.preprocessing import label_binarize |
| 17 | from sklearn.datasets import load_files | 18 | from sklearn.datasets import load_files |
| 18 | 19 | ||
| 20 | +# CMC: Run example | ||
| 21 | +# python3.4 filter_papers.py --traind /home/cmendezc/gitlab_repositories/useless/data/TEXT_FILES | ||
| 19 | 22 | ||
| 20 | parser = argparse.ArgumentParser( | 23 | parser = argparse.ArgumentParser( |
| 21 | description="This script separates biomedical papers that" | 24 | description="This script separates biomedical papers that" |
| 22 | "report data from biomedical experiments from those that do not.") | 25 | "report data from biomedical experiments from those that do not.") |
| 23 | -parser.add_argument("--input", help="Input file containing the to" | 26 | +parser.add_argument("--input", help="Input directory containing the papers to" |
| 24 | "be predited.") | 27 | "be predited.") |
| 25 | parser.add_argument("--traind", help="Input directory containing the papers of" | 28 | parser.add_argument("--traind", help="Input directory containing the papers of" |
| 26 | "two classes to be learned.") | 29 | "two classes to be learned.") |
| 27 | parser.add_argument("--out", help="Path to the output directory " | 30 | parser.add_argument("--out", help="Path to the output directory " |
| 28 | "(default='./filter_output')", default="filter_output") | 31 | "(default='./filter_output')", default="filter_output") |
| 29 | parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | 32 | parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" |
| 30 | - "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl") | 33 | + "(default='./model_binClass/svm_model.paper.pkl')", default="model_binClass/svm_model.paper.pkl") |
| 34 | +parser.add_argument("--split", default=False, | ||
| 35 | + action="store_true", dest="split", | ||
| 36 | + help="Automatic split training/test of input data ") | ||
| 31 | 37 | ||
| 32 | args = parser.parse_args() | 38 | args = parser.parse_args() |
| 33 | labels = {0: 'useless', 1: 'useful'} | 39 | labels = {0: 'useless', 1: 'useful'} |
| ... | @@ -56,10 +62,19 @@ if args.traind and not args.input: | ... | @@ -56,10 +62,19 @@ if args.traind and not args.input: |
| 56 | 62 | ||
| 57 | model_params = {k: list(set(model_params[k])) for k in model_params} | 63 | model_params = {k: list(set(model_params[k])) for k in model_params} |
| 58 | 64 | ||
| 59 | - tfidf_model = vectorizer.fit(data.data) | 65 | + # CMC: separate in training - validation datasets |
| 60 | - X = vectorizer.transform(data.data) | 66 | + if args.split: |
| 61 | - #y = [x['topic'] for x in abstracs] | 67 | + X_train, X_test, y_train, y_test = train_test_split(data.data, labels, test_size = 0.25, random_state = 42) |
| 62 | - y = data.target | 68 | + tfidf_model = vectorizer.fit(X_train) |
| 69 | + X = vectorizer.transform(X_train) | ||
| 70 | + y = y_train | ||
| 71 | + else: | ||
| 72 | + #y = [x['topic'] for x in abstracs] | ||
| 73 | + # Original Nacho: | ||
| 74 | + tfidf_model = vectorizer.fit(data.data) | ||
| 75 | + X = vectorizer.transform(data.data) | ||
| 76 | + y = data.target | ||
| 77 | + | ||
| 63 | 78 | ||
| 64 | #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | 79 | #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) |
| 65 | 80 | ||
| ... | @@ -81,15 +96,22 @@ if args.traind and not args.input: | ... | @@ -81,15 +96,22 @@ if args.traind and not args.input: |
| 81 | #print(metrics.f1_score(clf.predict(X_test), y_test)) | 96 | #print(metrics.f1_score(clf.predict(X_test), y_test)) |
| 82 | 97 | ||
| 83 | #joblib.dump(clf, 'model/svm_model.pkl') | 98 | #joblib.dump(clf, 'model/svm_model.pkl') |
| 84 | - joblib.dump(clf.best_estimator_, 'model/svm_model.paper.pkl') | 99 | + joblib.dump(clf.best_estimator_, 'model_binClass/svm_model.paper.pkl') |
| 85 | - joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl') | 100 | + joblib.dump(tfidf_model, 'model_binClass/tfidf_model.paper.pkl') |
| 86 | 101 | ||
| 102 | + if args.split: | ||
| 103 | + X = vectorizer.transform(X_test) | ||
| 104 | + y_pred = clf.predict(X) | ||
| 105 | + print(precision_score(y_test, y_pred)) | ||
| 106 | + print(recall_score(y_test, y_pred)) | ||
| 107 | + print(f1_score(y_test, y_pred)) | ||
| 108 | + print(classification_report(y_test, y_pred)) | ||
| 87 | else: | 109 | else: |
| 88 | from pdb import set_trace as st | 110 | from pdb import set_trace as st |
| 89 | data=load_files(container_path=args.input, encoding=None, | 111 | data=load_files(container_path=args.input, encoding=None, |
| 90 | decode_error='replace') | 112 | decode_error='replace') |
| 91 | clf = joblib.load(args.svcmodel) | 113 | clf = joblib.load(args.svcmodel) |
| 92 | - vectorizer = joblib.load('model/tfidf_model.paper.pkl') | 114 | + vectorizer = joblib.load('model_binClass/tfidf_model.paper.pkl') |
| 93 | X = vectorizer.transform(data.data) | 115 | X = vectorizer.transform(data.data) |
| 94 | 116 | ||
| 95 | classes = clf.predict(X) | 117 | classes = clf.predict(X) |
| ... | @@ -97,10 +119,10 @@ else: | ... | @@ -97,10 +119,10 @@ else: |
| 97 | if not os.path.exists(args.out): | 119 | if not os.path.exists(args.out): |
| 98 | os.makedirs(args.out) | 120 | os.makedirs(args.out) |
| 99 | # Writing predictions to output files | 121 | # Writing predictions to output files |
| 100 | - with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ | 122 | + with open(args.out + "/" + labels[0] + "-binClass-paper.out", 'w') as f0, \ |
| 101 | - open(args.out + "/" + labels[1] + ".out", 'w') as f1: | 123 | + open(args.out + "/" + labels[1] + "-binClass-paper.out", 'w') as f1: |
| 102 | - for c, a in zip(classes, papers): | 124 | + for c, a in zip(classes, data): |
| 103 | if c == 0: | 125 | if c == 0: |
| 104 | - f0.write("%d\t%s\n" % (a['title'], a['body'])) | 126 | + f0.write("%d\n" % (a['title'])) |
| 105 | elif c == 1: | 127 | elif c == 1: |
| 106 | - f1.write("%d\t%s\n" % (a['title'], a['body'])) | 128 | + f1.write("%d\n" % (a['title'])) | ... | ... |
-
Please register or login to post a comment