Carlos-Francisco Méndez-Cruz

Final version binClass for papers

...@@ -3,7 +3,8 @@ from sklearn.cross_validation import train_test_split as splitt ...@@ -3,7 +3,8 @@ from sklearn.cross_validation import train_test_split as splitt
3 from sklearn.feature_extraction.text import TfidfVectorizer 3 from sklearn.feature_extraction.text import TfidfVectorizer
4 from sklearn.model_selection import RandomizedSearchCV 4 from sklearn.model_selection import RandomizedSearchCV
5 from sklearn.model_selection import GridSearchCV 5 from sklearn.model_selection import GridSearchCV
6 -from sklearn import metrics 6 +from sklearn.model_selection import train_test_split
7 +from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
7 from sklearn.svm import SVC 8 from sklearn.svm import SVC
8 import numpy as np 9 import numpy as np
9 import argparse 10 import argparse
...@@ -16,18 +17,23 @@ from scipy.stats import expon ...@@ -16,18 +17,23 @@ from scipy.stats import expon
16 from sklearn.preprocessing import label_binarize 17 from sklearn.preprocessing import label_binarize
17 from sklearn.datasets import load_files 18 from sklearn.datasets import load_files
18 19
20 +# CMC: Run example
21 +# python3.4 filter_papers.py --traind /home/cmendezc/gitlab_repositories/useless/data/TEXT_FILES
19 22
20 parser = argparse.ArgumentParser( 23 parser = argparse.ArgumentParser(
21 description="This script separates biomedical papers that" 24 description="This script separates biomedical papers that"
22 "report data from biomedical experiments from those that do not.") 25 "report data from biomedical experiments from those that do not.")
23 -parser.add_argument("--input", help="Input file containing the to" 26 +parser.add_argument("--input", help="Input directory containing the papers to"
24 "be predited.") 27 "be predited.")
25 parser.add_argument("--traind", help="Input directory containing the papers of" 28 parser.add_argument("--traind", help="Input directory containing the papers of"
26 "two classes to be learned.") 29 "two classes to be learned.")
27 parser.add_argument("--out", help="Path to the output directory " 30 parser.add_argument("--out", help="Path to the output directory "
28 "(default='./filter_output')", default="filter_output") 31 "(default='./filter_output')", default="filter_output")
29 parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" 32 parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
30 - "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl") 33 + "(default='./model_binClass/svm_model.paper.pkl')", default="model_binClass/svm_model.paper.pkl")
34 +parser.add_argument("--split", default=False,
35 + action="store_true", dest="split",
36 + help="Automatic split training/test of input data ")
31 37
32 args = parser.parse_args() 38 args = parser.parse_args()
33 labels = {0: 'useless', 1: 'useful'} 39 labels = {0: 'useless', 1: 'useful'}
...@@ -56,11 +62,20 @@ if args.traind and not args.input: ...@@ -56,11 +62,20 @@ if args.traind and not args.input:
56 62
57 model_params = {k: list(set(model_params[k])) for k in model_params} 63 model_params = {k: list(set(model_params[k])) for k in model_params}
58 64
65 + # CMC: separate in training - validation datasets
66 + if args.split:
67 + X_train, X_test, y_train, y_test = train_test_split(data.data, labels, test_size = 0.25, random_state = 42)
68 + tfidf_model = vectorizer.fit(X_train)
69 + X = vectorizer.transform(X_train)
70 + y = y_train
71 + else:
72 + #y = [x['topic'] for x in abstracs]
73 + # Original Nacho:
59 tfidf_model = vectorizer.fit(data.data) 74 tfidf_model = vectorizer.fit(data.data)
60 X = vectorizer.transform(data.data) 75 X = vectorizer.transform(data.data)
61 - #y = [x['topic'] for x in abstracs]
62 y = data.target 76 y = data.target
63 77
78 +
64 #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) 79 #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
65 80
66 clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9) 81 clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
...@@ -81,15 +96,22 @@ if args.traind and not args.input: ...@@ -81,15 +96,22 @@ if args.traind and not args.input:
81 #print(metrics.f1_score(clf.predict(X_test), y_test)) 96 #print(metrics.f1_score(clf.predict(X_test), y_test))
82 97
83 #joblib.dump(clf, 'model/svm_model.pkl') 98 #joblib.dump(clf, 'model/svm_model.pkl')
84 - joblib.dump(clf.best_estimator_, 'model/svm_model.paper.pkl') 99 + joblib.dump(clf.best_estimator_, 'model_binClass/svm_model.paper.pkl')
85 - joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl') 100 + joblib.dump(tfidf_model, 'model_binClass/tfidf_model.paper.pkl')
86 101
102 + if args.split:
103 + X = vectorizer.transform(X_test)
104 + y_pred = clf.predict(X)
105 + print(precision_score(y_test, y_pred))
106 + print(recall_score(y_test, y_pred))
107 + print(f1_score(y_test, y_pred))
108 + print(classification_report(y_test, y_pred))
87 else: 109 else:
88 from pdb import set_trace as st 110 from pdb import set_trace as st
89 data=load_files(container_path=args.input, encoding=None, 111 data=load_files(container_path=args.input, encoding=None,
90 decode_error='replace') 112 decode_error='replace')
91 clf = joblib.load(args.svcmodel) 113 clf = joblib.load(args.svcmodel)
92 - vectorizer = joblib.load('model/tfidf_model.paper.pkl') 114 + vectorizer = joblib.load('model_binClass/tfidf_model.paper.pkl')
93 X = vectorizer.transform(data.data) 115 X = vectorizer.transform(data.data)
94 116
95 classes = clf.predict(X) 117 classes = clf.predict(X)
...@@ -97,10 +119,10 @@ else: ...@@ -97,10 +119,10 @@ else:
97 if not os.path.exists(args.out): 119 if not os.path.exists(args.out):
98 os.makedirs(args.out) 120 os.makedirs(args.out)
99 # Writing predictions to output files 121 # Writing predictions to output files
100 - with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ 122 + with open(args.out + "/" + labels[0] + "-binClass-paper.out", 'w') as f0, \
101 - open(args.out + "/" + labels[1] + ".out", 'w') as f1: 123 + open(args.out + "/" + labels[1] + "-binClass-paper.out", 'w') as f1:
102 - for c, a in zip(classes, papers): 124 + for c, a in zip(classes, data):
103 if c == 0: 125 if c == 0:
104 - f0.write("%d\t%s\n" % (a['title'], a['body'])) 126 + f0.write("%d\n" % (a['title']))
105 elif c == 1: 127 elif c == 1:
106 - f1.write("%d\t%s\n" % (a['title'], a['body'])) 128 + f1.write("%d\n" % (a['title']))
......