Created
December 18, 2017 11:30
-
-
Save yujuwon/17d125bd4af3b85fd6532212e064dc82 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer | |
| # scikit learn으로 다양한 종류의 모델을 만들어 보자. | |
| clf_1 = Pipeline([('vect', CountVectorizer()), | |
| ('clf', MultinomialNB())]) | |
| clf_2 = Pipeline([('vect', HashingVectorizer(non_negative=True)), | |
| ('clf', MultinomialNB())]) | |
| clf_3 = Pipeline([('vect', TfidfVectorizer()), | |
| ('clf', MultinomialNB())]) | |
| # 입력 데이터를 훈련 데이터와 테스트 데이터로 분해하자. | |
| import csv | |
| import numpy as np | |
| org_data = [] | |
| org_label = [] | |
| class1_count = 0 | |
| class2_count = 0 | |
| with open('tmp.csv', 'r') as csvfile: | |
| reader = csv.reader(csvfile, delimiter=',') | |
| for row in reader: | |
| if row[1] == 'CLASS1': | |
| org_label.append(1) | |
| class1_count += 1 | |
| elif row[1] == 'CLASS2': | |
| org_label.append(0) | |
| class2_count += 1 | |
| else: | |
| continue | |
| org_data.append(row[0]) | |
| SPLIT_PERC = 0.75 | |
| split_size = int(len(org_data)*SPLIT_PERC) | |
| X_train = org_data[:split_size] | |
| X_test = org_data[split_size:] | |
| y_train = org_label[:split_size] | |
| y_test = org_label[split_size:] | |
| ndarr_label = np.array(org_label) | |
| class1_rate = class1_count / float(len(org_label)) | |
| class2_rate = class2_count / float(len(org_label)) | |
| print('class1 : %f' %class1_rate) | |
| print('class2 : %f' %class2_rate) | |
| print('test size : %d' %len(X_test)) | |
| # 훈련 모델을 만들자. | |
| from sklearn import metrics | |
| def train_and_evaluate(clf, X_train, X_test, y_train, y_test): | |
| clf.fit(X_train, y_train) | |
| print("Accuracy on training set:") | |
| print(clf.score(X_train, y_train)) | |
| print("Accuracy on testing set:") | |
| print(clf.score(X_test, y_test)) | |
| y_pred = clf.predict(X_test) | |
| print("Classification Report:") | |
| print(metrics.classification_report(y_test, y_pred)) | |
| print(metrics.confusion_matrix(y_test, y_pred)) | |
| train_and_evaluate(clf_1, X_train, X_test, y_train, y_test) | |
| # CLASS1과 CLASS2를 구분하자. | |
| def predict(X_train): | |
| y_pred = clf_1.predict(X_train) | |
| return y_pred | |
| for data in datas: | |
| predict(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment