Created
May 22, 2015 01:33
-
-
Save xccds/ae012b5484c50defc4c0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"### Python的文本挖掘\n", | |
"- 本文主要演练三种文本挖掘方法\n", | |
" - 使用的是sogou的语料库http://www.sogou.com/labs/dl/c.html\n", | |
" - 常规的词袋模型用于分类\n", | |
" - 使用word2vec得到词向量,再对词汇进行聚类,用类编号作为特征再进行分类\n", | |
" - 使用word2vec得到词向量,对文档中的词向量平均化作为文档向量,用文档向量作为特征进行分类" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from os import path\n", | |
"import os\n", | |
"import re" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['SogouC.reduced/Reduced/C000008',\n", | |
" 'SogouC.reduced/Reduced/C000010',\n", | |
" 'SogouC.reduced/Reduced/C000013',\n", | |
" 'SogouC.reduced/Reduced/C000014',\n", | |
" 'SogouC.reduced/Reduced/C000016',\n", | |
" 'SogouC.reduced/Reduced/C000020',\n", | |
" 'SogouC.reduced/Reduced/C000022',\n", | |
" 'SogouC.reduced/Reduced/C000023',\n", | |
" 'SogouC.reduced/Reduced/C000024']" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"rootdir = 'SogouC.reduced/Reduced'\n", | |
"dirs = os.listdir(rootdir)\n", | |
"dirs = [path.join(rootdir,f) for f in dirs if f.startswith('C')]\n", | |
"dirs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def load_txt(x):\n", | |
" with open(x) as f:\n", | |
" res = [t.decode('gbk','ignore') for t in f]\n", | |
" return ''.join(res)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"text_t = {}\n", | |
"for i, d in enumerate(dirs):\n", | |
" files = os.listdir(d)\n", | |
" files = [path.join(d, x) for x in files if x.endswith('txt') and not x.startswith('.')]\n", | |
" text_t[i] = [load_txt(f) for f in files]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# to dataframe\n", | |
"import pandas as pd\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"flen = [len(t) for t in text_t.values()]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"labels = np.repeat(text_t.keys(),flen)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# flatter nested list\n", | |
"import itertools\n", | |
"merged = list(itertools.chain.from_iterable(text_t.values()))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>label</th>\n", | |
" <th>txt</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 0</td>\n", | |
" <td> 本报记者陈雪频实习记者唐翔发自上海\\r\\n 一家刚刚成立两年的网络支付公司,它的目标是...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 0</td>\n", | |
" <td> 证券通:百联股份未来5年有能力保持高速增长\\r\\n\\r\\n 深度报告 权威内参...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 0</td>\n", | |
" <td> 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 0</td>\n", | |
" <td> 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 0</td>\n", | |
" <td> 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" label txt\n", | |
"0 0 本报记者陈雪频实习记者唐翔发自上海\\r\\n 一家刚刚成立两年的网络支付公司,它的目标是...\n", | |
"1 0 证券通:百联股份未来5年有能力保持高速增长\\r\\n\\r\\n 深度报告 权威内参...\n", | |
"2 0 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....\n", | |
"3 0 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www....\n", | |
"4 0 5月09日消息快评\\r\\n\\r\\n 深度报告 权威内参 来自“证券通”www...." | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = pd.DataFrame({'label': labels, 'txt': merged})\n", | |
"df.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt\n", | |
"DEBUG:jieba:Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt\n", | |
"dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache\n", | |
"DEBUG:jieba:dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache\n", | |
"loading model cost 3.71529507637 seconds.\n", | |
"DEBUG:jieba:loading model cost 3.71529507637 seconds.\n", | |
"Trie has been built succesfully.\n", | |
"DEBUG:jieba:Trie has been built succesfully.\n" | |
] | |
} | |
], | |
"source": [ | |
"# cut word\n", | |
"import jieba\n", | |
"jieba.enable_parallel(4)\n", | |
"def cutword_1(x):\n", | |
" words = jieba.cut(x)\n", | |
" return ' '.join(words)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df['seg_word'] = df.txt.map(cutword_1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from cPickle import dump,load\n", | |
"#dump(df, open('df.pickle', 'wb'))\n", | |
"df = load(open('df.pickle','rb'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(17903, 10000)" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# model \n", | |
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | |
"vect = TfidfVectorizer(ngram_range=(1,1), min_df = 2, max_features = 10000)\n", | |
"xvec = vect.fit_transform(df.seg_word)\n", | |
"xvec.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"y = df.label" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.cross_validation import train_test_split\n", | |
"train_X, test_X, train_y, test_y = train_test_split(xvec, y , train_size=0.7, random_state=1)\n", | |
"from sklearn.naive_bayes import MultinomialNB\n", | |
"clf = MultinomialNB()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"clf.fit(train_X, train_y)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 0.91 0.88 0.89 576\n", | |
" 1 0.86 0.83 0.84 604\n", | |
" 2 0.88 0.83 0.86 616\n", | |
" 3 0.99 0.97 0.98 580\n", | |
" 4 0.87 0.88 0.88 597\n", | |
" 5 0.88 0.80 0.83 607\n", | |
" 6 0.78 0.89 0.83 599\n", | |
" 7 0.74 0.79 0.76 613\n", | |
" 8 0.92 0.93 0.92 579\n", | |
"\n", | |
"avg / total 0.87 0.86 0.87 5371\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn import metrics\n", | |
"pre = clf.predict(test_X)\n", | |
"print metrics.classification_report(test_y, pre)\n", | |
"#print metrics.confusion_matrix(test_y, pre)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# word2vec\n", | |
"txt = df.seg_word.values\n", | |
"txtlist = []\n", | |
"for sent in txt:\n", | |
" temp = [w for w in sent.split()]\n", | |
" txtlist.append(temp)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"num_features = 100\n", | |
"min_word_count = 10\n", | |
"num_workers = 4\n", | |
"context = 20\n", | |
"epoch = 20\n", | |
"sample = 1e-5" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from gensim.models import word2vec" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 89, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model = word2vec.Word2Vec(txtlist, workers = num_workers,\n", | |
" sample = sample,\n", | |
" size = num_features,\n", | |
" min_count=min_word_count,\n", | |
" window = context,\n", | |
" iter = epoch)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 90, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(57675, 100)" | |
] | |
}, | |
"execution_count": 90, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"model.syn0.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 91, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"在线 0.809367001057\n", | |
"网络 0.792132735252\n", | |
"网民 0.789814114571\n", | |
"网站 0.766795158386\n", | |
"网络广告 0.763081729412\n", | |
"门户网站 0.757833242416\n", | |
"互联网内容 0.728336572647\n", | |
"访问量 0.703088879585\n", | |
"商业模式 0.701648652554\n", | |
"Web2 0.698530614376\n" | |
] | |
} | |
], | |
"source": [ | |
"for w in model.most_similar(u'互联网'):\n", | |
" print w[0], w[1]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#model.save('sogo_wv')\n", | |
"model = word2vec.Word2Vec.load('sogo_wv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# kmeans based on word_vec\n", | |
"from sklearn.cluster import KMeans\n", | |
"word_vectors = model.syn0\n", | |
"num_clusters = word_vectors.shape[0]//20\n", | |
"kmeans_clustering = KMeans(n_clusters = num_clusters)\n", | |
"idx = kmeans_clustering.fit_predict(word_vectors)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"word_centroid_map = dict(zip(model.index2word, idx))\n", | |
"word_centroid_df = pd.DataFrame(zip( model.index2word, idx )) \n", | |
"word_centroid_df.columns = ['word','cluster'] \n", | |
"word_centroid_df.head() " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# 观察前十个群的效果 \n", | |
"for cluster in xrange(10): \n", | |
" print \"\\nCluster %d\" % cluster \n", | |
" words = word_centroid_df.ix[word_centroid_df.cluster==cluster,'word'].values \n", | |
" print ' '.join(words) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 107, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# 观察有很多词的大群 \n", | |
"big_cluster = word_centroid_df.groupby('cluster').apply(lambda x: len(x.word)).reset_index() \n", | |
"big_cluster.columns = ['cluster','word_num'] \n", | |
"key_cluster = big_cluster.ix[big_cluster['word_num']>=10,'cluster'].values \n", | |
"\n", | |
"\n", | |
"def create_bag_of_centroids( wordlist, word_centroid_map ): \n", | |
" # 从词到类别编号的映射函数 \n", | |
" # wordlist是文本中的词,word_centroid_map是诩到编号的dict \n", | |
" num_centroids = max( word_centroid_map.values() ) + 1 \n", | |
" bag_of_centroids = np.zeros( num_centroids, dtype=\"float32\" ) \n", | |
" for word in wordlist: \n", | |
" if word in word_centroid_map: \n", | |
" index = word_centroid_map[word] \n", | |
" if index in key_cluster: \n", | |
" bag_of_centroids[index] += 1 \n", | |
" return bag_of_centroids " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# 从原始文本映射成群编号 \n", | |
"train_centroids = np.zeros( (len(txtlist), num_clusters),dtype=\"float32\" ) \n", | |
"for i, review in enumerate(txtlist): \n", | |
" train_centroids[i] = create_bag_of_centroids( review,word_centroid_map ) \n", | |
"# 变为0-1特征 \n", | |
"train_centroids = np.where(train_centroids>0,1,0) \n", | |
"train_centroids_df = pd.DataFrame(train_centroids) \n", | |
"train_centroids_df= train_centroids_df.ix[:,train_centroids.sum(axis=0)!=0] " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(17910, 1429)" | |
] | |
}, | |
"execution_count": 113, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"train_centroids_df.shape" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.linear_model import SGDClassifier\n", | |
"from sklearn.cross_validation import train_test_split\n", | |
"train_X, test_X, train_y, test_y = train_test_split(train_centroids_df.values, y , train_size=0.7, random_state=1)\n", | |
"from sklearn.naive_bayes import MultinomialNB\n", | |
"clf = SGDClassifier()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 0.88 0.91 0.90 577\n", | |
" 1 0.74 0.86 0.80 603\n", | |
" 2 0.83 0.86 0.84 619\n", | |
" 3 0.85 0.73 0.78 584\n", | |
" 4 0.82 0.75 0.78 570\n", | |
" 5 0.75 0.74 0.75 600\n", | |
" 6 0.82 0.82 0.82 600\n", | |
" 7 0.99 0.94 0.96 615\n", | |
" 8 0.71 0.75 0.73 605\n", | |
"\n", | |
"avg / total 0.82 0.82 0.82 5373\n", | |
"\n", | |
"[[525 1 1 1 9 4 3 0 33]\n", | |
" [ 0 519 9 13 22 18 2 1 19]\n", | |
" [ 3 14 533 2 4 47 10 0 6]\n", | |
" [ 1 52 30 425 13 22 11 0 30]\n", | |
" [ 8 53 6 21 429 5 10 1 37]\n", | |
" [ 26 19 26 18 10 447 31 1 22]\n", | |
" [ 7 8 19 5 9 27 491 1 33]\n", | |
" [ 1 3 11 1 1 9 6 578 5]\n", | |
" [ 23 29 8 14 26 14 37 1 453]]\n" | |
] | |
} | |
], | |
"source": [ | |
"clf.fit(train_X, train_y)\n", | |
"from sklearn import metrics\n", | |
"pre = clf.predict(test_X)\n", | |
"print metrics.classification_report(test_y, pre)\n", | |
"print metrics.confusion_matrix(test_y, pre)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(17910, 100)" | |
] | |
}, | |
"execution_count": 95, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 将词向量平均化为文档向量 \n", | |
"def sentvec(sent,m=num_features,model=model): \n", | |
" res = np.zeros(m) \n", | |
" words = sent.split() \n", | |
" num = 0 \n", | |
" for w in words: \n", | |
" if w in model.index2word: \n", | |
" res += model[w] \n", | |
" num += 1.0 \n", | |
" if num == 0: return np.zeros(m) \n", | |
" else: return res/num \n", | |
" \n", | |
"n = df.shape[0] \n", | |
"sent_matrix = np.zeros([n,num_features],float) \n", | |
"for i ,sent in enumerate(df.seg_word.values): \n", | |
" sent_matrix[i,:] = sentvec(sent) \n", | |
"sent_matrix.shape " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 99, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.ensemble import GradientBoostingClassifier\n", | |
"from sklearn.cross_validation import train_test_split\n", | |
"train_X, test_X, train_y, test_y = train_test_split(sent_matrix, y , train_size=0.7, random_state=1)\n", | |
"clf = GradientBoostingClassifier()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" precision recall f1-score support\n", | |
"\n", | |
" 0 0.93 0.93 0.93 577\n", | |
" 1 0.83 0.84 0.84 603\n", | |
" 2 0.91 0.85 0.88 619\n", | |
" 3 0.85 0.87 0.86 584\n", | |
" 4 0.85 0.83 0.84 570\n", | |
" 5 0.83 0.80 0.81 600\n", | |
" 6 0.88 0.88 0.88 600\n", | |
" 7 0.97 0.96 0.97 615\n", | |
" 8 0.76 0.83 0.80 605\n", | |
"\n", | |
"avg / total 0.87 0.87 0.87 5373\n", | |
"\n", | |
"[[539 1 2 3 5 3 6 1 17]\n", | |
" [ 0 507 4 20 20 9 4 1 38]\n", | |
" [ 3 10 529 11 6 35 12 3 10]\n", | |
" [ 0 25 8 509 11 13 5 1 12]\n", | |
" [ 6 27 4 17 472 9 5 2 28]\n", | |
" [ 15 12 22 15 12 477 19 4 24]\n", | |
" [ 6 7 8 9 3 13 530 3 21]\n", | |
" [ 0 1 0 2 5 4 5 592 6]\n", | |
" [ 9 20 6 10 22 12 19 2 505]]\n" | |
] | |
} | |
], | |
"source": [ | |
"clf.fit(train_X, train_y)\n", | |
"from sklearn import metrics\n", | |
"pre = clf.predict(test_X)\n", | |
"print metrics.classification_report(test_y, pre)\n", | |
"print metrics.confusion_matrix(test_y, pre)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment