xccds · May 22, 2015 01:33
diff --git a/text_classifier.ipynb b/text_classifier.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "### Python的文本挖掘\n",
    "- 本文主要演练三种文本挖掘方法\n",
    "    - 使用的是sogou的语料库http://www.sogou.com/labs/dl/c.html\n",
    "    - 常规的词袋模型用于分类\n",
    "    - 使用word2vec得到词向量，再对词汇进行聚类，用类编号作为特征再进行分类\n",
    "    - 使用word2vec得到词向量，对文档中的词向量平均化作为文档向量，用文档向量作为特征进行分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from os import path\n",
    "import os\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['SogouC.reduced/Reduced/C000008',\n",
       " 'SogouC.reduced/Reduced/C000010',\n",
       " 'SogouC.reduced/Reduced/C000013',\n",
       " 'SogouC.reduced/Reduced/C000014',\n",
       " 'SogouC.reduced/Reduced/C000016',\n",
       " 'SogouC.reduced/Reduced/C000020',\n",
       " 'SogouC.reduced/Reduced/C000022',\n",
       " 'SogouC.reduced/Reduced/C000023',\n",
       " 'SogouC.reduced/Reduced/C000024']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rootdir = 'SogouC.reduced/Reduced'\n",
    "dirs = os.listdir(rootdir)\n",
    "dirs = [path.join(rootdir,f) for f in dirs if f.startswith('C')]\n",
    "dirs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def load_txt(x):\n",
    "    with open(x) as f:\n",
    "        res = [t.decode('gbk','ignore') for t in f]\n",
    "        return ''.join(res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "text_t = {}\n",
    "for i, d in enumerate(dirs):\n",
    "    files = os.listdir(d)\n",
    "    files = [path.join(d, x) for x in files if x.endswith('txt') and not x.startswith('.')]\n",
    "    text_t[i] = [load_txt(f) for f in files]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# to dataframe\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "flen = [len(t) for t in text_t.values()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "labels = np.repeat(text_t.keys(),flen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# flatter nested list\n",
    "import itertools\n",
    "merged = list(itertools.chain.from_iterable(text_t.values()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>txt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td> 0</td>\n",
       "      <td> 　　本报记者陈雪频实习记者唐翔发自上海\\r\\n　　一家刚刚成立两年的网络支付公司，它的目标是...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td> 0</td>\n",
       "      <td>     证券通：百联股份未来5年有能力保持高速增长\\r\\n\\r\\n    深度报告 权威内参...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td> 0</td>\n",
       "      <td>     5月09日消息快评\\r\\n\\r\\n    深度报告 权威内参 来自“证券通”www....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td> 0</td>\n",
       "      <td>     5月09日消息快评\\r\\n\\r\\n    深度报告 权威内参 来自“证券通”www....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td> 0</td>\n",
       "      <td>     5月09日消息快评\\r\\n\\r\\n    深度报告 权威内参 来自“证券通”www....</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label                                                txt\n",
       "0      0  　　本报记者陈雪频实习记者唐翔发自上海\\r\\n　　一家刚刚成立两年的网络支付公司，它的目标是...\n",
       "1      0      证券通：百联股份未来5年有能力保持高速增长\\r\\n\\r\\n    深度报告 权威内参...\n",
       "2      0      5月09日消息快评\\r\\n\\r\\n    深度报告 权威内参 来自“证券通”www....\n",
       "3      0      5月09日消息快评\\r\\n\\r\\n    深度报告 权威内参 来自“证券通”www....\n",
       "4      0      5月09日消息快评\\r\\n\\r\\n    深度报告 权威内参 来自“证券通”www...."
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame({'label': labels, 'txt': merged})\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt\n",
      "DEBUG:jieba:Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt\n",
      "dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache\n",
      "DEBUG:jieba:dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache\n",
      "loading model cost 3.71529507637 seconds.\n",
      "DEBUG:jieba:loading model cost 3.71529507637 seconds.\n",
      "Trie has been built succesfully.\n",
      "DEBUG:jieba:Trie has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "# cut word\n",
    "import jieba\n",
    "jieba.enable_parallel(4)\n",
    "def cutword_1(x):\n",
    "    words = jieba.cut(x)\n",
    "    return ' '.join(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "df['seg_word'] = df.txt.map(cutword_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from cPickle import dump,load\n",
    "#dump(df, open('df.pickle', 'wb'))\n",
    "df = load(open('df.pickle','rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(17903, 10000)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# model \n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "vect = TfidfVectorizer(ngram_range=(1,1), min_df = 2, max_features = 10000)\n",
    "xvec = vect.fit_transform(df.seg_word)\n",
    "xvec.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "y = df.label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.cross_validation import train_test_split\n",
    "train_X, test_X, train_y, test_y = train_test_split(xvec, y , train_size=0.7, random_state=1)\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "clf = MultinomialNB()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.fit(train_X, train_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.91      0.88      0.89       576\n",
      "          1       0.86      0.83      0.84       604\n",
      "          2       0.88      0.83      0.86       616\n",
      "          3       0.99      0.97      0.98       580\n",
      "          4       0.87      0.88      0.88       597\n",
      "          5       0.88      0.80      0.83       607\n",
      "          6       0.78      0.89      0.83       599\n",
      "          7       0.74      0.79      0.76       613\n",
      "          8       0.92      0.93      0.92       579\n",
      "\n",
      "avg / total       0.87      0.86      0.87      5371\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics\n",
    "pre = clf.predict(test_X)\n",
    "print metrics.classification_report(test_y, pre)\n",
    "#print metrics.confusion_matrix(test_y, pre)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# word2vec\n",
    "txt = df.seg_word.values\n",
    "txtlist = []\n",
    "for sent in txt:\n",
    "    temp = [w for w in sent.split()]\n",
    "    txtlist.append(temp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "num_features = 100\n",
    "min_word_count = 10\n",
    "num_workers = 4\n",
    "context = 20\n",
    "epoch = 20\n",
    "sample = 1e-5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from gensim.models import word2vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "model = word2vec.Word2Vec(txtlist, workers = num_workers,\n",
    "                          sample = sample,\n",
    "                          size = num_features,\n",
    "                          min_count=min_word_count,\n",
    "                          window = context,\n",
    "                          iter = epoch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(57675, 100)"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model.syn0.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "在线 0.809367001057\n",
      "网络 0.792132735252\n",
      "网民 0.789814114571\n",
      "网站 0.766795158386\n",
      "网络广告 0.763081729412\n",
      "门户网站 0.757833242416\n",
      "互联网内容 0.728336572647\n",
      "访问量 0.703088879585\n",
      "商业模式 0.701648652554\n",
      "Web2 0.698530614376\n"
     ]
    }
   ],
   "source": [
    "for w in model.most_similar(u'互联网'):\n",
    "    print w[0], w[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#model.save('sogo_wv')\n",
    "model = word2vec.Word2Vec.load('sogo_wv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# kmeans based on word_vec\n",
    "from sklearn.cluster import KMeans\n",
    "word_vectors = model.syn0\n",
    "num_clusters = word_vectors.shape[0]//20\n",
    "kmeans_clustering = KMeans(n_clusters = num_clusters)\n",
    "idx = kmeans_clustering.fit_predict(word_vectors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "word_centroid_map = dict(zip(model.index2word, idx))\n",
    "word_centroid_df = pd.DataFrame(zip( model.index2word, idx ))  \n",
    "word_centroid_df.columns = ['word','cluster'] \n",
    "word_centroid_df.head() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 观察前十个群的效果  \n",
    "for cluster in xrange(10):  \n",
    "    print \"\\nCluster %d\" % cluster  \n",
    "    words = word_centroid_df.ix[word_centroid_df.cluster==cluster,'word'].values \n",
    "    print ' '.join(words) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 观察有很多词的大群 \n",
    "big_cluster = word_centroid_df.groupby('cluster').apply(lambda x: len(x.word)).reset_index() \n",
    "big_cluster.columns = ['cluster','word_num'] \n",
    "key_cluster = big_cluster.ix[big_cluster['word_num']>=10,'cluster'].values \n",
    "\n",
    "\n",
    "def create_bag_of_centroids( wordlist, word_centroid_map ):  \n",
    "    # 从词到类别编号的映射函数  \n",
    "    # wordlist是文本中的词，word_centroid_map是诩到编号的dict  \n",
    "    num_centroids = max( word_centroid_map.values() ) + 1  \n",
    "    bag_of_centroids = np.zeros( num_centroids, dtype=\"float32\" )  \n",
    "    for word in wordlist:  \n",
    "        if word in word_centroid_map:  \n",
    "            index = word_centroid_map[word]  \n",
    "            if index in key_cluster: \n",
    "                bag_of_centroids[index] += 1  \n",
    "    return bag_of_centroids  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# 从原始文本映射成群编号  \n",
    "train_centroids = np.zeros( (len(txtlist), num_clusters),dtype=\"float32\" )  \n",
    "for i, review in enumerate(txtlist):  \n",
    "    train_centroids[i] = create_bag_of_centroids( review,word_centroid_map )  \n",
    "# 变为0-1特征 \n",
    "train_centroids = np.where(train_centroids>0,1,0) \n",
    "train_centroids_df = pd.DataFrame(train_centroids) \n",
    "train_centroids_df= train_centroids_df.ix[:,train_centroids.sum(axis=0)!=0] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(17910, 1429)"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_centroids_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.cross_validation import train_test_split\n",
    "train_X, test_X, train_y, test_y = train_test_split(train_centroids_df.values, y , train_size=0.7, random_state=1)\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "clf = SGDClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.88      0.91      0.90       577\n",
      "          1       0.74      0.86      0.80       603\n",
      "          2       0.83      0.86      0.84       619\n",
      "          3       0.85      0.73      0.78       584\n",
      "          4       0.82      0.75      0.78       570\n",
      "          5       0.75      0.74      0.75       600\n",
      "          6       0.82      0.82      0.82       600\n",
      "          7       0.99      0.94      0.96       615\n",
      "          8       0.71      0.75      0.73       605\n",
      "\n",
      "avg / total       0.82      0.82      0.82      5373\n",
      "\n",
      "[[525   1   1   1   9   4   3   0  33]\n",
      " [  0 519   9  13  22  18   2   1  19]\n",
      " [  3  14 533   2   4  47  10   0   6]\n",
      " [  1  52  30 425  13  22  11   0  30]\n",
      " [  8  53   6  21 429   5  10   1  37]\n",
      " [ 26  19  26  18  10 447  31   1  22]\n",
      " [  7   8  19   5   9  27 491   1  33]\n",
      " [  1   3  11   1   1   9   6 578   5]\n",
      " [ 23  29   8  14  26  14  37   1 453]]\n"
     ]
    }
   ],
   "source": [
    "clf.fit(train_X, train_y)\n",
    "from sklearn import metrics\n",
    "pre = clf.predict(test_X)\n",
    "print metrics.classification_report(test_y, pre)\n",
    "print metrics.confusion_matrix(test_y, pre)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(17910, 100)"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将词向量平均化为文档向量 \n",
    "def sentvec(sent,m=num_features,model=model): \n",
    "    res = np.zeros(m) \n",
    "    words = sent.split() \n",
    "    num = 0  \n",
    "    for w in words: \n",
    "        if w in model.index2word: \n",
    "            res += model[w] \n",
    "            num += 1.0 \n",
    "    if num == 0: return np.zeros(m) \n",
    "    else: return res/num \n",
    "     \n",
    "n = df.shape[0] \n",
    "sent_matrix = np.zeros([n,num_features],float) \n",
    "for i ,sent in enumerate(df.seg_word.values): \n",
    "    sent_matrix[i,:] = sentvec(sent) \n",
    "sent_matrix.shape "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.cross_validation import train_test_split\n",
    "train_X, test_X, train_y, test_y = train_test_split(sent_matrix, y , train_size=0.7, random_state=1)\n",
    "clf = GradientBoostingClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.93      0.93      0.93       577\n",
      "          1       0.83      0.84      0.84       603\n",
      "          2       0.91      0.85      0.88       619\n",
      "          3       0.85      0.87      0.86       584\n",
      "          4       0.85      0.83      0.84       570\n",
      "          5       0.83      0.80      0.81       600\n",
      "          6       0.88      0.88      0.88       600\n",
      "          7       0.97      0.96      0.97       615\n",
      "          8       0.76      0.83      0.80       605\n",
      "\n",
      "avg / total       0.87      0.87      0.87      5373\n",
      "\n",
      "[[539   1   2   3   5   3   6   1  17]\n",
      " [  0 507   4  20  20   9   4   1  38]\n",
      " [  3  10 529  11   6  35  12   3  10]\n",
      " [  0  25   8 509  11  13   5   1  12]\n",
      " [  6  27   4  17 472   9   5   2  28]\n",
      " [ 15  12  22  15  12 477  19   4  24]\n",
      " [  6   7   8   9   3  13 530   3  21]\n",
      " [  0   1   0   2   5   4   5 592   6]\n",
      " [  9  20   6  10  22  12  19   2 505]]\n"
     ]
    }
   ],
   "source": [
    "clf.fit(train_X, train_y)\n",
    "from sklearn import metrics\n",
    "pre = clf.predict(test_X)\n",
    "print metrics.classification_report(test_y, pre)\n",
    "print metrics.confusion_matrix(test_y, pre)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"### Python的文本挖掘\n",
	"- 本文主要演练三种文本挖掘方法\n",
	" - 使用的是sogou的语料库http://www.sogou.com/labs/dl/c.html\n",
	" - 常规的词袋模型用于分类\n",
	" - 使用word2vec得到词向量，再对词汇进行聚类，用类编号作为特征再进行分类\n",
	" - 使用word2vec得到词向量，对文档中的词向量平均化作为文档向量，用文档向量作为特征进行分类"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from os import path\n",
	"import os\n",
	"import re"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['SogouC.reduced/Reduced/C000008',\n",
	" 'SogouC.reduced/Reduced/C000010',\n",
	" 'SogouC.reduced/Reduced/C000013',\n",
	" 'SogouC.reduced/Reduced/C000014',\n",
	" 'SogouC.reduced/Reduced/C000016',\n",
	" 'SogouC.reduced/Reduced/C000020',\n",
	" 'SogouC.reduced/Reduced/C000022',\n",
	" 'SogouC.reduced/Reduced/C000023',\n",
	" 'SogouC.reduced/Reduced/C000024']"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"rootdir = 'SogouC.reduced/Reduced'\n",
	"dirs = os.listdir(rootdir)\n",
	"dirs = [path.join(rootdir,f) for f in dirs if f.startswith('C')]\n",
	"dirs"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"def load_txt(x):\n",
	" with open(x) as f:\n",
	" res = [t.decode('gbk','ignore') for t in f]\n",
	" return ''.join(res)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"text_t = {}\n",
	"for i, d in enumerate(dirs):\n",
	" files = os.listdir(d)\n",
	" files = [path.join(d, x) for x in files if x.endswith('txt') and not x.startswith('.')]\n",
	" text_t[i] = [load_txt(f) for f in files]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# to dataframe\n",
	"import pandas as pd\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"flen = [len(t) for t in text_t.values()]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"labels = np.repeat(text_t.keys(),flen)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# flatter nested list\n",
	"import itertools\n",
	"merged = list(itertools.chain.from_iterable(text_t.values()))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>label</th>\n",
	" <th>txt</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td> 0</td>\n",
	" <td> 　　本报记者陈雪频实习记者唐翔发自上海\\r\\n　　一家刚刚成立两年的网络支付公司，它的目标是...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td> 0</td>\n",
	" <td> 证券通：百联股份未来5年有能力保持高速增长\\r\\n\\r\\n 深度报告权威内参...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td> 0</td>\n",
	" <td> 5月09日消息快评\\r\\n\\r\\n 深度报告权威内参来自“证券通”www....</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td> 0</td>\n",
	" <td> 5月09日消息快评\\r\\n\\r\\n 深度报告权威内参来自“证券通”www....</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td> 0</td>\n",
	" <td> 5月09日消息快评\\r\\n\\r\\n 深度报告权威内参来自“证券通”www....</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" label txt\n",
	"0 0 　　本报记者陈雪频实习记者唐翔发自上海\\r\\n　　一家刚刚成立两年的网络支付公司，它的目标是...\n",
	"1 0 证券通：百联股份未来5年有能力保持高速增长\\r\\n\\r\\n 深度报告权威内参...\n",
	"2 0 5月09日消息快评\\r\\n\\r\\n 深度报告权威内参来自“证券通”www....\n",
	"3 0 5月09日消息快评\\r\\n\\r\\n 深度报告权威内参来自“证券通”www....\n",
	"4 0 5月09日消息快评\\r\\n\\r\\n 深度报告权威内参来自“证券通”www...."
	]
	},
	"execution_count": 9,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = pd.DataFrame({'label': labels, 'txt': merged})\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt\n",
	"DEBUG:jieba:Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt\n",
	"dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache\n",
	"DEBUG:jieba:dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache\n",
	"loading model cost 3.71529507637 seconds.\n",
	"DEBUG:jieba:loading model cost 3.71529507637 seconds.\n",
	"Trie has been built succesfully.\n",
	"DEBUG:jieba:Trie has been built succesfully.\n"
	]
	}
	],
	"source": [
	"# cut word\n",
	"import jieba\n",
	"jieba.enable_parallel(4)\n",
	"def cutword_1(x):\n",
	" words = jieba.cut(x)\n",
	" return ' '.join(words)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"df['seg_word'] = df.txt.map(cutword_1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from cPickle import dump,load\n",
	"#dump(df, open('df.pickle', 'wb'))\n",
	"df = load(open('df.pickle','rb'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(17903, 10000)"
	]
	},
	"execution_count": 11,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# model \n",
	"from sklearn.feature_extraction.text import TfidfVectorizer\n",
	"vect = TfidfVectorizer(ngram_range=(1,1), min_df = 2, max_features = 10000)\n",
	"xvec = vect.fit_transform(df.seg_word)\n",
	"xvec.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"y = df.label"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.cross_validation import train_test_split\n",
	"train_X, test_X, train_y, test_y = train_test_split(xvec, y , train_size=0.7, random_state=1)\n",
	"from sklearn.naive_bayes import MultinomialNB\n",
	"clf = MultinomialNB()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"clf.fit(train_X, train_y)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 0.91 0.88 0.89 576\n",
	" 1 0.86 0.83 0.84 604\n",
	" 2 0.88 0.83 0.86 616\n",
	" 3 0.99 0.97 0.98 580\n",
	" 4 0.87 0.88 0.88 597\n",
	" 5 0.88 0.80 0.83 607\n",
	" 6 0.78 0.89 0.83 599\n",
	" 7 0.74 0.79 0.76 613\n",
	" 8 0.92 0.93 0.92 579\n",
	"\n",
	"avg / total 0.87 0.86 0.87 5371\n",
	"\n"
	]
	}
	],
	"source": [
	"from sklearn import metrics\n",
	"pre = clf.predict(test_X)\n",
	"print metrics.classification_report(test_y, pre)\n",
	"#print metrics.confusion_matrix(test_y, pre)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# word2vec\n",
	"txt = df.seg_word.values\n",
	"txtlist = []\n",
	"for sent in txt:\n",
	" temp = [w for w in sent.split()]\n",
	" txtlist.append(temp)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 22,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"num_features = 100\n",
	"min_word_count = 10\n",
	"num_workers = 4\n",
	"context = 20\n",
	"epoch = 20\n",
	"sample = 1e-5"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from gensim.models import word2vec"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 89,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"model = word2vec.Word2Vec(txtlist, workers = num_workers,\n",
	" sample = sample,\n",
	" size = num_features,\n",
	" min_count=min_word_count,\n",
	" window = context,\n",
	" iter = epoch)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 90,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(57675, 100)"
	]
	},
	"execution_count": 90,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"model.syn0.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 91,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"在线 0.809367001057\n",
	"网络 0.792132735252\n",
	"网民 0.789814114571\n",
	"网站 0.766795158386\n",
	"网络广告 0.763081729412\n",
	"门户网站 0.757833242416\n",
	"互联网内容 0.728336572647\n",
	"访问量 0.703088879585\n",
	"商业模式 0.701648652554\n",
	"Web2 0.698530614376\n"
	]
	}
	],
	"source": [
	"for w in model.most_similar(u'互联网'):\n",
	" print w[0], w[1]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#model.save('sogo_wv')\n",
	"model = word2vec.Word2Vec.load('sogo_wv')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# kmeans based on word_vec\n",
	"from sklearn.cluster import KMeans\n",
	"word_vectors = model.syn0\n",
	"num_clusters = word_vectors.shape[0]//20\n",
	"kmeans_clustering = KMeans(n_clusters = num_clusters)\n",
	"idx = kmeans_clustering.fit_predict(word_vectors)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"word_centroid_map = dict(zip(model.index2word, idx))\n",
	"word_centroid_df = pd.DataFrame(zip( model.index2word, idx )) \n",
	"word_centroid_df.columns = ['word','cluster'] \n",
	"word_centroid_df.head() "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# 观察前十个群的效果 \n",
	"for cluster in xrange(10): \n",
	" print \"\\nCluster %d\" % cluster \n",
	" words = word_centroid_df.ix[word_centroid_df.cluster==cluster,'word'].values \n",
	" print ' '.join(words) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 107,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# 观察有很多词的大群 \n",
	"big_cluster = word_centroid_df.groupby('cluster').apply(lambda x: len(x.word)).reset_index() \n",
	"big_cluster.columns = ['cluster','word_num'] \n",
	"key_cluster = big_cluster.ix[big_cluster['word_num']>=10,'cluster'].values \n",
	"\n",
	"\n",
	"def create_bag_of_centroids( wordlist, word_centroid_map ): \n",
	" # 从词到类别编号的映射函数 \n",
	" # wordlist是文本中的词，word_centroid_map是诩到编号的dict \n",
	" num_centroids = max( word_centroid_map.values() ) + 1 \n",
	" bag_of_centroids = np.zeros( num_centroids, dtype=\"float32\" ) \n",
	" for word in wordlist: \n",
	" if word in word_centroid_map: \n",
	" index = word_centroid_map[word] \n",
	" if index in key_cluster: \n",
	" bag_of_centroids[index] += 1 \n",
	" return bag_of_centroids "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 112,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# 从原始文本映射成群编号 \n",
	"train_centroids = np.zeros( (len(txtlist), num_clusters),dtype=\"float32\" ) \n",
	"for i, review in enumerate(txtlist): \n",
	" train_centroids[i] = create_bag_of_centroids( review,word_centroid_map ) \n",
	"# 变为0-1特征 \n",
	"train_centroids = np.where(train_centroids>0,1,0) \n",
	"train_centroids_df = pd.DataFrame(train_centroids) \n",
	"train_centroids_df= train_centroids_df.ix[:,train_centroids.sum(axis=0)!=0] "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 113,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(17910, 1429)"
	]
	},
	"execution_count": 113,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"train_centroids_df.shape"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 118,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.linear_model import SGDClassifier\n",
	"from sklearn.cross_validation import train_test_split\n",
	"train_X, test_X, train_y, test_y = train_test_split(train_centroids_df.values, y , train_size=0.7, random_state=1)\n",
	"from sklearn.naive_bayes import MultinomialNB\n",
	"clf = SGDClassifier()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 119,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 0.88 0.91 0.90 577\n",
	" 1 0.74 0.86 0.80 603\n",
	" 2 0.83 0.86 0.84 619\n",
	" 3 0.85 0.73 0.78 584\n",
	" 4 0.82 0.75 0.78 570\n",
	" 5 0.75 0.74 0.75 600\n",
	" 6 0.82 0.82 0.82 600\n",
	" 7 0.99 0.94 0.96 615\n",
	" 8 0.71 0.75 0.73 605\n",
	"\n",
	"avg / total 0.82 0.82 0.82 5373\n",
	"\n",
	"[[525 1 1 1 9 4 3 0 33]\n",
	" [ 0 519 9 13 22 18 2 1 19]\n",
	" [ 3 14 533 2 4 47 10 0 6]\n",
	" [ 1 52 30 425 13 22 11 0 30]\n",
	" [ 8 53 6 21 429 5 10 1 37]\n",
	" [ 26 19 26 18 10 447 31 1 22]\n",
	" [ 7 8 19 5 9 27 491 1 33]\n",
	" [ 1 3 11 1 1 9 6 578 5]\n",
	" [ 23 29 8 14 26 14 37 1 453]]\n"
	]
	}
	],
	"source": [
	"clf.fit(train_X, train_y)\n",
	"from sklearn import metrics\n",
	"pre = clf.predict(test_X)\n",
	"print metrics.classification_report(test_y, pre)\n",
	"print metrics.confusion_matrix(test_y, pre)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 95,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"(17910, 100)"
	]
	},
	"execution_count": 95,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# 将词向量平均化为文档向量 \n",
	"def sentvec(sent,m=num_features,model=model): \n",
	" res = np.zeros(m) \n",
	" words = sent.split() \n",
	" num = 0 \n",
	" for w in words: \n",
	" if w in model.index2word: \n",
	" res += model[w] \n",
	" num += 1.0 \n",
	" if num == 0: return np.zeros(m) \n",
	" else: return res/num \n",
	" \n",
	"n = df.shape[0] \n",
	"sent_matrix = np.zeros([n,num_features],float) \n",
	"for i ,sent in enumerate(df.seg_word.values): \n",
	" sent_matrix[i,:] = sentvec(sent) \n",
	"sent_matrix.shape "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 99,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"from sklearn.ensemble import GradientBoostingClassifier\n",
	"from sklearn.cross_validation import train_test_split\n",
	"train_X, test_X, train_y, test_y = train_test_split(sent_matrix, y , train_size=0.7, random_state=1)\n",
	"clf = GradientBoostingClassifier()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 100,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" precision recall f1-score support\n",
	"\n",
	" 0 0.93 0.93 0.93 577\n",
	" 1 0.83 0.84 0.84 603\n",
	" 2 0.91 0.85 0.88 619\n",
	" 3 0.85 0.87 0.86 584\n",
	" 4 0.85 0.83 0.84 570\n",
	" 5 0.83 0.80 0.81 600\n",
	" 6 0.88 0.88 0.88 600\n",
	" 7 0.97 0.96 0.97 615\n",
	" 8 0.76 0.83 0.80 605\n",
	"\n",
	"avg / total 0.87 0.87 0.87 5373\n",
	"\n",
	"[[539 1 2 3 5 3 6 1 17]\n",
	" [ 0 507 4 20 20 9 4 1 38]\n",
	" [ 3 10 529 11 6 35 12 3 10]\n",
	" [ 0 25 8 509 11 13 5 1 12]\n",
	" [ 6 27 4 17 472 9 5 2 28]\n",
	" [ 15 12 22 15 12 477 19 4 24]\n",
	" [ 6 7 8 9 3 13 530 3 21]\n",
	" [ 0 1 0 2 5 4 5 592 6]\n",
	" [ 9 20 6 10 22 12 19 2 505]]\n"
	]
	}
	],
	"source": [
	"clf.fit(train_X, train_y)\n",
	"from sklearn import metrics\n",
	"pre = clf.predict(test_X)\n",
	"print metrics.classification_report(test_y, pre)\n",
	"print metrics.confusion_matrix(test_y, pre)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}