Created
December 21, 2018 10:36
-
-
Save ikedaosushi/72a1194f06ae8d3bcd91a10a34a5681e to your computer and use it in GitHub Desktop.
Slackデータを可視化して会社の2018年を振り返る
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"%load_ext autoreload\n", | |
"%autoreload 2\n", | |
"%matplotlib inline\n", | |
"from IPython.display import Image\n", | |
"\n", | |
"import os, sys, re, datetime, time, copy\n", | |
"from pathlib import Path\n", | |
"\n", | |
"pj_dir = Path(os.getcwd()).parents[1]\n", | |
"data_dir = pj_dir/'data'\n", | |
"img_dir = pj_dir/'images'\n", | |
"src_dir = pj_dir/'src'\n", | |
"sys.path.append(str(src_dir))\n", | |
"\n", | |
"from matplotlib import pyplot as plt\n", | |
"import jpholiday\n", | |
"from tqdm import tqdm_notebook\n", | |
"from dotenv import load_dotenv\n", | |
"\n", | |
"import seaborn as sns\n", | |
"plt.style.use(\"bmh\")\n", | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import dask.dataframe as dd\n", | |
"import requests\n", | |
"\n", | |
"import MeCab\n", | |
"from sklearn.manifold import TSNE\n", | |
"from wordcloud import WordCloud\n", | |
"\n", | |
"from gensim import models\n", | |
"from gensim.models.doc2vec import TaggedDocument" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import matplotlib\n", | |
"matplotlib.rcParams[\"figure.figsize\"] = (16, 4)\n", | |
"plt.rcParams[\"font.family\"] = \"IPAexGothic\"\n", | |
"import logging\n", | |
"logging.basicConfig(level=logging.INFO)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pd.set_option(\"display.max_rows\", 100)\n", | |
"load_dotenv(pj_dir/'.env')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Slackデータの取得" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"token = os.environ.get('SLACK_TOKEN')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"headers = {\n", | |
" \"Content-type\": \"application/json\",\n", | |
" \"Authorization\": f\"Bearer {token}\"\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def fetch_messages_by_channel(channe_id):\n", | |
" oldest_ts = None\n", | |
" one_year_ago = pd.to_datetime('2017-12-31')\n", | |
" endpoint = 'https://slack.com/api/channels.history'\n", | |
"\n", | |
" ls_messages = []\n", | |
" while True:\n", | |
" payload = {\n", | |
" 'channel': channel_id,\n", | |
" 'latest': oldest_ts,\n", | |
" 'count': 1000\n", | |
" }\n", | |
"\n", | |
" data = requests.get(endpoint, headers=headers, params=payload).json()\n", | |
" messages = data['messages']\n", | |
" ls_messages.extend(messages)\n", | |
"\n", | |
" if data['has_more']:\n", | |
" time.sleep(1)\n", | |
" oldest_ts = messages[-1]['ts']\n", | |
" oldest_datetime = pd.to_datetime(oldest_ts, unit='s')\n", | |
" sys.stdout.write(f\"\\r{oldest_datetime}\")\n", | |
" sys.stdout.flush()\n", | |
" if oldest_datetime < one_year_ago:\n", | |
" sys.stdout.write(f\"\\rfinish!\" + ' '*50)\n", | |
" break\n", | |
" else:\n", | |
" break\n", | |
" df = pd.DataFrame(ls_messages)\n", | |
" df['channel_id'] = channel_id\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"ls_df = []\n", | |
"ls_err_channel_id = []\n", | |
"for i, row in tqdm_notebook(df_channel.iterrows()):\n", | |
" channel_id = row['id']\n", | |
" try:\n", | |
" df = fetch_messages_by_channel(channel_id)\n", | |
" except:\n", | |
" print(f\"Error on {row['name']}\")\n", | |
" ls_err_channel_id.append(channel_id)\n", | |
" else:\n", | |
" ls_df.append(df)\n", | |
" time.sleep(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.concat(ls_df)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"endpoint = 'https://slack.com/api/channels.list'\n", | |
"payload = {}\n", | |
"\n", | |
"data = requests.get(endpoint, headers=headers, params=payload).json()\n", | |
"df_channel = pd.DataFrame(data['channels'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"endpoint = 'https://slack.com/api/users.list'\n", | |
"payload = {}\n", | |
"\n", | |
"data = requests.get(endpoint, headers=headers, params=payload).json()\n", | |
"df_member = pd.DataFrame(data['members'])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 保存" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_channel.to_pickle(data_dir/'kaizen_slack/channels.pickle')\n", | |
"df_member.to_pickle(data_dir/'kaizen_slack/members.pickle')\n", | |
"df.to_pickle(data_dir/'kaizen_slack/messages.pickle')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# ロード" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"dfall = pd.read_pickle(data_dir/'kaizen_slack/all_messages.pickle')\n", | |
"df_channel = pd.read_pickle(data_dir/'kaizen_slack/channels.pickle')\n", | |
"df_member = pd.read_pickle(data_dir/'kaizen_slack/members.pickle')\n", | |
"df = pd.read_pickle(data_dir/'kaizen_slack/messages.pickle')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# mapping作成" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"user_id_name_map = df_member.set_index('id')['name'].to_dict()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"channel_name_id_map = df_channel.set_index('id')['name'].to_dict()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 前処理" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# datetimeに変換\n", | |
"df['dt'] = pd.to_datetime(df['ts'], unit='s')\n", | |
"\n", | |
"# 1年前からの発言に絞る\n", | |
"one_year_ago = pd.to_datetime('2017-12-31')\n", | |
"df = df.query('@one_year_ago < dt')\n", | |
"\n", | |
"# usernameをmap\n", | |
"df['username'] = df['user'].map(user_id_name_map)\n", | |
"\n", | |
"# channel nameをmap\n", | |
"df['channel_name'] = df['channel_id'].map(channel_name_id_map)\n", | |
"\n", | |
"# botを削除\n", | |
"df = df[df['bot_id'].isnull()]\n", | |
"df = df.query('username != \"cronbot\"').query('username != \"slackbot\"')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# メッセージだけに絞る" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"not_message_types = ['channel_join', 'channel_leave', 'channel_topic', 'channel_archive', 'channel_purpose', 'sh_room_created', 'channel_name', 'pinned_item', 'reminder_add', 'app_conversation_join']\n", | |
"df = df[~df['subtype'].isin(not_message_types)]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# @されてるユーザー" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df['at_user'] = df['text'].str.extract(r'(?<=<@)(.{1,9})(?=>)')\n", | |
"df['at_username'] = df['at_user'].map(user_id_name_map)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 発言数" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"s = df['username'].value_counts()[:5]\n", | |
"\n", | |
"n = s.shape[0]\n", | |
"fig = plt.figure(figsize=(16,1*n))\n", | |
"\n", | |
"tmp_df = s.to_frame('value').reset_index().rename(columns={'index': 'name'})\n", | |
"ax = sns.barplot(x='value', y='name', palette=\"autumn\", data=tmp_df)\n", | |
"max_ = tmp_df['value'].max()\n", | |
"\n", | |
"for i, (_, row) in enumerate(tmp_df.iterrows()):\n", | |
" text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n", | |
"\n", | |
"[spine.set_visible(False) for spine in ax.spines.values()]\n", | |
"ax.tick_params(bottom=False, left=False, labelbottom=False)\n", | |
"ax.tick_params(axis='y', labelsize=20)\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')\n", | |
"ax.set_title('2018年 発言数 TOP5', fontsize=30)\n", | |
"ax.patch.set_facecolor('white') \n", | |
"\n", | |
"ax.patch.set_alpha(0)\n", | |
"plt.grid(False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# @された数" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"s = df['at_username'].value_counts()[:5]\n", | |
"\n", | |
"n = s.shape[0]\n", | |
"fig = plt.figure(figsize=(16,1*n))\n", | |
"\n", | |
"tmp_df = s.to_frame('value').reset_index().rename(columns={'index': 'name'})\n", | |
"ax = sns.barplot(x='value', y='name', data=tmp_df, palette=\"autumn\")\n", | |
"max_ = tmp_df['value'].max()\n", | |
"\n", | |
"for i, (_, row) in enumerate(tmp_df.iterrows()):\n", | |
" text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n", | |
"\n", | |
"[spine.set_visible(False) for spine in ax.spines.values()]\n", | |
"ax.tick_params(bottom=False, left=False, labelbottom=False)\n", | |
"ax.tick_params(axis='y', labelsize=20)\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')\n", | |
"ax.set_title('2018年 @された数 TOP5', fontsize=30)\n", | |
"ax.patch.set_facecolor('white') \n", | |
"\n", | |
"ax.patch.set_alpha(0)\n", | |
"plt.grid(False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 一番使われたリアクションは?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from collections import defaultdict" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"count_reaction = defaultdict(int)\n", | |
"for reactions in tqdm_notebook(df['reactions'].fillna('')):\n", | |
" if len(reactions) == 0:\n", | |
" continue\n", | |
" for reaction in reactions:\n", | |
" name = reaction['name']\n", | |
" count = len(reaction['users'])\n", | |
" count_reaction[name] += count" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"s = pd.Series(count_reaction).sort_values(ascending=False)[:5]\n", | |
"\n", | |
"n = s.shape[0]\n", | |
"fig = plt.figure(figsize=(16,1*n))\n", | |
"\n", | |
"tmp_df = s.to_frame('value').reset_index().rename(columns={'index': 'name'})\n", | |
"ax = sns.barplot(x='value', y='name', data=tmp_df, palette=\"autumn\")\n", | |
"max_ = tmp_df['value'].max()\n", | |
"\n", | |
"for i, (_, row) in enumerate(tmp_df.iterrows()):\n", | |
" text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n", | |
"\n", | |
"[spine.set_visible(False) for spine in ax.spines.values()]\n", | |
"ax.tick_params(bottom=False, left=False, labelbottom=False)\n", | |
"ax.tick_params(axis='y', labelsize=20)\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')\n", | |
"ax.set_title('2018年 使われたリアクション TOP10', fontsize=30)\n", | |
"ax.patch.set_facecolor('white') \n", | |
"\n", | |
"ax.patch.set_alpha(0)\n", | |
"plt.grid(False)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 時系列で見た発言数" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"is_weekday_date =[dt for dt in pd.date_range('2018-1-1', '2018-12-16', freq='1D') if dt.weekday() in [0, 1, 2, 3, 4] and not jpholiday.is_holiday(dt.date())]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"fig = plt.figure(figsize=(16, 4))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"ax = df.groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].plot(linewidth=2, linestyle='--', ax=ax)\n", | |
"ax = df.groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].rolling(window=5).mean().plot(linewidth=5, ax=ax)\n", | |
"ax.set_title('1日あたりの発言数(休日祝日は除く) ※破線が実数、実線が周期5の移動平均', fontsize=20)\n", | |
"ax.tick_params(axis='both', labelsize='xx-large')\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Channel数" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_channel['created_dt'] = pd.to_datetime(df_channel['created'], unit='s')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"s_added = df_channel.groupby(pd.Grouper(key='created_dt', freq='1W')).size()\n", | |
"s_archived = dfall.query('subtype == \"channel_archive\"').groupby(pd.Grouper(key='datetime', freq='1W')).size()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# transactionデータ作成" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_transition_channel = pd.concat([s_added, s_archived], axis=1).rename(columns={0: 'added', 1: 'archived'})\n", | |
"df_transition_channel.fillna(0, inplace=True)\n", | |
"df_transition_channel['count_channel'] = (df_transition_channel['added'] - df_transition_channel['archived']).cumsum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_transition_channel[one_year_ago < df_transition_channel.index]['added'].sum(), df_transition_channel[one_year_ago < df_transition_channel.index]['archived'].sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = plt.figure(figsize=(16, 8))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"s = df_transition_channel['count_channel']\n", | |
"ax = s.plot(linewidth=5, linestyle='-', ax=ax, label='総Channel数')\n", | |
"ax.set_title('左軸: Channel数 右軸: 作成/アーカイブ数', fontsize=20)\n", | |
"ax.tick_params(axis='both', labelsize=20)\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')\n", | |
"plt.legend(fontsize=20, loc='lower right')\n", | |
"\n", | |
"ax2 = ax.twinx()\n", | |
"s = df_transition_channel['added'].rolling(window=7).mean()\n", | |
"s.plot(linewidth=3, linestyle='--', ax=ax2, label='作成数/day', color='C1')\n", | |
"s = df_transition_channel['archived'].rolling(window=7).mean()\n", | |
"s.plot(linewidth=3, linestyle='--', ax=ax2, label='アーカイブ数/day', color='C3')\n", | |
"ax2.tick_params(axis='both', labelsize=15)\n", | |
"plt.legend(fontsize=20)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 時系列細かく" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"weekday_str_map = {\n", | |
" 0: '月', 1: '火', 2: '水', 3: '木', 4: '金', 5: '土', 6: '日'\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_daily = df.groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].to_frame('count')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_daily['weekday'] = df_daily.index.weekday.map(weekday_str_map)\n", | |
"df_daily['day_in_month'] = df_daily.index.day" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = plt.figure(figsize=(16, 4))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"ax = sns.boxplot(data=df_daily.sort_values('weekday'), x='weekday', y='count')\n", | |
"ax.set_title('曜日による発言数の分布', fontsize=20)\n", | |
"ax.tick_params(axis='both', labelsize='x-large')\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = plt.figure(figsize=(16, 4))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"ax = sns.boxplot(data=df_daily.sort_values('day_in_month'), x='day_in_month', y='count')\n", | |
"ax.set_title('日付による発言数の分布', fontsize=20)\n", | |
"ax.tick_params(axis='both', labelsize='x-large')\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"is_weekday = df['dt'].dt.weekday.isin([0, 1, 2, 3, 4]) & ~df['dt'].dt.date.apply(jpholiday.is_holiday)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_hourly = df[is_weekday].groupby(pd.Grouper(key='dt', freq='1h')).size().to_frame('count')\n", | |
"df_hourly['hour'] = df_hourly.index.hour + 9\n", | |
"work_hours = list(range(9, 20))\n", | |
"\n", | |
"fig = plt.figure(figsize=(16, 4))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"ax = sns.boxplot(data=df_hourly.query('hour in @work_hours').sort_values('hour'), x='hour', y='count')\n", | |
"ax.set_title('時間帯による発言数の分布', fontsize=20)\n", | |
"ax.tick_params(axis='both', labelsize='x-large')\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"s = df.groupby('channel_name').size().sort_values(ascending=False)[:5]\n", | |
"\n", | |
"n = s.shape[0]\n", | |
"fig = plt.figure(figsize=(16,1*n))\n", | |
"\n", | |
"tmp_df = s.to_frame('value').reset_index().rename(columns={'channel_name': 'name'})\n", | |
"ax = sns.barplot(x='value', y='name', data=tmp_df, palette=\"autumn\")\n", | |
"max_ = tmp_df['value'].max()\n", | |
"\n", | |
"for i, (_, row) in enumerate(tmp_df.iterrows()):\n", | |
" text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n", | |
"\n", | |
"[spine.set_visible(False) for spine in ax.spines.values()]\n", | |
"ax.tick_params(bottom=False, left=False, labelbottom=False)\n", | |
"ax.tick_params(axis='y', labelsize=20)\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')\n", | |
"ax.set_title('2018年 発言が多かったChannel TOP5', fontsize=30)\n", | |
"ax.patch.set_facecolor('white') \n", | |
"\n", | |
"ax.patch.set_alpha(0)\n", | |
"plt.grid(False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = plt.figure(figsize=(16, 4))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"ax = df.query('channel_name == \"ad-cs\"').groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].plot(linewidth=2, linestyle='--', ax=ax)\n", | |
"ax = df.query('channel_name == \"ad-cs\"').groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].rolling(window=5).mean().plot(linewidth=5, ax=ax)\n", | |
"ax.set_title('ad-csの1日あたりの発言数(休日祝日は除く) ※破線が実数、実線が周期5の移動平均', fontsize=20)\n", | |
"ax.tick_params(axis='both', labelsize='xx-large')\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = plt.figure(figsize=(16, 4))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"ax = df.query('channel_name == \"times_ikedayu\"').groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].plot(linewidth=2, linestyle='--', ax=ax)\n", | |
"ax = df.query('channel_name == \"times_ikedayu\"').groupby(pd.Grouper(key='dt', freq='1D')).size()[is_weekday_date].rolling(window=5).mean().plot(linewidth=5, ax=ax)\n", | |
"ax.set_title('times_ikedayuの1日あたりの発言数(休日祝日は除く) ※破線が実数、実線が周期5の移動平均', fontsize=20)\n", | |
"ax.tick_params(axis='both', labelsize='xx-large')\n", | |
"ax.set_xlabel('')\n", | |
"ax.set_ylabel('')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# s = df[df['channel_name'].str.contains('times_')].groupby('channel_name').size().sort_values(ascending=False)[:5]\n", | |
"\n", | |
"# n = s.shape[0]\n", | |
"# fig = plt.figure(figsize=(16,1*n))\n", | |
"\n", | |
"# tmp_df = s.to_frame('value').reset_index().rename(columns={'channel_name': 'name'})\n", | |
"# ax = sns.barplot(x='value', y='name', data=tmp_df, palette=\"autumn\")\n", | |
"# max_ = tmp_df['value'].max()\n", | |
"\n", | |
"# for i, (_, row) in enumerate(tmp_df.iterrows()):\n", | |
"# text = ax.text(row['value'] + max_*.05, i+0.1, row['value'], color='black', ha=\"center\", fontsize=20)\n", | |
"\n", | |
"# [spine.set_visible(False) for spine in ax.spines.values()]\n", | |
"# ax.tick_params(bottom=False, left=False, labelbottom=False)\n", | |
"# ax.tick_params(axis='y', labelsize=20)\n", | |
"# ax.set_xlabel('')\n", | |
"# ax.set_ylabel('')\n", | |
"# ax.set_title('2018年 発言が多かったtimes TOP5', fontsize=30)\n", | |
"# ax.patch.set_facecolor('white') \n", | |
"\n", | |
"# ax.patch.set_alpha(0)\n", | |
"# plt.grid(False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def plot_hist(s, title, bins=30, xlabel='', ylabel=''):\n", | |
" fig = plt.figure(figsize=(16, 4))\n", | |
" ax = fig.add_subplot(1, 1, 1)\n", | |
"\n", | |
" mean =s.mean().round(2)\n", | |
" median = s.median().round(2)\n", | |
" std = s.std().round(2)\n", | |
"\n", | |
" sns.distplot(s, ax=ax, bins=bins, kde_kws={\"color\": \"k\", \"lw\": 3})\n", | |
" ax.set_title(title, fontsize=20)\n", | |
" ax.tick_params(axis = 'x', which = 'major', labelsize = 20)\n", | |
" vals = ax.get_yticks()\n", | |
" ax.set_xlabel(xlabel, fontsize=20)\n", | |
" ax.set_ylabel(ylabel, fontsize=20)\n", | |
" ax.set_yticklabels(['{:,.2%}'.format(x) for x in vals])\n", | |
" ax.text( 0.99, 0.99, f\"平均値: {mean:.2f} \\n 中央値: {median:.2f} \\n 標準偏差: {std:.2f}\", horizontalalignment='right', verticalalignment='top', transform=ax.transAxes, fontsize=20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"s = df.groupby('channel_name').size().sort_values(ascending=False)\n", | |
"s = s[s > 0]\n", | |
"plot_hist(s, 'チャンネルごとの発言数のヒストグラム', bins=100, xlabel='発言数')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"s = df.groupby('channel_name').size().sort_values(ascending=False)\n", | |
"sum_ = s.sum()\n", | |
"tmp_df = pd.concat([s, s.cumsum(), 100*s.cumsum()/sum_], axis=1)\n", | |
"tmp_df.columns = ['number', 'cumsum', 'cumsum_percent']\n", | |
"\n", | |
"fig = plt.figure(figsize=(16, 4))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"\n", | |
"n = tmp_df.shape[0]\n", | |
"x = np.arange(0, n)\n", | |
"ax.plot(x, tmp_df['cumsum_percent'], linewidth=5)\n", | |
"ax.tick_params(axis='both', labelsize=20)\n", | |
"ax.set_xlabel('Channel数', fontsize=20)\n", | |
"ax.set_ylabel('発言数の累積%', fontsize=20)\n", | |
"ax.set_title('Slcak Channelと発言数のパレート図', fontsize=30)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# 自然言語処理" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df['text_trimed'] = df['text'].str.replace(r'<\\S+>', '').str.replace(r':\\S+:', '').str.replace('\\n', '')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Doc2Vec" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"tmp_df = df.groupby('channel_name').size().sort_values(ascending=False)\n", | |
"top100_channel = tmp_df[:100].index.tolist()\n", | |
"top10_channel = tmp_df[:10].index.tolist()\n", | |
"top20_channel = tmp_df[:20].index.tolist()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"grouped = df.query('channel_name in @top100_channel').groupby('channel_name')\n", | |
"channel_words = {}\n", | |
"for channel_name, tmp_df in tqdm_notebook(grouped):\n", | |
" doc = ''.join(tmp_df['text_trimed'].values.tolist())\n", | |
" channel_words[channel_name] = split_into_words(doc)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def split_into_words(doc):\n", | |
" mecab = MeCab.Tagger(\"-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd\")\n", | |
" lines = mecab.parse(doc).splitlines()\n", | |
" words = []\n", | |
" for line in tqdm_notebook(lines):\n", | |
" chunks = line.split('\\t')\n", | |
" if len(chunks) > 3 and (chunks[3].startswith('動詞') or chunks[3].startswith('形容詞') or (chunks[3].startswith('名詞') and not chunks[3].startswith('名詞-数'))):\n", | |
" words.append(chunks[0])\n", | |
" return words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"stop_words = [ 'てる', 'いる', 'なる', 'れる', 'する', 'ある', 'こと', 'これ', 'さん', 'して', \\\n", | |
" 'くれる', 'やる', 'くださる', 'そう', 'せる', 'した', '思う', \\\n", | |
" 'それ', 'ここ', 'ちゃん', 'くん', '', 'て','に','を','は','の', 'が', 'と', 'た', 'し', 'で', \\\n", | |
" 'ない', 'も', 'な', 'い', 'か', 'ので', 'よう', '', '思い', 'なっ', 'でき', 'いい', 'もの', 'あり', 'なり', 'ところ',\n", | |
" 'こちら', '本日', 'おり', 'ください', 'お願い', 'いたし', 'ため', 'いただき', 'gt', 'commented', 'on', '思っ', '行っ', \n", | |
" 'しまっ', 'やっ', '行き', 'とき', 'できる', '自分', '書い', 'あと'\n", | |
" ]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"channel_name = 'general'\n", | |
"\n", | |
"tmp_df = df.query(f'channel_name == \"{channel_name}\"') \n", | |
"tmp_doc = ''.join(tmp_df['text_trimed'].values.tolist())\n", | |
"tmp_words = split_into_words(tmp_doc)\n", | |
"\n", | |
"fig = plt.figure(figsize=(16, 10))\n", | |
"fpath = \"/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc\"\n", | |
"\n", | |
"wordcloud = WordCloud(background_color=\"white\", width=900, height=500, font_path=fpath, stopwords=stop_words).generate(\" \".join(tmp_words))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"ax.imshow(wordcloud)\n", | |
"[spine.set_visible(False) for spine in ax.spines.values()]\n", | |
"\n", | |
"ax.set_yticklabels([])\n", | |
"ax.set_xticklabels([])\n", | |
"ax.grid(False)\n", | |
"ax.set_title(f'#{channel_name}', fontsize=20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"channel_name = 'random'\n", | |
"\n", | |
"tmp_df = df.query(f'channel_name == \"{channel_name}\"') \n", | |
"tmp_doc = ''.join(tmp_df['text_trimed'].values.tolist())\n", | |
"tmp_words = split_into_words(tmp_doc)\n", | |
"\n", | |
"fig = plt.figure(figsize=(16, 10))\n", | |
"fpath = \"/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc\"\n", | |
"\n", | |
"wordcloud = WordCloud(background_color=\"white\", width=900, height=500, font_path=fpath, stopwords=stop_words).generate(\" \".join(tmp_words))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"ax.imshow(wordcloud)\n", | |
"[spine.set_visible(False) for spine in ax.spines.values()]\n", | |
"\n", | |
"ax.set_yticklabels([])\n", | |
"ax.set_xticklabels([])\n", | |
"ax.grid(False)\n", | |
"ax.set_title(f'#{channel_name}', fontsize=20)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"channel_name = 'times_ikedayu'\n", | |
"\n", | |
"tmp_df = df.query(f'channel_name == \"{channel_name}\"') \n", | |
"tmp_doc = ''.join(tmp_df['text_trimed'].values.tolist())\n", | |
"tmp_words = split_into_words(tmp_doc)\n", | |
"\n", | |
"fig = plt.figure(figsize=(16, 10))\n", | |
"fpath = \"/System/Library/Fonts/ヒラギノ角ゴシック W3.ttc\"\n", | |
"\n", | |
"wordcloud = WordCloud(background_color=\"white\", width=900, height=500, font_path=fpath, stopwords=stop_words).generate(\" \".join(tmp_words))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"ax.imshow(wordcloud)\n", | |
"[spine.set_visible(False) for spine in ax.spines.values()]\n", | |
"\n", | |
"ax.set_yticklabels([])\n", | |
"ax.set_xticklabels([])\n", | |
"ax.grid(False)\n", | |
"ax.set_title(f'#{channel_name}', fontsize=20)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Doc2Vec" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"sentences = []\n", | |
"for channel_name, words in channel_words.items():\n", | |
" td = TaggedDocument(words=words, tags=[channel_name])\n", | |
" sentences.append(td)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = models.Doc2Vec(sentences, dm=0, vector_size=300, window=15, alpha=.025, min_alpha=.025, min_count=1, sample=1e-6)\n", | |
"\n", | |
"print('\\n訓練開始')\n", | |
"for epoch in range(20):\n", | |
" print('Epoch: {}'.format(epoch + 1))\n", | |
" model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)\n", | |
" model.alpha -= (0.025 - 0.0001) / 19\n", | |
" model.min_alpha = model.alpha" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model.save(str(data_dir/'kaizen_slack/d2v.model'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"model = models.Doc2Vec.load(str(data_dir/'kaizen_slack/d2v.model'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": true | |
}, | |
"outputs": [], | |
"source": [ | |
"ls_similar_doc = []\n", | |
"for channel_name in top20_channel:\n", | |
" similar_doc = {'channel_name': channel_name}\n", | |
" similar_channels = model.docvecs.most_similar(channel_name, topn=3)\n", | |
" for i, (cname, value) in enumerate(similar_channels):\n", | |
" text = f\"{cname}({value:.2f})\"\n", | |
" similar_doc[f'{i+1}位'] = text\n", | |
" ls_similar_doc.append(similar_doc)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df_sim = pd.DataFrame(ls_similar_doc)[['channel_name', '1位', '2位', '3位']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"scrolled": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_sim[df_sim['channel_name'].str.contains('times_')].to_clipboard(sep=';')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# t-SNE" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"target_channels = [c for c in top20_channel if 'kz-' not in c]\n", | |
"X = np.stack([model.docvecs[cname] for cname in target_channels])\n", | |
"X_embedded = TSNE(n_components=2, n_iter=100000, learning_rate=4).fit_transform(X)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"fig = plt.figure(figsize=(16, 8))\n", | |
"ax = fig.add_subplot(1, 1, 1)\n", | |
"\n", | |
"ax.scatter(X_embedded.T[0], X_embedded.T[1])\n", | |
"ax.set_xlim(-0.105, -0.094)\n", | |
"ax.set_ylim(-0.075, -0.063)\n", | |
"fontsize=19\n", | |
"\n", | |
"for i, c_name in enumerate(target_channels):\n", | |
" if c_name == 'ad-cs':\n", | |
" ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]+0.0005), fontsize=fontsize)\n", | |
" elif c_name =='support-tech':\n", | |
" ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]-0.0005), fontsize=fontsize)\n", | |
" elif c_name =='cs-engineering':\n", | |
" ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]-0.0005), fontsize=fontsize)\n", | |
" elif c_name =='prd-random':\n", | |
" ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]-0.0005), fontsize=fontsize)\n", | |
" elif c_name =='ad-dev-qa':\n", | |
" ax.annotate(c_name, (X_embedded[i][0]-0.001, X_embedded[i][1]+0.0005), fontsize=fontsize)\n", | |
" elif c_name =='prd-team-sre':\n", | |
" ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]-0.0007), fontsize=fontsize)\n", | |
" else:\n", | |
" ax.annotate(c_name, (X_embedded[i][0]+0.0001, X_embedded[i][1]+0.0001), fontsize=fontsize)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment