Last active
May 4, 2018 00:52
-
-
Save takezoe/c574410f38fdb11d0495a5577c42166a to your computer and use it in GitHub Desktop.
scikit-learnでGitHubのイシューのラベル判定をやってみた
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"from sklearn.feature_extraction.text import CountVectorizer\n", | |
"from sklearn.naive_bayes import BernoulliNB\n", | |
"from sklearn.model_selection import train_test_split" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_json(\"https://github.com/takezoe/github-issues-exporter/raw/master/export.json\", lines = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>body</th>\n", | |
" <th>isPullRequest</th>\n", | |
" <th>labels</th>\n", | |
" <th>title</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>This PR add show SMTP error message as follows...</td>\n", | |
" <td>True</td>\n", | |
" <td>[improvement]</td>\n", | |
" <td>Show SMTP Error message in testing email settings</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>In 4.24.0, branch protection setting has probl...</td>\n", | |
" <td>True</td>\n", | |
" <td>[bug]</td>\n", | |
" <td>Fix branch protection problem</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>This bug introduced by #1962. It changed [diff...</td>\n", | |
" <td>True</td>\n", | |
" <td>[bug]</td>\n", | |
" <td>Fix editor preview bug</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>9</th>\n", | |
" <td>In #1880 I figured out this problem. #1880 is ...</td>\n", | |
" <td>False</td>\n", | |
" <td>[bug]</td>\n", | |
" <td>Issue/PullRequest hooks doesn't called when is...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>10</th>\n", | |
" <td>This PR fixes #1880.\\r\\n\\r\\nBut, it doesn't ca...</td>\n", | |
" <td>True</td>\n", | |
" <td>[bug]</td>\n", | |
" <td>call issue closed webhook when pushed commit c...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>11</th>\n", | |
" <td>In [git-flow](https://github.com/nvie/gitflow)...</td>\n", | |
" <td>True</td>\n", | |
" <td>[feature]</td>\n", | |
" <td>close and mark as merged PR by pushed commits</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>12</th>\n", | |
" <td>PR message shows \"into user:branch from user :...</td>\n", | |
" <td>True</td>\n", | |
" <td>[bug]</td>\n", | |
" <td>don't separate user:branch in PR message</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>14</th>\n", | |
" <td>### Before submitting a pull-request to GitBuc...</td>\n", | |
" <td>True</td>\n", | |
" <td>[improvement]</td>\n", | |
" <td>show tags on commits page</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>16</th>\n", | |
" <td>Hi.\\r\\n\\r\\nI try to trigger a jenkins pipeline...</td>\n", | |
" <td>False</td>\n", | |
" <td>[question]</td>\n", | |
" <td>Trigger a Jenkins Pipeline after a Push</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>18</th>\n", | |
" <td>GitBucket 4.23.1.\\r\\n\\r\\nCreate a repo, then c...</td>\n", | |
" <td>False</td>\n", | |
" <td>[bug]</td>\n", | |
" <td>Internal server error when you try to download...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>19</th>\n", | |
" <td>### Before submitting a pull-request to GitBuc...</td>\n", | |
" <td>True</td>\n", | |
" <td>[improvement]</td>\n", | |
" <td>Update with propper mobile/tablet scalling</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>27</th>\n", | |
" <td>Discussed in #1265.\\r\\n\\r\\n</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1987</th>\n", | |
" <td></td>\n", | |
" <td>False</td>\n", | |
" <td>[feature, plugin]</td>\n", | |
" <td>Star</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1988</th>\n", | |
" <td></td>\n", | |
" <td>False</td>\n", | |
" <td>[feature, plugin]</td>\n", | |
" <td>Statistics</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1989</th>\n", | |
" <td></td>\n", | |
" <td>False</td>\n", | |
" <td>[feature]</td>\n", | |
" <td>Network graph</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1990</th>\n", | |
" <td></td>\n", | |
" <td>False</td>\n", | |
" <td>[feature]</td>\n", | |
" <td>User (and repository) activity timeline</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1991</th>\n", | |
" <td></td>\n", | |
" <td>False</td>\n", | |
" <td>[feature]</td>\n", | |
" <td>Repository search</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1992</th>\n", | |
" <td></td>\n", | |
" <td>False</td>\n", | |
" <td>[feature]</td>\n", | |
" <td>Fork and pull request</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>1382 rows × 4 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" body isPullRequest \\\n", | |
"0 This PR add show SMTP error message as follows... True \n", | |
"3 In 4.24.0, branch protection setting has probl... True \n", | |
"6 This bug introduced by #1962. It changed [diff... True \n", | |
"9 In #1880 I figured out this problem. #1880 is ... False \n", | |
"10 This PR fixes #1880.\\r\\n\\r\\nBut, it doesn't ca... True \n", | |
"11 In [git-flow](https://github.com/nvie/gitflow)... True \n", | |
"12 PR message shows \"into user:branch from user :... True \n", | |
"14 ### Before submitting a pull-request to GitBuc... True \n", | |
"16 Hi.\\r\\n\\r\\nI try to trigger a jenkins pipeline... False \n", | |
"18 GitBucket 4.23.1.\\r\\n\\r\\nCreate a repo, then c... False \n", | |
"19 ### Before submitting a pull-request to GitBuc... True \n", | |
"27 Discussed in #1265.\\r\\n\\r\\n \n", | |
"1987 Star \n", | |
"1988 Statistics \n", | |
"1989 Network graph \n", | |
"1990 User (and repository) activity timeline \n", | |
"1991 Repository search \n", | |
"1992 Fork and pull request \n", | |
"\n", | |
"[1382 rows x 4 columns]" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 特定のラベルがついているデータのみ使用する\n", | |
"filter = df[\"labels\"].map(lambda x: (\"bug\" in x) or (\"question\" in x) or (\"improvement\" in x) or (\"feature\" in x))\n", | |
"df2 = df[filter]\n", | |
"df2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# ラベルをbug、question、requestの三種類にまとめる\n", | |
"labels = df2[\"labels\"].map(lambda x: \"bug\" if x.count(\"bug\") > 0 else (\"question\" if x.count(\"question\") > 0 else \"request\"))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"9844" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# タイトルと本文を結合したテキストをベクトル化\n", | |
"count = CountVectorizer()\n", | |
"vector = count.fit_transform(df2[\"title\"] + \" \" + df2[\"body\"].map(str))\n", | |
"#vector = count.fit_transform(df[\"title\"])\n", | |
"vocabulary = count.vocabulary_\n", | |
"len(vocabulary)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 学習データとテストデータに分割\n", | |
"train_vector, test_vector, train_label, test_label = train_test_split(vector, labels, test_size=0.1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# 学習\n", | |
"model = BernoulliNB()\n", | |
"model.fit(train_vector, train_label)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Train accuracy: 0.681\n", | |
"Test accuracy: 0.626\n" | |
] | |
} | |
], | |
"source": [ | |
"# 精度を確認\n", | |
"print('Train accuracy: {:.3f}'.format(model.score(train_vector, train_label)))\n", | |
"print('Test accuracy: {:.3f}'.format(model.score(test_vector, test_label)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"array(['request'], dtype='<U8')" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# テスト\n", | |
"count = CountVectorizer(vocabulary = vocabulary)\n", | |
"test = pd.Series([\"Empty URL markdown link causes java.lang.NullPointerException\"])\n", | |
"vect = count.fit_transform(test)\n", | |
"model.predict(vect)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment