Created
October 27, 2016 13:50
-
-
Save ledmonster/b65560f20e14c67da81f7eb067af995c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd\n", | |
"import re" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv('data/tweets_20161027.tsv', sep='\\t')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" <th>num4</th>\n", | |
" <th>tweet</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>@user_name texttext #tag #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext http://url</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>【交換】</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" num1 num2 num3 num4 tweet\n", | |
"0 1 2 3 0.4 @user_name texttext #tag #tag\n", | |
"1 1 2 3 0.8 texttext #tag\n", | |
"2 0 2 3 0.4 RT texttext\n", | |
"3 1 2 3 0.8 RT texttext\n", | |
"4 1 2 3 0.8 texttext http://url\n", | |
"5 1 2 3 0.4 texttext\n", | |
"6 1 2 3 0.4 【交換】" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def is_rt(tweet):\n", | |
" matched_texts = rt_pattern.findall(tweet) \n", | |
" is_retweet = len(matched_texts) > 0 \n", | |
" return is_retweet" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"rt_pattern = re.compile(r'^RT\\s(.*)')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"result = df.ix[\n", | |
" (~df['tweet'].apply(is_rt)) & \n", | |
" (df['num4'] < 0.8) \n", | |
"] " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" <th>num4</th>\n", | |
" <th>tweet</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>@user_name texttext #tag #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>【交換】</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" num1 num2 num3 num4 tweet\n", | |
"0 1 2 3 0.4 @user_name texttext #tag #tag\n", | |
"5 1 2 3 0.4 texttext\n", | |
"6 1 2 3 0.4 【交換】" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"result" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### @マークを除く" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"re_atmark = re.compile(r'@[A-Za-z0-9_]+')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'NAME texttext #tag #tag'" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"re_atmark.sub('NAME', '@user_name texttext #tag #tag')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'こんにちは さん、 さん'" | |
] | |
}, | |
"execution_count": 12, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"re_atmark.sub('', 'こんにちは @junya さん、@hiro さん')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def remove_atmark(tweet):\n", | |
" return re_atmark.sub('', tweet).strip()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 texttext #tag #tag\n", | |
"1 texttext #tag\n", | |
"2 RT texttext\n", | |
"3 RT texttext\n", | |
"4 texttext http://url\n", | |
"5 texttext\n", | |
"6 【交換】\n", | |
"Name: tweet, dtype: object" | |
] | |
}, | |
"execution_count": 14, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
" df['tweet'].apply(remove_atmark)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df2 = df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df2['tweet'] = df['tweet'].apply(remove_atmark)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" <th>num4</th>\n", | |
" <th>tweet</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext #tag #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext http://url</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>6</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>【交換】</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" num1 num2 num3 num4 tweet\n", | |
"0 1 2 3 0.4 texttext #tag #tag\n", | |
"1 1 2 3 0.8 texttext #tag\n", | |
"2 0 2 3 0.4 RT texttext\n", | |
"3 1 2 3 0.8 RT texttext\n", | |
"4 1 2 3 0.8 texttext http://url\n", | |
"5 1 2 3 0.4 texttext\n", | |
"6 1 2 3 0.4 【交換】" | |
] | |
}, | |
"execution_count": 17, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df2" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"True" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'_'=='_'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'abcc'" | |
] | |
}, | |
"execution_count": 19, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"' abcc '.strip()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### 譲渡系アカウントを外す" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"re_ng_word = re.compile(r'交換')\n", | |
"def has_ng_word(tweet):\n", | |
" return tweet.find('交換')>0\n", | |
" # return bool(re_ng_word.match(tweet))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"text = 'アカウント交換してください'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"5" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"text.find('交換') " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"re_ng_word = re.compile(r'交換|置換|売買')\n", | |
"def has_ng_word2(tweet):\n", | |
" return bool(re_ng_word.search(tweet))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'アカウント交換してください'" | |
] | |
}, | |
"execution_count": 24, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"text" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"re.match?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 True\n", | |
"1 True\n", | |
"2 True\n", | |
"3 True\n", | |
"4 True\n", | |
"5 True\n", | |
"6 False\n", | |
"Name: tweet, dtype: bool" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"~df2['tweet'].apply(has_ng_word2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df3 = df2[~df2['tweet'].apply(has_ng_word2)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 28, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" <th>num4</th>\n", | |
" <th>tweet</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext #tag #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext http://url</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" num1 num2 num3 num4 tweet\n", | |
"0 1 2 3 0.4 texttext #tag #tag\n", | |
"1 1 2 3 0.8 texttext #tag\n", | |
"2 0 2 3 0.4 RT texttext\n", | |
"3 1 2 3 0.8 RT texttext\n", | |
"4 1 2 3 0.8 texttext http://url\n", | |
"5 1 2 3 0.4 texttext" | |
] | |
}, | |
"execution_count": 28, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### ツイートの長さ" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 29, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 False\n", | |
"1 False\n", | |
"2 False\n", | |
"3 False\n", | |
"4 False\n", | |
"5 True\n", | |
"Name: tweet, dtype: bool" | |
] | |
}, | |
"execution_count": 29, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3['tweet'].str.len() < 10" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" <th>num4</th>\n", | |
" <th>tweet</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext #tag #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext http://url</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" num1 num2 num3 num4 tweet\n", | |
"0 1 2 3 0.4 texttext #tag #tag\n", | |
"1 1 2 3 0.8 texttext #tag\n", | |
"2 0 2 3 0.4 RT texttext\n", | |
"3 1 2 3 0.8 RT texttext\n", | |
"4 1 2 3 0.8 texttext http://url" | |
] | |
}, | |
"execution_count": 30, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3[(df3['tweet'].str.len() >= 10)]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 31, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" <th>num4</th>\n", | |
" <th>tweet</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext #tag #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext http://url</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" num1 num2 num3 num4 tweet\n", | |
"0 1 2 3 0.4 texttext #tag #tag\n", | |
"1 1 2 3 0.8 texttext #tag\n", | |
"2 0 2 3 0.4 RT texttext\n", | |
"3 1 2 3 0.8 RT texttext\n", | |
"4 1 2 3 0.8 texttext http://url" | |
] | |
}, | |
"execution_count": 31, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3[(df3['tweet'].str.len() >= 10) & (df3['tweet'].str.len() >= 10)]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### URL を外す" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 32, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"re_url = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'before URL abc'" | |
] | |
}, | |
"execution_count": 33, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"re_url.sub('URL', 'before http://www.google.com/abc abc')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 34, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def remove_url(tweet):\n", | |
" return re_url.sub('', tweet)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 35, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 texttext #tag #tag\n", | |
"1 texttext #tag\n", | |
"2 RT texttext\n", | |
"3 RT texttext\n", | |
"4 texttext \n", | |
"5 texttext\n", | |
"Name: tweet, dtype: object" | |
] | |
}, | |
"execution_count": 35, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3['tweet'].apply(remove_url)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 36, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import copy\n", | |
"df4 = copy.copy(df3)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 37, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df4['tweet'] = df3['tweet'].apply(remove_url)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 38, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>num1</th>\n", | |
" <th>num2</th>\n", | |
" <th>num3</th>\n", | |
" <th>num4</th>\n", | |
" <th>tweet</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext #tag #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext #tag</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>0</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>RT texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.8</td>\n", | |
" <td>texttext</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>5</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>3</td>\n", | |
" <td>0.4</td>\n", | |
" <td>texttext</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" num1 num2 num3 num4 tweet\n", | |
"0 1 2 3 0.4 texttext #tag #tag\n", | |
"1 1 2 3 0.8 texttext #tag\n", | |
"2 0 2 3 0.4 RT texttext\n", | |
"3 1 2 3 0.8 RT texttext\n", | |
"4 1 2 3 0.8 texttext \n", | |
"5 1 2 3 0.4 texttext" | |
] | |
}, | |
"execution_count": 38, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df4" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### タグを除く" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"re_tag = re.compile(r'<[^>]+>')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'aaa link ddd'" | |
] | |
}, | |
"execution_count": 40, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"re_tag.sub('', 'aaa <a href=\"fff\">link</a> ddd')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def remove_tag(tweet):\n", | |
" return re_tag.sub('', tweet)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 texttext #tag #tag\n", | |
"1 texttext #tag\n", | |
"2 RT texttext\n", | |
"3 RT texttext\n", | |
"4 texttext \n", | |
"5 texttext\n", | |
"Name: tweet, dtype: object" | |
] | |
}, | |
"execution_count": 42, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3['tweet'].apply(remove_tag)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 62, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"tweet = 'hello <a href=\">aaa\"> foo'\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 63, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'hello aaa\"> foo'" | |
] | |
}, | |
"execution_count": 63, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"remove_tag(tweet)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 64, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import lxml" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 65, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import lxml.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 66, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"'hello foo'" | |
] | |
}, | |
"execution_count": 66, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"lxml.html.fromstring(tweet).text_content()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 67, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def remove_tag2(tweet):\n", | |
" return lxml.html.fromstring(tweet).text_content()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 68, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 texttext #tag #tag\n", | |
"1 texttext #tag\n", | |
"2 RT texttext\n", | |
"3 RT texttext\n", | |
"4 texttext \n", | |
"5 texttext\n", | |
"Name: tweet, dtype: object" | |
] | |
}, | |
"execution_count": 68, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df3['tweet'].apply(remove_tag2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment