Skip to content

Instantly share code, notes, and snippets.

@ledmonster
Created October 27, 2016 13:50
Show Gist options
  • Save ledmonster/b65560f20e14c67da81f7eb067af995c to your computer and use it in GitHub Desktop.
Save ledmonster/b65560f20e14c67da81f7eb067af995c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df = pd.read_csv('data/tweets_20161027.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" <th>num4</th>\n",
" <th>tweet</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>@user_name texttext #tag #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext http://url</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>【交換】</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" num1 num2 num3 num4 tweet\n",
"0 1 2 3 0.4 @user_name texttext #tag #tag\n",
"1 1 2 3 0.8 texttext #tag\n",
"2 0 2 3 0.4 RT texttext\n",
"3 1 2 3 0.8 RT texttext\n",
"4 1 2 3 0.8 texttext http://url\n",
"5 1 2 3 0.4 texttext\n",
"6 1 2 3 0.4 【交換】"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def is_rt(tweet):\n",
" matched_texts = rt_pattern.findall(tweet) \n",
" is_retweet = len(matched_texts) > 0 \n",
" return is_retweet"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"rt_pattern = re.compile(r'^RT\\s(.*)')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"result = df.ix[\n",
" (~df['tweet'].apply(is_rt)) & \n",
" (df['num4'] < 0.8) \n",
"] "
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" <th>num4</th>\n",
" <th>tweet</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>@user_name texttext #tag #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>【交換】</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" num1 num2 num3 num4 tweet\n",
"0 1 2 3 0.4 @user_name texttext #tag #tag\n",
"5 1 2 3 0.4 texttext\n",
"6 1 2 3 0.4 【交換】"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### @マークを除く"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"re_atmark = re.compile(r'@[A-Za-z0-9_]+')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'NAME texttext #tag #tag'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re_atmark.sub('NAME', '@user_name texttext #tag #tag')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'こんにちは さん、 さん'"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re_atmark.sub('', 'こんにちは @junya さん、@hiro さん')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def remove_atmark(tweet):\n",
" return re_atmark.sub('', tweet).strip()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 texttext #tag #tag\n",
"1 texttext #tag\n",
"2 RT texttext\n",
"3 RT texttext\n",
"4 texttext http://url\n",
"5 texttext\n",
"6 【交換】\n",
"Name: tweet, dtype: object"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
" df['tweet'].apply(remove_atmark)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df2 = df"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df2['tweet'] = df['tweet'].apply(remove_atmark)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" <th>num4</th>\n",
" <th>tweet</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext #tag #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext http://url</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>【交換】</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" num1 num2 num3 num4 tweet\n",
"0 1 2 3 0.4 texttext #tag #tag\n",
"1 1 2 3 0.8 texttext #tag\n",
"2 0 2 3 0.4 RT texttext\n",
"3 1 2 3 0.8 RT texttext\n",
"4 1 2 3 0.8 texttext http://url\n",
"5 1 2 3 0.4 texttext\n",
"6 1 2 3 0.4 【交換】"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'_'=='_'"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'abcc'"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"' abcc '.strip()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 譲渡系アカウントを外す"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"re_ng_word = re.compile(r'交換')\n",
"def has_ng_word(tweet):\n",
" return tweet.find('交換')>0\n",
" # return bool(re_ng_word.match(tweet))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"text = 'アカウント交換してください'"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text.find('交換') "
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"re_ng_word = re.compile(r'交換|置換|売買')\n",
"def has_ng_word2(tweet):\n",
" return bool(re_ng_word.search(tweet))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'アカウント交換してください'"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"re.match?"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 True\n",
"1 True\n",
"2 True\n",
"3 True\n",
"4 True\n",
"5 True\n",
"6 False\n",
"Name: tweet, dtype: bool"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"~df2['tweet'].apply(has_ng_word2)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df3 = df2[~df2['tweet'].apply(has_ng_word2)]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" <th>num4</th>\n",
" <th>tweet</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext #tag #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext http://url</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" num1 num2 num3 num4 tweet\n",
"0 1 2 3 0.4 texttext #tag #tag\n",
"1 1 2 3 0.8 texttext #tag\n",
"2 0 2 3 0.4 RT texttext\n",
"3 1 2 3 0.8 RT texttext\n",
"4 1 2 3 0.8 texttext http://url\n",
"5 1 2 3 0.4 texttext"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### ツイートの長さ"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 False\n",
"1 False\n",
"2 False\n",
"3 False\n",
"4 False\n",
"5 True\n",
"Name: tweet, dtype: bool"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3['tweet'].str.len() < 10"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" <th>num4</th>\n",
" <th>tweet</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext #tag #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext http://url</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" num1 num2 num3 num4 tweet\n",
"0 1 2 3 0.4 texttext #tag #tag\n",
"1 1 2 3 0.8 texttext #tag\n",
"2 0 2 3 0.4 RT texttext\n",
"3 1 2 3 0.8 RT texttext\n",
"4 1 2 3 0.8 texttext http://url"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3[(df3['tweet'].str.len() >= 10)]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" <th>num4</th>\n",
" <th>tweet</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext #tag #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext http://url</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" num1 num2 num3 num4 tweet\n",
"0 1 2 3 0.4 texttext #tag #tag\n",
"1 1 2 3 0.8 texttext #tag\n",
"2 0 2 3 0.4 RT texttext\n",
"3 1 2 3 0.8 RT texttext\n",
"4 1 2 3 0.8 texttext http://url"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3[(df3['tweet'].str.len() >= 10) & (df3['tweet'].str.len() >= 10)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### URL を外す"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"re_url = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'before URL abc'"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re_url.sub('URL', 'before http://www.google.com/abc abc')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def remove_url(tweet):\n",
" return re_url.sub('', tweet)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 texttext #tag #tag\n",
"1 texttext #tag\n",
"2 RT texttext\n",
"3 RT texttext\n",
"4 texttext \n",
"5 texttext\n",
"Name: tweet, dtype: object"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3['tweet'].apply(remove_url)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import copy\n",
"df4 = copy.copy(df3)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df4['tweet'] = df3['tweet'].apply(remove_url)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>num1</th>\n",
" <th>num2</th>\n",
" <th>num3</th>\n",
" <th>num4</th>\n",
" <th>tweet</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext #tag #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext #tag</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>RT texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.8</td>\n",
" <td>texttext</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0.4</td>\n",
" <td>texttext</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" num1 num2 num3 num4 tweet\n",
"0 1 2 3 0.4 texttext #tag #tag\n",
"1 1 2 3 0.8 texttext #tag\n",
"2 0 2 3 0.4 RT texttext\n",
"3 1 2 3 0.8 RT texttext\n",
"4 1 2 3 0.8 texttext \n",
"5 1 2 3 0.4 texttext"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df4"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### タグを除く"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"re_tag = re.compile(r'<[^>]+>')"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'aaa link ddd'"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re_tag.sub('', 'aaa <a href=\"fff\">link</a> ddd')"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def remove_tag(tweet):\n",
" return re_tag.sub('', tweet)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 texttext #tag #tag\n",
"1 texttext #tag\n",
"2 RT texttext\n",
"3 RT texttext\n",
"4 texttext \n",
"5 texttext\n",
"Name: tweet, dtype: object"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3['tweet'].apply(remove_tag)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tweet = 'hello <a href=\">aaa\"> foo'\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'hello aaa\"> foo'"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"remove_tag(tweet)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import lxml"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import lxml.html"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'hello foo'"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lxml.html.fromstring(tweet).text_content()"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def remove_tag2(tweet):\n",
" return lxml.html.fromstring(tweet).text_content()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0 texttext #tag #tag\n",
"1 texttext #tag\n",
"2 RT texttext\n",
"3 RT texttext\n",
"4 texttext \n",
"5 texttext\n",
"Name: tweet, dtype: object"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df3['tweet'].apply(remove_tag2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment