-
-
Save kitsuyui/74574813abc4ee050f5e0a78f04a3393 to your computer and use it in GitHub Desktop.
Python で画像や URL やリプライを含まない日本語のツイートを集める ref: http://qiita.com/kitsuyui/items/b30296f13f71b287f14f
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
export APP_KEY='XXXXXXXXXXXXX' | |
export APP_SECRET='XXXXXXXXXXXXXXXXXXXX' | |
export OAUTH_TOKEN='XXXXX-XXXXXXXXXX' | |
export OAUTH_TOKEN_SECRET='XXXXXXXXXX' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python tweetcorpus.py -n 10 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ while true; do python -u tweetcorpus.py -n 500 | tee /dev/tty | gzip -cn >> tweet.gz ; sleep 1 ; done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source ./.env |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ pip3 install twython==3.4.0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import html | |
import os | |
import sys | |
from twython import TwythonStreamer | |
class CorpusStreamer(TwythonStreamer): | |
def __init__(self, *args, | |
max_corpus_tweets=100, | |
write_file=sys.stdout): | |
super().__init__(*args) | |
self.corpus_tweets = 0 | |
self.max_corpus_tweets = max_corpus_tweets | |
self.write_file = write_file | |
def exit_when_corpus_tweets_exceeded(self): | |
if self.corpus_tweets >= self.max_corpus_tweets: | |
self.disconnect() | |
def write(self, text): | |
corpus_text = text.replace('\n', '\r') | |
self.write_file.write(corpus_text + '\n') | |
self.corpus_tweets += 1 | |
def on_success(self, tweet): | |
if 'text' not in tweet: | |
# ツイート情報以外を除外 (通知など) | |
return | |
if 'retweeted_status' in tweet: | |
# リツイートを除外 | |
return | |
if any(tweet['entities'].values()): | |
''' | |
tweet.entities.url | |
tweet.entities.media | |
tweet.entities.symbol | |
など自然言語処理だけでは扱えない情報を含むツイートを除外 | |
''' | |
return | |
text = html.unescape(tweet['text']) | |
self.write(text) | |
self.exit_when_corpus_tweets_exceeded() | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-n', '--number-of-corpus-tweets', | |
type=int, default=100) | |
parser.add_argument('-o', '--outfile', | |
type=argparse.FileType('w', encoding='UTF-8'), | |
default=sys.stdout) | |
parser.add_argument('-l', '--language', type=str, default='ja') | |
app_key = os.environ['APP_KEY'] | |
app_secret = os.environ['APP_SECRET'] | |
oauth_token = os.environ['OAUTH_TOKEN'] | |
oauth_token_secret = os.environ['OAUTH_TOKEN_SECRET'] | |
args = parser.parse_args() | |
stream = CorpusStreamer(app_key, app_secret, | |
oauth_token, oauth_token_secret, | |
max_corpus_tweets=args.number_of_corpus_tweets, | |
write_file=args.outfile) | |
stream.statuses.sample(language=args.language) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment