tobynet · December 11, 2015 06:48
diff --git a/cakesgetter.py b/cakesgetter.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 """
 cakegetter.py
 ツイッター検索からcakes無料時間の記事を探してダウンロードしておくスクリプト
 """

 import os
 import re
 import urllib
 import urllib2
 import cookielib
 import json
 import logging

 SEARCH_TEXT = "//cakes.mu/r/ -RT"

 RE_URL = r'https?://[-_.!~*\'()a-zA-Z0-9;/?:@&=+$,%#]+'
 RE_CAKES_URL = r'^https?:\/\/cakes\.mu\/r\/.+$'

 logging.basicConfig(
        filename='cakesgetter.log',
        level=logging.DEBUG,
        #level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s')

 download_cache_list = []


 class HeadRequest(urllib2.Request):
    def get_method(self):
        return 'HEAD'


 class MyHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
    def http_error_302(self, req, fp, code, msg, headers):
        url = req.get_full_url()
        logging.debug("full url is %(url)s", {"url": url})
        if re.match(RE_CAKES_URL, url):
            download(url)
            return None
        return req


 def load_urls(urls):
    opener = urllib2.build_opener(MyHTTPRedirectHandler())
    for url in urls:
        try:
            logging.debug("loading %(url)s", {"url": url})
            opener.open(HeadRequest(url))
        except urllib2.HTTPError:
            pass


 def download(url):
    # ひとまずダウンロードする
    # ルビーのopen-uriみたいなHTTPクライアント・ラッパー兼お手軽ライブラリないの
    #
    # TODO: HTMLのダウンロード後にファイルに出力するかのチェックをしているので、
    #       GET前にチェックをしてリクエストを省きたい。
    #       (リダイレクトの処理面倒そうだしいいかという気もする)
    logging.debug("downloading url %(url)s", {"url": url})
    cookie = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
    response = opener.open(url)
    html = response.read()

    # ローカルで閲覧しやすいように相対パスを絶対パスに置換する
    html = html.replace('href="', 'href="https://cakes.mu').replace(
        'src="', 'src="https://cakes.mu')

    # 最終的なURLを得る(リダイレクト処理含む)
    url = response.geturl()
    # ファイルに出力するためのファイル名を得る
    path = url.split("/").pop() + '.html'

    # すでにダウンロード済みでなければ、ファイルとして出力する
    if not os.path.exists(path) and url not in download_cache_list:
        download_cache_list.append(url)
        with open(path, 'wb') as f:
            f.write(html)
        logging.info("wrote to '%(path)s' from %(url)s", {
            "path": path, "url": url})
    else:
        logging.debug("URL exists or cached %(url)s", {"url": url})


 def search_from_twitter():
    query = urllib.urlencode(dict(q=SEARCH_TEXT, rpp=200))
    response = urllib2.urlopen('http://search.twitter.com/search.json', query)
    logging.debug("searching links from %(url)s", {"url": response.url})
    statuses = json.loads(response.read())

    urls = []
    for status in statuses.get('results'):
        tweet = status.get('text')
        urls.extend(_find_urls(tweet))
    logging.debug(
        "found %(results)d tweets and %(urls)d urls from twitter search", {
            "results": len(statuses.get('results')),
            "urls": len(urls)})
    return urls


 def _find_urls(text):
    return re.findall(RE_URL, text)


 def main():
    logging.info("starting...")
    urls = search_from_twitter()
    load_urls(urls)
    logging.info("done!!")


 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	cakegetter.py
	ツイッター検索からcakes無料時間の記事を探してダウンロードしておくスクリプト
	"""

	import os
	import re
	import urllib
	import urllib2
	import cookielib
	import json
	import logging

	SEARCH_TEXT = "//cakes.mu/r/ -RT"

	RE_URL = r'https?://[-_.!~*\'()a-zA-Z0-9;/?:@&=+$,%#]+'
	RE_CAKES_URL = r'^https?:\/\/cakes\.mu\/r\/.+$'

	logging.basicConfig(
	filename='cakesgetter.log',
	level=logging.DEBUG,
	#level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s')

	download_cache_list = []


	class HeadRequest(urllib2.Request):
	def get_method(self):
	return 'HEAD'


	class MyHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
	def http_error_302(self, req, fp, code, msg, headers):
	url = req.get_full_url()
	logging.debug("full url is %(url)s", {"url": url})
	if re.match(RE_CAKES_URL, url):
	download(url)
	return None
	return req


	def load_urls(urls):
	opener = urllib2.build_opener(MyHTTPRedirectHandler())
	for url in urls:
	try:
	logging.debug("loading %(url)s", {"url": url})
	opener.open(HeadRequest(url))
	except urllib2.HTTPError:
	pass


	def download(url):
	# ひとまずダウンロードする
	# ルビーのopen-uriみたいなHTTPクライアント・ラッパー兼お手軽ライブラリないの
	#
	# TODO: HTMLのダウンロード後にファイルに出力するかのチェックをしているので、
	# GET前にチェックをしてリクエストを省きたい。
	# (リダイレクトの処理面倒そうだしいいかという気もする)
	logging.debug("downloading url %(url)s", {"url": url})
	cookie = cookielib.CookieJar()
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
	response = opener.open(url)
	html = response.read()

	# ローカルで閲覧しやすいように相対パスを絶対パスに置換する
	html = html.replace('href="', 'href="https://cakes.mu').replace(
	'src="', 'src="https://cakes.mu')

	# 最終的なURLを得る(リダイレクト処理含む)
	url = response.geturl()
	# ファイルに出力するためのファイル名を得る
	path = url.split("/").pop() + '.html'

	# すでにダウンロード済みでなければ、ファイルとして出力する
	if not os.path.exists(path) and url not in download_cache_list:
	download_cache_list.append(url)
	with open(path, 'wb') as f:
	f.write(html)
	logging.info("wrote to '%(path)s' from %(url)s", {
	"path": path, "url": url})
	else:
	logging.debug("URL exists or cached %(url)s", {"url": url})


	def search_from_twitter():
	query = urllib.urlencode(dict(q=SEARCH_TEXT, rpp=200))
	response = urllib2.urlopen('http://search.twitter.com/search.json', query)
	logging.debug("searching links from %(url)s", {"url": response.url})
	statuses = json.loads(response.read())

	urls = []
	for status in statuses.get('results'):
	tweet = status.get('text')
	urls.extend(_find_urls(tweet))
	logging.debug(
	"found %(results)d tweets and %(urls)d urls from twitter search", {
	"results": len(statuses.get('results')),
	"urls": len(urls)})
	return urls


	def _find_urls(text):
	return re.findall(RE_URL, text)


	def main():
	logging.info("starting...")
	urls = search_from_twitter()
	load_urls(urls)
	logging.info("done!!")


	if __name__ == '__main__':
	main()