Created
February 8, 2013 07:11
-
-
Save HuangFJ/4737224 to your computer and use it in GitHub Desktop.
采集微博的动画gif图片
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from distutils.core import setup | |
import py2exe | |
data_files = [("config.ini"),("test.db")] | |
includes = ["mechanize", "simplejson", "PIL.Image"] | |
options = {"py2exe": | |
{ "compressed": 1, | |
"optimize": 2, | |
"includes": includes, | |
"bundle_files": 1 | |
} | |
} | |
setup( | |
version = "0.1.0", | |
options = options, | |
data_files = data_files, | |
zipfile=None, | |
console=[{"script": "test.py"}], |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding = utf-8 | |
import mechanize | |
import os, fnmatch | |
from cStringIO import StringIO | |
import sqlite3 | |
import cookielib | |
import simplejson | |
import re | |
import ConfigParser | |
from threading import Thread | |
from Queue import Queue | |
import urllib2 | |
import socket | |
from PIL import Image | |
import traceback | |
import time | |
timeout = 300 | |
socket.setdefaulttimeout(timeout) | |
def locate(pattern, root=os.curdir): | |
for path, dirs, files in os.walk(os.path.abspath(root)): | |
for filename in fnmatch.filter(files, pattern): | |
yield os.path.join(path, filename) | |
def read_cookie(browser, site): | |
site = site.split(',') | |
cookie_path = None | |
if 'LOCALAPPDATA' not in os.environ: | |
os.environ['LOCALAPPDATA'] = os.path.join(os.environ['USERPROFILE'], 'Local Settings', 'Application Data') | |
if browser == 'chrome': | |
#C:\Users\Jon\AppData\Local\Google\Chrome\User Data\Default\Cookies | |
chrome_cookie = os.path.join(os.environ['LOCALAPPDATA'],'Google','Chrome','User Data','Default','Cookies') | |
if os.path.isfile(chrome_cookie): | |
cookie_path = chrome_cookie | |
elif browser == 'firefox': | |
#C:\Users\Jon\AppData\Roaming\Mozilla\Firefox\Profiles\5t3akq0c.default\cookies.sqlite | |
firefox_cookies = [x for x in locate('cookies.sqlite', os.path.join(os.environ['APPDATA'],'Mozilla','Firefox','Profiles'))] | |
if firefox_cookies: | |
cookie_path = firefox_cookies[0] | |
if cookie_path is None: | |
raise Exception(browser.title() + ' is not installed.') | |
try: | |
cookie_path.decode('utf8') | |
except: | |
cookie_path = cookie_path.decode('gbk').encode('utf8') | |
con = sqlite3.connect(cookie_path) | |
con.text_factory = str | |
cur = con.cursor() | |
if browser == 'firefox': | |
sql = "select host, path, isSecure, expiry, name, value from moz_cookies where" | |
for item in site: | |
sql += " host like '%" + item + "' or" | |
sql = sql[0:-3] | |
elif browser == 'chrome': | |
sql = "select host_key, path, secure, expires_utc, name, value from cookies where" | |
for item in site: | |
sql += " host_key like '%" + item + "' or" | |
sql = sql[0:-3] | |
cur.execute(sql) | |
ftstr = ["FALSE","TRUE"] | |
s = StringIO() | |
s.write('''\ | |
# Netscape HTTP Cookie File | |
# http://www.netscape.com/newsref/std/cookie_spec.html | |
# This is a generated file! Do not edit. | |
''') | |
for item in cur.fetchall(): | |
try: | |
s.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( | |
item[0], ftstr[item[0].startswith('.')], item[1], | |
ftstr[item[2]], item[3], item[4], item[5])) | |
except UnicodeError: | |
continue | |
s.seek(0) | |
cookie_jar = cookielib.MozillaCookieJar() | |
cookie_jar._really_load(s, '', True, True) | |
return cookie_jar | |
def download_anigif(src): | |
path = os.path.join('images', os.path.basename(src)) | |
if os.path.isfile(path): return | |
data = urllib2.urlopen(src).read() | |
string_io = StringIO(data) | |
image = Image.open(string_io) | |
try: image.seek(1) | |
except EOFError: return | |
del image | |
string_io.close() | |
#image.format, image.size, image.mode | |
with open(path, 'wb') as fp: | |
fp.write(data) | |
print src | |
class Worker(Thread): | |
def __init__(self, tasks): | |
Thread.__init__(self) | |
self.tasks = tasks | |
self.daemon = True | |
self.start() | |
def run(self): | |
while True: | |
func, args, kargs = self.tasks.get() | |
try: | |
func(*args, **kargs) | |
except: | |
traceback.print_exc() | |
self.tasks.task_done() | |
class ThreadPool: | |
def __init__(self, num_threads): | |
self.tasks = Queue(num_threads) | |
for _ in range(num_threads): Worker(self.tasks) | |
def add_task(self, func, *args, **kargs): | |
self.tasks.put((func, args, kargs)) | |
def wait_completion(self): | |
self.tasks.join() | |
if __name__ == '__main__': | |
try: os.mkdir('images') | |
except: pass | |
config = ConfigParser.ConfigParser() | |
config.read('config.ini') | |
cookiejar = read_cookie(config.get('weibo', 'browser'), config.get('weibo', 'host')) | |
br = mechanize.Browser() | |
br.set_cookiejar(cookiejar) | |
pool = ThreadPool(20) | |
con = sqlite3.connect('test.db') | |
cur = con.cursor() | |
uid_str = config.get('weibo', 'uid') | |
uid_list = uid_str.split(',') | |
uid_idx = 0 | |
uid_count = len(uid_list) | |
page = 1 | |
while True: | |
uid = uid_list[uid_idx] | |
try: | |
result = br.open('http://www.weibo.com/aj/mblog/mbloglist?count=15&page=%s&uid=%s' % (page, uid), timeout=60).read() | |
except socket.timeout: | |
time.sleep(10) | |
continue | |
page += 1 | |
try: | |
result = simplejson.loads(result) | |
except: | |
raise Exception('You do not sign in.') | |
if result['code'] == '100000': | |
if result['data'].find('mid=') == -1: | |
page = 1 | |
if (uid_idx + 1) < uid_count: | |
uid_idx = uid_idx + 1 | |
else: | |
uid_idx = 0 | |
else: | |
srcs = re.findall(r'http://[^"]+sinaimg\.cn[^"]+\.gif', result['data']) | |
for src in srcs: | |
src = src.replace('thumbnail', 'large') | |
cur.execute('SELECT src FROM images WHERE src=?', (src,)) | |
if cur.fetchone() is None: | |
pool.add_task(download_anigif, src) | |
cur.execute('INSERT INTO images (src) VALUES (?)', (src,)) | |
con.commit() | |
pool.wait_completion() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment