Last active
August 29, 2015 14:08
-
-
Save udonmai/019829532db98f6866bb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# coding=utf-8 | |
#import os | |
import sys | |
#import string | |
#import urllib | |
import urllib2 | |
import re | |
import json | |
from bs4 import BeautifulSoup | |
import gevent | |
from gevent import monkey, queue, event, pool | |
monkey.patch_all() | |
def req(url): | |
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36" | |
headers = {'User-Agent': user_agent} | |
request = urllib2.Request(url, headers=headers) | |
tempreq = urllib2.urlopen(request) | |
response = tempreq.read() | |
# get the search result page content | |
html = response | |
soup = BeautifulSoup(html) | |
return soup | |
def eightbooks_processor(page_content): | |
book_collector = {} | |
num = 0 | |
had_read_section = page_content.find('h3', text=re.compile(u'読み終わった本')) | |
if had_read_section: | |
had_read_section = had_read_section.next_siblings | |
else: | |
print('***********************') | |
return book_collector | |
for book in had_read_section: | |
if book == '\n' or book.name == 'br': | |
continue | |
if book['class'][0] == 'more': | |
continue | |
#print(book['class'][0]) | |
if book['class'][1] == 'book_box_inline_3r': | |
book_collector[num] = {} | |
xx = '+' * num | |
print(xx) | |
doc_book_image = book.find(attrs={'class': 'book_box_book_image'}) | |
book_collector[num]['imageurl'] = doc_book_image.a.img['src'] | |
doc_book_title = book.find(attrs={'class': 'book_box_book_title'}) | |
book_collector[num]['book_name'] = doc_book_title.a.contents | |
book_collector[num]['url'] = doc_book_title.a['href'] | |
bookurl = 'http://bookmeter.com' + book_collector[num]['url'] | |
amazon_url = book_amazon_url_processor(bookurl) | |
book_collector[num]['isbn'] = {} | |
book_collector[num]['isbn'] = amazon_processor(amazon_url) | |
doc_book_author = book.find(attrs={'class': 'book_box_book_author'}) | |
if doc_book_author: | |
book_collector[num]['author_name'] = doc_book_author.a.contents | |
else: | |
book_collector[num]['author_name'] = '' | |
num += 1 | |
return book_collector | |
def book_amazon_url_processor(bookurl): | |
page_content = req(bookurl) | |
amazon_url_exited = page_content.find(attrs={'class': 'book_detail_amazon_right'}) | |
if amazon_url_exited: | |
return amazon_url_exited.a['href'] | |
else: | |
return '' | |
def amazon_processor(amazon_url): | |
if amazon_url == '': | |
isbn = {} | |
isbn['10'] = '' | |
isbn['13'] = '' | |
return isbn | |
try: | |
page_content = req(amazon_url) | |
except urllib2.HTTPError, e: | |
print 'We failed with error code - %s.' % e.code | |
print('This link (' + amazon_url + ')is not available.') | |
isbn = {} | |
isbn['10'] = '' | |
isbn['13'] = '' | |
return isbn | |
else: | |
isbn = {} | |
isbn_tmp = page_content.find('b', text=re.compile('ISBN-10:')) | |
if isbn_tmp: | |
isbn['10'] = isbn_tmp.next_sibling.split(' ')[1] | |
isbn_tmp = page_content.find('b', text=re.compile('ISBN-13:')) | |
if isbn_tmp: | |
isbn['13'] = isbn_tmp.next_sibling.split(' ')[1] | |
#print(isbn) | |
return isbn | |
def basicworker(user, user_id): | |
# Original url | |
ori_url = 'http://bookmeter.com/u/' | |
user[user_id] = {} | |
url = ori_url + str(user_id) | |
try: | |
page_content = req(url) | |
except urllib2.HTTPError, e: | |
print 'We failed with error code - %s.' % e.code | |
print('user ' + str(user_id) + ' is not existed.') | |
print('- - - - - - - - - - -') | |
#not_exist = page_content.find_all(text='このページはご利用いただけません') | |
#if not_exist: | |
#print('user ' + num + ' is not existed.\n') | |
else: | |
#print(page_content) | |
print('- - - - - - - - - - -') | |
print(url) | |
print('- - - - - - - - - - -') | |
user[user_id] = eightbooks_processor(page_content) | |
print(str(user_id) + ' => 成功!') | |
with open('data_gevent_15_46.json', 'w') as outfile: | |
json.dump(user, outfile) | |
if __name__ == '__main__': | |
# Total users | |
#num = 53200 | |
# User | |
user = {} | |
i = 16 | |
pool = pool.Pool(300) | |
for i in xrange((i - 1) * 1000 + 1, (i + 30) * 1000 + 1): | |
pool.spawn(basicworker, user, i) | |
pool.join() | |
# Main thread | |
#for i in xrange(260, 521): | |
#jobs = [gevent.spawn(basicworker, user, user_id) for user_id in | |
#xrange((i - 1) * 100 + 1, i * 100 + 1)] | |
#gevent.joinall(jobs) | |
#print(str(i*100) + ' SUCCEED!!!') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment