Last active
August 29, 2015 14:08
-
-
Save udonmai/fba1629821115b97d544 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# coding=utf-8 | |
import os | |
#import sys | |
#import string | |
#import urllib | |
import urllib2 | |
import re | |
import json | |
from bs4 import BeautifulSoup | |
def req(url): | |
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36" | |
headers = {'User-Agent': user_agent} | |
request = urllib2.Request(url, headers=headers) | |
tempreq = urllib2.urlopen(request) | |
response = tempreq.read() | |
# get the search result page content | |
html = response | |
soup = BeautifulSoup(html) | |
return soup | |
def eightbooks_processor(page_content): | |
book_collector = {} | |
num = 0 | |
had_read_section = page_content.find('h3', text=re.compile(u'読み終わった本')) | |
if had_read_section: | |
had_read_section = had_read_section.next_siblings | |
else: | |
print('***********************') | |
return book_collector | |
for book in had_read_section: | |
if book == '\n' or book.name == 'br': | |
continue | |
if book['class'][0] == 'more': | |
continue | |
print(book['class'][0]) | |
if book['class'][1] == 'book_box_inline_3r': | |
book_collector[num] = {} | |
print('&&&&&&&&&&&&&&&&&&&&&') | |
doc_book_image = book.find(attrs={'class': 'book_box_book_image'}) | |
book_collector[num]['imageurl'] = doc_book_image.a.img['src'] | |
doc_book_title = book.find(attrs={'class': 'book_box_book_title'}) | |
book_collector[num]['book_name'] = doc_book_title.a.contents | |
book_collector[num]['url'] = doc_book_title.a['href'] | |
bookurl = 'http://bookmeter.com' + book_collector[num]['url'] | |
amazon_url = book_amazon_url_processor(bookurl) | |
book_collector[num]['isbn'] = {} | |
book_collector[num]['isbn'] = amazon_processor(amazon_url) | |
doc_book_author = book.find(attrs={'class': 'book_box_book_author'}) | |
if doc_book_author: | |
book_collector[num]['author_name'] = doc_book_author.a.contents | |
else: | |
book_collector[num]['author_name'] = '' | |
num += 1 | |
return book_collector | |
def book_amazon_url_processor(bookurl): | |
page_content = req(bookurl) | |
amazon_url_exited = page_content.find(attrs={'class': 'book_detail_amazon_right'}) | |
if amazon_url_exited: | |
return amazon_url_exited.a['href'] | |
else: | |
return '' | |
def amazon_processor(amazon_url): | |
if amazon_url == '': | |
isbn = {} | |
isbn['10'] = '' | |
isbn['13'] = '' | |
return isbn | |
try: | |
page_content = req(amazon_url) | |
except urllib2.HTTPError, e: | |
print 'We failed with error code - %s.' % e.code | |
print('This link (' + amazon_url + ')is not available.') | |
isbn = {} | |
isbn['10'] = '' | |
isbn['13'] = '' | |
return isbn | |
else: | |
isbn = {} | |
isbn_tmp = page_content.find('b', text=re.compile('ISBN-10:')) | |
if isbn_tmp: | |
isbn['10'] = isbn_tmp.next_sibling.split(' ')[1] | |
isbn_tmp = page_content.find('b', text=re.compile('ISBN-13:')) | |
if isbn_tmp: | |
isbn['13'] = isbn_tmp.next_sibling.split(' ')[1] | |
#print(isbn) | |
return isbn | |
if __name__ == '__main__': | |
# Total users | |
num = 99999 | |
# 2nd | |
#num = 277 | |
# 3nd | |
#num = 966 | |
# User | |
user = {} | |
# Original url | |
ori_url = 'http://bookmeter.com/u/' | |
# Main cycle | |
while True: | |
user[num] = {} | |
url = ori_url + str(num + 1) | |
try: | |
page_content = req(url) | |
except urllib2.HTTPError, e: | |
print 'We failed with error code - %s.' % e.code | |
print('user ' + str(num) + ' is not existed.') | |
print('- - - - - - - - - - -') | |
#not_exist = page_content.find_all(text='このページはご利用いただけません') | |
#if not_exist: | |
#print('user ' + num + ' is not existed.\n') | |
else: | |
#print(page_content) | |
print('- - - - - - - - - - -') | |
print(url) | |
user[num] = eightbooks_processor(page_content) | |
print(user) | |
with open('data_100000.json', 'w') as outfile: | |
json.dump(user, outfile) | |
num += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment