udonmai · August 29, 2015 14:08
diff --git a/gistfile1.py b/gistfile1.py
 #!/usr/bin/python
 # coding=utf-8

 import os
 #import sys
 #import string
 #import urllib
 import urllib2
 import re
 import json
 from bs4 import BeautifulSoup


 def req(url):
    user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
    headers = {'User-Agent': user_agent}

    request = urllib2.Request(url, headers=headers)
    tempreq = urllib2.urlopen(request)

    response = tempreq.read()

    # get the search result page content
    html = response
    soup = BeautifulSoup(html)
    return soup


 def eightbooks_processor(page_content):
    book_collector = {}
    num = 0

    had_read_section = page_content.find('h3', text=re.compile(u'読み終わった本'))
    if had_read_section:
        had_read_section = had_read_section.next_siblings
    else:
        print('***********************')
        return book_collector

    for book in had_read_section:
        if book == '\n' or book.name == 'br':
            continue

        if book['class'][0] == 'more':
            continue
        print(book['class'][0])

        if book['class'][1] == 'book_box_inline_3r':
            book_collector[num] = {}

            print('&&&&&&&&&&&&&&&&&&&&&')
            doc_book_image = book.find(attrs={'class': 'book_box_book_image'})
            book_collector[num]['imageurl'] = doc_book_image.a.img['src']

            doc_book_title = book.find(attrs={'class': 'book_box_book_title'})
            book_collector[num]['book_name'] = doc_book_title.a.contents
            book_collector[num]['url'] = doc_book_title.a['href']

            bookurl = 'http://bookmeter.com' + book_collector[num]['url']
            amazon_url = book_amazon_url_processor(bookurl)
            book_collector[num]['isbn'] = {}
            book_collector[num]['isbn'] = amazon_processor(amazon_url)

            doc_book_author = book.find(attrs={'class': 'book_box_book_author'})
            if doc_book_author:
                book_collector[num]['author_name'] = doc_book_author.a.contents
            else:
                book_collector[num]['author_name'] = ''

        num += 1

    return book_collector


 def book_amazon_url_processor(bookurl):
    page_content = req(bookurl)
    amazon_url_exited = page_content.find(attrs={'class': 'book_detail_amazon_right'})
    if amazon_url_exited:
        return amazon_url_exited.a['href']
    else:
        return ''


 def amazon_processor(amazon_url):
    if amazon_url == '':
        isbn = {}
        isbn['10'] = ''
        isbn['13'] = ''
        return isbn

    try:
        page_content = req(amazon_url)

    except urllib2.HTTPError, e:
        print 'We failed with error code - %s.' % e.code
        print('This link (' + amazon_url + ')is not available.')

        isbn = {}
        isbn['10'] = ''
        isbn['13'] = ''
        return isbn

    else:
        isbn = {}

        isbn_tmp = page_content.find('b', text=re.compile('ISBN-10:'))
        if isbn_tmp:
            isbn['10'] = isbn_tmp.next_sibling.split(' ')[1]

        isbn_tmp = page_content.find('b', text=re.compile('ISBN-13:'))
        if isbn_tmp:
            isbn['13'] = isbn_tmp.next_sibling.split(' ')[1]

        #print(isbn)
        return isbn


 if __name__ == '__main__':

    # Total users
    num = 99999

    # 2nd
    #num = 277

    # 3nd
    #num = 966

    # User
    user = {}

    # Original url
    ori_url = 'http://bookmeter.com/u/'

    # Main cycle
    while True:
        user[num] = {}
        url = ori_url + str(num + 1)

        try:
            page_content = req(url)

        except urllib2.HTTPError, e:
            print 'We failed with error code - %s.' % e.code
            print('user ' + str(num) + ' is not existed.')
            print('- - - - - - - - - - -')

            #not_exist = page_content.find_all(text='このページはご利用いただけません')
            #if not_exist:
                #print('user ' + num + ' is not existed.\n')

        else:
            #print(page_content)
            print('- - - - - - - - - - -')
            print(url)

        user[num] = eightbooks_processor(page_content)
        print(user)

        with open('data_100000.json', 'w') as outfile:
            json.dump(user, outfile)

        num += 1
	#!/usr/bin/python
	# coding=utf-8

	import os
	#import sys
	#import string
	#import urllib
	import urllib2
	import re
	import json
	from bs4 import BeautifulSoup


	def req(url):
	user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"
	headers = {'User-Agent': user_agent}

	request = urllib2.Request(url, headers=headers)
	tempreq = urllib2.urlopen(request)

	response = tempreq.read()

	# get the search result page content
	html = response
	soup = BeautifulSoup(html)
	return soup


	def eightbooks_processor(page_content):
	book_collector = {}
	num = 0

	had_read_section = page_content.find('h3', text=re.compile(u'読み終わった本'))
	if had_read_section:
	had_read_section = had_read_section.next_siblings
	else:
	print('***********************')
	return book_collector

	for book in had_read_section:
	if book == '\n' or book.name == 'br':
	continue

	if book['class'][0] == 'more':
	continue
	print(book['class'][0])

	if book['class'][1] == 'book_box_inline_3r':
	book_collector[num] = {}

	print('&&&&&&&&&&&&&&&&&&&&&')
	doc_book_image = book.find(attrs={'class': 'book_box_book_image'})
	book_collector[num]['imageurl'] = doc_book_image.a.img['src']

	doc_book_title = book.find(attrs={'class': 'book_box_book_title'})
	book_collector[num]['book_name'] = doc_book_title.a.contents
	book_collector[num]['url'] = doc_book_title.a['href']

	bookurl = 'http://bookmeter.com' + book_collector[num]['url']
	amazon_url = book_amazon_url_processor(bookurl)
	book_collector[num]['isbn'] = {}
	book_collector[num]['isbn'] = amazon_processor(amazon_url)

	doc_book_author = book.find(attrs={'class': 'book_box_book_author'})
	if doc_book_author:
	book_collector[num]['author_name'] = doc_book_author.a.contents
	else:
	book_collector[num]['author_name'] = ''

	num += 1

	return book_collector


	def book_amazon_url_processor(bookurl):
	page_content = req(bookurl)
	amazon_url_exited = page_content.find(attrs={'class': 'book_detail_amazon_right'})
	if amazon_url_exited:
	return amazon_url_exited.a['href']
	else:
	return ''


	def amazon_processor(amazon_url):
	if amazon_url == '':
	isbn = {}
	isbn['10'] = ''
	isbn['13'] = ''
	return isbn

	try:
	page_content = req(amazon_url)

	except urllib2.HTTPError, e:
	print 'We failed with error code - %s.' % e.code
	print('This link (' + amazon_url + ')is not available.')

	isbn = {}
	isbn['10'] = ''
	isbn['13'] = ''
	return isbn

	else:
	isbn = {}

	isbn_tmp = page_content.find('b', text=re.compile('ISBN-10:'))
	if isbn_tmp:
	isbn['10'] = isbn_tmp.next_sibling.split(' ')[1]

	isbn_tmp = page_content.find('b', text=re.compile('ISBN-13:'))
	if isbn_tmp:
	isbn['13'] = isbn_tmp.next_sibling.split(' ')[1]

	#print(isbn)
	return isbn


	if __name__ == '__main__':

	# Total users
	num = 99999

	# 2nd
	#num = 277

	# 3nd
	#num = 966

	# User
	user = {}

	# Original url
	ori_url = 'http://bookmeter.com/u/'

	# Main cycle
	while True:
	user[num] = {}
	url = ori_url + str(num + 1)

	try:
	page_content = req(url)

	except urllib2.HTTPError, e:
	print 'We failed with error code - %s.' % e.code
	print('user ' + str(num) + ' is not existed.')
	print('- - - - - - - - - - -')

	#not_exist = page_content.find_all(text='このページはご利用いただけません')
	#if not_exist:
	#print('user ' + num + ' is not existed.\n')

	else:
	#print(page_content)
	print('- - - - - - - - - - -')
	print(url)

	user[num] = eightbooks_processor(page_content)
	print(user)

	with open('data_100000.json', 'w') as outfile:
	json.dump(user, outfile)

	num += 1