flashton2003 · March 26, 2016 18:20
diff --git a/amazon_get_book_info.py b/amazon_get_book_info.py
 import sys
 import re
 import amazonproduct
 import pprint
 import pickle
 import lxml.objectify
 import numpy as np
 from lxml import etree
 from datetime import datetime
 import socket
 import urllib2


 def item_search(api):
 	items = api.item_search('Books', Title="Benjamin Franklin", Author = 'Walter Isaacson')
 	# print dir(items)
 	print len(items)
 	for book in items:
 		try:
 			print '%s: "%s"' % (book.ItemAttributes.Author,
 	                        	book.ItemAttributes.Title)
 			pprint.pprint(dir(book.ItemAttributes))
 		except AttributeError:
 			pass

 def get_page_numbers(api, asin):
 	# result = api.item_lookup('B006H3MIV8', ResponseGroup = 'Tracks')
 	try:
 		result = api.item_lookup(asin, ResponseGroup = 'Large')
 	except (socket.timeout, urllib2.URLError):
 		return None
 	# print str(result)
 	for book in result:
 		result_string = etree.tostring(book, pretty_print = True)
 		for line in result_string.split('\n'):
 			if 'NumberOfPages' in line:
 				match = re.search('(\d+)', line)
 				return int(match.group(0))


 def get_all_page_numbers(res_list, api):
 	for book in res_list:
 		book['page_numbers_list'] = []
 		for asin in book['asin_list']:
 			number = get_page_numbers(api, asin)
 			if number != None:
 				book['page_numbers_list'].append(number)
 		print book['page_numbers_list']
 		book['median_page_numbers'] = np.median(np.array(book['page_numbers_list']))
 		print book['median_page_numbers']

 def parse_book_orders(inhandle):
 	res_list = []
 	with open(inhandle) as fi:
 		lines = fi.readlines()
 		lines = [x.strip() for x in lines]
 		date_re = re.compile('([0-9]+ [A-Z][a-z]+ [0-9]+)')
 		for x, line in enumerate(lines):
 			if line.startswith('Order details'):
 				res_dict = {}
 				this_date = date_re.search(line)
 				# print line
 				# print this_date.group(0)
 				# print datetime.strptime(this_date.group(0), '%d %B %Y')
 				res_dict['date'] = datetime.strptime(this_date.group(0), '%d %B %Y')
 				# print lines[x + 2]
 				res_dict['title'] = lines[x + 1].split(' (')[0]
 				# print title
 				if len(lines[x + 2].split(', ')) > 1:
 					author_surname = lines[x + 2].split(', ')[0]
 					author_firstname = lines[x + 2].split(', ')[1]
 					author = author_firstname + ' ' + author_surname
 				else:
 					author = lines[x + 2]
 				res_dict['author'] = author
 				res_list.append(res_dict)
 	return res_list

 def get_aisn(res_list, api):
 	for book in res_list:
 		try:
 			results = api.item_search('Books', Title = book['title'], Author = book['author'])
 			print book['title'], len(results)
 		except UnicodeDecodeError:
 			print 'problem with', book['title']
 			pass
 		book['asin_list'] = []
 		for x in results:
 			result_string = etree.tostring(x, pretty_print = True)
 			for line in result_string.split('\n'):
 				if '<ASIN>' in line:
 					match = re.search('>([A-Z, 0-9])\w+<', line)
 					asin = match.group(0)
 					asin = asin.strip('>')
 					asin = asin.strip('<')
 					book['asin_list'].append(asin)
 	return res_list

 def print_data_for_r(res_list):
 	dates = [x['date'].date() for x in res_list]
 	dates = sorted(dates)
 	for book in res_list:
 		try:
 			end_date = dates[dates.index(book['date'].date()) + 1]
 			print '\t'.join(map(str, [book['title'], book['median_page_numbers'], book['date'].date(), end_date]))
 		except IndexError:
 			print '\t'.join(map(str, [book['title'], book['median_page_numbers'], book['date'].date()]))



 # item_lookup(api)
 def main():
 	api = amazonproduct.API()
 	res_list = parse_book_orders('/path/to/kindle_books')
 	res_list = get_aisn(res_list, api)
 	get_all_page_numbers(res_list, api)
 	pickle.dump(res_list, open('res_list.pick', 'wb'))
 	# res_list = pickle.load(open('res_list.pick', 'r'))
 	print_data_for_r(res_list)




 if __name__ == '__main__':
 	main()
	import sys
	import re
	import amazonproduct
	import pprint
	import pickle
	import lxml.objectify
	import numpy as np
	from lxml import etree
	from datetime import datetime
	import socket
	import urllib2


	def item_search(api):
	items = api.item_search('Books', Title="Benjamin Franklin", Author = 'Walter Isaacson')
	# print dir(items)
	print len(items)
	for book in items:
	try:
	print '%s: "%s"' % (book.ItemAttributes.Author,
	book.ItemAttributes.Title)
	pprint.pprint(dir(book.ItemAttributes))
	except AttributeError:
	pass

	def get_page_numbers(api, asin):
	# result = api.item_lookup('B006H3MIV8', ResponseGroup = 'Tracks')
	try:
	result = api.item_lookup(asin, ResponseGroup = 'Large')
	except (socket.timeout, urllib2.URLError):
	return None
	# print str(result)
	for book in result:
	result_string = etree.tostring(book, pretty_print = True)
	for line in result_string.split('\n'):
	if 'NumberOfPages' in line:
	match = re.search('(\d+)', line)
	return int(match.group(0))


	def get_all_page_numbers(res_list, api):
	for book in res_list:
	book['page_numbers_list'] = []
	for asin in book['asin_list']:
	number = get_page_numbers(api, asin)
	if number != None:
	book['page_numbers_list'].append(number)
	print book['page_numbers_list']
	book['median_page_numbers'] = np.median(np.array(book['page_numbers_list']))
	print book['median_page_numbers']

	def parse_book_orders(inhandle):
	res_list = []
	with open(inhandle) as fi:
	lines = fi.readlines()
	lines = [x.strip() for x in lines]
	date_re = re.compile('([0-9]+ [A-Z][a-z]+ [0-9]+)')
	for x, line in enumerate(lines):
	if line.startswith('Order details'):
	res_dict = {}
	this_date = date_re.search(line)
	# print line
	# print this_date.group(0)
	# print datetime.strptime(this_date.group(0), '%d %B %Y')
	res_dict['date'] = datetime.strptime(this_date.group(0), '%d %B %Y')
	# print lines[x + 2]
	res_dict['title'] = lines[x + 1].split(' (')[0]
	# print title
	if len(lines[x + 2].split(', ')) > 1:
	author_surname = lines[x + 2].split(', ')[0]
	author_firstname = lines[x + 2].split(', ')[1]
	author = author_firstname + ' ' + author_surname
	else:
	author = lines[x + 2]
	res_dict['author'] = author
	res_list.append(res_dict)
	return res_list

	def get_aisn(res_list, api):
	for book in res_list:
	try:
	results = api.item_search('Books', Title = book['title'], Author = book['author'])
	print book['title'], len(results)
	except UnicodeDecodeError:
	print 'problem with', book['title']
	pass
	book['asin_list'] = []
	for x in results:
	result_string = etree.tostring(x, pretty_print = True)
	for line in result_string.split('\n'):
	if '<ASIN>' in line:
	match = re.search('>([A-Z, 0-9])\w+<', line)
	asin = match.group(0)
	asin = asin.strip('>')
	asin = asin.strip('<')
	book['asin_list'].append(asin)
	return res_list

	def print_data_for_r(res_list):
	dates = [x['date'].date() for x in res_list]
	dates = sorted(dates)
	for book in res_list:
	try:
	end_date = dates[dates.index(book['date'].date()) + 1]
	print '\t'.join(map(str, [book['title'], book['median_page_numbers'], book['date'].date(), end_date]))
	except IndexError:
	print '\t'.join(map(str, [book['title'], book['median_page_numbers'], book['date'].date()]))



	# item_lookup(api)
	def main():
	api = amazonproduct.API()
	res_list = parse_book_orders('/path/to/kindle_books')
	res_list = get_aisn(res_list, api)
	get_all_page_numbers(res_list, api)
	pickle.dump(res_list, open('res_list.pick', 'wb'))
	# res_list = pickle.load(open('res_list.pick', 'r'))
	print_data_for_r(res_list)




	if __name__ == '__main__':
	main()
No results found