Skip to content

Instantly share code, notes, and snippets.

@flashton2003
Created March 26, 2016 18:20
Show Gist options
  • Select an option

  • Save flashton2003/7fe57697a28257d8c650 to your computer and use it in GitHub Desktop.

Select an option

Save flashton2003/7fe57697a28257d8c650 to your computer and use it in GitHub Desktop.
import sys
import re
import amazonproduct
import pprint
import pickle
import lxml.objectify
import numpy as np
from lxml import etree
from datetime import datetime
import socket
import urllib2
def item_search(api):
items = api.item_search('Books', Title="Benjamin Franklin", Author = 'Walter Isaacson')
# print dir(items)
print len(items)
for book in items:
try:
print '%s: "%s"' % (book.ItemAttributes.Author,
book.ItemAttributes.Title)
pprint.pprint(dir(book.ItemAttributes))
except AttributeError:
pass
def get_page_numbers(api, asin):
# result = api.item_lookup('B006H3MIV8', ResponseGroup = 'Tracks')
try:
result = api.item_lookup(asin, ResponseGroup = 'Large')
except (socket.timeout, urllib2.URLError):
return None
# print str(result)
for book in result:
result_string = etree.tostring(book, pretty_print = True)
for line in result_string.split('\n'):
if 'NumberOfPages' in line:
match = re.search('(\d+)', line)
return int(match.group(0))
def get_all_page_numbers(res_list, api):
for book in res_list:
book['page_numbers_list'] = []
for asin in book['asin_list']:
number = get_page_numbers(api, asin)
if number != None:
book['page_numbers_list'].append(number)
print book['page_numbers_list']
book['median_page_numbers'] = np.median(np.array(book['page_numbers_list']))
print book['median_page_numbers']
def parse_book_orders(inhandle):
res_list = []
with open(inhandle) as fi:
lines = fi.readlines()
lines = [x.strip() for x in lines]
date_re = re.compile('([0-9]+ [A-Z][a-z]+ [0-9]+)')
for x, line in enumerate(lines):
if line.startswith('Order details'):
res_dict = {}
this_date = date_re.search(line)
# print line
# print this_date.group(0)
# print datetime.strptime(this_date.group(0), '%d %B %Y')
res_dict['date'] = datetime.strptime(this_date.group(0), '%d %B %Y')
# print lines[x + 2]
res_dict['title'] = lines[x + 1].split(' (')[0]
# print title
if len(lines[x + 2].split(', ')) > 1:
author_surname = lines[x + 2].split(', ')[0]
author_firstname = lines[x + 2].split(', ')[1]
author = author_firstname + ' ' + author_surname
else:
author = lines[x + 2]
res_dict['author'] = author
res_list.append(res_dict)
return res_list
def get_aisn(res_list, api):
for book in res_list:
try:
results = api.item_search('Books', Title = book['title'], Author = book['author'])
print book['title'], len(results)
except UnicodeDecodeError:
print 'problem with', book['title']
pass
book['asin_list'] = []
for x in results:
result_string = etree.tostring(x, pretty_print = True)
for line in result_string.split('\n'):
if '<ASIN>' in line:
match = re.search('>([A-Z, 0-9])\w+<', line)
asin = match.group(0)
asin = asin.strip('>')
asin = asin.strip('<')
book['asin_list'].append(asin)
return res_list
def print_data_for_r(res_list):
dates = [x['date'].date() for x in res_list]
dates = sorted(dates)
for book in res_list:
try:
end_date = dates[dates.index(book['date'].date()) + 1]
print '\t'.join(map(str, [book['title'], book['median_page_numbers'], book['date'].date(), end_date]))
except IndexError:
print '\t'.join(map(str, [book['title'], book['median_page_numbers'], book['date'].date()]))
# item_lookup(api)
def main():
api = amazonproduct.API()
res_list = parse_book_orders('/path/to/kindle_books')
res_list = get_aisn(res_list, api)
get_all_page_numbers(res_list, api)
pickle.dump(res_list, open('res_list.pick', 'wb'))
# res_list = pickle.load(open('res_list.pick', 'r'))
print_data_for_r(res_list)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment