Created
March 26, 2016 18:20
-
-
Save flashton2003/7fe57697a28257d8c650 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import re | |
| import amazonproduct | |
| import pprint | |
| import pickle | |
| import lxml.objectify | |
| import numpy as np | |
| from lxml import etree | |
| from datetime import datetime | |
| import socket | |
| import urllib2 | |
| def item_search(api): | |
| items = api.item_search('Books', Title="Benjamin Franklin", Author = 'Walter Isaacson') | |
| # print dir(items) | |
| print len(items) | |
| for book in items: | |
| try: | |
| print '%s: "%s"' % (book.ItemAttributes.Author, | |
| book.ItemAttributes.Title) | |
| pprint.pprint(dir(book.ItemAttributes)) | |
| except AttributeError: | |
| pass | |
| def get_page_numbers(api, asin): | |
| # result = api.item_lookup('B006H3MIV8', ResponseGroup = 'Tracks') | |
| try: | |
| result = api.item_lookup(asin, ResponseGroup = 'Large') | |
| except (socket.timeout, urllib2.URLError): | |
| return None | |
| # print str(result) | |
| for book in result: | |
| result_string = etree.tostring(book, pretty_print = True) | |
| for line in result_string.split('\n'): | |
| if 'NumberOfPages' in line: | |
| match = re.search('(\d+)', line) | |
| return int(match.group(0)) | |
| def get_all_page_numbers(res_list, api): | |
| for book in res_list: | |
| book['page_numbers_list'] = [] | |
| for asin in book['asin_list']: | |
| number = get_page_numbers(api, asin) | |
| if number != None: | |
| book['page_numbers_list'].append(number) | |
| print book['page_numbers_list'] | |
| book['median_page_numbers'] = np.median(np.array(book['page_numbers_list'])) | |
| print book['median_page_numbers'] | |
| def parse_book_orders(inhandle): | |
| res_list = [] | |
| with open(inhandle) as fi: | |
| lines = fi.readlines() | |
| lines = [x.strip() for x in lines] | |
| date_re = re.compile('([0-9]+ [A-Z][a-z]+ [0-9]+)') | |
| for x, line in enumerate(lines): | |
| if line.startswith('Order details'): | |
| res_dict = {} | |
| this_date = date_re.search(line) | |
| # print line | |
| # print this_date.group(0) | |
| # print datetime.strptime(this_date.group(0), '%d %B %Y') | |
| res_dict['date'] = datetime.strptime(this_date.group(0), '%d %B %Y') | |
| # print lines[x + 2] | |
| res_dict['title'] = lines[x + 1].split(' (')[0] | |
| # print title | |
| if len(lines[x + 2].split(', ')) > 1: | |
| author_surname = lines[x + 2].split(', ')[0] | |
| author_firstname = lines[x + 2].split(', ')[1] | |
| author = author_firstname + ' ' + author_surname | |
| else: | |
| author = lines[x + 2] | |
| res_dict['author'] = author | |
| res_list.append(res_dict) | |
| return res_list | |
| def get_aisn(res_list, api): | |
| for book in res_list: | |
| try: | |
| results = api.item_search('Books', Title = book['title'], Author = book['author']) | |
| print book['title'], len(results) | |
| except UnicodeDecodeError: | |
| print 'problem with', book['title'] | |
| pass | |
| book['asin_list'] = [] | |
| for x in results: | |
| result_string = etree.tostring(x, pretty_print = True) | |
| for line in result_string.split('\n'): | |
| if '<ASIN>' in line: | |
| match = re.search('>([A-Z, 0-9])\w+<', line) | |
| asin = match.group(0) | |
| asin = asin.strip('>') | |
| asin = asin.strip('<') | |
| book['asin_list'].append(asin) | |
| return res_list | |
| def print_data_for_r(res_list): | |
| dates = [x['date'].date() for x in res_list] | |
| dates = sorted(dates) | |
| for book in res_list: | |
| try: | |
| end_date = dates[dates.index(book['date'].date()) + 1] | |
| print '\t'.join(map(str, [book['title'], book['median_page_numbers'], book['date'].date(), end_date])) | |
| except IndexError: | |
| print '\t'.join(map(str, [book['title'], book['median_page_numbers'], book['date'].date()])) | |
| # item_lookup(api) | |
| def main(): | |
| api = amazonproduct.API() | |
| res_list = parse_book_orders('/path/to/kindle_books') | |
| res_list = get_aisn(res_list, api) | |
| get_all_page_numbers(res_list, api) | |
| pickle.dump(res_list, open('res_list.pick', 'wb')) | |
| # res_list = pickle.load(open('res_list.pick', 'r')) | |
| print_data_for_r(res_list) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment