Skip to content

Instantly share code, notes, and snippets.

@oiva
Last active August 29, 2015 14:15
Show Gist options
  • Save oiva/400a383e8f271c84d4a5 to your computer and use it in GitHub Desktop.
Save oiva/400a383e8f271c84d4a5 to your computer and use it in GitHub Desktop.
Back to Work Reading List
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import codecs
from datetime import datetime
import feedparser
import sys
import re
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
def parse_books():
# url = 'http://feeds.5by5.tv/b2w'
filename = './b2w.xml'
pattern = '<a .*? href="(http:\/\/www\.amazon\.com[^"]*).*?>(.*?)</a>'
pattern2 = '<a .*? href="((?!http:\/\/www\.amazon).*?)" .*?>((?:Audio)?\
Book: .*?)</a>'
skip = ['Health &amp; Personal Care', 'Toys &amp; Games', 'MP3 Downloads',
'Computers &amp; Accessories', 'Musical Instruments', 'Moleskine',
'Everything Else', 'Music', 'Electronics', 'Movies &amp; TV',
'Sports &amp; Outdoors', 'Grocery &amp; Gourmet Food', ': Baby',
'Amazon Instant Video', 'Kitchen &amp; Dining', 'Floor Lamp',
'Wishlist', 'The Aviator', 'Home Improvement', 'Video Games',
'Fingernail Clipper', 'Edimax N150 Wireless', 'Automotive',
'ASUS Dual-Band Wireless', 'Camera &amp; Photo', 'Beauty',
'Office Products', 'Crafts & Sewing', 'Patio, Lawn &amp; Garden',
'Home &amp; Kitchen', 'Brass No Soliciting Sign', 'Light Bulb',
'Ultra Pro Resealable Current Size Comic Bags', 'Model Rocket Kit',
'Arts, Crafts &amp; Sewing']
regex = re.compile(pattern, re.IGNORECASE)
regex2 = re.compile(pattern2, re.IGNORECASE)
books = []
feed = feedparser.parse(filename)
for episode in feed.entries:
links = regex.findall(episode.content[0].value)
non_amazon = regex2.findall(episode.content[0].value)
links += non_amazon
for link in links:
# skip items that are not really books
is_book = True
for category in skip:
if category in link[1]:
is_book = False
if not is_book:
continue
# include episode info
link += (episode.link, episode.title)
# no duplicates
if link not in books:
books.append(link)
return books
def produce_list(books):
filename = './index.tmpl'
filtered = [';Book: ', 'BOOK: ', ': Books', ':Books', '[Amazon]',
'Amazon.com: Boo', 'Amazon: ', 'Amazon.com: ', ': Amazon.com',
' at Amazon.com', ':Amazon', '(Amazon.com)', '(Amazon)',
': Explore similar items', ' - Amazon.com', 'Kindle Store',
' (HIGHLY recommended by Merlin)']
comics = ['Marvel Famous Firsts', 'Marvel Now', 'Thor:', 'X-Men',
'Hawkeye', 'American Vampire', 'The Walking Dead', 'Watchmen',
'X-Force', 'Daredevil', 'Spider-Man', 'Scarlet', 'She-Hulk',
'Fantastic Four', 'Wolverine', 'Fiona Staples', 'Civil War',
'Brian Michael Bendis', 'Marvels', 'Batman', 'Deadpool',
'Avengers', 'Animal Man', 'Transmetropolitan', 'Volume 1',
'Y: The Last Man', 'Zita the Spacegirl', 'Invincible:',
'5 Ronin', 'Runaways', 'The Immortal Iron Fist', 'Superman',
'The Wonderful Wizard of Oz', 'World War Hulk',
'Incredible Hulk', 'Infinity Gauntlet', 'Punk Rock Jesus']
booklist = comiclist = ''
bookcount = comiccount = authorcount = gtd = 0
for book in books:
(link, title, episodeLink, episodeTitle) = book
# filter out "Amazon.com" and similar things from the title
for word in filtered:
title = title.replace(word, '')
title = re.sub(r'^(Audiobook|Book):', '', title)
title = title.replace('&#x27;', '\'')
# one stupid link in ep 72
if title == 'Amazon':
title = 'The Now Habit: A Strategic Program for Overcoming\
Procrastination and Enjoying Guilt-Free Play: \
Neil Fiore'
iscomic = False
for comic in comics:
if comic in title:
iscomic = True
break
# try to guess author from title
(author, title) = get_author(title)
# parse episode number for sorting
episode = episodeTitle[:episodeTitle.find(':')]
row = '\t\t\t<tr>\n\t\t\t\t<td><a href="%s">%s</a></td>\n\
<td>%s</td>\
<td data-value="%s"><a href="%s">%s</a></td>\n\
\t\t\t</tr>\n'\
% (link, title, author, episode, episodeLink, episodeTitle)
if iscomic:
comiclist += row
comiccount += 1
else:
booklist += row
bookcount += 1
if author != '':
authorcount += 1
if 'Getting Things Done' in title:
gtd += 1
with codecs.open(filename, 'r', 'utf-8') as template:
html = template.read()
html = html.replace('{tablebody}', booklist)
html = html.replace('{bookcount}', str(bookcount))
html = html.replace('{comicbody}', comiclist)
html = html.replace('{comiccount}', str(comiccount))
html = html.replace('{date}', "{:%Y-%m-%d}".format(datetime.now()))
html = html.replace('{gtd}', str(gtd))
if not html:
print 'reading template from %s failed' % filename
return
with codecs.open(filename.replace('tmpl', 'html'), 'w', 'utf-8') as file:
file.write(html)
return (bookcount, comiccount, authorcount)
def get_author(title):
# remove (9123912392925) from title
title = re.sub(r'\(?[0-9]{6,}\)?', '', title)
# split title into title and author. Usually separated by ':'.
if title.count(':') == title.count(' - ') and title.count(':') > 0:
parts = title.split(' - ')
elif ':' in title:
parts = title.split(':')
# remove empty parts
parts = filter(lambda title: title.strip(), parts)
elif ' by ' in title:
parts = title.split(' by ')
else:
parts = title.split(' - ')
# take last part of title
part = parts.pop()
if len(parts) >= 1:
# try to guess if string is either a list of authors or just part of
# the title
author = part.strip(': ')
author = author.replace(' and ', ', ').replace(',,', ',')
# remove middle name initials for easier heuristics about name
simpleauthor = re.sub(r'\s[A-Z]\.', '', author)
simpleauthor = simpleauthor.replace(' MSPT', '').strip()
authorparts = len(simpleauthor.split(' '))
commas = simpleauthor.count(',')
# example: Andrew Hunt, David Thomas -> 4 names <= (1 comma + 1) * 2
if authorparts > 3 and authorparts > (commas + 1) * 2 + 1:
author = ''
# more than three words without commas
elif authorparts > 3 and commas == 0:
author = ''
if author != '':
title = ': '.join(parts)
else:
author = ''
# cleanup
title = title.strip(' :')
author = re.sub(r',([^\s])', r', \1', author)
return (author, title)
books = parse_books()
count = produce_list(books)
print "found %d books and %d comics. %d authors found." % count
<!DOCTYPE html>
<html>
<head>
<title>Back to Work Reading List</title>
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.2/css/bootstrap.min.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/sortable/0.6.0/css/sortable-theme-bootstrap.css">
<style type="text/css" media="screen">
footer {
border-top: 1px solid #e0e0e0;
margin-top: 30px;
padding-top: 10px;
}
.table-hover>tbody>tr:hover {
background-color: #f0f6ff;
}
</style>
</head>
<body>
<div class="container-fluid">
<div class="page-header row">
<div class="col-xs-12 col-md-6">
<h1>Back to Work Reading List</h1>
<p><a href="http://5by5.tv/b2w">Back to Work</a> is a podcast about <a href="#comics">comics</a>
and <a href="#books">productivity books</a>.
This is a list of the show's recommended reading, as gathered from show notes.
</p>
<p>Some errors and omissions are possible because the list is in fact compiled from RSS by a robot.
Most of the links go to amazon.com and those are 5by5's affiliate links.
</p>
</div>
</div>
<div class="books">
<a name="books"></a>
<h2>Books</h2>
<p>A total of {bookcount} books have been recommended so far. {gtd} of them are
<a href="http://www.amazon.com/exec/obidos/ASIN/0142000280/5by5-20">Getting Things Done</a>,
©DavidCo 2001.</p>
<table class="table table-hover table-condensed sortable-theme-bootstrap" data-sortable>
<thead>
<tr>
<th>Title</th>
<th>Author</th>
<th>Episode</th>
</tr>
</thead>
<tbody>
{tablebody}
</tbody>
</table>
</div>
<div class="comics">
<a name="comics"></a>
<h2>Comics</h2>
<p>A total of {comiccount} comics have been recommended.</p>
<table class="table table-hover table-condensed sortable-theme-bootstrap" data-sortable>
<thead>
<tr>
<th>Title</th>
<th>Author</th>
<th>Episode</th>
</tr>
</thead>
<tbody>
{comicbody}
</tbody>
</table>
</div>
<footer class="footer">
<p>List created by <a href="https://twitter.com/oiva">Oiva</a>. Updated on {date}.
Lovingly created with <a href="https://gist.github.com/oiva/400a383e8f271c84d4a5">Python</a>.</p>
</footer>
</div>
<script src="http://cdnjs.cloudflare.com/ajax/libs/sortable/0.6.0/js/sortable.min.js"></script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment