Skip to content

Instantly share code, notes, and snippets.

@mwicat
Created August 2, 2013 09:46
Show Gist options
  • Select an option

  • Save mwicat/6138727 to your computer and use it in GitHub Desktop.

Select an option

Save mwicat/6138727 to your computer and use it in GitHub Desktop.
import json
from lxml.html import fromstring
from urllib import urlopen
def get_tree(page):
d = urlopen('http://audio.tutsplus.com/category/tutorials/production/page/%d' % page).read()
h = fromstring(d)
return h
def extract_art(head):
a = head.find('a')
link = a.get('href')
text = a.text
return link, text
def get_arts(tree):
arts = [extract_art(head) for head in tree.cssselect('h1.post_title')]
return arts
# arts = []
# for i in range(1, 48):
# print i
# arts.extend(get_arts(get_tree(i)))
# json.dump(arts, open('arts.json', 'w'))
arts = json.load(open('arts.json'))
def shorten(text):
return text.replace('Quick Tip:', '').replace('How to', '').strip()
arts = sorted([[shorten(text), link] for link, text in arts])
print '<ol>'
for text, link in arts:
t = '<li><a href="%s">%s</a></li>' % (link, text)
print t.encode('utf-8')
print '</ol>'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment