Skip to content

Instantly share code, notes, and snippets.

@darvin
Created May 18, 2011 11:19
Show Gist options
  • Save darvin/978393 to your computer and use it in GitHub Desktop.
Save darvin/978393 to your computer and use it in GitHub Desktop.
Script to prettyformat books from safaribooksonline.com
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
Usage: ./safaribooksdownloader <url> "Name Of Book"
"""
import sys
import os, time, random
from pyquery import PyQuery as pq
import codecs
import mimetypes
from magic import Magic
mag = Magic(mime=True)
start = sys.argv[1]
title = sys.argv[2]
if os.path.exists("%s.html"%title) or os.path.exists("%s_files"%(title,)):
print "%s already downloaded here"%title
sys.exit()
if title is None:
title = "Some Book from SafariBooksOnline"
print "retrieving book from start url '%s', named <%s>"% (start, title)
result = pq('<html><head><title>%s</title></head><body><h1>%s</h1><div id="all_book_content"></div></body></html>'%(title,title))
temp_file_name = "%s.temp_part.html"%title
wget = 'wget --quiet "http://my.safaribooksonline.com/_ajax_htmlview?__sugus=929164532&action=7&__version=6.0.2&__className=bookhtmlcontent&reader=html&xmlid=%s" -O "%s" --load-cookies cookies.txt'
link=start
page=0
from lxml.html.soupparser import fromstring
from lxml.etree import tostring
break_after_join = False
while True:
page+=1
os.system(wget %(link, temp_file_name))
#sleep = random.randint(40,50)
sleep = random.randint(3,7)
print "%s is retrieved, sleep for %d seconds"%(link, sleep)
f = open(temp_file_name)
d = pq(fromstring(f))
f.close()
#link_new = d('a:contains("Next Section")').attr("href")
link_new = d('#Reader_NextXmlId').val()
#if title is None and d(".book_title") is not None:
# title = d(".book_title").text()
if link==link_new:
break_after_join = True
if len(link_new)==0:
if len(d('div.recaptcha_widget')) >0:
os.system("play /usr/share/sounds/gnome/default/alerts/glass.ogg")
raw_input( "PLEASE, ENTER CAPTHCA ON SITE!")
continue
else:
print "FINISHED!"
break
else:
link = link_new
#result.append(d("div.book_content").children())
result('#all_book_content').append(d("div.htmlcontent").children())
if break_after_join:
break
try:
time.sleep(sleep)
except KeyboardInterrupt:
break
os.remove(temp_file_name)
f = codecs.open("%s.html"%title, "w", "utf-8")
os.mkdir("%s_files"%(title,))
imgnum = 0
failed_imgs = []
imgs = {}
for elem in result('*[src]'):
src = result(elem).attr('src')
if src not in imgs:
if src.startswith("http:"):
link = src
else:
link = "http://my.safaribooksonline.com/%s"%src
filename = "%s_files/img%d"%(title, imgnum)
os.system('wget "%s" -O "%s" --load-cookies cookies.txt'%(link, filename))
sleep = random.randint(1,3)
print "%s is retrieved, sleep for %d seconds"%(src, sleep)
ext = mimetypes.guess_extension(mag.from_file(filename))
if ext==".jpe":
ext = ".jpg"
if ext is None:
print "%s 's extension cannot be detected, please input it:"%filename
failed_imgs.append(src)
ext = raw_input()
filename_final = filename + ext
os.rename(filename, filename_final)
imgs[src] = filename_final
imgnum+=1
time.sleep(sleep)
result(elem).attr('src', imgs[src])
print "failed images", failed_imgs
for elem in result('*[href]'):
href = result(elem).attr('href')
result(elem).attr('href', "#"+href.split('#')[-1])
res = ''.join([tostring(e, method="html") for e in result])
f.write(res)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment