Created
May 18, 2011 11:19
-
-
Save darvin/978393 to your computer and use it in GitHub Desktop.
Script to prettyformat books from safaribooksonline.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
""" | |
Usage: ./safaribooksdownloader <url> "Name Of Book" | |
""" | |
import sys | |
import os, time, random | |
from pyquery import PyQuery as pq | |
import codecs | |
import mimetypes | |
from magic import Magic | |
mag = Magic(mime=True) | |
start = sys.argv[1] | |
title = sys.argv[2] | |
if os.path.exists("%s.html"%title) or os.path.exists("%s_files"%(title,)): | |
print "%s already downloaded here"%title | |
sys.exit() | |
if title is None: | |
title = "Some Book from SafariBooksOnline" | |
print "retrieving book from start url '%s', named <%s>"% (start, title) | |
result = pq('<html><head><title>%s</title></head><body><h1>%s</h1><div id="all_book_content"></div></body></html>'%(title,title)) | |
temp_file_name = "%s.temp_part.html"%title | |
wget = 'wget --quiet "http://my.safaribooksonline.com/_ajax_htmlview?__sugus=929164532&action=7&__version=6.0.2&__className=bookhtmlcontent&reader=html&xmlid=%s" -O "%s" --load-cookies cookies.txt' | |
link=start | |
page=0 | |
from lxml.html.soupparser import fromstring | |
from lxml.etree import tostring | |
break_after_join = False | |
while True: | |
page+=1 | |
os.system(wget %(link, temp_file_name)) | |
#sleep = random.randint(40,50) | |
sleep = random.randint(3,7) | |
print "%s is retrieved, sleep for %d seconds"%(link, sleep) | |
f = open(temp_file_name) | |
d = pq(fromstring(f)) | |
f.close() | |
#link_new = d('a:contains("Next Section")').attr("href") | |
link_new = d('#Reader_NextXmlId').val() | |
#if title is None and d(".book_title") is not None: | |
# title = d(".book_title").text() | |
if link==link_new: | |
break_after_join = True | |
if len(link_new)==0: | |
if len(d('div.recaptcha_widget')) >0: | |
os.system("play /usr/share/sounds/gnome/default/alerts/glass.ogg") | |
raw_input( "PLEASE, ENTER CAPTHCA ON SITE!") | |
continue | |
else: | |
print "FINISHED!" | |
break | |
else: | |
link = link_new | |
#result.append(d("div.book_content").children()) | |
result('#all_book_content').append(d("div.htmlcontent").children()) | |
if break_after_join: | |
break | |
try: | |
time.sleep(sleep) | |
except KeyboardInterrupt: | |
break | |
os.remove(temp_file_name) | |
f = codecs.open("%s.html"%title, "w", "utf-8") | |
os.mkdir("%s_files"%(title,)) | |
imgnum = 0 | |
failed_imgs = [] | |
imgs = {} | |
for elem in result('*[src]'): | |
src = result(elem).attr('src') | |
if src not in imgs: | |
if src.startswith("http:"): | |
link = src | |
else: | |
link = "http://my.safaribooksonline.com/%s"%src | |
filename = "%s_files/img%d"%(title, imgnum) | |
os.system('wget "%s" -O "%s" --load-cookies cookies.txt'%(link, filename)) | |
sleep = random.randint(1,3) | |
print "%s is retrieved, sleep for %d seconds"%(src, sleep) | |
ext = mimetypes.guess_extension(mag.from_file(filename)) | |
if ext==".jpe": | |
ext = ".jpg" | |
if ext is None: | |
print "%s 's extension cannot be detected, please input it:"%filename | |
failed_imgs.append(src) | |
ext = raw_input() | |
filename_final = filename + ext | |
os.rename(filename, filename_final) | |
imgs[src] = filename_final | |
imgnum+=1 | |
time.sleep(sleep) | |
result(elem).attr('src', imgs[src]) | |
print "failed images", failed_imgs | |
for elem in result('*[href]'): | |
href = result(elem).attr('href') | |
result(elem).attr('href', "#"+href.split('#')[-1]) | |
res = ''.join([tostring(e, method="html") for e in result]) | |
f.write(res) | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment