Skip to content

Instantly share code, notes, and snippets.

@dsnopek
Created July 31, 2011 12:13
Show Gist options
  • Save dsnopek/1116742 to your computer and use it in GitHub Desktop.
Save dsnopek/1116742 to your computer and use it in GitHub Desktop.
Script that pulls books from Project Gutenburg
#!/usr/bin/env python
#-*- coding: utf-8 -*-
#Tool for getting books in given languge(s) from Project Gutenberg site
import urllib, re, os
from shutil import copy
class harv:
def __init__(self, adr, lang):
self.adress = adr
self.languages = lang
def harvest(self):
for letter in "abcdefghijklmnopqrstuvwxyz":
self.process_page("/browse/authors/"+letter)
def process_page(self, adr):
raw = urllib.urlopen(self.adress+adr)
raw = re.split("<h2>",raw.read())
del raw[0]
for a in raw:
self.process_author(a)
def process_author(self, raw):
author = re.findall(r'<a name[^>]+>(.*)<\/a>', raw)
if not author:
return
author = author[0]
raw = re.findall(r'.*class="pgdbetext".*', raw)
for a in raw:
self.process_book(a,author)
def process_book(self, raw, author):
raw = re.findall(r'<a href="([^"]+)">(.*)<\/a> \(([^\)]+)\) \(.*', raw)
raw = raw[0]
book_nr = re.findall(r'/.*/(.*)', raw[0])
raw = list(raw)
raw[1] = re.sub(r'<.*>', '',raw[1])
raw[1] = re.sub(r'[\r\n]', '',raw[1])
raw[1] = re.sub(r':',',',raw[1])
if len(raw) > 2 and filter(lambda x: x == raw[2], self.languages):
self.add_entity_to_base(author, raw[1], urllib.urlretrieve(self.adress+"/files/"+book_nr[0]+"/"+book_nr[0]+"-h.zip"), raw[2])
# def get_content(self, link):
# raw = urllib.urlopen(self.adress+link)
# link = re.findall("href=\"(\/catalog\/world\/readfile\?.*)\">", raw.read())
# if not link:
# return
# book = []
# i = 2
# page = self.get_page(link[0], 1)
# while(page):
# book.append(page)
# page = self.get_page(link[0], i)
# return book
# def get_page(self, link, nr):
# raw = urllib.urlopen(self.adress+link+'&pageno'+str(nr))
# raw = raw.read()
# page_nr = re.findall("Goto page\:<\/label>.*value=\"(.*)\".*name=\"pageno", raw, re.DOTALL)
# print(self.adress+link+'&pageno='+str(nr))
# print page_nr, " ", nr
# print re.findall("Goto page\:<\/label>.*value=\".*\".*name=\"pageno", raw, re.DOTALL)
# if page_nr and page_nr[0] == str(nr):
# raw = re.findall("<hr>(.*)<hr>", raw, re.DOTALL)
# #print raw
# if raw:
# return raw[0]
# return False
def add_entity_to_base(self, author, title, content, language):
if content:
try:
copy(content[0], os.path.join('.', 'books', author+' - '+title+'.zip'))
print title, " by ", author," in ", language, " processed!"
except IOError:
print title, " by ", author, " is too long name for file :("
else:
print "Couldn't get ", title, " by ", author, " :("
pass
try:
os.mkdir("books")
except:
pass
a = harv("http://gutenberg.org", ["English"])
a.harvest()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment