Created
July 31, 2011 12:13
-
-
Save dsnopek/1116742 to your computer and use it in GitHub Desktop.
Script that pulls books from Project Gutenburg
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
books |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#-*- coding: utf-8 -*- | |
#Tool for getting books in given languge(s) from Project Gutenberg site | |
import urllib, re, os | |
from shutil import copy | |
class harv: | |
def __init__(self, adr, lang): | |
self.adress = adr | |
self.languages = lang | |
def harvest(self): | |
for letter in "abcdefghijklmnopqrstuvwxyz": | |
self.process_page("/browse/authors/"+letter) | |
def process_page(self, adr): | |
raw = urllib.urlopen(self.adress+adr) | |
raw = re.split("<h2>",raw.read()) | |
del raw[0] | |
for a in raw: | |
self.process_author(a) | |
def process_author(self, raw): | |
author = re.findall(r'<a name[^>]+>(.*)<\/a>', raw) | |
if not author: | |
return | |
author = author[0] | |
raw = re.findall(r'.*class="pgdbetext".*', raw) | |
for a in raw: | |
self.process_book(a,author) | |
def process_book(self, raw, author): | |
raw = re.findall(r'<a href="([^"]+)">(.*)<\/a> \(([^\)]+)\) \(.*', raw) | |
raw = raw[0] | |
book_nr = re.findall(r'/.*/(.*)', raw[0]) | |
raw = list(raw) | |
raw[1] = re.sub(r'<.*>', '',raw[1]) | |
raw[1] = re.sub(r'[\r\n]', '',raw[1]) | |
raw[1] = re.sub(r':',',',raw[1]) | |
if len(raw) > 2 and filter(lambda x: x == raw[2], self.languages): | |
self.add_entity_to_base(author, raw[1], urllib.urlretrieve(self.adress+"/files/"+book_nr[0]+"/"+book_nr[0]+"-h.zip"), raw[2]) | |
# def get_content(self, link): | |
# raw = urllib.urlopen(self.adress+link) | |
# link = re.findall("href=\"(\/catalog\/world\/readfile\?.*)\">", raw.read()) | |
# if not link: | |
# return | |
# book = [] | |
# i = 2 | |
# page = self.get_page(link[0], 1) | |
# while(page): | |
# book.append(page) | |
# page = self.get_page(link[0], i) | |
# return book | |
# def get_page(self, link, nr): | |
# raw = urllib.urlopen(self.adress+link+'&pageno'+str(nr)) | |
# raw = raw.read() | |
# page_nr = re.findall("Goto page\:<\/label>.*value=\"(.*)\".*name=\"pageno", raw, re.DOTALL) | |
# print(self.adress+link+'&pageno='+str(nr)) | |
# print page_nr, " ", nr | |
# print re.findall("Goto page\:<\/label>.*value=\".*\".*name=\"pageno", raw, re.DOTALL) | |
# if page_nr and page_nr[0] == str(nr): | |
# raw = re.findall("<hr>(.*)<hr>", raw, re.DOTALL) | |
# #print raw | |
# if raw: | |
# return raw[0] | |
# return False | |
def add_entity_to_base(self, author, title, content, language): | |
if content: | |
try: | |
copy(content[0], os.path.join('.', 'books', author+' - '+title+'.zip')) | |
print title, " by ", author," in ", language, " processed!" | |
except IOError: | |
print title, " by ", author, " is too long name for file :(" | |
else: | |
print "Couldn't get ", title, " by ", author, " :(" | |
pass | |
try: | |
os.mkdir("books") | |
except: | |
pass | |
a = harv("http://gutenberg.org", ["English"]) | |
a.harvest() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment