Created
September 10, 2017 16:57
-
-
Save Juancard/bb09f19c79a008db58ccfd58e5674ac7 to your computer and use it in GitHub Desktop.
A script to load all spanish books in gutenberg
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import argparse | |
import logging | |
import requests | |
import unicodedata | |
import re | |
import codecs | |
import lxml.html | |
API_URL = "https://gutenbergapi.org" | |
def slugify(value): | |
""" | |
Normalizes string, converts to lowercase, removes non-alpha characters, | |
and converts spaces to hyphens. | |
""" | |
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore') | |
value = unicode(re.sub('[^\w\s-]', '', value).strip().lower()) | |
value = unicode(re.sub('[-\s]+', '-', value)) | |
return value | |
def loadArgParser(): | |
parser = argparse.ArgumentParser(description='A script to load all spanish books in gutenberg') | |
parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true") | |
return parser.parse_args() | |
def booksIdByLanguage(lang): | |
url = API_URL + "/search/language eq " + lang | |
response = requests.get(url) | |
booksId = set() | |
for t in response.json()["texts"]: | |
booksId.add(t["text_id"]) | |
return booksId | |
def metadataByBookId(bookId): | |
url = API_URL + "/texts/%d" % bookId | |
metadata = {} | |
response = requests.get(url) | |
filename = "%d" % bookId | |
for t in response.json()["metadata"]["title"]: | |
if t is not None: | |
filename += "-" + slugify(t) | |
metadata["filename"] = filename | |
for t in response.json()["metadata"]["formaturi"]: | |
if t.endswith(".htm"): | |
metadata["link_htm"] = t | |
return metadata | |
def bodyByBookId(bookId): | |
url = API_URL + "/texts/%d" % bookId + "/body" | |
response = requests.get(url) | |
if not "body" in response.json(): | |
return "" | |
return response.json()["body"] | |
def bookTxt(link): | |
txt = bodyByBookId(link) | |
if txt is None or txt == "": | |
return "" | |
start = txt.find("*** START OF") | |
char_searched = '\n' | |
for i in range(start, len(txt)): | |
if txt[i] == char_searched: | |
start = i + 1 | |
break | |
end = txt.rfind("*** END OF") | |
return txt[start:end] | |
def bookHtm(link): | |
htmltree = lxml.html.parse(link) | |
p_tags = htmltree.xpath('//p') | |
p_content = [p.text_content() for p in p_tags] | |
return "\n".join(p_content) | |
def main(): | |
args = loadArgParser() | |
if args.verbose: | |
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.DEBUG) | |
lang = "es" | |
corpus_dir = "corpus/" | |
if not os.path.exists(corpus_dir): | |
os.mkdir(corpus_dir) | |
bookIds = booksIdByLanguage(lang) | |
processed = 0 | |
allBooks = len(bookIds) | |
booksNotAdded = 0 | |
for bId in bookIds: | |
meta = metadataByBookId(bId) | |
processed += 1 | |
print "%d/%d: %s" % (processed, allBooks, meta["filename"]) | |
if "link_htm" in meta: | |
parsed = bookHtm(meta["link_htm"]) | |
else: | |
parsed = bookTxt(bId) | |
if parsed is None or parsed == "": | |
logging.warning("Book not added: no body content.") | |
booksNotAdded += 1 | |
else: | |
filePath = os.path.join(corpus_dir, meta["filename"] + ".txt") | |
with codecs.open(filePath, mode='wt', encoding='utf-8') as f: | |
f.write(parsed) | |
print "Books added to corpus: %d" % allBooks | |
print "Books NOT added to corpus: %d" % booksNotAdded | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment