Last active
March 4, 2018 18:14
-
-
Save onlurking/a553b3241c1e956cbf3a6ef90367174a to your computer and use it in GitHub Desktop.
crawl elder scrolls books and save as markdown (WIP)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from splinter import Browser | |
from bs4 import BeautifulSoup | |
import tomd | |
from html_sanitizer import Sanitizer | |
# pip install tomd splinter html-sanitizer beautifulsoup4 | |
browser = Browser( | |
'chrome', user_agent="Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en)") | |
def get_books_list(url): | |
result = [] | |
base = "https://www.imperial-library.info" | |
browser.visit(url) | |
soup = BeautifulSoup(browser.html, "html5lib") | |
content = soup.find_all('div', { | |
'class': 'view view-tes-game-books view-id-tes_game_books view-display-id-page_1 gamebooks view-dom-id-3fb27bded42aced1c8164ba6039b2947'}) | |
lines = soup.find_all('li', {"class": "views-row"}) | |
for th in lines: | |
result.extend(th.find_all('a')) | |
links = ["{}{}".format(base, c['href']) | |
for c in result if "/content" in c['href']] | |
return links | |
def get_book(url): | |
browser.visit(url) | |
soup = BeautifulSoup(browser.html, "html5lib") | |
body = soup.find_all( | |
"div", {"class": "views-row views-row-1 views-row-odd views-row-first"}) | |
sanitizer = Sanitizer() | |
html = sanitizer.sanitize(body[0].prettify()) | |
book = {"title": soup.find('h1', {"class": "page-title"}).text, | |
"author": soup.find('div', {"class": "field-item odd"}).text.split('\n')[-1].rstrip().strip(), | |
"tags": list(filter((lambda tag: True if len(tag) > 0 else False), | |
soup.find('ul', {"class": "links inline"}).text.split('\n'))), | |
"content": tomd.convert(html)} | |
return book | |
def write_book(book): | |
file = open("{} - {}.md".format(book['title'], book['author']), "w") | |
file.write(book['content']) | |
file.close() | |
links = get_books_list( | |
"https://www.imperial-library.info/books/all/by-category") | |
test = get_book(links[0]) | |
write_book(test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@terremoth, tu me fez descobrir um bug xP
Tá completo aqui, mas o crawler só pegou o v1.