Last active
August 26, 2016 20:14
-
-
Save hex128/be10f25645b4524207d6 to your computer and use it in GitHub Desktop.
Google Play Books Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python2 | |
# -*- coding: utf-8 -*- | |
from json import dumps | |
from sys import stdout, exit | |
from codecs import getwriter | |
from signal import signal, SIGINT | |
from urllib import urlopen | |
from bs4 import BeautifulSoup | |
def parse(html): | |
result = {} | |
soup = BeautifulSoup(html, "lxml") | |
result["url"] = soup.find("meta", {"itemprop": "url"})["content"] | |
result["name"] = soup.find("div", {"itemprop": "name"}).text.strip() | |
author = soup.find("div", {"itemprop": "author"}) | |
result["author"] = { | |
"name": author.find("meta", {"itemprop": "name"})["content"], | |
"url": author.find("meta", {"itemprop": "url"})["content"] | |
} | |
offer = soup.find("span", {"itemprop": "offers"}) | |
result["price"] = offer.find("meta", {"itemprop": "price"})["content"] | |
result["preview"] = offer.find("meta", {"itemprop": "previewUrl"})["content"] | |
result["rating"] = soup.find("meta", {"itemprop": "ratingValue"})["content"] | |
result["rating-count"] = soup.find("meta", {"itemprop": "ratingCount"})["content"] | |
result["description"] = unicode(soup.find("div", {"class": "details-section-body"})) | |
result["pages"] = soup.find("a", {"itemprop": "num_pages"}).text.strip() | |
result["language"] = soup.find("div", {"itemprop": "language"}).text.strip() | |
result["isbn"] = soup.find("div", {"itemprop": "isbn"}).text.strip() | |
result["genres"] = [] | |
for genre in soup.find_all("a", {"itemprop": "genre"}): | |
result["genres"].append({ | |
"name": genre.text.strip(), | |
"url": genre["href"] | |
}) | |
result["protection"] = soup.find("div", {"itemprop": "contentProtection"}).text.strip() | |
return result | |
def main(): | |
sout = getwriter("utf8")(stdout) | |
data = parse(urlopen("https://play.google.com/store/books/details/Douglas_Adams_The_Ultimate_Hitchhiker_s_Guide_to_t?id=mO-62VxpLe0C&hl=en").read()) | |
sout.write(dumps(data, ensure_ascii=False, sort_keys=True, indent=2, separators=(',', ': ')) + "\n") | |
if __name__ == "__main__": | |
def signal_handler(signal, frame): | |
exit(0) | |
signal(SIGINT, signal_handler) | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"author": { | |
"name": "Douglas Adams", | |
"url": "/store/books/author?id=Douglas+Adams" | |
}, | |
"description": "<div class=\"details-section-body expandable\"> <div class=\"full-text multicol\" data-multicol-fixed-height=\"true\" data-multicol-short-layout=\"true\" data-multicol-text=\"true\"> <b>At last in paperback in one complete volume, here are the five classic novels from Douglas Adams’s beloved Hitchiker series.<br/></b><br/><b><i>The Hitchhiker’s Guide to the Galaxy<br/></i></b>Seconds before the Earth is demolished for a galactic freeway, Arthur Dent is saved by Ford Prefect, a researcher for the revised <i>Guide</i>. Together they stick out their thumbs to the stars and begin a wild journey through time and space.<br/><br/><b><i>The Restaurant at the End of the Universe<br/></i></b>Facing annihilation at the hands of warmongers is a curious time to crave tea. It could only happen to the cosmically displaced Arthur Dent and his comrades as they hurtle across the galaxy in a desperate search for a place to eat.<br/><br/><b><i>Life, the Universe and Everything<br/></i></b>The unhappy inhabitants of planet Krikkit are sick of looking at the night sky– so they plan to destroy it. The universe, that is. Now only five individuals can avert Armageddon: mild-mannered Arthur Dent and his stalwart crew.<br/><br/><b><i>So Long, and Thanks for All the Fish<br/></i></b>Back on Earth, Arthur Dent is ready to believe that the past eight years were all just a figment of his stressed-out imagination. But a gift-wrapped fishbowl with a cryptic inscription conspires to thrust him back to reality. So to speak.<br/><br/><b><i>Mostly Harmless<br/></i></b>Just when Arthur Dent makes the terrible mistake of starting to enjoy life, all hell breaks loose. Can he save the Earth from total obliteration? Can he save the <i>Guide</i> from a hostile alien takeover? Can he save his daughter from herself?<br/><br/><br/><i>From the Trade Paperback edition.</i> </div> </div>", | |
"genres": [ | |
{ | |
"name": "Science Fiction & Fantasy", | |
"url": "/store/books/category/coll_1604" | |
}, | |
{ | |
"name": "Space Opera", | |
"url": "/store/books/category/subj_Science_Fiction___Fantasy.Space_Opera/collection/featured" | |
}, | |
{ | |
"name": "Fiction / Humorous", | |
"url": "/store/search?q=subject:%22Fiction+/+Humorous%22&c=books" | |
}, | |
{ | |
"name": "Fiction / Science Fiction / Action & Adventure", | |
"url": "/store/search?q=subject:%22Fiction+/+Science+Fiction+/+Action+%26+Adventure%22&c=books" | |
}, | |
{ | |
"name": "Fiction / Science Fiction / Space Opera", | |
"url": "/store/search?q=subject:%22Fiction+/+Science+Fiction+/+Space+Opera%22&c=books" | |
} | |
], | |
"isbn": "9780307498465", | |
"language": "English", | |
"name": "The Ultimate Hitchhiker's Guide to the Galaxy", | |
"pages": "832", | |
"preview": "https://play.google.com/books/reader?id=mO-62VxpLe0C&printsec=frontcover&output=reader", | |
"price": "UAH155.72", | |
"protection": "This content is DRM protected.", | |
"rating": "4.4338765144348145", | |
"rating-count": "2639", | |
"url": "https://play.google.com/store/books/details?id=mO-62VxpLe0C" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment