Skip to content

Instantly share code, notes, and snippets.

@hex128
Last active August 26, 2016 20:12
Show Gist options
  • Save hex128/0dc2d2019d63ce596e85 to your computer and use it in GitHub Desktop.
Save hex128/0dc2d2019d63ce596e85 to your computer and use it in GitHub Desktop.
Google Play Movie Parser
#!/usr/bin/python2
# -*- coding: utf-8 -*-
from json import dumps
from sys import stdout, exit
from codecs import getwriter
from signal import signal, SIGINT
from urllib import urlopen
from bs4 import BeautifulSoup
def parse(html):
result = {}
soup = BeautifulSoup(html, "lxml")
result["url"] = soup.find("meta", {"itemprop": "url"})["content"]
result["name"] = soup.find("div", {"itemprop": "name"}).text.strip()
result["published"] = soup.find("div", {"itemprop": "datePublished"}).text.strip()
result["genre"] = soup.find("span", {"itemprop": "genre"}).text.strip()
result["offers"] = []
if soup.find("button", {"class": "price"}):
for offer in soup.find("button", {"class": "price"}).find_all("span", {"itemprop": "offers"}):
result["offers"].append({
"description": offer.find("meta", {"itemprop": "description"})["content"],
"price": offer.find("meta", {"itemprop": "price"})["content"]
})
result["rating"] = soup.find("meta", {"itemprop": "ratingValue"})["content"]
result["rating-count"] = soup.find("meta", {"itemprop": "ratingCount"})["content"]
trailer = soup.find("span", {"class": "details-trailer"})
if trailer:
result["trailer"] = trailer.find("span", {"class": "preview-overlay-container"})["data-video-url"]
else:
result["trailer"] = None
result["description"] = unicode(soup.find("div", {"class": "details-section-body"}))
details = soup.find("div", {"class": "cc-contents"})
result["actors"] = []
for actor in details.find_all("span", {"itemprop": "actor"}):
result["actors"].append({
"name": actor.find("span", {"itemprop": "name"}).text.strip(),
"url": actor.find("a", {"itemprop": "url"})["href"]
})
result["producers"] = []
for producer in details.find_all("span", {"itemprop": "producer"}):
result["producers"].append({
"name": producer.find("span", {"itemprop": "name"}).text.strip(),
"url": producer.find("a", {"itemprop": "url"})["href"]
})
director = details.find("span", {"itemprop": "director"})
result["director"] = {
"name": director.find("span", {"itemprop": "name"}).text.strip(),
"url": director.find("a", {"itemprop": "url"})["href"]
}
result["authors"] = []
for author in details.find_all("span", {"itemprop": "actor"}):
result["authors"].append({
"name": author.find("span", {"itemprop": "name"}).text.strip(),
"url": author.find("a", {"itemprop": "url"})["href"]
})
return result
def main():
sout = getwriter("utf8")(stdout)
data = parse(urlopen("https://play.google.com/store/movies/details?id=P_lCpSna7mY&hl=en").read())
sout.write(dumps(data, ensure_ascii=False, sort_keys=True, indent=2, separators=(',', ': ')) + "\n")
if __name__ == "__main__":
def signal_handler(signal, frame):
exit(0)
signal(SIGINT, signal_handler)
main()
{
"actors": [
{
"name": "Benedict Cumberbatch",
"url": "/store/search?q=Benedict+Cumberbatch&c=movies"
},
{
"name": "Keira Knightley",
"url": "/store/search?q=Keira+Knightley&c=movies"
},
{
"name": "Matthew Goode",
"url": "/store/search?q=Matthew+Goode&c=movies"
},
{
"name": "Rory Kinnear",
"url": "/store/search?q=Rory+Kinnear&c=movies"
},
{
"name": "Allen Leech",
"url": "/store/search?q=Allen+Leech&c=movies"
},
{
"name": "Matthew Beard",
"url": "/store/search?q=Matthew+Beard&c=movies"
},
{
"name": "Charles Dance",
"url": "/store/search?q=Charles+Dance&c=movies"
},
{
"name": "Mark Strong",
"url": "/store/search?q=Mark+Strong&c=movies"
}
],
"authors": [
{
"name": "Benedict Cumberbatch",
"url": "/store/search?q=Benedict+Cumberbatch&c=movies"
},
{
"name": "Keira Knightley",
"url": "/store/search?q=Keira+Knightley&c=movies"
},
{
"name": "Matthew Goode",
"url": "/store/search?q=Matthew+Goode&c=movies"
},
{
"name": "Rory Kinnear",
"url": "/store/search?q=Rory+Kinnear&c=movies"
},
{
"name": "Allen Leech",
"url": "/store/search?q=Allen+Leech&c=movies"
},
{
"name": "Matthew Beard",
"url": "/store/search?q=Matthew+Beard&c=movies"
},
{
"name": "Charles Dance",
"url": "/store/search?q=Charles+Dance&c=movies"
},
{
"name": "Mark Strong",
"url": "/store/search?q=Mark+Strong&c=movies"
}
],
"description": "<div class=\"details-section-body expandable\"> <div class=\"full-text multicol\" data-multicol-fixed-height=\"true\" data-multicol-text=\"true\"> <span class=\"details-trailer\"> <span class=\"video-image-wrapper\"> <img class=\"video-image\" src=\"https://lh3.googleusercontent.com/LIbDUFQL85Yn6IOp-l4H93chch6q58HnjdCmbYcd510x4thv4oehJAbETMzj43iVYETc=w315\"/> </span> <span class=\"preview-overlay-container\" data-docid=\"movie-P_lCpSna7mY\" data-video-url=\"https://www.youtube.com/embed/Agd89L0CvO8?ps=play&amp;vq=large&amp;rel=0&amp;autohide=1&amp;showinfo=0&amp;autoplay=1\"> <span class=\"play-action-container\" data-video-url=\"https://www.youtube.com/embed/Agd89L0CvO8?ps=play&amp;vq=large&amp;rel=0&amp;autohide=1&amp;showinfo=0&amp;autoplay=1\"> <span class=\"play-action\"></span> </span> </span> </span> Academy Award®-Winner for Best Adapted Screenplay. Academy Award®-nominee Benedict Cumberbatch (TV's SHERLOCK, STAR TREK INTO DARKNESS) shines as real-life war hero and pioneer of modern-day computing, Alan Turing. THE IMITATION GAME follows Turing as he leads a motley crew of scholars, linguists, chess champions, and intelligence officers in cracking the so-called unbreakable codes of Germany's World War II Enigma machine, potentially saving millions of lives by helping to shorten the war. Also depicted is Turing's tragic fall from grace when he was convicted of homosexuality - a crime in post-war Britain. Co-starring Academy Award®-nominee Keira Knightley of BEGIN AGAIN and PIRATES OF THE CARIBBEAN FRANCHISE. </div> </div>",
"director": {
"name": "Morten Tyldum",
"url": "/store/search?q=Morten+Tyldum&c=movies"
},
"genre": "Drama",
"name": "The Imitation Game",
"offers": [],
"producers": [
{
"name": "Nora Grossman",
"url": "/store/search?q=Nora+Grossman&c=movies"
},
{
"name": "Ido Ostrowsky",
"url": "/store/search?q=Ido+Ostrowsky&c=movies"
},
{
"name": "Teddy Schwarzman",
"url": "/store/search?q=Teddy+Schwarzman&c=movies"
},
{
"name": "Peter Heslop",
"url": "/store/search?q=Peter+Heslop&c=movies"
},
{
"name": "Graham Moore",
"url": "/store/search?q=Graham+Moore&c=movies"
}
],
"published": "February 2015",
"rating": "4.300000190734863",
"rating-count": "170",
"trailer": "https://www.youtube.com/embed/Agd89L0CvO8?ps=play&vq=large&rel=0&autohide=1&showinfo=0&autoplay=1",
"url": "https://play.google.com/store/movies/details?id=P_lCpSna7mY"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment