Created
August 21, 2020 11:25
-
-
Save jpclipffel/07ca4532b61804dc26cc75d7a61b7612 to your computer and use it in GitHub Desktop.
A simple App Store scrapper example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import requests | |
from lxml import etree | |
def get_app_info(app_id, app_name, country='lu', lang='fr-fr'): | |
'''Fetches an application page and returns a set of contextual information. | |
:param str app_id: Application ID | |
:param str app_name: Application name | |
:param str country: Country code (e.g. 'lu' for Luxembourg) | |
:param str lang: Application page language (e.g. 'fr-fr' for French) | |
''' | |
# Generate application page URL from the application ID (app_id), name (app_name), country code (country) and language (lang) | |
url = f'https://apps.apple.com/{country}/app/{app_name}/id{app_id}?l={lang}' | |
# Download the application page document (HTML page) | |
src = requests.get(url) | |
# Parse the received document as HTML using the html-dedicated parser 'HTMLParser()' | |
obj = etree.fromstring(src.text, parser=etree.HTMLParser()) | |
# Extract and returns information from the HTML document | |
# | |
# obj.xpath('...') Search HTML blosk by XPath (see https://www.w3schools.com/xml/xpath_syntax.asp) | |
# [0] XPath return a list; select the first item in the list | |
# .attrib['...'] Select a subattribute | |
# .replace('\u200e', '') Remove trailing Unicode character | |
return { | |
'name' : obj.xpath('//meta[@property="og:title"]')[0].attrib['content'].replace('\u200e', ''), | |
'description' : obj.xpath('//meta[@property="og:description"]')[0].attrib['content'].split('\n')[0].replace('\u200e', ''), | |
# Add other fields here | |
} | |
def get_app_reviews(app_id, country='lu'): | |
'''Fetches an application reviews and returns a cleaned-up reviews list. | |
:param str app_id: Application ID | |
:param str country: Country code (e.g. 'lu' for Luxembourg) | |
''' | |
# Generate the application reviews feed URL fromthe application ID (app_id) and country code (country) | |
url = f'https://itunes.apple.com/{country}/rss/customerreviews/id={app_id}/sortBy=mostRecent/json' | |
# Download the application comments document (RSS feed) | |
src = requests.get(url) | |
# Parse the received document as a JSON string | |
obj = src.json() | |
# Generate the reviews list | |
reviews = [] | |
for review in obj['feed']['entry']: | |
reviews.append({ | |
'id' : review['id']['label'], | |
'user' : review['author']['name']['label'], | |
'stars' : int(review['im:rating']['label']), | |
'thought' : review['title']['label'], | |
'comment' : review['content']['label'], | |
# Add other fields here | |
}) | |
# Return the reviews | |
return reviews | |
def get_app(app_id, app_name, country='lu', lang='fr-fr'): | |
'''Returns an application information and reviews. | |
:param str app_id: Application ID | |
:param str app_name: Application name | |
:param str country: Country code (e.g. 'lu' for Luxembourg) | |
:param str lang: Application page language (e.g. 'fr-fr' for French) | |
''' | |
return { | |
**{ "reviews": get_app_reviews(app_id, country) }, | |
**get_app_info(app_id, app_name, country, lang) | |
} | |
def report_reviews(app, path): | |
'''Generates an application reviews report as CSV. | |
:param dict app: Application information, as returned by `get_app()` | |
:param str path: Report path | |
''' | |
# Fields to show in the CSV report | |
fields = ["stars", "user", "thought", "id"] | |
# Open or create report file | |
with open(path, 'w') as fd: | |
# Print CSV report header (coulmns name) | |
print(f'app_name,{",".join(fields)}', file=fd) | |
# Print each review | |
for review in app['reviews']: | |
columns = [ app['name'], ] + [ str(review.get(f, '')) for f in fields ] | |
print(','.join(columns), file=fd) | |
def main(): | |
if len(sys.argv) < 4: | |
print(f'Usage: {sys.argv[0]} <application id> <application name> <CSV file>') | |
sys.exit(1) | |
else: | |
# Extract command line arguments | |
app_id = sys.argv[1] | |
app_name = sys.argv[2] | |
report_path = sys.argv[3] | |
# Get application info | |
# app_info = get_app(app_id='323229106', app_name='waze-navigation-live-traffic') | |
app_info = get_app(app_id, app_name) | |
# Generate application reviews report | |
report_reviews(app_info, report_path) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment