Skip to content

Instantly share code, notes, and snippets.

@jpclipffel
Created August 21, 2020 11:25
Show Gist options
  • Save jpclipffel/07ca4532b61804dc26cc75d7a61b7612 to your computer and use it in GitHub Desktop.
Save jpclipffel/07ca4532b61804dc26cc75d7a61b7612 to your computer and use it in GitHub Desktop.
A simple App Store scrapper example
import sys
import requests
from lxml import etree
def get_app_info(app_id, app_name, country='lu', lang='fr-fr'):
'''Fetches an application page and returns a set of contextual information.
:param str app_id: Application ID
:param str app_name: Application name
:param str country: Country code (e.g. 'lu' for Luxembourg)
:param str lang: Application page language (e.g. 'fr-fr' for French)
'''
# Generate application page URL from the application ID (app_id), name (app_name), country code (country) and language (lang)
url = f'https://apps.apple.com/{country}/app/{app_name}/id{app_id}?l={lang}'
# Download the application page document (HTML page)
src = requests.get(url)
# Parse the received document as HTML using the html-dedicated parser 'HTMLParser()'
obj = etree.fromstring(src.text, parser=etree.HTMLParser())
# Extract and returns information from the HTML document
#
# obj.xpath('...') Search HTML blosk by XPath (see https://www.w3schools.com/xml/xpath_syntax.asp)
# [0] XPath return a list; select the first item in the list
# .attrib['...'] Select a subattribute
# .replace('\u200e', '') Remove trailing Unicode character
return {
'name' : obj.xpath('//meta[@property="og:title"]')[0].attrib['content'].replace('\u200e', ''),
'description' : obj.xpath('//meta[@property="og:description"]')[0].attrib['content'].split('\n')[0].replace('\u200e', ''),
# Add other fields here
}
def get_app_reviews(app_id, country='lu'):
'''Fetches an application reviews and returns a cleaned-up reviews list.
:param str app_id: Application ID
:param str country: Country code (e.g. 'lu' for Luxembourg)
'''
# Generate the application reviews feed URL fromthe application ID (app_id) and country code (country)
url = f'https://itunes.apple.com/{country}/rss/customerreviews/id={app_id}/sortBy=mostRecent/json'
# Download the application comments document (RSS feed)
src = requests.get(url)
# Parse the received document as a JSON string
obj = src.json()
# Generate the reviews list
reviews = []
for review in obj['feed']['entry']:
reviews.append({
'id' : review['id']['label'],
'user' : review['author']['name']['label'],
'stars' : int(review['im:rating']['label']),
'thought' : review['title']['label'],
'comment' : review['content']['label'],
# Add other fields here
})
# Return the reviews
return reviews
def get_app(app_id, app_name, country='lu', lang='fr-fr'):
'''Returns an application information and reviews.
:param str app_id: Application ID
:param str app_name: Application name
:param str country: Country code (e.g. 'lu' for Luxembourg)
:param str lang: Application page language (e.g. 'fr-fr' for French)
'''
return {
**{ "reviews": get_app_reviews(app_id, country) },
**get_app_info(app_id, app_name, country, lang)
}
def report_reviews(app, path):
'''Generates an application reviews report as CSV.
:param dict app: Application information, as returned by `get_app()`
:param str path: Report path
'''
# Fields to show in the CSV report
fields = ["stars", "user", "thought", "id"]
# Open or create report file
with open(path, 'w') as fd:
# Print CSV report header (coulmns name)
print(f'app_name,{",".join(fields)}', file=fd)
# Print each review
for review in app['reviews']:
columns = [ app['name'], ] + [ str(review.get(f, '')) for f in fields ]
print(','.join(columns), file=fd)
def main():
if len(sys.argv) < 4:
print(f'Usage: {sys.argv[0]} <application id> <application name> <CSV file>')
sys.exit(1)
else:
# Extract command line arguments
app_id = sys.argv[1]
app_name = sys.argv[2]
report_path = sys.argv[3]
# Get application info
# app_info = get_app(app_id='323229106', app_name='waze-navigation-live-traffic')
app_info = get_app(app_id, app_name)
# Generate application reviews report
report_reviews(app_info, report_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment