Skip to content

Instantly share code, notes, and snippets.

@kgn
Created June 9, 2015 23:02
Show Gist options
  • Save kgn/e96e7ae71a38447ac614 to your computer and use it in GitHub Desktop.
Save kgn/e96e7ae71a38447ac614 to your computer and use it in GitHub Desktop.
App Reviews - Python script to retrieve App Store reviews and save them to a CSV file
#!/usr/bin/env python
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
import json
import time
def getJson(url):
response = urlopen(url)
data = str(response.read())
return json.loads(data)
def getReviews(appID, page=1):
url = 'https://itunes.apple.com/rss/customerreviews/id=%s/page=%d/sortby=mostrecent/json' % (appID, page)
data = getJson(url).get('feed')
if data.get('entry') == None:
getReviews(appID, page+1)
return
for entry in data.get('entry'):
if entry.get('im:name'): continue
review_id = entry.get('id').get('label')
title = entry.get('title').get('label')
author = entry.get('author').get('name').get('label')
author_url = entry.get('author').get('uri').get('label')
version = entry.get('im:version').get('label')
rating = entry.get('im:rating').get('label')
review = entry.get('content').get('label')
vote_count = entry.get('im:voteCount').get('label')
csvData = [review_id, title.replace('"', '""'), author, author_url, version, rating, review.replace('"', '""'), vote_count]
print '"'+'","'.join(csvData)+'"'
getReviews(appID, page+1)
csvTitles = ['review_id', 'title', 'author', 'author_url', 'version', 'rating', 'review', 'vote_count']
print ','.join(csvTitles)
getReviews(<app store id>)
@rigogsilva
Copy link

Here is my version of the code in case you might need. Instead of returning a csv_data, I am returning a list of dictionaries containing the reviews:

import pprint
import time
import typing

import requests


def is_error_response(http_response, seconds_to_sleep: float = 1) -> bool:
    """
    Returns False if status_code is 503 (system unavailable) or 200 (success),
    otherwise it will return True (failed). This function should be used
    after calling the commands requests.post() and requests.get().

    :param http_response:
        The response object returned from requests.post or requests.get.
    :param seconds_to_sleep:
        The sleep time used if the status_code is 503. This is used to not
        overwhelm the service since it is unavailable.
    """
    if http_response.status_code == 503:
        time.sleep(seconds_to_sleep)
        return False

    return http_response.status_code != 200


def get_json(url) -> typing.Union[dict, None]:
    """
    Returns json response if any. Returns None if no json found.

    :param url:
        The url go get the json from.
    """
    response = requests.get(url)
    if is_error_response(response):
        return None
    json_response = response.json()
    return json_response


def get_reviews(app_id, page=1) -> typing.List[dict]:
    """
    Returns a list of dictionaries with each dictionary being one review. 
    
    :param app_id:
        The app_id you are searching. 
    :param page:
        The page id to start the loop. Once it reaches the final page + 1, the 
        app will return a non valid json, thus it will exit with the current 
        reviews. 
    """
    reviews: typing.List[dict] = [{}]

    while True:
        url = (f'https://itunes.apple.com/rss/customerreviews/id={app_id}/'
               f'page={page}/sortby=mostrecent/json')
        json = get_json(url)

        if not json:
            return reviews

        data_feed = json.get('feed')

        if not data_feed.get('entry'):
            get_reviews(app_id, page + 1)

        reviews += [
            {
                'review_id': entry.get('id').get('label'),
                'title': entry.get('title').get('label'),
                'author': entry.get('author').get('name').get('label'),
                'author_url': entry.get('author').get('uri').get('label'),
                'version': entry.get('im:version').get('label'),
                'rating': entry.get('im:rating').get('label'),
                'review': entry.get('content').get('label'),
                'vote_count': entry.get('im:voteCount').get('label')
            }
            for entry in data_feed.get('entry')
            if not entry.get('im:name')
        ]

        page += 1


reviews = get_reviews('appid')
print(len(reviews))
pprint.pprint(reviews)

@soumitra9
Copy link

Hi, I am getting the same data irrespective of page number. Please help

@Asterios17
Copy link

Special thanks to rigogsilva

Your script worked like a charm.
However I am also getting only limited reviews (500 reviews in particular) , the same over over and over again.
Should i do something different?

Thanks

@apkdiamond1
Copy link

can you tell us if we want to store it into wordpress website instead of CSV file

@Sumesh96
Copy link

Sumesh96 commented Nov 2, 2020

Hi rigogsilva,

This is not working for me. Can u help me to solve

@JustStas
Copy link

JustStas commented Jan 10, 2021

Updated rigogsilva's code with the new URL:


import pprint
import time
import typing

import requests


def is_error_response(http_response, seconds_to_sleep: float = 1) -> bool:
    """
    Returns False if status_code is 503 (system unavailable) or 200 (success),
    otherwise it will return True (failed). This function should be used
    after calling the commands requests.post() and requests.get().

    :param http_response:
        The response object returned from requests.post or requests.get.
    :param seconds_to_sleep:
        The sleep time used if the status_code is 503. This is used to not
        overwhelm the service since it is unavailable.
    """
    if http_response.status_code == 503:
        time.sleep(seconds_to_sleep)
        return False

    return http_response.status_code != 200


def get_json(url) -> typing.Union[dict, None]:
    """
    Returns json response if any. Returns None if no json found.

    :param url:
        The url go get the json from.
    """
    response = requests.get(url)
    if is_error_response(response):
        return None
    json_response = response.json()
    return json_response


def get_reviews(app_id, page=1) -> typing.List[dict]:
    """
    Returns a list of dictionaries with each dictionary being one review. 
    
    :param app_id:
        The app_id you are searching. 
    :param page:
        The page id to start the loop. Once it reaches the final page + 1, the 
        app will return a non valid json, thus it will exit with the current 
        reviews. 
    """
    print(f'STARTED {page}')
    reviews: typing.List[dict] = [{}]

    while True:
        url = (f'https://itunes.apple.com/ru/rss/customerreviews/page={page}/id={app_id}/sortBy=mostRecent/json')
        json = get_json(url)

        if not json:
            return reviews

        data_feed = json.get('feed')

        try:
            if not data_feed.get('entry'):
                get_reviews(app_id, page + 1)


            reviews += [
                {
                    'review_id': entry.get('id').get('label'),
                    'title': entry.get('title').get('label'),
                    'author': entry.get('author').get('name').get('label'),
                    'author_url': entry.get('author').get('uri').get('label'),
                    'version': entry.get('im:version').get('label'),
                    'rating': entry.get('im:rating').get('label'),
                    'review': entry.get('content').get('label'),
                    'vote_count': entry.get('im:voteCount').get('label'),
                    'page': page
                }
                for entry in data_feed.get('entry')
                if not entry.get('im:name')
            ]
            page += 1
        except Exception:
            return reviews


reviews = get_reviews('1234567')
print(len(reviews))
pprint.pprint(reviews)

@sapna88
Copy link

sapna88 commented Sep 13, 2021

Can someone help, Its not going beyond 500 reviews.How to get all the reviews

@kgn
Copy link
Author

kgn commented Sep 25, 2021

500 is the max Apple stores, you'll need to pull them periodically over time if you want to build up your own dataset of reviews

@dvlmcr69
Copy link

Hi I'm getting this output:
STARTED 1
STARTED 2
STARTED 3
STARTED 4
STARTED 5
STARTED 6
STARTED 7
STARTED 8
STARTED 9
STARTED 10
STARTED 11
1
[{}]

What's wrong? can someone help?

@RavenKyu
Copy link

@dvlmcr69 you should change the country field.

 url = (f'https://itunes.apple.com/------> ru <-------/rss/customerreviews/page={page}/id={app_id}/sortBy=mostRecent/json')

such as us, ko, ru, ca

@ClebsonLu
Copy link

Does this script have the same purpose as the app_store_scraper library?
I see that the return comes different.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment