Last active
April 10, 2018 22:00
-
-
Save spicyramen/826e8bce134f2b4c5e18023ff9d01425 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Extract important information from AppAnnie via API.""" | |
import pandas as pd | |
from absl import app | |
from absl import flags | |
from absl import logging | |
from bs4 import BeautifulSoup as BS | |
from collections import namedtuple | |
from retrying import retry | |
from datetime import datetime, timedelta | |
import csv | |
import json | |
import os | |
import requests | |
import re | |
import urllib | |
API_KEY = os.environ['APPANNIE_API_KEY'] | |
GOOGLE_PLAY = 'google-play' | |
GOOGLE_PLAY_STORE = 'Google Play' | |
IOS = 'ios' | |
IOS_STORE = 'ios' | |
# "Accept-Encoding": "gzip, deflate" | |
HEADERS = {'Authorization': 'Bearer %s' % API_KEY, 'Accept-Encoding': 'gzip'} | |
_APPS = 'apps' | |
_STORES = ('ios', 'google-play') | |
_RANK = 'rank' | |
_PRODUCT = 'product' | |
_PRODUCT_CODE = 'product_code' | |
_PRODUCT_NAME = 'product_name' | |
_PRODUCT_ID = 'product_id' | |
_PARENT_COMPANY_NAME = 'parent_company_name' | |
_PRIVACY_POLICY_URL = 'privacy_policy_url' | |
_COMPANY_NAME = 'company_name' | |
_DESCRIPTION = 'description' | |
_COMPANY_URL = 'company_url' | |
_SUPPORT_URL = 'support_url' | |
_DEVICE = {'ios': 'iphone', 'google-play': 'android'} | |
COUNTRY_LIST = ('US',) | |
# ios | mac | appletv | google-play | amazon-appstore | windows-phone | | |
# windows-store | |
# Example URL: https://api.appannie.com/v1.2/apps/ios/app/553834731/details | |
APP_DETAILS_URL = 'https://api.appannie.com/%s/apps/%s/app/%s/details' | |
TOP_APPS_URL = 'https://api.appannie.com/%s/intelligence/%s/%s/ranking' | |
COUNTRIES_URL = 'https://api.appannie.com/%s/meta/countries' | |
API_VERSION = 'v1.2' | |
_GRANULARITY = 'weekly' | |
FILENAME = 'data/top_ios_downloads.csv' | |
COMPANIES = 'data/capitalg_android.csv' | |
# Wait this long for outgoing HTTP connections to be established. | |
_CONNECT_TIMEOUT_SECONDS = 90 | |
# Wait this long to read from an HTTP socket. | |
_READ_TIMEOUT_SECONDS = 120 | |
_INITIAL_RETRY_INTERVAL_MS = 3000 | |
_RETRY_TIMES = 2 | |
FLAGS = flags.FLAGS | |
flags.DEFINE_string('filename', '', 'Dataset') | |
flags.DEFINE_integer('max_apps', 1000, 'Max number of apps to search', | |
lower_bound=1, upper_bound=1000) | |
flags.DEFINE_string('store', 'ios', 'ios or google-play') | |
flags.DEFINE_string('device', 'iphone', 'Device: android, ipad, iphone') | |
CONTACT_REGEXES = [re.compile(r'[\w\.-]+@[\w\.-]+'), | |
re.compile( | |
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),' | |
r']|(?:%[0-9a-fA-F][0-9a-fA-F]))+')] | |
Application = namedtuple('Application', 'rank name product_id url') | |
class MobileApplication(object): | |
"""This represents an App object.""" | |
def __init__(self, app_id): | |
self.app_id = app_id | |
self._store = None | |
self.app_name_unified = None | |
self.parent_company_name = None | |
self.publisher_name_raw = None | |
self.publisher_name = None | |
self.company_name = None | |
self.company_url = None | |
self.support_url = None | |
self.urls = [] | |
self.total_downloads = 0 | |
self._url = None | |
@property | |
def url(self): | |
return self._url | |
@url.getter | |
def url(self): | |
return APP_DETAILS_URL % (API_VERSION, self.store, self.app_id) | |
@property | |
def store(self, store): | |
self._store = store | |
@store.getter | |
def store(self): | |
return self._store | |
@store.setter | |
def store(self, store): | |
self._store = store | |
def __str__(self): | |
return '%s %s %s %d' % ( | |
self.app_id, self.app_name_unified, self.parent_company_name, | |
self.total_downloads) | |
def __unicode__(self): | |
return u'%s %s %s %d' % ( | |
self.app_id, self.app_name_unified, self.parent_company_name, | |
self.total_downloads) | |
@property | |
def details(self): | |
return 'app_id: %s store: %s app_name_unified: %s ' \ | |
'parent_company_name: %s publisher_name_raw: %s ' \ | |
'publisher_name: %s company_name: %s company_url: %s ' \ | |
'support_url: %s url: %s' % ( | |
self.app_id, self._store, self.app_name_unified, | |
self.parent_company_name, self.publisher_name_raw, | |
self.publisher_name, self.company_name, self.company_url, | |
self.support_url, self.urls) | |
def GetCountries(): | |
""" | |
Returns | |
------- | |
A list of countries | |
""" | |
session = requests.Session() | |
try: | |
countries_url = COUNTRIES_URL % API_VERSION | |
logging.info('Get Countries: %s' % countries_url) | |
response = session.get(countries_url, | |
timeout=( | |
_CONNECT_TIMEOUT_SECONDS, | |
_READ_TIMEOUT_SECONDS), | |
headers=HEADERS, | |
allow_redirects=False, | |
verify=True) | |
response.raise_for_status() | |
return _HandleCountriesResponse(response) | |
except requests.exceptions.HTTPError as err: | |
if err.response.status_code == 404: | |
logging.error('Page not found %s' % countries_url) | |
return | |
def GetCategories(store): | |
""" | |
Parameters | |
---------- | |
market | |
Returns | |
------- | |
""" | |
if not store or store not in _STORES: | |
raise ValueError('Invalid market, should be ios or google-play') | |
CATEGORIES_URL = 'https://api.appannie.com/%s/meta/apps/%s/categories' | |
session = requests.Session() | |
try: | |
categories_url = CATEGORIES_URL % (API_VERSION, store) | |
logging.info('Get Categories: %s' % categories_url) | |
response = session.get(categories_url, | |
timeout=( | |
_CONNECT_TIMEOUT_SECONDS, | |
_READ_TIMEOUT_SECONDS), | |
headers=HEADERS, | |
allow_redirects=False, | |
verify=True) | |
response.raise_for_status() | |
return _HandleCategoriesResponse(response) | |
except requests.exceptions.HTTPError as err: | |
if err.response.status_code == 404: | |
logging.error('Page not found %s' % categories_url) | |
return | |
@retry(stop_max_attempt_number=_RETRY_TIMES, | |
stop_max_delay=_INITIAL_RETRY_INTERVAL_MS) | |
def GetAppAnnieDetails(mobile_app): | |
""" | |
Parameters | |
---------- | |
mobile_app | |
Returns | |
------- | |
""" | |
logging.info( | |
'Looking up: %s at: %s' % (mobile_app.product_id, mobile_app.url)) | |
session = requests.Session() | |
try: | |
response = session.get(mobile_app.url, | |
timeout=( | |
_CONNECT_TIMEOUT_SECONDS, | |
_READ_TIMEOUT_SECONDS), | |
headers=HEADERS, | |
allow_redirects=False, | |
verify=True) | |
response.raise_for_status() | |
return _HandleAppDetailsResponse(response) | |
except requests.exceptions.HTTPError as err: | |
logging.exception(err) | |
if err.response.status_code == 404: | |
logging.exception('Page not found %s' % mobile_app.url) | |
return | |
if err.response.status_code == 400: | |
logging.exception('Page not found %s' % mobile_app.url) | |
return | |
@retry(stop_max_attempt_number=_RETRY_TIMES, | |
stop_max_delay=_INITIAL_RETRY_INTERVAL_MS) | |
def GetTopApps(vertical, store, country, categories, start_date, end_date, | |
ranks, feeds, granularity, device): | |
"""Get Top Apps for AppAnnie. | |
https://api.appannie.com/v1.2/intelligence/ | |
apps/ | |
google-play/ | |
ranking? | |
device=android& | |
countries=US& | |
start_date=2018-03-01& | |
end_date=2018-03-01& | |
feeds=free& | |
categories=OVERALL | |
Parameters | |
---------- | |
vertical | |
store | |
country | |
categories | |
start_date | |
end_date | |
ranks | |
granularity | |
device | |
Returns | |
------- | |
""" | |
base_url = TOP_APPS_URL % (API_VERSION, vertical, store) | |
url_params = { | |
'countries': country, | |
'categories': categories, | |
'start_date': start_date.strftime('%Y-%m-%d'), | |
'end_date': end_date.strftime('%Y-%m-%d'), | |
'granularity': granularity, | |
'device': device | |
} | |
if feeds: | |
url_params['feeds'] = feeds | |
# Handle Ranks. | |
if isinstance(ranks, int): | |
url_params['ranks'] = ranks | |
url = '%s?%s' % (base_url, urllib.urlencode(url_params)) | |
logging.info('Looking up: %s' % url) | |
session = requests.Session() | |
try: | |
response = session.get(url, | |
timeout=( | |
_CONNECT_TIMEOUT_SECONDS, | |
_READ_TIMEOUT_SECONDS), | |
headers=HEADERS, | |
allow_redirects=False, | |
verify=True) | |
response.raise_for_status() | |
return _HandleTopAppsResponse(response) | |
except requests.exceptions.HTTPError as err: | |
logging.exception(err) | |
if err.response.status_code == 404: | |
logging.error('Page not found %s' % url) | |
return | |
if err.response.status_code == 400: | |
logging.exception('No Apps in this category %s' % url) | |
return | |
def _ToString(value): | |
"""Returns a string type based on value variable type. | |
Since we handle multiple languages we need to return a string type to write | |
in file human readable character. | |
Args: | |
value: (None, str or unicode) | |
Returns: | |
A str or None if no input. | |
""" | |
if not value: | |
# logging.warning('No string, empty value') | |
return None | |
if isinstance(value, unicode): | |
return value.encode('utf-8') | |
else: | |
return str(value).encode('utf-8') | |
def _FetchPageContent(response): | |
"""Use FetchProxy to fetch the content of a URL. | |
Args: | |
response: (requests.models.Response), content we fetched in get request. | |
Returns: | |
(response) Image data in bytes as str type. | |
Raises: | |
ValueError: Invalid HTTP response. | |
""" | |
if not response: | |
logging.exception('HTTP Response is None') | |
return | |
return response.content | |
def _HandleCountriesResponse(response): | |
""" | |
Parameters | |
---------- | |
response | |
Returns | |
------- | |
""" | |
content = _FetchPageContent(response) | |
if not content: | |
raise ValueError('HTTP Response is None') | |
content_json = json.loads(content) | |
country_list = content_json.get('country_list') | |
if country_list: | |
logging.info('Found: %d countries.' % len(country_list)) | |
return [country['country_code'] for country in country_list] | |
def _HandleCategoriesResponse(response): | |
""" | |
Parameters | |
---------- | |
response | |
Returns | |
------- | |
""" | |
content = _FetchPageContent(response) | |
if not content: | |
raise ValueError('HTTP Response is None') | |
content_json = json.loads(content) | |
category_list = content_json.get('categories') | |
category_labels_list = content_json.get('category_labels') | |
logging.info('Found: %d categories.' % len(category_list)) | |
logging.info('Found: %d category labels.' % len(category_labels_list)) | |
return category_list, category_labels_list | |
def _HandleTopAppsResponse(response): | |
"""Build AppDetails. | |
Parameters | |
---------- | |
response | |
Returns | |
------- | |
""" | |
content = _FetchPageContent(response) | |
if not content: | |
raise ValueError('HTTP Response is None') | |
content_json = json.loads(content) | |
app_list = content_json.get('list') | |
top_apps = [] | |
if app_list: | |
for _app in app_list: | |
rank = _app.get(_RANK) | |
product_name = _app.get(_PRODUCT_NAME) | |
product_id = str(_app.get(_PRODUCT_ID)) | |
url = APP_DETAILS_URL % (API_VERSION, FLAGS.store, product_id) | |
mobile_app = Application(rank, product_name, product_id, url) | |
logging.info(mobile_app) | |
top_apps.append(mobile_app) | |
logging.info('Top Apps found: %s.' % len(top_apps)) | |
return top_apps | |
def _HandleAppDetailsResponse(response): | |
"""This function handles HTTP response body in JSON format. | |
Args: | |
content: (str). API response information. | |
Returns: | |
App response | |
Raises: | |
ValueError: Invalid HTTP response. | |
""" | |
content = _FetchPageContent(response) | |
if not content: | |
raise ValueError('HTTP Response is None') | |
content_json = json.loads(content) | |
product = content_json.get(_PRODUCT) | |
if product: | |
# Extract App details. | |
app_id = product.get(_PRODUCT_CODE) | |
company_name = _ToString(product.get(_COMPANY_NAME)) | |
parent_company_name = _ToString(product.get(_PARENT_COMPANY_NAME)) | |
main_category = _ToString(product.get('main_category')) | |
company_url = _ToString(product.get(_COMPANY_URL)) | |
privacy_policy_url = _ToString(product.get(_PRIVACY_POLICY_URL)) | |
description = _ToString(product.get(_DESCRIPTION)).replace('\r', | |
'').replace( | |
'\n', '').replace(',', '') | |
email_info, url_info = _ExtractContactInformation(description) | |
support_url = _ToString(product.get(_SUPPORT_URL)) | |
size = _ToString(product.get('size')) | |
languages = _ToString(product.get('languages')) | |
publisher_name = _ToString(product.get('publisher_name')) | |
unified_product_name = _ToString(product.get('unified_product_name')) | |
has_iap = product.get('has_iap') | |
app_details = [app_id, unified_product_name, company_name, | |
parent_company_name, publisher_name, main_category, | |
company_url, privacy_policy_url, email_info, url_info, | |
support_url, description, size, languages, | |
has_iap] | |
# logging.info('Company URL: %s Support URL: %s URLS: %s' % ( | |
# company_url, support_url, url_info)) | |
return app_details | |
def _ExtractContactInformation(description): | |
""" | |
Parameters | |
---------- | |
description | |
Returns | |
------- | |
""" | |
email_info, url_info = None, None | |
if not description: | |
logging.error('No description found') | |
return email_info, url_info | |
# Clean HTML code. | |
description_clean = BS(description.replace('<br>', ' '), 'html.parser') | |
contact_info = [','.join(contact.findall(description_clean.get_text())) for | |
contact in CONTACT_REGEXES] | |
if contact_info: | |
email_info, url_info = contact_info | |
return email_info, url_info | |
def ProcessDataSet(apps): | |
""" | |
Parameters | |
---------- | |
apps | |
Returns | |
------- | |
""" | |
all_apps = [] | |
for _, _app in apps.iterrows(): | |
_mobile_app = MobileApplication(_app['app_id']) | |
if _app['store'] == GOOGLE_PLAY_STORE: | |
_mobile_app.store = GOOGLE_PLAY | |
elif _app['store'] == IOS_STORE: | |
_mobile_app.store = IOS | |
_mobile_app.app_name_unified = _app['app_name_unified'] | |
_mobile_app.parent_company_name = _app['parent_company_name'] | |
_mobile_app.publisher_name_raw = _app['publisher_name_raw'] | |
_mobile_app.publisher_name = _app['publisher_name'] | |
_mobile_app.company_name = _app['company_name'] | |
_mobile_app.total_downloads = _app['total_downloads'] | |
all_apps.append(_mobile_app) | |
logging.info('Processed %d Applications' % len(all_apps)) | |
return all_apps | |
def GetHistoricalData(start_date, end_date, granularity): | |
""" | |
/v1.2/intelligence/ | |
{vertical}/ | |
{market} | |
/ranking? | |
countries={country}& | |
categories={category}& | |
feeds={feeds}& | |
ranks={ranks}& | |
granularity={granularity}& | |
device={device} | |
https://api.appannie.com/v1.2/intelligence/ | |
apps/ | |
google-play/ | |
ranking? | |
device=android& | |
countries=US& | |
start_date=2018-03-01& | |
end_date=2018-03-01& | |
feeds=free& | |
categories=OVERALL | |
""" | |
if granularity == 'weekly': | |
period = 7 | |
elif granularity == 'month': | |
period = 30 | |
else: | |
period = 1 | |
start_date = datetime.strptime(start_date, '%Y-%m-%d') | |
end_date = datetime.strptime(end_date, '%Y-%m-%d') | |
date_periods = [] | |
while start_date < end_date: | |
date_periods.append(datetime.strftime(start_date, '%Y-%m-%d')) | |
start_date = start_date + timedelta(days=period) | |
return date_periods | |
def LoadDataSet(filename): | |
""" | |
:param filename: | |
:return: | |
""" | |
return pd.read_csv('%s' % (filename)) | |
def _GetDate(date_period): | |
"""Returns a Date object. | |
Args: | |
date_period: (str): A Date in string format 2018-01-12. | |
Returns: | |
A datetime object. | |
""" | |
year, month, day = date_period.split('-') | |
return datetime(int(year), int(month), int(day)) | |
def SaveDataSet(results, filename): | |
"""Write results stored in list of lists into a file in CNS. | |
Args: | |
results: (list) list of lists Results with article information. | |
filename: (str) Destination file. | |
Raises: | |
ValueError: Result list is empty. | |
FileError: Unable to write filename. | |
""" | |
if not results: | |
raise ValueError('Result list is empty') | |
with open(filename, 'w+') as csvfile: | |
filewriter = csv.writer(csvfile) | |
filewriter.writerows(results) | |
logging.info('Apps stored: %d.', len(results)) | |
def ProcessTopApps(top_apps, market, country, time_period): | |
"""Gets TopApps details using AppAnnie API. | |
Args: | |
top_apps: (list) List of Top Apps namedtuple. | |
market: (str) ios or google-play. | |
country: (str) Country of origin. ISO code. | |
time_period: (str) Historical time period. | |
Returns: | |
A list of lists including app details. | |
""" | |
app_details_results = [] | |
if not top_apps: | |
logging.error('No Top Apps') | |
return | |
for top_app in top_apps: | |
app_details = GetAppAnnieDetails(top_app) | |
app_details_results.append([market, country, time_period] + app_details) | |
return app_details_results | |
def main(_): | |
""" | |
This function collects Top Apps over a period of time. | |
The API returns a list of 1000 apps per period. Some App information | |
may repeat. | |
We store all Apps in a dictionary. | |
Write TopApps in CSV file, including period. | |
For each App we request App details. | |
Parameters | |
---------- | |
_ | |
Returns | |
------- | |
""" | |
# date_periods = GetHistoricalData('2018-03-18', '2018-03-24', 'weekly') | |
# logging.info(date_periods) | |
category_list, _ = GetCategories(FLAGS.store) | |
country_list = GetCountries() + ['WW'] | |
category_list = ['Overall > Games > Educational'] | |
date_periods = ['2018-03-24'] | |
total_apps_count = 0 | |
all_apps = [] | |
for country in country_list: | |
for date_period in date_periods: | |
for category in category_list: | |
logging.info('Getting Top Apps for: %s category: %s' % ( | |
date_period, category)) | |
# Collect list of Applications. | |
top_apps = GetTopApps(vertical=_APPS, | |
store=FLAGS.store, | |
country=country, | |
categories=category, | |
start_date=_GetDate(date_period), | |
end_date=_GetDate(date_period), | |
ranks=None, | |
feeds=None, | |
granularity=_GRANULARITY, | |
device=FLAGS.device) | |
# Collect Application details. | |
if top_apps: | |
logging.info('Top Apps found: %d in %s', len(top_apps), | |
country) | |
top_apps_details = ProcessTopApps(top_apps, FLAGS.store, | |
country, date_period) | |
total_apps_count += len(top_apps) | |
if top_apps_details: | |
all_apps.extend(top_apps_details) | |
else: | |
logging.warning('No Top Apps details found') | |
logging.info('Total apps found so far: %d', len(all_apps)) | |
SaveDataSet(all_apps, | |
'data/%s_%s.csv' % ( | |
'overall_games_educational_iphone', date_period)) | |
logging.info('Total number of apps: %d' % total_apps_count) | |
if __name__ == '__main__': | |
app.run(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment