-
-
Save TruffleClock/5676475 to your computer and use it in GitHub Desktop.
Scrape app reviews from iTunes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* | |
* An updated version of kentbye's gist to reflect changes in iTunes and fix some errors | |
* | |
* Scrape app reviews from iTunes. | |
* | |
* Set the iOS app id and the number of pages to scrape, and it creates a {$app_id}-reviews.csv file | |
* Increase set_time_limit(N) if your query runs out of time before completion | |
* Add/remove countries from the $countries array as needed | |
* | |
* Original author: | |
* @author Kent Bye <[email protected]> | |
* Modified and extended from Sean Murphy's gist at https://gist.github.com/1878352 | |
* Modified and updated from kentbye's gist at https://gist.github.com/kentbye/3740357 | |
*/ | |
// Set the id for the app (Right click on the icon to copy the link in iTunes. | |
// It should look something like this: http://itunes.apple.com/us/app/netflix/id363590051?mt=8 | |
$app_id = '363590051'; // Netflix app | |
// extend execution time if necessary, if requesting many pages of reviews | |
// change the default 30 (seconds) to a value that suits your query | |
//set_time_limit(30); | |
// Manually set the number of review pages for the app. Select "All Versions" of reviews in iTunes and see what the last number is. | |
// Default is set to one to grab the latest 10 results | |
// WARNING: Do not fetch all reviews from all countries if your app has many pages of reviews (100~200+), as Apple will block your IP for a while. | |
$total_number_of_review_pages = 1; | |
// Initialize the results array | |
$results = array(); | |
// Just all reviews from all countries | |
// Remove unnecessary countries from the array if needed | |
$countries = json_decode('[{"storefront":"143455-6,12","name":"Canada"},{"storefront":"143441-1,12","name":"United States"},{"storefront":"143565,12","name":"Belarus"},{"storefront":"143446-2,12","name":"Belgium"},{"storefront":"143526,12","name":"Bulgaria"},{"storefront":"143494,12","name":"Croatia"},{"storefront":"143557-2,12","name":"Cyprus"},{"storefront":"143489,12","name":"Czech Republic"},{"storefront":"143458-2,12","name":"Denmark"},{"storefront":"143443,12","name":"Deutschland"},{"storefront":"143454-8,12","name":"España"},{"storefront":"143518,12","name":"Estonia"},{"storefront":"143447-2,12","name":"Finland"},{"storefront":"143442,12","name":"France"},{"storefront":"143448,12","name":"Greece"},{"storefront":"143482,12","name":"Hungary"},{"storefront":"143558,12","name":"Iceland"},{"storefront":"143449,12","name":"Ireland"},{"storefront":"143450,12","name":"Italia"},{"storefront":"143519,12","name":"Latvia"},{"storefront":"143520,12","name":"Lithuania"},{"storefront":"143451-2,12","name":"Luxembourg"},{"storefront":"143530,12","name":"Macedonia"},{"storefront":"143521,12","name":"Malta"},{"storefront":"143523,12","name":"Moldova"},{"storefront":"143452,12","name":"Nederland"},{"storefront":"143457-2,12","name":"Norway"},{"storefront":"143445,12","name":"Österreich"},{"storefront":"143478,12","name":"Poland"},{"storefront":"143453,12","name":"Portugal"},{"storefront":"143487,12","name":"Romania"},{"storefront":"143496,12","name":"Slovakia"},{"storefront":"143499,12","name":"Slovenia"},{"storefront":"143456,12","name":"Sverige"},{"storefront":"143459-2,12","name":"Switzerland"},{"storefront":"143480,12","name":"Turkey"},{"storefront":"143444,12","name":"United Kingdom"},{"storefront":"143469,12","name":"Россия"},{"storefront":"143563,12","name":"Algeria"},{"storefront":"143564,12","name":"Angola"},{"storefront":"143524,12","name":"Armenia"},{"storefront":"143568,12","name":"Azerbaijan"},{"storefront":"143559,12","name":"Bahrain"},{"storefront":"143525,12","name":"Botswana"},{"storefront":"143516,12","name":"Egypt"},{"storefront":"143573,12","name":"Ghana"},{"storefront":"143467,12","name":"India"},{"storefront":"143491,12","name":"Israel"},{"storefront":"143528,12","name":"Jordan"},{"storefront":"143529,12","name":"Kenya"},{"storefront":"143493,12","name":"Kuwait"},{"storefront":"143497,12","name":"Lebanon"},{"storefront":"143531,12","name":"Madagascar"},{"storefront":"143532,12","name":"Mali"},{"storefront":"143533,12","name":"Mauritius"},{"storefront":"143534,12","name":"Niger"},{"storefront":"143561,12","name":"Nigeria"},{"storefront":"143562,12","name":"Oman"},{"storefront":"143498,12","name":"Qatar"},{"storefront":"143479,12","name":"Saudi Arabia"},{"storefront":"143535,12","name":"Senegal"},{"storefront":"143472,12","name":"South Africa"},{"storefront":"143572,12","name":"Tanzania"},{"storefront":"143536,12","name":"Tunisia"},{"storefront":"143481,12","name":"UAE"},{"storefront":"143537,12","name":"Uganda"},{"storefront":"143571,12","name":"Yemen"},{"storefront":"143460,12","name":"Australia"},{"storefront":"143560,12","name":"Brunei Darussalam"},{"storefront":"143465-2,12","name":"China"},{"storefront":"143463,12","name":"Hong Kong"},{"storefront":"143476,12","name":"Indonesia"},{"storefront":"143462-1,12","name":"Japan"},{"storefront":"143517,12","name":"Kazakhstan"},{"storefront":"143515,12","name":"Macau"},{"storefront":"143473,12","name":"Malaysia"},{"storefront":"143461,12","name":"New Zealand"},{"storefront":"143477,12","name":"Pakistan"},{"storefront":"143474,12","name":"Philippines"},{"storefront":"143464,12","name":"Singapore"},{"storefront":"143486,12","name":"Sri Lanka"},{"storefront":"143470,12","name":"Taiwan"},{"storefront":"143475,12","name":"Thailand"},{"storefront":"143566,12","name":"Uzbekistan"},{"storefront":"143471,12","name":"Vietnam"},{"storefront":"143466,12","name":"대한민국"},{"storefront":"143538,12","name":"Anguilla"},{"storefront":"143540,12","name":"Antigua and Barbuda"},{"storefront":"143505-2,12","name":"Argentina"},{"storefront":"143539,12","name":"Bahamas"},{"storefront":"143541,12","name":"Barbados"},{"storefront":"143555-2,12","name":"Belize"},{"storefront":"143542,12","name":"Bermuda"},{"storefront":"143556-2,12","name":"Bolivia"},{"storefront":"143503,12","name":"Brasil"},{"storefront":"143543,12","name":"British Virgin Islands"},{"storefront":"143544,12","name":"Cayman Islands"},{"storefront":"143483-2,12","name":"Chile"},{"storefront":"143501-2,12","name":"Colombia"},{"storefront":"143495-2,12","name":"Costa Rica"},{"storefront":"143545,12","name":"Dominica"},{"storefront":"143508-2,12","name":"Dominican Republic"},{"storefront":"143509-2,12","name":"Ecuador"},{"storefront":"143506-2,12","name":"El Salvador"},{"storefront":"143546,12","name":"Grenada"},{"storefront":"143504-2,12","name":"Guatemala"},{"storefront":"143553,12","name":"Guyana"},{"storefront":"143510-2,12","name":"Honduras"},{"storefront":"143511,12","name":"Jamaica"},{"storefront":"143468,12","name":"México"},{"storefront":"143547,12","name":"Montserrat"},{"storefront":"143512-2,12","name":"Nicaragua"},{"storefront":"143485-2,12","name":"Panama"},{"storefront":"143513-2,12","name":"Paraguay"},{"storefront":"143507-2,12","name":"Peru"},{"storefront":"143548,12","name":"St. Kitts and Nevis"},{"storefront":"143549,12","name":"St. Lucia"},{"storefront":"143550,12","name":"St. Vincent & The Grenadines"},{"storefront":"143554-2,12","name":"Suriname"},{"storefront":"143551,12","name":"Trinidad and Tobago"},{"storefront":"143552,12","name":"Turks & Caicos"},{"storefront":"143514-2,12","name":"Uruguay"},{"storefront":"143502-2,12","name":"Venezuela"}]'); | |
// US reviews only | |
//$countries = json_decode('[{"storefront":"143441-1,12","name":"United States"}]'); | |
// Write the results to a CSV file named after the $app_id | |
$fp = fopen($app_id . '-reviews.csv', 'w'); | |
// Add in column names to the CSV file | |
$column_names = array('Date', 'Version', 'Rating', 'Review Title', 'Review', 'Helpful Percent', 'Helpful Votes', 'Total Votes', 'Username', 'User Page', 'Review ID', 'Country'); | |
fputcsv($fp, $column_names); | |
// Start on the first page of most recent results, and continue to the manually set number of pages to crawl | |
for ($page = 1; $page <= $total_number_of_review_pages; $page++) { | |
// Loop through each of the countries | |
foreach ($countries as $country) { | |
$ch = curl_init(); | |
// Grab app reviews sorted by most recent and specify the specific page | |
// updated for HTTPS, old one gave 301 | |
curl_setopt($ch, CURLOPT_URL, "https://itunes.apple.com/WebObjects/MZStore.woa/wa/customerReviews?displayable-kind=11&id={$app_id}&page={$page}&sort=4"); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); | |
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); | |
// Set the user agent to iTunes in order to get the full results | |
curl_setopt($ch, CURLOPT_HTTPHEADER, array( | |
'User-Agent: iTunes/11.0.2 (Macintosh; Intel Mac OS X 10.8.3) AppleWebKit/536.28.10', | |
"X-Apple-Store-Front: {$country->storefront}", | |
'X-Apple-Tz: -18000', | |
'Accept-Language: en-us, en;q=0.50', | |
)); | |
$body = curl_exec($ch); | |
curl_close($ch); | |
$dom = new DOMDocument(); | |
@$dom->loadHTML($body); | |
// Set the XPath selectors for the titles, ratings, username & link, Version + Date, Actual review, and how many people found it helpful | |
$xpath = new DOMXPath($dom); | |
$review_titles = $xpath->query('//html/body/div[@class="customer-reviews"]/div[5]/div[@class="paginated-container"]/div/div/h5/span'); | |
$ratings = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/h5/div/@aria-label"); | |
$user_links = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/span/a/@href"); | |
$users_versions_dates = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/span"); | |
$review_bodies = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/p[1]"); | |
$helpfulness_ratings = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/p[2]/span[1]"); | |
$review_id_urls = $xpath->query("/html/body/div[2]/div[5]/div[3]/div/div/div/@report-a-concern-fragment-url"); | |
// DEBUGGING: Output the full html from iTunes to confirm that the data is correct and matches what you expect from iTunes | |
// Write the output to an *.html file and confirm it's appearance in a web browser. | |
//echo $body; | |
// Convert the objects into a results array | |
// Review title | |
$i = 1; | |
foreach ($review_titles as $review_title) { | |
$results[$i][3] = $review_title->nodeValue; | |
$i++; | |
} | |
// Star rating | |
$i = 1; | |
foreach ($ratings as $rating) { | |
// Just return the first character, which is the star rating; | |
$results[$i][2] = $rating->nodeValue[0]; | |
$i++; | |
} | |
// User, version number and date | |
$i = 1; | |
foreach ($users_versions_dates as $user_version_date) { | |
$split_user_version_date = explode("\n", $user_version_date->nodeValue); | |
$results[$i][8] = trim($split_user_version_date[4]); // User | |
$results[$i][1] = trim($split_user_version_date[8]); // Version Number | |
$results[$i][0] = trim($split_user_version_date[11]); // Date | |
$i++; | |
} | |
// The text of the actual review | |
$i = 1; | |
foreach ($review_bodies as $review_body) { | |
// Remove white space and character return at beginning | |
$results[$i][4] = trim($review_body->nodeValue); | |
$i++; | |
} | |
// How many people found that the review to be helpful or not helpful | |
$i = 1; | |
foreach ($helpfulness_ratings as $helpfulness_rating) { | |
// If it starts with "Was this review helpful?" then don't include since their review hasn't been evaluated by other users yet. | |
// NOTE: This initial letter of 'W' works for English, but for Spanish, it'd be "E" for Esta | |
// TODO: This logic could be improved. | |
if ($helpfulness_rating->nodeValue[0] == "W") { | |
$results[$i][5] = ""; | |
$results[$i][6] = ""; | |
$results[$i][7] = ""; | |
} else { | |
$split_helpfulness_rating = explode("\n", $helpfulness_rating->nodeValue); | |
$helpful = explode(" ", $split_helpfulness_rating[0]); | |
$total_votes = explode(" ", trim($split_helpfulness_rating[1])); | |
if (intval($total_votes[0]) == 0) { // prevent division by zero | |
$results[$i][5] = 0; | |
} | |
else { | |
$results[$i][5] = intval($helpful[0])/intval($total_votes[0]); // Total Percentage of Helpfulness | |
} | |
$results[$i][6] = $helpful[0]; // Number of Helpful Votes | |
$results[$i][7] = $total_votes[0]; // Number of Total Votes | |
} | |
$i++; | |
} | |
// Link to the user's page of reviews | |
$i = 1; | |
foreach ($user_links as $user_link) { | |
$results[$i][9] = $user_link->nodeValue; | |
$i++; | |
} | |
// Review ID | |
$i = 1; | |
foreach ($review_id_urls as $review_id_url) { | |
$review_id = explode("=", $review_id_url->nodeValue); | |
$results[$i][10] = $review_id[1]; | |
// Add country name in the last column | |
$results[$i][11] = $country->name; | |
$i++; | |
} | |
// DEBUGGING: Check to see that the information in the array is displayed properly | |
//print_r($results); | |
// Write each of the rows to the CSV file | |
foreach ($results as $fields) { | |
// Sort the results so that it properly outputs to the CSV file | |
ksort($fields); | |
fputcsv($fp, $fields); | |
} | |
} | |
} | |
fclose($fp); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment