Created
September 19, 2015 03:38
-
-
Save adambyram/9f4e6eea2d983de83a16 to your computer and use it in GitHub Desktop.
Review Downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var url = "https://appexchange.salesforce.com/listingDetail?listingId=a0N30000004fnkUEAQ"; | |
var page = require('webpage').create(); | |
var loaded = false; | |
var currentPage = 1; | |
// Set this to the highest page of reviews you want to load (e.g. if there are 1000 reviews, that's 100 pages to load) | |
var maximumPage = 1; | |
// It takes time to load each page of reviews. If this value is too low, you'll get errors because the data hasn't | |
// fully loaded. 15 seconds seems to always work, but 5 or 10 seconds may be enough. | |
var pageLoadDelay = 15000; | |
var collectedReviews = []; | |
function scrapeAfterLoad() { | |
if(loaded) { | |
console.log('Collecting Reviews for Page ' + currentPage); | |
var results = page.evaluate(function () { | |
var reviewCollection = []; | |
var reviews = document.getElementsByClassName('feed-container')[0].getElementsByClassName('feed-item'); | |
var reviewCount = reviews.length; | |
for (var reviewIndex = 0; reviewIndex < reviewCount; reviewIndex++) { | |
var reviewItem = reviews[reviewIndex]; | |
// Grab the review text - if there is a "more" link, follow that and use that text instead | |
var reviewText = reviewItem.getElementsByClassName('feed-item-text-short-desc')[0].textContent; | |
var reviewMoreText = reviewItem.getElementsByClassName('feed-item-text-more-text'); | |
if(reviewMoreText.length > 0) { | |
reviewText = reviewMoreText[0].textContent; | |
} | |
// Grab the rating from the CSS class "50" = 5 stars, "40" = 4 stars, etc. | |
var rating = reviewItem.getElementsByClassName('feed-item-rating')[0].getElementsByClassName('rating-stars')[0].className.split(' ')[1].split('-')[2]; | |
// Grab the date - this is a relative date (so the more recent reviews may say "Today" or "Yesterday" etc) | |
var date = reviewItem.getElementsByClassName('feed-footer-link-secondary')[0].textContent; | |
reviewCollection.push({'rating': rating, 'date': date, 'text': reviewText}); | |
} | |
return reviewCollection; | |
}); | |
collectedReviews = collectedReviews.concat(results); | |
// Checks to see if the "next page" link is there - if it is, we can load more pages | |
var performAnotherLoad = page.evaluate(function() { | |
return document.getElementById('listingDetailPage:AppExchangeLayout:listingDetailForm:listingDetailReviewsTab:reviewsTabComponent:nextRvwCmdLink') != null; | |
}); | |
if(performAnotherLoad && currentPage < maximumPage) { | |
nextReviewLoad(); | |
} else { | |
// Write out the reviews to a json file | |
var fs = require('fs'); | |
try { | |
fs.write("reviews.json", JSON.stringify(collectedReviews), 'w'); | |
} catch(e) { | |
console.log(e); | |
} | |
console.log('Done.'); | |
phantom.exit(); | |
} | |
} else { | |
setTimeout(scrapeAfterLoad, pageLoadDelay); | |
} | |
} | |
// To start loading reviews, we have to click the reviews tab to force the page to start loading reviews | |
function startReviewLoad() { | |
console.log('Waiting for Page ' + currentPage + ' Reviews'); | |
loaded = false; | |
page.evaluate(function() { | |
var ev = document.createEvent("MouseEvent"); | |
ev.initMouseEvent( | |
"click", | |
true /* bubble */, true /* cancelable */, | |
window, null, | |
0, 0, 0, 0, /* coordinates */ | |
false, false, false, false, /* modifier keys */ | |
0 /*left*/, null | |
); | |
document.getElementById('tab_content_reviews').dispatchEvent(ev); | |
}); | |
scrapeAfterLoad(); | |
} | |
// This clicks the "next page" link to force the load of another round of reviews | |
function nextReviewLoad() { | |
currentPage++; | |
loaded = false; | |
console.log('Waiting for Page ' + currentPage + ' Reviews'); | |
page.evaluate(function() { | |
var ev = document.createEvent("MouseEvent"); | |
ev.initMouseEvent( | |
"click", | |
true /* bubble */, true /* cancelable */, | |
window, null, | |
0, 0, 0, 0, /* coordinates */ | |
false, false, false, false, /* modifier keys */ | |
0 /*left*/, null | |
); | |
document.getElementById('listingDetailPage:AppExchangeLayout:listingDetailForm:listingDetailReviewsTab:reviewsTabComponent:nextRvwCmdLink').dispatchEvent(ev); | |
}); | |
scrapeAfterLoad(); | |
} | |
page.open(url, function(status) { | |
if (status !== 'success') { | |
console.log('Unable to access network'); | |
phantom.exit(); | |
} else { | |
// We have to wait on the listingDetail url to load since that has the data we need | |
page.onResourceReceived = function (response) { | |
if(response.url === "https://appexchange.salesforce.com/listingDetail") { | |
loaded = true; | |
} | |
}; | |
// Kick off the whole process | |
startReviewLoad(); | |
} | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment