Skip to content

Instantly share code, notes, and snippets.

@skeggse
Created December 12, 2013 09:21
Show Gist options
  • Save skeggse/7925278 to your computer and use it in GitHub Desktop.
Save skeggse/7925278 to your computer and use it in GitHub Desktop.
Simple (and pretty bad) scraper for Google Play Store reviews.
var scan = (function(window, document) {
/**
* Scans through all the pages and maps all the reviews.
*
* TODO: automatically switch to date-ordered reviews, and page backwards to
* beginning.
*
* @param {function(?Error, Object, Array)} callback The callback when all reviews have
* been found.
*/
function scan(callback) {
var nextBtn = $('.details-section.reviews .expand-next');
var total = getNumber($('.details-info .header-star-badge .stars-count').textContent);
var map = {}, unknown = [];
function read() {
return list(map, unknown) > 0;
}
function next(fn) {
var count = 0, value = getOffset();
// wait for it to change from value
function waitOne() {
if (count++ % 100 === 0) {
nextBtn.click();
}
if (nextBtn.style.display === 'none') {
return callback(map, unknown);
}
if (getOffset() !== value) {
console.log('diverged');
console.log('wait for converge');
return setTimeout(waitTwo, 50);
}
setTimeout(waitOne, 50);
}
// wait for it remain the same
function waitTwo() {
if (getOffset() === value && hasPage()) {
console.log('converged');
return fn();
}
value = getOffset();
setTimeout(waitTwo, 50);
}
console.log('wait for diverge');
waitOne();
nextBtn.click();
}
function getOffset() {
return $('.details-section.reviews .expand-pages-container').style.left;
}
function countAll() {
var n = unknown.length;
for (var k in map) n++;
return n;
}
(function tick() {
var found;
try {
found = read();
} catch (err) {
return callback(err);
}
found ? next(tick) : callback(map, unknown);
}());
}
/**
* Lists all visible reviews, or places them into a hashmap and an excess array
* if specified.
*
* @param {Object.<string, Object>=} map The map of review ids to reviews.
* @param {Array.<Object>} unknown An array for reviews lacking ids.
* @return {Object|number} The array of reviews or the number of reviews found.
*/
function list(map, unknown) {
var page = getPage(), reviews = page.querySelectorAll('.single-review');
var review, i = reviews.length - 1;
if (map) {
for (; i >= 0; i--) {
review = new Review(reviews[i]);
if (review.id) {
map[review.id] = review;
continue;
}
unknown.push(review);
}
return reviews.length;
}
reviews = slice.call(reviews);
for (; i >= 0; i--) {
reviews[i] = new Review(reviews[i]);
}
return reviews;
}
/**
* Gets the active page, or throws an error if expectations fail.
*
* @return {HTMLElement} The active page.
*/
function getPage() {
var pages = filterPages(getPages());
if (pages.length !== 1)
throw new Error('expected one visible page, found ' + pages.length);
return pages[0];
}
/**
* Filters pages for those that are completely visible.
*
* @return {Array.<HTMLElement>} The visible pages.
*/
function filterPages(pages) {
return filter.call(pages, function(page) {
return page.style.opacity == 1;
});
}
/**
* Checks if the number of visible pages is equal to one.
*
* @return {boolean} Whether there is one visible page.
*/
function hasPage() {
return filterPages(getPages()).length === 1;
}
/**
* Gets all the loaded pages.
*
* @return {NodeList} The loaded pages.
*/
function getPages() {
return $.a('.details-section.reviews .expand-page');
}
/**
* An optimized Review constructor, holds review data.
*
* @param {HTMLElement} element The root element of the review.
* @constructor
*/
function Review(element) {
if (!element) debugger;
// ids
this.id = query('[data-reviewid]', 'dataset', 'reviewid');
this.uid = query('[data-userid]', 'dataset', 'userid');
// meta
this.author = query('.review-info .author-name > a[href]', 'textContent');
this.date = dateOf(query('.review-info .review-date', 'textContent'), null);
this.rating = rate(query('.review-info-star-rating .current-rating', 'style',
'width'));
// content
this.title = query('.review-body .review-title', 'textContent');
this.body = trim(query('.review-body .review-title', 'nextSibling',
'textContent'));
/**
* Helper function for cleanly finding a value from a selector.
*
* @param {string} selector The element to select for.
* @param {...string} var_args The properties to read from the element,
* hierarchically.
*/
function query(selector) {
var i = 1, result = element.querySelector(selector);
for (; i < arguments.length; i++) {
if (!result)
return '';
result = result[arguments[i]];
}
return result;
}
}
// utilities!
var slice = Array.prototype.slice;
var filter = Array.prototype.filter;
function dateOf(date, def) {
return date ? new Date(date) : def;
}
function rate(value) {
return value ? parseInt(value, 10) * 0.05 : 0;
}
function trim(value) {
return value && value.trim();
}
function getNumber(string, def) {
var value = /[0-9]+/.exec(string);
return value ? +value[0] : def;
}
function $(s) {
return document.querySelector(s);
}
$.a = function(s) {
return document.querySelectorAll(s);
};
scan.list = list;
return scan;
}(window, document));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment