Created
December 12, 2013 09:21
-
-
Save skeggse/7925278 to your computer and use it in GitHub Desktop.
Simple (and pretty bad) scraper for Google Play Store reviews.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var scan = (function(window, document) { | |
/** | |
* Scans through all the pages and maps all the reviews. | |
* | |
* TODO: automatically switch to date-ordered reviews, and page backwards to | |
* beginning. | |
* | |
* @param {function(?Error, Object, Array)} callback The callback when all reviews have | |
* been found. | |
*/ | |
function scan(callback) { | |
var nextBtn = $('.details-section.reviews .expand-next'); | |
var total = getNumber($('.details-info .header-star-badge .stars-count').textContent); | |
var map = {}, unknown = []; | |
function read() { | |
return list(map, unknown) > 0; | |
} | |
function next(fn) { | |
var count = 0, value = getOffset(); | |
// wait for it to change from value | |
function waitOne() { | |
if (count++ % 100 === 0) { | |
nextBtn.click(); | |
} | |
if (nextBtn.style.display === 'none') { | |
return callback(map, unknown); | |
} | |
if (getOffset() !== value) { | |
console.log('diverged'); | |
console.log('wait for converge'); | |
return setTimeout(waitTwo, 50); | |
} | |
setTimeout(waitOne, 50); | |
} | |
// wait for it remain the same | |
function waitTwo() { | |
if (getOffset() === value && hasPage()) { | |
console.log('converged'); | |
return fn(); | |
} | |
value = getOffset(); | |
setTimeout(waitTwo, 50); | |
} | |
console.log('wait for diverge'); | |
waitOne(); | |
nextBtn.click(); | |
} | |
function getOffset() { | |
return $('.details-section.reviews .expand-pages-container').style.left; | |
} | |
function countAll() { | |
var n = unknown.length; | |
for (var k in map) n++; | |
return n; | |
} | |
(function tick() { | |
var found; | |
try { | |
found = read(); | |
} catch (err) { | |
return callback(err); | |
} | |
found ? next(tick) : callback(map, unknown); | |
}()); | |
} | |
/** | |
* Lists all visible reviews, or places them into a hashmap and an excess array | |
* if specified. | |
* | |
* @param {Object.<string, Object>=} map The map of review ids to reviews. | |
* @param {Array.<Object>} unknown An array for reviews lacking ids. | |
* @return {Object|number} The array of reviews or the number of reviews found. | |
*/ | |
function list(map, unknown) { | |
var page = getPage(), reviews = page.querySelectorAll('.single-review'); | |
var review, i = reviews.length - 1; | |
if (map) { | |
for (; i >= 0; i--) { | |
review = new Review(reviews[i]); | |
if (review.id) { | |
map[review.id] = review; | |
continue; | |
} | |
unknown.push(review); | |
} | |
return reviews.length; | |
} | |
reviews = slice.call(reviews); | |
for (; i >= 0; i--) { | |
reviews[i] = new Review(reviews[i]); | |
} | |
return reviews; | |
} | |
/** | |
* Gets the active page, or throws an error if expectations fail. | |
* | |
* @return {HTMLElement} The active page. | |
*/ | |
function getPage() { | |
var pages = filterPages(getPages()); | |
if (pages.length !== 1) | |
throw new Error('expected one visible page, found ' + pages.length); | |
return pages[0]; | |
} | |
/** | |
* Filters pages for those that are completely visible. | |
* | |
* @return {Array.<HTMLElement>} The visible pages. | |
*/ | |
function filterPages(pages) { | |
return filter.call(pages, function(page) { | |
return page.style.opacity == 1; | |
}); | |
} | |
/** | |
* Checks if the number of visible pages is equal to one. | |
* | |
* @return {boolean} Whether there is one visible page. | |
*/ | |
function hasPage() { | |
return filterPages(getPages()).length === 1; | |
} | |
/** | |
* Gets all the loaded pages. | |
* | |
* @return {NodeList} The loaded pages. | |
*/ | |
function getPages() { | |
return $.a('.details-section.reviews .expand-page'); | |
} | |
/** | |
* An optimized Review constructor, holds review data. | |
* | |
* @param {HTMLElement} element The root element of the review. | |
* @constructor | |
*/ | |
function Review(element) { | |
if (!element) debugger; | |
// ids | |
this.id = query('[data-reviewid]', 'dataset', 'reviewid'); | |
this.uid = query('[data-userid]', 'dataset', 'userid'); | |
// meta | |
this.author = query('.review-info .author-name > a[href]', 'textContent'); | |
this.date = dateOf(query('.review-info .review-date', 'textContent'), null); | |
this.rating = rate(query('.review-info-star-rating .current-rating', 'style', | |
'width')); | |
// content | |
this.title = query('.review-body .review-title', 'textContent'); | |
this.body = trim(query('.review-body .review-title', 'nextSibling', | |
'textContent')); | |
/** | |
* Helper function for cleanly finding a value from a selector. | |
* | |
* @param {string} selector The element to select for. | |
* @param {...string} var_args The properties to read from the element, | |
* hierarchically. | |
*/ | |
function query(selector) { | |
var i = 1, result = element.querySelector(selector); | |
for (; i < arguments.length; i++) { | |
if (!result) | |
return ''; | |
result = result[arguments[i]]; | |
} | |
return result; | |
} | |
} | |
// utilities! | |
var slice = Array.prototype.slice; | |
var filter = Array.prototype.filter; | |
function dateOf(date, def) { | |
return date ? new Date(date) : def; | |
} | |
function rate(value) { | |
return value ? parseInt(value, 10) * 0.05 : 0; | |
} | |
function trim(value) { | |
return value && value.trim(); | |
} | |
function getNumber(string, def) { | |
var value = /[0-9]+/.exec(string); | |
return value ? +value[0] : def; | |
} | |
function $(s) { | |
return document.querySelector(s); | |
} | |
$.a = function(s) { | |
return document.querySelectorAll(s); | |
}; | |
scan.list = list; | |
return scan; | |
}(window, document)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment