Skip to content

Instantly share code, notes, and snippets.

@Raidus
Created August 8, 2017 04:24
Show Gist options
  • Save Raidus/c27af1fc0f152f944adad35378d84291 to your computer and use it in GitHub Desktop.
Save Raidus/c27af1fc0f152f944adad35378d84291 to your computer and use it in GitHub Desktop.
// Abort pagination when current page fulfills certain condition
const rp = require('request-promise');
const Xray = require('x-ray');
const x = Xray({
filters: {
correctURI: function(value) {
return typeof value === 'string'
? `https://www.amazon.de${value}`
: value;
}
}
});
const MAX_PAGES = 3;
const getPage = url => {
return rp({ url }).then(html => {
return new Promise((resolve, reject) => {
const result = x(html, {
pagnResult: x(html, '.s-result-item.celwidget', [
{
asin: '@data-asin'
}
]),
pagnNextLink: '#pagnNextLink@href | correctURI'
})((err, result) => {
if (err) reject(err);
if (!err) resolve(result);
});
});
});
};
function abort() {
// some useful conditition
return false;
}
async function scrapeNpages(book) {
let i,
results = [],
refererrs = [];
refererrs.push(
`https://www.amazon.de/s?&field-keywords=${encodeURIComponent(book)}`
);
for (i = 0; i < MAX_PAGES; ++i) {
try {
const result = await getPage(refererrs[i]);
refererrs.push(result.pagnNextLink);
results.push(result.pagnResult);
if (abort()) break;
} catch (err) {
console.log(err);
}
}
return Array.prototype.concat(...results);
}
scrapeNpages('harry potter').then(res => console.log(res));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment