Skip to content

Instantly share code, notes, and snippets.

@arn-ob
Last active December 22, 2019 12:59
Show Gist options
  • Select an option

  • Save arn-ob/d5baee9c12a54ff6e7ba533b34f46e65 to your computer and use it in GitHub Desktop.

Select an option

Save arn-ob/d5baee9c12a54ff6e7ba533b34f46e65 to your computer and use it in GitHub Desktop.
Scrap All the url from the site
const request = require('request');
const cheerio = require('cheerio');
const ld = require('lodash')
let base_url = 'https://khaasfood.com'
let urls = []
let count = -1;
let threshold = 2000
let loopStuckTime = 0;
let loopStuckValue = 0;
function scrap(url) {
var options = {
url: url,
headers: {
'User-Agent': 'request',
'Content-Type': 'text/html; charset=utf-8'
}
};
request(options, function (error, result, html) {
if (!error && result.statusCode == 200) {
let links = []
var s = cheerio.load(html);
s('a').each((index, value) => {
var link = s(value).attr('href');
links.push(link);
});
urls.push(links)
urls = ld.flattenDeep(urls)
urls = ld.filter(urls, e => e !== '/')
urls = ld.filter(urls, e => e !== '')
urls = ld.filter(urls, e => e !== undefined)
urls = ld.uniq(urls)
console.log(urls)
if (count !== threshold) {
count++;
scrap(base_url + urls[count])
} else {
console.log('Count', count)
console.log('urls', urls)
}
} else {
console.log('Error')
if (count !== threshold) {
count++;
scrap(base_url + urls[count])
} else {
console.log('Count', count)
console.log('urls', urls)
}
}
})
}
// Start
scrap(base_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment