Last active
December 22, 2019 12:59
-
-
Save arn-ob/d5baee9c12a54ff6e7ba533b34f46e65 to your computer and use it in GitHub Desktop.
Scrap All the url from the site
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const request = require('request'); | |
| const cheerio = require('cheerio'); | |
| const ld = require('lodash') | |
| let base_url = 'https://khaasfood.com' | |
| let urls = [] | |
| let count = -1; | |
| let threshold = 2000 | |
| let loopStuckTime = 0; | |
| let loopStuckValue = 0; | |
| function scrap(url) { | |
| var options = { | |
| url: url, | |
| headers: { | |
| 'User-Agent': 'request', | |
| 'Content-Type': 'text/html; charset=utf-8' | |
| } | |
| }; | |
| request(options, function (error, result, html) { | |
| if (!error && result.statusCode == 200) { | |
| let links = [] | |
| var s = cheerio.load(html); | |
| s('a').each((index, value) => { | |
| var link = s(value).attr('href'); | |
| links.push(link); | |
| }); | |
| urls.push(links) | |
| urls = ld.flattenDeep(urls) | |
| urls = ld.filter(urls, e => e !== '/') | |
| urls = ld.filter(urls, e => e !== '') | |
| urls = ld.filter(urls, e => e !== undefined) | |
| urls = ld.uniq(urls) | |
| console.log(urls) | |
| if (count !== threshold) { | |
| count++; | |
| scrap(base_url + urls[count]) | |
| } else { | |
| console.log('Count', count) | |
| console.log('urls', urls) | |
| } | |
| } else { | |
| console.log('Error') | |
| if (count !== threshold) { | |
| count++; | |
| scrap(base_url + urls[count]) | |
| } else { | |
| console.log('Count', count) | |
| console.log('urls', urls) | |
| } | |
| } | |
| }) | |
| } | |
| // Start | |
| scrap(base_url) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment