Last active
September 21, 2017 07:26
-
-
Save kayslay/2d4f6c85334c412a29f7961b0acb3fb5 to your computer and use it in GitHub Desktop.
wiki.js for Build a CLI to crawl a web page with web-crawljs article
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Created by kayslay on 6/3/17. | |
*/ | |
const mongoose = require('mongoose'); | |
const dbConfig = require('../config/db'); | |
//mongoose configs | |
const Schema = mongoose.Schema; | |
//creating a schema for the extracted data | |
const wikiSchema = new Schema({ | |
title: String, | |
body: String, | |
references: [String] | |
}); | |
//connect to mongo db | |
mongoose.connect(`mongodb://${dbConfig.dbHost}/${dbConfig.dbName}`); | |
//create the model | |
const wikiModel = mongoose.model('Wiki', wikiSchema); | |
//crawl config | |
module.exports = { | |
//the selectors on page we want to select | |
//here we are selecting the title, a div with an id of mw-content-text and links with a | |
//class name of external and text | |
fetchSelector: {title: "title", body: "div#mw-content-text",references: 'a.external.text'}, | |
//what we want to select from the selector | |
//for the title and body we want the text | |
//for the references we want to get the href of the links | |
fetchSelectBy: {title: "text", body: "text",references:['attr','href']}, | |
// the same rules apply to the nextSelector and nextSelectBy | |
//but this is used to get the links of the page to crawl next | |
nextSelector: {links: 'a[href^="/wiki"]'}, | |
nextSelectBy: {links: ['attr','href']}, | |
//this changes the next selector when the links match .svg | |
dynamicSchemas:{ | |
nextSelector:[{url:/\.svg/,schema:{links:""}}] | |
}, | |
//formats the url | |
formatUrl: function (url) { | |
if((/\.svg?/.test(url) || /[A-Z]\w+:\w+?/.test(url))){ | |
//returning a visited string so that it does not visit the link | |
//when the url ends with `.svg` or something like `Wikipedia:About` | |
return 'https://en.wikipedia.org/wiki/Web_crawler/' | |
} | |
return url; | |
}, | |
//what we want to do with the data extracted from the page | |
//we want to save it to a mongodb database | |
fetchFn: (err, data, url) => { | |
if (err) { | |
return console.error(err.message); | |
} | |
let {title, body, references} = data; | |
let wikiData = {title: title[0], body: body[0], references}; | |
wikiModel.create(wikiData, function (err, wiki) { | |
console.log(`page with a title ${wiki.title}, has been saved to the database`); | |
}); | |
}, | |
//called at the end of the whole crawl | |
finalFn: function () { | |
console.log('finished crawling wiki'); | |
}, | |
depth: 3, //how deep the crawl should go | |
limitNextLinks: 10,// limit the amount of links we get from wikipedia to 10. this helps when you dont want to get all the links | |
urls: ['https://en.wikipedia.org/wiki/Web_crawler/'] //the default urls to crawl if one is not specified | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment