Skip to content

Instantly share code, notes, and snippets.

@lukekarrys
Last active May 31, 2020 16:47
Show Gist options
  • Save lukekarrys/4102152 to your computer and use it in GitHub Desktop.
Save lukekarrys/4102152 to your computer and use it in GitHub Desktop.
Node Crawler to find all domain links on a site and run a function on them

Node Crawler to find all domain links on a site and run a function on them

Linked to from http://lukecod.es/2012/11/18/random-problem-of-the-night/

What

This is a node.js crawler that will crawl an entire site (using crawl) to find all internal links in the entire site. It will then test each unique internal link for the presence of an optional string and then the query string into an object. All values with the same key from the query string will be pushed to an array for that key.

Usage

  • npm install
  • node app.js http://site-to-crawl.com /only/return/links/containing/this/path

Example Output

{
  'a': [
    'x',
    'y',
    'z'
  ],
  'b': [
    'c',
    'd'
  ],
  'z': [
    1
  ]
}

How to get all possible clinical trial query parameters from biooncology.com

node app.js http://www.biooncology.com /clinical-trials

Latest Output (11/18/12)

{
    "tumor": [
        "breast cancer",
        "cll",
        "dlbcl",
        "fnhl",
        "colorectal cancer",
        "gastric cancer",
        "glioblastoma",
        "lung cancer",
        "melanoma",
        "ovarian cancer",
        "multiple myeloma",
        "pancreatic cancer",
        "other tumor types",
        "renal cell carcinoma",
        "colon cancer",
        "liver cancer"
    ],
    "drug": [
        "pi3k inhibitor (gdc-0941)",
        "pi3k/mtor inhibitor (gdc-0980)",
        "obinutuzumab (ga101)",
        "onartuzumab (metmab)",
        "mek inhibitor (gdc-0973)",
        "akt inhibitor (gdc-0068)",
        "anti-egfl7",
        "dulanermin"
    ]
}
node_modules/
*.log
/*global console process require */
var crawler = require('./node_modules/crawl/lib/crawler'),
_ = require('underscore'),
url = require('url'),
qs = require('qs'),
ent = require('ent'),
startUrl = process.argv[2],
urlPath = process.argv[3] || '',
parsedStartUrl = url.parse(startUrl);
crawler.crawl(startUrl, { headers: false, body: false }, function(err, pages) {
if (err) {
console.log("An error occured: " + err);
process.exit(1);
}
var // An array of unique urls within the site, falsy values removed
allLinks = _.uniq(_.compact(_.flatten(_.pluck(pages, 'links')))),
// Internal links: check if it contains our original host or is relative
// if it is relative, prepend 'protocol//host'
internalLinks = _.map(allLinks, function(link) {
link = link.split('#')[0];
if (link.indexOf(parsedStartUrl.host) > -1) {
return link;
} else {
return link.charAt(0) === '/' ? parsedStartUrl.protocol + '//' + parsedStartUrl.host + link : '';
}
}),
// Remove falsy and make unique again to account for relative links that are now absolute
uniqueLinks = _.uniq(_.compact(internalLinks)),
alreadyViewedQueryStrings = [],
results = {};
_.each(uniqueLinks, function(link) {
// The query string (minus the ?)
var queryString = (url.parse(link).search || '').slice(1);
/* Continue if:
* our link contains our passed in path
* our link has a query string
* we haven't seen this query string before
*/
if (link.indexOf(urlPath) > -1 && queryString && !_.contains(alreadyViewedQueryStrings, queryString)) {
alreadyViewedQueryStrings.push(queryString);
// Decode html entities, +'s to spaces, pass to decodeURIComponent, then parse to an object with qs
queryString = qs.parse(decodeURIComponent(ent.decode(queryString).replace(/\+/g, ' ')));
/* For each key:value pair of the query string we either create a new array with the value
* if we haven't seen that key before, or if we have then we push to that array.
* Also, always make the value lowercase.
*/
_.each(queryString, function(value, key) {
if (typeof results[key] === 'undefined') {
results[key] = [value.toLowerCase()];
} else {
results[key].push(value.toLowerCase());
}
});
// Make each key only contain unique values in its array
_.each(results, function(value, key) {
results[key] = _.uniq(results[key]);
});
}
});
console.log(JSON.stringify(results, null, 4));
});
{
"name": "crawling-link-matcher",
"version": "0.1.0",
"dependencies": {
"crawl": "0.1.0",
"qs": "0.5.2",
"underscore": "1.4.2",
"ent": "0.0.4"
}
}
@lukekarrys
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment