Forked from daithiw44/NodeJS web scraper for Anagrams
Created
October 28, 2012 03:07
-
-
Save gotomypc/3967286 to your computer and use it in GitHub Desktop.
NodeJS Web Scraper Anagram Example, written as a test some time ago with early Node Version, seems to still work.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Web Scraper that scrapes a web page (without permission I may add). | |
//Simple test using node is all this is, no error handling etc. | |
//Returns JSON OR XML | |
//Format : localhost:'PORT'/scrape/'WORD'/'FORMAT' | |
//Example CURL | |
//JSON - curl -X GET http://localhost:3000/scrape/fundamental/json | |
//XML - curl -X GET http://localhost:3000/scrape/fundamental/xml | |
var express = require('express'), | |
request = require('request'), | |
jsdom = require('jsdom'), | |
builder = require('xmlbuilder'), | |
sys = require('sys'); | |
var app = express.createServer(); | |
app.configure('development', function() {/*whatever you want*/}); | |
app.configure('production', function() { | |
app.use(express.errorHandler({ | |
})); | |
}); | |
app.get('/scrape/:id/:format', function(req, res) { | |
//Get the word | |
var word = req.params.id, formatType = req.params.format; | |
//Hit the website | |
request({uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'}, function(error, response, body) { | |
var b, doc, handleResults, handResultsOutput, sendResponse; | |
if (!error && response.statusCode == 200) { | |
doc = jsdom.jsdom(body, null, { | |
features: { | |
FetchExternalResources: ['script'], | |
ProcessExternalResources: false | |
} | |
}); | |
//determine the formatType; | |
handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults; | |
//I know where the words are in the DOM. | |
b = doc.getElementsByTagName('b'); | |
for (var i = 0; i < b.length; i++) { | |
var text = b[i].innerHTML; | |
if (text.search(/found/i) !== -1) { | |
handleResultsOutput = handleResults(b[2], parseInt(text, 10)); | |
} | |
} | |
//Handle the XML results (if we have any) | |
function getXMLResults(node, count) { | |
count = count * 2; | |
var el = node.nextSibling; | |
var j = 0; | |
var root = builder.begin('anagrams'); | |
while (j < count) { | |
if (el.nodeType === 3 && el.nodeValue.length > 1) { | |
var item = root.ele('anagram'); | |
item.txt(el.nodeValue.replace(/^\s+|\s+$/g, '')); | |
} | |
if (el.tagName === 'bottomlinks') { break;} | |
j++; | |
el = el.nextSibling; | |
} | |
root.up(); | |
return {'output' : builder.toString(), 'ctype': 'text/xml'}; | |
} | |
//JSON Results | |
function getJSONResults(node, count) { | |
count = count * 2; | |
var el = node.nextSibling, j = 0, anagObj = {},elArray = []; | |
while (j < count) { | |
if (el.nodeType === 3 && el.nodeValue.length > 1) { | |
elArray.push(el.nodeValue.replace(/^\s+|\s+$/g, '')); | |
} | |
if (el.tagName === 'bottomlinks') { break;} | |
j++; | |
el = el.nextSibling; | |
} | |
anagObj['anagrams'] = elArray; | |
return {'output' : JSON.stringify(anagObj), 'ctype' : 'application/json'}; | |
} | |
//Send it out. | |
res.writeHead(200, {'Content-Type': handleResultsOutput.ctype}); | |
res.end(handleResultsOutput.output); | |
} | |
}); | |
}); | |
app.listen(process.env.PORT || 3000); | |
console.log('server started port: ' + 3000); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment