-
-
Save gotomypc/3967472 to your computer and use it in GitHub Desktop.
Nodejs Anagram Website Scraper with express and Domains, update to previous example gist: 1335009
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Updated example to utilize 'basic' Domains with express. | |
// Website Scraper that scrapes off a site (without permission I may add) | |
// had seen some screen scraping examples using jQuery this example is jQuery-less. | |
var express = require('express'), | |
request = require('request'), | |
jsdom = require('jsdom'), | |
builder = require('xmlbuilder').create(), | |
sys = require('util'), | |
createDomain = require('domain').create, | |
app = express.createServer(); | |
app.use(function(req, res, next) { | |
var domain = createDomain(), formattedMsg; | |
domain.on('error', function(err) { | |
//server Error 500 but we'll handle it and send out a 200. | |
if (req.params.format === 'json') { | |
formattedMsg = '{"error":"' + err.message + '"}'; | |
} else { | |
formattedMsg = '<anagrams><error>' + err.message + '</error></anagrams>'; | |
} | |
writeOut(res, req.params.format, formattedMsg); | |
domain.dispose(); | |
}); | |
domain.enter(); | |
next(); | |
}); | |
// Write All Output. | |
function writeOut(res, ctype, output) { | |
res.writeHead(200, { | |
'Content-Type': ctype | |
}); | |
res.end(output); | |
} | |
//Handle the XML results | |
function getXMLResults(node, count) { | |
var el = node.nextSibling, | |
j = 0, | |
root, item; | |
count = count * 2; | |
root = builder.begin('anagrams', { | |
'version': '1.0', | |
'encoding': 'UTF-8', | |
'standalone': true | |
}); | |
while (j < count) { | |
if (el.nodeType === 3 && el.nodeValue.length > 1) { | |
item = root.ele('anagram'); | |
item.txt(el.nodeValue.replace(/^\s+|\s+$/g, '')); | |
} | |
if (el.tagName === 'bottomlinks') { | |
break; | |
} | |
j++; | |
el = el.nextSibling; | |
} | |
//Uncomment below to throw an Error | |
//throw new Error('This is an XML Error'); | |
return { | |
'output': root.doc().toString(), | |
'ctype': 'text/xml' | |
}; | |
} | |
//Handle JSON Results | |
function getJSONResults(node, count) { | |
var el = node.nextSibling, | |
j = 0, | |
anagObj = {}, | |
elArray = []; | |
count = count * 2; | |
while (j < count) { | |
if (el.nodeType === 3 && el.nodeValue.length > 1) { | |
elArray.push(el.nodeValue.replace(/^\s+|\s+$/g, '')); | |
} | |
if (el.tagName === 'bottomlinks') { | |
break; | |
} | |
j++; | |
el = el.nextSibling; | |
} | |
anagObj['anagrams'] = elArray; | |
//Uncomment below to throw an Error | |
//throw new Error('This is an JSON Error'); | |
return { | |
'output': JSON.stringify(anagObj), | |
'ctype': 'application/json' | |
}; | |
} | |
app.get('/scrape/:id/:format', function(req, res) { | |
//Get the word | |
var word = req.params.id, | |
formatType = req.params.format; | |
//Hit the website | |
request({ | |
uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n' | |
}, function(error, response, body) { | |
var b, doc, handleResults, handleResultsOutput, sendResponse, text, i; | |
if (!error && response.statusCode === 200) { | |
doc = jsdom.jsdom(body, null, { | |
features: { | |
// FetchExternalResources : ['script'], | |
ProcessExternalResources: false | |
} | |
}); | |
//determine the formatType; | |
handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults; | |
//I know where the words are in the DOM returned from call. | |
b = doc.getElementsByTagName('b'); | |
for (i = 0; i < b.length; i++) { | |
text = b[i].innerHTML; | |
if (text.search(/found/i) !== -1) { | |
handleResultsOutput = handleResults(b[2], parseInt(text, 10)); | |
} | |
} | |
writeOut(res, req.params.format, handleResultsOutput.output); | |
} | |
else { | |
//Real Error so throw. | |
throw new Error('Server responded with error, try again later'); | |
} | |
}); | |
}); | |
app.listen(3000); | |
console.log('server started'); | |
// format http://localhost:3000/scrape/"word for anagram"/"format: json or xml" | |
// example call : http://localhost:3000/scrape/example/json |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment