gotomypc · October 28, 2012 03:07
diff --git a/NodeJS web scraper for Anagrams b/NodeJS web scraper for Anagrams
 //Web Scraper that scrapes a web page (without permission I may add). 
 //Simple test using node is all this is, no error handling etc.
 //Returns JSON OR XML
 //Format : localhost:'PORT'/scrape/'WORD'/'FORMAT'
 //Example CURL
 //JSON - curl -X GET http://localhost:3000/scrape/fundamental/json
 //XML - curl -X GET http://localhost:3000/scrape/fundamental/xml

 var express = require('express'),
 	request = require('request'),
 	jsdom = require('jsdom'),
 	builder = require('xmlbuilder'),
 	sys = require('sys');

 var app = express.createServer();
 app.configure('development', function() {/*whatever you want*/});

 app.configure('production', function() {
 	app.use(express.errorHandler({
 	}));
 });

 app.get('/scrape/:id/:format', function(req, res) {
 	//Get the word
 	var word = req.params.id, formatType = req.params.format;

 	//Hit the website
 	request({uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'}, function(error, response, body) {
 		var b, doc, handleResults, handResultsOutput, sendResponse;
 		if (!error && response.statusCode == 200) {
 			doc = jsdom.jsdom(body, null, {
 				features: {
 				FetchExternalResources: ['script'],
 				ProcessExternalResources: false
 				}
 			});
 			//determine the formatType;
 			handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults;
 			//I know where the words are in the DOM.
 			b = doc.getElementsByTagName('b');
 			for (var i = 0; i < b.length; i++) {
 				var text = b[i].innerHTML;
 				if (text.search(/found/i) !== -1) {
 					handleResultsOutput = handleResults(b[2], parseInt(text, 10));
 				}
 			}

 			//Handle the XML results (if we have any)
 			function getXMLResults(node, count) {
 				count = count * 2;
 				var el = node.nextSibling;
 				var j = 0;
 				var root = builder.begin('anagrams');
 				while (j < count) {
 					if (el.nodeType === 3 && el.nodeValue.length > 1) {
 						var item = root.ele('anagram');
 						item.txt(el.nodeValue.replace(/^\s+|\s+$/g, ''));
 					}
 					if (el.tagName === 'bottomlinks') { break;}
 					j++;
 					el = el.nextSibling;
 				}
 				root.up();
 				return {'output' : builder.toString(), 'ctype': 'text/xml'};
 			}

 			//JSON Results
 			function getJSONResults(node, count) {
 				count = count * 2;
 				var el = node.nextSibling, j = 0, anagObj = {},elArray = [];
 				while (j < count) {
 					if (el.nodeType === 3 && el.nodeValue.length > 1) {
 						elArray.push(el.nodeValue.replace(/^\s+|\s+$/g, ''));
 					}
 					if (el.tagName === 'bottomlinks') { break;}
 					j++;
 					el = el.nextSibling;
 				}
 				anagObj['anagrams'] = elArray;
 				return {'output' : JSON.stringify(anagObj), 'ctype' : 'application/json'};
 			}

 			//Send it out.
 			res.writeHead(200, {'Content-Type': handleResultsOutput.ctype});
 			res.end(handleResultsOutput.output);
 		}
 	});

 });

 app.listen(process.env.PORT || 3000);
 console.log('server started port: ' +  3000);
	//Web Scraper that scrapes a web page (without permission I may add).
	//Simple test using node is all this is, no error handling etc.
	//Returns JSON OR XML
	//Format : localhost:'PORT'/scrape/'WORD'/'FORMAT'
	//Example CURL
	//JSON - curl -X GET http://localhost:3000/scrape/fundamental/json
	//XML - curl -X GET http://localhost:3000/scrape/fundamental/xml

	var express = require('express'),
	request = require('request'),
	jsdom = require('jsdom'),
	builder = require('xmlbuilder'),
	sys = require('sys');

	var app = express.createServer();
	app.configure('development', function() {/whatever you want/});

	app.configure('production', function() {
	app.use(express.errorHandler({
	}));
	});

	app.get('/scrape/:id/:format', function(req, res) {
	//Get the word
	var word = req.params.id, formatType = req.params.format;

	//Hit the website
	request({uri: 'http://wordsmith.org/anagram/anagram.cgi?anagram=' + word + '&t=1000&a=n'}, function(error, response, body) {
	var b, doc, handleResults, handResultsOutput, sendResponse;
	if (!error && response.statusCode == 200) {
	doc = jsdom.jsdom(body, null, {
	features: {
	FetchExternalResources: ['script'],
	ProcessExternalResources: false
	}
	});
	//determine the formatType;
	handleResults = (formatType === 'xml') ? getXMLResults : getJSONResults;
	//I know where the words are in the DOM.
	b = doc.getElementsByTagName('b');
	for (var i = 0; i < b.length; i++) {
	var text = b[i].innerHTML;
	if (text.search(/found/i) !== -1) {
	handleResultsOutput = handleResults(b[2], parseInt(text, 10));
	}
	}

	//Handle the XML results (if we have any)
	function getXMLResults(node, count) {
	count = count * 2;
	var el = node.nextSibling;
	var j = 0;
	var root = builder.begin('anagrams');
	while (j < count) {
	if (el.nodeType === 3 && el.nodeValue.length > 1) {
	var item = root.ele('anagram');
	item.txt(el.nodeValue.replace(/^\s+\|\s+$/g, ''));
	}
	if (el.tagName === 'bottomlinks') { break;}
	j++;
	el = el.nextSibling;
	}
	root.up();
	return {'output' : builder.toString(), 'ctype': 'text/xml'};
	}

	//JSON Results
	function getJSONResults(node, count) {
	count = count * 2;
	var el = node.nextSibling, j = 0, anagObj = {},elArray = [];
	while (j < count) {
	if (el.nodeType === 3 && el.nodeValue.length > 1) {
	elArray.push(el.nodeValue.replace(/^\s+\|\s+$/g, ''));
	}
	if (el.tagName === 'bottomlinks') { break;}
	j++;
	el = el.nextSibling;
	}
	anagObj['anagrams'] = elArray;
	return {'output' : JSON.stringify(anagObj), 'ctype' : 'application/json'};
	}

	//Send it out.
	res.writeHead(200, {'Content-Type': handleResultsOutput.ctype});
	res.end(handleResultsOutput.output);
	}
	});

	});

	app.listen(process.env.PORT \|\| 3000);
	console.log('server started port: ' + 3000);