Skip to content

Instantly share code, notes, and snippets.

@8bitDesigner
Last active January 2, 2016 06:49
Show Gist options
  • Save 8bitDesigner/8265831 to your computer and use it in GitHub Desktop.
Save 8bitDesigner/8265831 to your computer and use it in GitHub Desktop.
<?xml version='1.0' encoding='utf-8'?>
<feed xmlns='http://www.w3.org/2005/Atom'>
<title><%= title %></title>
<link rel='alternate' type='text/html' href='<%= url %>' />
<updated><%= (new Date).toISOString() %></updated>
<% entries.forEach(function(entry) { %>
<entry>
<title><%= entry.title %></title>
<link rel='alternate' type='text/html' href='<%= entry.url %>' />
<id><%= entry.id %></id>
<published><%= entry.published.toISOString() %></published>
<author>
<name><%= entry.author %></name>
</author>
<content type='html' xml:lang='en'>
<![CDATA[ <%= entry.html %> ]]>
</content>
</entry>
<% }) %>
</feed>
var jsdom = require('jsdom')
, http = require('http')
, async = require('async')
, _ = require('underscore')
, port = process.env.PORT || 5000
, feedXml = require('fs').readFileSync('./feed.xml').toString()
var url = 'http://help.octgn.net/support/discussions/topics/46055'
http.createServer(function(req, res) {
function bail(msg) {
res.writeHead(500)
res.end(msg)
}
fetchPages(url, function(err, pages) {
if (err) { return bail(err) }
async.map(pages, fetchFeed, function(err, feeds) {
if (err) { return bail(err) }
var response = {
title: 'Mac test client',
url: url,
entries: _(feeds).flatten(1)
}
res.writeHead(200, { 'Content-Type': 'application/atom+xml' });
res.end(_.template(feedXml, response), 'utf8')
})
})
}).listen(port)
console.log('Listening on port '+port)
function fetchFeed(url, cb) {
jsdom.env({
url: url,
scripts: ["http://code.jquery.com/jquery.js"],
done: function (errors, window) {
var $ = window.jQuery;
if (errors) { return cb(errors) }
return cb(null,
$('.user-comment').toArray().map(function(comment) {
comment = $(comment)
return {
title: comment.find('.p-content').text().trim().substring(0, 50) + '...',
author: comment.find('.user-name').text(),
published: new Date(comment.find('.timeago').data('timeago')),
link: url + '#' + comment.attr('id'),
id: comment.attr('id').split('-').pop(),
html: comment.find('.p-content .p-desc').html().trim()
}
})
)
}
})
}
function fetchPages(url, cb) {
jsdom.env({
url: url,
scripts: ["http://code.jquery.com/jquery.js"],
done: function (errors, window) {
var $ = window.jQuery;
if (errors) { return cb(errors) }
function convertTojQuery(node) { return $(node) }
function hasUrl(node) { return node.attr('href') }
function isNumeric(node) { return !isNaN(parseInt(node.text(), 10)) }
function getUrl(node) { return url + '/page/' + $(node).text() }
var urls = $('.pagination').find('a').toArray().map(convertTojQuery)
.filter(hasUrl).filter(isNumeric).map(getUrl)
urls.unshift(url)
return cb(null, urls)
}
})
}
{
"name": "forum-to-rss-scraper",
"version": "0.0.0",
"description": "Script to pull a forum topic into an RSS feed via screen scraping",
"main": "index.js",
"dependencies": {
"jsdom": "~0.8.10",
"underscore": "~1.5.2",
"async": "~0.2.9"
},
"engines": {
"node": "0.10.x"
},
"author": "Paul Sweeney <[email protected]>",
"license": "BSD"
}
web: node index.js
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment