Skip to content

Instantly share code, notes, and snippets.

@ds0nt
Last active August 29, 2015 14:25
Show Gist options
  • Save ds0nt/9cccc331b657a15d5e37 to your computer and use it in GitHub Desktop.
Save ds0nt/9cccc331b657a15d5e37 to your computer and use it in GitHub Desktop.
reddit scraper
#!/bin/babel-node
import jsdom from 'jsdom'
import fs from 'fs'
let subreddit = process.argv.length > 2
? process.argv[2]
: 'webware'
function grab(url) {
console.log('fetching', url)
return new Promise((ok, fail) => {
jsdom.env({
url,
done: (err, window) => {
if (err)
fail(err)
else
ok(window)
}
})
})
}
async function reddit(subreddit) {
let { document } = await grab(`http://www.reddit.com/r/${subreddit}/`)
let posts = []
let html = ''
let items = document.querySelectorAll('.thing a.title')
for (var i = items.length - 1; i >= 0; i--) {
posts.push({
title: items[i].innerHTML,
href: items[i].href,
})
html = items[i].outerHTML + html
}
return { posts, html };
}
async function run() {
let { posts, html } = await reddit(subreddit)
console.log(posts)
fs.appendFile(
'reddit.json',
JSON.stringify(posts),
() => console.log('appended file reddit.json'))
fs.appendFile(
'reddit.html',
html,
() => console.log('appended file reddit.html'))
}
run()
@ds0nt
Copy link
Author

ds0nt commented Jul 22, 2015

npm install --save babel jsdom
./reddit.js <subreddit>
[dsont@dsont scrapers]$ babel-node reddit-webware.js opensource
fetching http://www.reddit.com/r/opensource/
[ { title: 'Giving Back to Open Source',
    href: 'https://blog.frappe.io/blog/open-source/giving-back-to-open-source' },
  { title: 'OpenEduCat - Features',
    href: 'http://www.openeducat.org/page/features' },
  { title: 'RocksDB', href: 'http://rocksdb.org/' },
  { title: 'NSA Releases Linux-Based Open Source Infosec Tool',
    href: 'http://www.itnews.com.au/News/406509,nsa-releases-linux-based-open-source-infosec-tool.aspx' },
  { title: 'Overcoming the Massive Slowing the Internet of Things',
    href: 'http://sudosecurity.com/2015/07/17/overcoming-the-massive-slowing-the-internet-of-things/' },
  { title: 'TIL NSA has an official GitHub account',
    href: 'https://imgur.com/AlLAptP' },
  { title: 'Open Source, but Access denied. European commission at its best.',
    href: 'http://i.imgur.com/oeCub0F.png' },
  { title: 'Tango Lyrics Translation database, developed in Ruby on Rails (xpost from /r/tango)',
    href: 'https://github.com/alexvicegrab/tangoLyricsDB' },
  { title: 'ImGui', href: 'https://github.com/ocornut/imgui' },
  { title: 'I made an app and want it to be awesome, you can help.',
    href: 'http://www.reddit.com/r/opensource/comments/3ds8an/i_made_an_app_and_want_it_to_be_awesome_you_can/' },
  { title: 'share your favorite software projects here',
    href: 'http://jon.bubblenet.co/' },
  { title: 'Very old, but interesting article regarding Stardock\'s stance on open source',
    href: 'https://www.stardock.com/stardock/articles/opensource.html' },
  { title: 'Open Source Virtual Reality running on Mozilla\'s Firefox OS',
    href: 'https://firefoxoscentral.wordpress.com/2015/07/01/webvr-on-mobile-devices/' },
  { title: 'LF a free text-to-speech program; suggestions appreciated!',
    href: 'http://www.reddit.com/r/opensource/comments/3dx2uu/lf_a_free_texttospeech_program_suggestions/' },
  { title: 'New generation of robotics are industry-agnostic, open-source',
    href: 'http://www.zdnet.com/article/new-generation-of-robotics-are-industry-agnostic-open-source/' },
  { title: 'How dead is SeaMonkey?',
    href: 'https://blog.seamonkey-project.org/2015/07/08/how-dead-is-seamoneky/' },
  { title: 'Sleepless Nights and Public Data: BitCamp Hackathon',
    href: 'http://technology.finra.org/articles/finra-bitcamp-hackathon.html' },
  { title: 'Open source design job board',
    href: 'http://opensourcedesign.net/jobs/' },
  { title: 'OpenEduCat Educational ERP System - Free Demo',
    href: 'http://www.openeducat.org/page/demo' },
  { title: 'KeePassX » Blog Archiv – First 2.0 Beta released',
    href: 'https://www.keepassx.org/news/2015/07/503' },
  { title: 'Beyond Hadoop At Yahoo: Druid, Spark, &amp; Storm [x-post from /r/bigdata]',
    href: 'http://www.infoworld.com/article/2949168/hadoop/yahoo-struts-its-hadoop-stuff.html' },
  { title: '"This sounds really f\'ing crazy, and we like that, so we\'ll give you the money" - Andreessen Horowitz on OpenBazaar',
    href: 'https://hacked.com/p2p-e-commerce-network-openbazaar-ceo-gives-talk-packed-house-san-diego/' },
  { title: 'A web app I have been working on is almost finished. Should I use my real name or an online handle when releasing it on GitHub?',
    href: 'http://www.reddit.com/r/opensource/comments/3e61i5/a_web_app_i_have_been_working_on_is_almost/' },
  { title: 'Introducing Verified Content on TLDRLegal',
    href: 'https://tldrlegal.com/verified' },
  { title: 'Why I Am Pro-GPL',
    href: 'http://dustycloud.org/blog/why-i-am-pro-gpl/' } ]
appended file reddit.json
appended file reddit.html

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment