Last active
August 29, 2015 14:25
-
-
Save ds0nt/9cccc331b657a15d5e37 to your computer and use it in GitHub Desktop.
reddit scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/babel-node | |
| import jsdom from 'jsdom' | |
| import fs from 'fs' | |
| let subreddit = process.argv.length > 2 | |
| ? process.argv[2] | |
| : 'webware' | |
| function grab(url) { | |
| console.log('fetching', url) | |
| return new Promise((ok, fail) => { | |
| jsdom.env({ | |
| url, | |
| done: (err, window) => { | |
| if (err) | |
| fail(err) | |
| else | |
| ok(window) | |
| } | |
| }) | |
| }) | |
| } | |
| async function reddit(subreddit) { | |
| let { document } = await grab(`http://www.reddit.com/r/${subreddit}/`) | |
| let posts = [] | |
| let html = '' | |
| let items = document.querySelectorAll('.thing a.title') | |
| for (var i = items.length - 1; i >= 0; i--) { | |
| posts.push({ | |
| title: items[i].innerHTML, | |
| href: items[i].href, | |
| }) | |
| html = items[i].outerHTML + html | |
| } | |
| return { posts, html }; | |
| } | |
| async function run() { | |
| let { posts, html } = await reddit(subreddit) | |
| console.log(posts) | |
| fs.appendFile( | |
| 'reddit.json', | |
| JSON.stringify(posts), | |
| () => console.log('appended file reddit.json')) | |
| fs.appendFile( | |
| 'reddit.html', | |
| html, | |
| () => console.log('appended file reddit.html')) | |
| } | |
| run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
[dsont@dsont scrapers]$ babel-node reddit-webware.js opensource fetching http://www.reddit.com/r/opensource/ [ { title: 'Giving Back to Open Source', href: 'https://blog.frappe.io/blog/open-source/giving-back-to-open-source' }, { title: 'OpenEduCat - Features', href: 'http://www.openeducat.org/page/features' }, { title: 'RocksDB', href: 'http://rocksdb.org/' }, { title: 'NSA Releases Linux-Based Open Source Infosec Tool', href: 'http://www.itnews.com.au/News/406509,nsa-releases-linux-based-open-source-infosec-tool.aspx' }, { title: 'Overcoming the Massive Slowing the Internet of Things', href: 'http://sudosecurity.com/2015/07/17/overcoming-the-massive-slowing-the-internet-of-things/' }, { title: 'TIL NSA has an official GitHub account', href: 'https://imgur.com/AlLAptP' }, { title: 'Open Source, but Access denied. European commission at its best.', href: 'http://i.imgur.com/oeCub0F.png' }, { title: 'Tango Lyrics Translation database, developed in Ruby on Rails (xpost from /r/tango)', href: 'https://github.com/alexvicegrab/tangoLyricsDB' }, { title: 'ImGui', href: 'https://github.com/ocornut/imgui' }, { title: 'I made an app and want it to be awesome, you can help.', href: 'http://www.reddit.com/r/opensource/comments/3ds8an/i_made_an_app_and_want_it_to_be_awesome_you_can/' }, { title: 'share your favorite software projects here', href: 'http://jon.bubblenet.co/' }, { title: 'Very old, but interesting article regarding Stardock\'s stance on open source', href: 'https://www.stardock.com/stardock/articles/opensource.html' }, { title: 'Open Source Virtual Reality running on Mozilla\'s Firefox OS', href: 'https://firefoxoscentral.wordpress.com/2015/07/01/webvr-on-mobile-devices/' }, { title: 'LF a free text-to-speech program; suggestions appreciated!', href: 'http://www.reddit.com/r/opensource/comments/3dx2uu/lf_a_free_texttospeech_program_suggestions/' }, { title: 'New generation of robotics are industry-agnostic, open-source', href: 'http://www.zdnet.com/article/new-generation-of-robotics-are-industry-agnostic-open-source/' }, { title: 'How dead is SeaMonkey?', href: 'https://blog.seamonkey-project.org/2015/07/08/how-dead-is-seamoneky/' }, { title: 'Sleepless Nights and Public Data: BitCamp Hackathon', href: 'http://technology.finra.org/articles/finra-bitcamp-hackathon.html' }, { title: 'Open source design job board', href: 'http://opensourcedesign.net/jobs/' }, { title: 'OpenEduCat Educational ERP System - Free Demo', href: 'http://www.openeducat.org/page/demo' }, { title: 'KeePassX » Blog Archiv – First 2.0 Beta released', href: 'https://www.keepassx.org/news/2015/07/503' }, { title: 'Beyond Hadoop At Yahoo: Druid, Spark, & Storm [x-post from /r/bigdata]', href: 'http://www.infoworld.com/article/2949168/hadoop/yahoo-struts-its-hadoop-stuff.html' }, { title: '"This sounds really f\'ing crazy, and we like that, so we\'ll give you the money" - Andreessen Horowitz on OpenBazaar', href: 'https://hacked.com/p2p-e-commerce-network-openbazaar-ceo-gives-talk-packed-house-san-diego/' }, { title: 'A web app I have been working on is almost finished. Should I use my real name or an online handle when releasing it on GitHub?', href: 'http://www.reddit.com/r/opensource/comments/3e61i5/a_web_app_i_have_been_working_on_is_almost/' }, { title: 'Introducing Verified Content on TLDRLegal', href: 'https://tldrlegal.com/verified' }, { title: 'Why I Am Pro-GPL', href: 'http://dustycloud.org/blog/why-i-am-pro-gpl/' } ] appended file reddit.json appended file reddit.html