Last active
May 31, 2020 00:44
-
-
Save reichert621/c17f733ecabf8983d48ef8fd86ca1027 to your computer and use it in GitHub Desktop.
Taro Example: Scraping recent YC job posts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const moment = require('moment'); | |
const request = require('superagent'); | |
const cheerio = require('cheerio'); | |
const _ = require('lodash'); | |
const scraper = async () => { | |
const url = 'https://news.ycombinator.com/jobs' | |
const {text: html} = await request.get(url); | |
const $ = cheerio.load(html); | |
const links = $('.storylink').map((i, el) => { | |
return {href: $(el).attr('href'), text: $(el).text()}; | |
}); | |
const ages = $('.age').map((i, el) => $(el).text()); | |
const formatted = _.zip(links, ages).map(([link, age]) => { | |
return { | |
text: link.text, | |
link: link.href, | |
age: age, | |
}; | |
}); | |
const recent = formatted.filter((item) => { | |
// item.age looks like "16 hours ago", "2 days ago", etc. | |
const [num, time] = item.age.split(' '); | |
const threshold = moment().subtract(24, 'hours'); | |
return moment().subtract(num, time) > threshold; | |
}); | |
return recent; | |
}; | |
// Run function and verify output | |
scraper().then(console.log).catch(console.log); | |
const main = () => scraper(); | |
// You must have a default export of the function you want to run | |
// in order for it to be deployed and scheduled | |
module.exports = main; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment