Last active
February 10, 2017 10:27
-
-
Save pchrysa/e284153a4f226e2c7468aae0db26cf5c to your computer and use it in GitHub Desktop.
Scrape with Cheerio
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var request = require('sync-request'); | |
var cheerio = require('cheerio'); | |
var _ = require('lodash'); | |
const months = ['gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno', 'luglio', 'agosto', 'settembre', 'ottobre', 'novembre', 'dicembre']; | |
var dates = []; | |
const url = 'http://m.paginainizio.com/onomasticimob.php?mese='; | |
for (var m in months) { | |
console.log(months[m], '!!month!!'); | |
var b = request('GET', url + months[m]); | |
var $ = cheerio.load(b.getBody('utf8')); | |
var dayCalendarElement = $('div.daycal'); | |
for (var i = 0; i < dayCalendarElement.length; i++) { | |
var elementsUntilNextDay = $(dayCalendarElement[i]).nextUntil('div.daycal'); | |
var dayNumber = $(dayCalendarElement[i]).first().text(); | |
dayNumber = ((Number(dayNumber)+1 < 10) ? "0" + Number(dayNumber) : Number(dayNumber)); | |
var names = []; | |
for (var k in elementsUntilNextDay) { | |
if (elementsUntilNextDay[k].name === 'a') { | |
var name = $(elementsUntilNextDay[k]).children('div.cateon').first().text(); | |
name = _.startCase(_.toLower(name)); | |
names.push(name); | |
} | |
} | |
var monthNumber = Number(m) + 1; | |
monthNumber = monthNumber.toString().length === 1 ? `0${monthNumber}` : monthNumber; | |
dates.push({ | |
names: names, | |
date: `${dayNumber}/${monthNumber}` | |
}) | |
} | |
} | |
fs.writeFile('it_namedays.json', JSON.stringify({data: dates}, null, 4), function(err){ | |
console.log('File successfully written! - ./it_namedays.json file'); | |
}) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "scrapper", | |
"version": "1.0.0", | |
"description": "Scrap html italian namedays site to json", | |
"main": "index.js", | |
"scripts": { | |
"start": "node index.js" | |
}, | |
"keywords": [ | |
"scraper", | |
"html-to-json" | |
], | |
"author": "Chrysa Papadopoulou", | |
"license": "MIT", | |
"dependencies": { | |
"cheerio": "^0.22.0", | |
"lodash": "^4.17.4", | |
"sync-request": "^4.0.1" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment