Created
July 30, 2018 14:51
-
-
Save xnuk/0f2b62e1c1e9890a7c5822086b725212 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const {parse, SELECTOR, DATA, CONVERT} = require('crawl-it') | |
const http = require('http') | |
const httpGet = (url, cb) => http.get(url, res => { | |
if(res.statusCode !== 200) return; | |
res.setEncoding('utf8') | |
let body = '' | |
res.on('data', ch => body += ch) | |
res.on('end', () => cb(body)) | |
}) | |
const parseTemplate = { | |
meta: { | |
[SELECTOR]: '#header>h1', | |
where: '.tit/text()', | |
when: '.tim' | |
}, | |
grapes: [{ | |
[SELECTOR]: 'li[id^="jsAnchorDust"]', | |
name: { | |
[DATA]: './.tit', | |
[CONVERT]: v => v.replace(/환경기준/, '').trim() | |
}, | |
id: '.graph2@id' | |
}], | |
// script parsing | |
script: { | |
[DATA]: 'body>script:last-of-type/text()', | |
[CONVERT]: v => (v.split('//7가지 물질 차트생성')[1] || '') | |
.split('new google.visualization.DataTable();') | |
.map(para => { | |
const [beforeRender, after] = para.split('new google.visualization.LineChart(') | |
const maybeHere = beforeRender.split('data.addRows([')[1]; | |
if(maybeHere == null || after == null) return null | |
const id = (after.match(/getElementById\('([^']+)'\)\)/) || [])[1] | |
const data = maybeHere.split(');')[0].split('[').map(v => { | |
const m = v.match(/['"]([0-2]?[0-9])시['"]\s*,\s*([0-9\.]+)\s*,/) | |
return m ? {hour: (m[1] | 0), val: m[2]} : m | |
}).filter(v => v != null) | |
return {id, data} | |
}).filter(v => v != null) | |
}, | |
[CONVERT]: ({meta, grapes, script}) => { | |
grapes.forEach(o => { | |
const sameIdData = script.filter(v => v.id === o.id)[0] | |
if(sameIdData != null) o.data = sameIdData.data | |
}) | |
return {...meta, grapes} | |
} | |
} | |
httpGet({ | |
hostname: 'm.airkorea.or.kr', | |
// 황간역 | |
path: '/main?lat=36.2250075&lng=127.9120465', | |
headers: { | |
Accept: 'text/html;charset=utf-8', | |
'Accept-Charset': 'utf-8', | |
// User Agent is important. It should be given. | |
'User-Agent': 'curl/7.54.0' | |
} | |
}, body => { | |
const result = parse(body, parseTemplate) | |
console.dir(result) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment