Last active
August 15, 2018 12:52
-
-
Save Astro36/aac89d7ef1be5cab53cff9d88c6e0a23 to your computer and use it in GitHub Desktop.
Naver News Crawler for Node.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const fs = require('fs'); | |
const jsdom = require('jsdom'); | |
const parallel = require('parallel-tasks'); | |
const path = require('path'); | |
const request = require('request'); | |
const { JSDOM } = jsdom; | |
const formatDate = (date) => `${date.getFullYear()}${(date.getMonth() + 1).toString().padStart(2, '0')}${date.getDate().toString().padStart(2, '0')}`; | |
const NewsType = { | |
경향신문: 32, | |
국민일보: 5, | |
동아일보: 20, | |
문화일보: 21, | |
서울신문: 81, | |
세계일보: 22, | |
조선일보: 23, | |
중앙일보: 25, | |
한겨레: 28, | |
한국일보: 469, | |
}; | |
class NewsCrawler { | |
static getLastestArticleId(type) { | |
return new Promise((resolve, reject) => { | |
const typeId = NewsType[type].toString().padStart(3, '0'); | |
request.get(`http://news.naver.com/main/list.nhn?mode=LPOD&mid=sec&oid=${typeId}&listType=title&date=${formatDate(new Date())}`, async (err, httpResponse, body) => { | |
if (err) { | |
reject(err); | |
} else { | |
const { document } = (new JSDOM(body)).window; | |
resolve(Number(document.querySelector('.list_body > ul.type02 > li > a').href.split('aid=')[1])); | |
} | |
}); | |
}); | |
} | |
static run(type, articleId, articleAmount = 100) { | |
return new Promise(async (resolve) => { | |
const typeId = NewsType[type].toString().padStart(3, '0'); | |
const newsDir = path.join(__dirname, type); | |
if (!fs.existsSync(newsDir)) { | |
fs.mkdirSync(newsDir); | |
} | |
const urls = Array.apply(null, { length: articleAmount }).map((value, index) => value = `http://news.naver.com/main/read.nhn?mode=LPOD&mid=sec&oid=${typeId}&aid=${(articleId - index).toString().padStart(10, '0')}`); | |
const createTask = (url, index) => () => new Promise((resolve2) => { | |
console.log(url) | |
request.get({ url, encoding: null }, (err, httpResponse, body) => { | |
if (!err) { | |
const { document } = (new JSDOM(body)).window; | |
const titleElement = document.querySelector('.article_header > .article_info > #articleTitle'); | |
const contentElement = document.querySelector('#articleBody > #articleBodyContents'); | |
if (titleElement && contentElement) { | |
const title = titleElement.innerHTML.trim(); | |
const content = contentElement.innerHTML | |
.replace(/<!--.+-->/g, '') | |
.replace(/\/\/[^\n]*\n/, '') | |
.replace(/(?:<br>)+/g, '\n') | |
.replace(/<(?:.|\n)*?>/gm, '') | |
.replace('function _flash_removeCallback() {}', '') | |
.trim(); | |
fs.writeFileSync(path.join(newsDir, `news${index}.json`), JSON.stringify({ title, content })); | |
} | |
} | |
resolve2(); | |
}); | |
}); | |
await parallel.run(urls.map(createTask)); | |
resolve(true); | |
}); | |
} | |
} | |
(async () => { | |
// const articleId = await NewsCrawler.getLastestArticleId('경향신문'); | |
// console.log(articleId); | |
await NewsCrawler.run('경향신문', 2854930, 1000); | |
})() | |
module.exports = NewsCrawler; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment