Skip to content

Instantly share code, notes, and snippets.

@kyujin-cho
Last active January 15, 2018 07:38
Show Gist options
  • Save kyujin-cho/47d96fe9bb8e1c0d8b09af5d622e32fc to your computer and use it in GitHub Desktop.
Save kyujin-cho/47d96fe9bb8e1c0d8b09af5d622e32fc to your computer and use it in GitHub Desktop.
네이버 정치/경제/사회 크롤링
import requests
import datetime
import time
from bs4 import BeautifulSoup as BSoup
from bs4 import NavigableString
cIds = {
'정치': ['950203', '100'],
'경제': ['949986', '101'],
'사회': ['949987', '102']
}
base_url = 'http://news.naver.com/main/mainNews.nhn?componentId={}&date={}&page={}'
news_url = 'http://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1={}&oid={}&aid={}'
last_date = datetime.datetime(2017, 12, 31, 0, 0, 0, 0)
dt_str = '%Y-%m-%d %H:%M:%S'
d = datetime.datetime(2017, 1, 1, 0, 0, 0, 0)
while d <= last_date:
for id in cIds.values():
news_list = []
s = d.strftime(dt_str)
news_data = requests.post(base_url.format(id[0], s, '1'),
data={
'componentId': id[0]
}, headers={
'Host': 'news.naver.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0'
}).json()
total_pages = int(news_data['pagerInfo']['totalPages'])
news_list.append(news_data['itemList'])
for i in range(2, total_pages + 1):
news_data = requests.post(base_url.format(id[0], s, str(i)),
data={
'componentId': id[0]
}, headers={
'Host': 'news.naver.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0'
}).json()
news_list.append(news_data['itemList'])
for data in news_data['itemList']:
article = requests.get(news_url.format(id[1], data['officeId'], data['articleId']))
soup = BSoup(article.text)
div = soup.find('div', {'id': 'articleBodyContents'})
inner_text = [element for element in div if isinstance(element, NavigableString)]
print('\n'.join(list(filter(lambda x: x != '\\n' and len(x) > 0 and '본문 내용' not in x and 'TV플레이어' not in x, inner_text))))
# print('Sleeping 3 sec...')
# time.sleep(3)
d += datetime.timedelta(days=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment