Last active
January 15, 2018 07:38
-
-
Save kyujin-cho/47d96fe9bb8e1c0d8b09af5d622e32fc to your computer and use it in GitHub Desktop.
네이버 정치/경제/사회 크롤링
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import datetime | |
import time | |
from bs4 import BeautifulSoup as BSoup | |
from bs4 import NavigableString | |
cIds = { | |
'정치': ['950203', '100'], | |
'경제': ['949986', '101'], | |
'사회': ['949987', '102'] | |
} | |
base_url = 'http://news.naver.com/main/mainNews.nhn?componentId={}&date={}&page={}' | |
news_url = 'http://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1={}&oid={}&aid={}' | |
last_date = datetime.datetime(2017, 12, 31, 0, 0, 0, 0) | |
dt_str = '%Y-%m-%d %H:%M:%S' | |
d = datetime.datetime(2017, 1, 1, 0, 0, 0, 0) | |
while d <= last_date: | |
for id in cIds.values(): | |
news_list = [] | |
s = d.strftime(dt_str) | |
news_data = requests.post(base_url.format(id[0], s, '1'), | |
data={ | |
'componentId': id[0] | |
}, headers={ | |
'Host': 'news.naver.com', | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0' | |
}).json() | |
total_pages = int(news_data['pagerInfo']['totalPages']) | |
news_list.append(news_data['itemList']) | |
for i in range(2, total_pages + 1): | |
news_data = requests.post(base_url.format(id[0], s, str(i)), | |
data={ | |
'componentId': id[0] | |
}, headers={ | |
'Host': 'news.naver.com', | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:58.0) Gecko/20100101 Firefox/58.0' | |
}).json() | |
news_list.append(news_data['itemList']) | |
for data in news_data['itemList']: | |
article = requests.get(news_url.format(id[1], data['officeId'], data['articleId'])) | |
soup = BSoup(article.text) | |
div = soup.find('div', {'id': 'articleBodyContents'}) | |
inner_text = [element for element in div if isinstance(element, NavigableString)] | |
print('\n'.join(list(filter(lambda x: x != '\\n' and len(x) > 0 and '본문 내용' not in x and 'TV플레이어' not in x, inner_text)))) | |
# print('Sleeping 3 sec...') | |
# time.sleep(3) | |
d += datetime.timedelta(days=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment