Skip to content

Instantly share code, notes, and snippets.

@oiehot
Created November 4, 2017 04:38
Show Gist options
  • Save oiehot/f55c635e8e628ee339e4b32c7c93b409 to your computer and use it in GitHub Desktop.
Save oiehot/f55c635e8e628ee339e4b32c7c93b409 to your computer and use it in GitHub Desktop.
Naver cafe crawler (wip)
from selenium import webdriver
class Article():
def __init__(self, id, title, author, date, view_count=0, like_count=0, contents=''):
self.id = id
self.title = title
self.author = author
self.date = date
self.view_count = view_count
self.like_count = like_count
self.contents = contents
def __repr__(self):
return "Article(id='%s', title='%s', author='%s', date='%s', view_count=%d, like_count=%d)" % (self.id, self.title, self.author, self.date, self.view_count, self.like_count)
class Naver():
def __init__(self, driver):
self.driver = driver
def login(self, id, pw):
'로그인'
self.driver.get('https://nid.naver.com/nidlogin.login') # 로그인 페이지.
self.driver.find_element_by_name('id').send_keys(id) # 아이디 입력.
self.driver.find_element_by_name('pw').send_keys(pw) # 비밀번호 입력.
self.driver.find_element_by_xpath('//*[@id="frmNIDLogin"]/fieldset/input').click() # 로그인 버튼 클릭.
def cafe_search(self, name, keyword):
'카페 검색'
# TODO: 모든 페이지 검색
# TODO: 내부 컨텐츠 크롤링
self.driver.get('http://cafe.naver.com/%s' % name) # 카페 메인 페이지.
self.driver.find_element_by_xpath('//*[@id="topLayerQueryInput"]').send_keys(keyword) # 검색어 입력.
self.driver.find_element_by_xpath('//*[@id="cafe-search"]/form/a').click() # 검색 버튼 클릭.
# 컨텐츠는 iframe 안에 있으므로, 컨텍스트를 바꾼다.
iframe = self.driver.find_element_by_xpath('//*[@id="cafe_main"]')
self.driver.switch_to_frame(iframe)
rows = self.driver.find_elements_by_css_selector('#main-area > div:nth-child(8) > form > table > tbody > tr:nth-of-type(3n+1)') # 게시물 행(Row)들을 선택한다.
articles = []
for row in rows:
id = row.find_element_by_xpath('td[1]/span').text.strip()
title = row.find_element_by_xpath('td[2]/span').text.strip()
author = row.find_element_by_xpath('td[3]').text.strip().split('\n')[1] # ex) '퍼스나콘/아이디 영역\n{ID}'
date = row.find_element_by_xpath('td[4]').text.strip() # ex) '12:27', '2017.11.03' # TODO: 정규화
view_count = int(row.find_element_by_xpath('td[5]').text.strip())
like_count = int(row.find_element_by_xpath('td[6]').text.strip())
articles.append( Article(id, title, author, date, view_count, like_count) )
self.driver.switch_to_default_content()
return articles
if __name__ == '__main__':
# driver = webdriver.PhantomJS('d:/project/a/bin/phantomjs/phantomjs.exe') # http://phantomjs.org/download.html
chrome = webdriver.Chrome('d:/project/a/bin/chrome_driver/chromedriver.exe') # https://sites.google.com/a/chromium.org/chromedriver/downloads
# driver.implicitly_wait(3) # 암묵적 페이지 로드 대기 시간 설정.
naver = Naver(driver=chrome)
naver.login(id='{ID}', pw='{PASSWORD}')
articles = naver.cafe_search(name='joonggonara', keyword='라이젠')
for article in articles:
print(article)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment