|
from selenium import webdriver |
|
|
|
class Article(): |
|
def __init__(self, id, title, author, date, view_count=0, like_count=0, contents=''): |
|
self.id = id |
|
self.title = title |
|
self.author = author |
|
self.date = date |
|
self.view_count = view_count |
|
self.like_count = like_count |
|
self.contents = contents |
|
def __repr__(self): |
|
return "Article(id='%s', title='%s', author='%s', date='%s', view_count=%d, like_count=%d)" % (self.id, self.title, self.author, self.date, self.view_count, self.like_count) |
|
|
|
class Naver(): |
|
def __init__(self, driver): |
|
self.driver = driver |
|
|
|
def login(self, id, pw): |
|
'로그인' |
|
self.driver.get('https://nid.naver.com/nidlogin.login') # 로그인 페이지. |
|
self.driver.find_element_by_name('id').send_keys(id) # 아이디 입력. |
|
self.driver.find_element_by_name('pw').send_keys(pw) # 비밀번호 입력. |
|
self.driver.find_element_by_xpath('//*[@id="frmNIDLogin"]/fieldset/input').click() # 로그인 버튼 클릭. |
|
|
|
def cafe_search(self, name, keyword): |
|
'카페 검색' |
|
# TODO: 모든 페이지 검색 |
|
# TODO: 내부 컨텐츠 크롤링 |
|
|
|
self.driver.get('http://cafe.naver.com/%s' % name) # 카페 메인 페이지. |
|
self.driver.find_element_by_xpath('//*[@id="topLayerQueryInput"]').send_keys(keyword) # 검색어 입력. |
|
self.driver.find_element_by_xpath('//*[@id="cafe-search"]/form/a').click() # 검색 버튼 클릭. |
|
|
|
# 컨텐츠는 iframe 안에 있으므로, 컨텍스트를 바꾼다. |
|
iframe = self.driver.find_element_by_xpath('//*[@id="cafe_main"]') |
|
self.driver.switch_to_frame(iframe) |
|
rows = self.driver.find_elements_by_css_selector('#main-area > div:nth-child(8) > form > table > tbody > tr:nth-of-type(3n+1)') # 게시물 행(Row)들을 선택한다. |
|
|
|
articles = [] |
|
for row in rows: |
|
id = row.find_element_by_xpath('td[1]/span').text.strip() |
|
title = row.find_element_by_xpath('td[2]/span').text.strip() |
|
author = row.find_element_by_xpath('td[3]').text.strip().split('\n')[1] # ex) '퍼스나콘/아이디 영역\n{ID}' |
|
date = row.find_element_by_xpath('td[4]').text.strip() # ex) '12:27', '2017.11.03' # TODO: 정규화 |
|
view_count = int(row.find_element_by_xpath('td[5]').text.strip()) |
|
like_count = int(row.find_element_by_xpath('td[6]').text.strip()) |
|
articles.append( Article(id, title, author, date, view_count, like_count) ) |
|
self.driver.switch_to_default_content() |
|
|
|
return articles |
|
|
|
if __name__ == '__main__': |
|
# driver = webdriver.PhantomJS('d:/project/a/bin/phantomjs/phantomjs.exe') # http://phantomjs.org/download.html |
|
chrome = webdriver.Chrome('d:/project/a/bin/chrome_driver/chromedriver.exe') # https://sites.google.com/a/chromium.org/chromedriver/downloads |
|
# driver.implicitly_wait(3) # 암묵적 페이지 로드 대기 시간 설정. |
|
naver = Naver(driver=chrome) |
|
naver.login(id='{ID}', pw='{PASSWORD}') |
|
articles = naver.cafe_search(name='joonggonara', keyword='라이젠') |
|
for article in articles: |
|
print(article) |