Created
November 23, 2017 09:19
-
-
Save dongkwan-kim/699edab9b6b7575cadf2fdf7aef3b4c7 to your computer and use it in GitHub Desktop.
Crawl news from http://mayor.seoul.go.kr/app/seoul/news.do
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from selenium import webdriver | |
from time import sleep | |
import csv | |
def run_driver(path): | |
driver = webdriver.Chrome(path) | |
driver.implicitly_wait(3) | |
return driver | |
def get_site(driver, url): | |
driver.get(url) | |
def click_prev(driver): | |
driver.find_element_by_class_name("bx-prev").click() | |
def get_current_month(driver): | |
month = driver.find_element_by_css_selector(".month-list.active") | |
return month.text | |
def get_seoul_news_list(driver): | |
news = driver.find_elements_by_class_name("newspaper") | |
return news | |
class SeoulNews(): | |
def __init__(self, webelem, month): | |
self.url = webelem.find_element_by_tag_name("a").get_attribute("href") | |
self.media = webelem.find_element_by_class_name("media").text | |
self.date = webelem.find_element_by_class_name("date").text | |
self.title = webelem.find_element_by_class_name("title").text | |
self.category = webelem.find_element_by_class_name("category").text | |
manifesto = webelem.find_element_by_class_name("manifesto").text | |
self.man_num, self.man_text = manifesto.split("\n") | |
self.month = month | |
def __str__(self): | |
return self.title | |
def export_dict(self): | |
return self.__dict__ | |
def writer_csv(filename, fieldnames): | |
f = open(filename, "w", encoding="utf-8") | |
wtr = csv.DictWriter(f, fieldnames=fieldnames) | |
wtr.writeheader() | |
return wtr | |
def main(num): | |
PATH = "/Users/todo/chromedriver" | |
URL = "http://mayor.seoul.go.kr/app/seoul/news.do" | |
driver = run_driver(PATH) | |
get_site(driver, URL) | |
fields = ["man_num", "man_text", "month", "title", "media", "date", "category", "url"] | |
wtr = writer_csv("seoulnews.csv", fields) | |
for _ in range(num): | |
news_list = get_seoul_news_list(driver) | |
month = get_current_month(driver) | |
for news_elem in news_list: | |
try: | |
sn = SeoulNews(news_elem, month) | |
wtr.writerow(sn.export_dict()) | |
except: | |
print(news_elem.text) | |
click_prev(driver) | |
sleep(1) | |
if __name__ == '__main__': | |
main(55) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment