Skip to content

Instantly share code, notes, and snippets.

@chaewonkong
Created February 9, 2018 14:20
Show Gist options
  • Select an option

  • Save chaewonkong/6ea2d1b364a8dbf14ee9b4045beaa607 to your computer and use it in GitHub Desktop.

Select an option

Save chaewonkong/6ea2d1b364a8dbf14ee9b4045beaa607 to your computer and use it in GitHub Desktop.
호갱노노 Scraper
'''"호갱노노" News Scraper
Scrape (title, date, url) from the google news search results for "호갱노노" within 1 week,
and create "hogangnono.csv" to contain the scraped data.
'''
import requests
import csv
from bs4 import BeautifulSoup
source = "https://www.google.co.kr/search?q=%ED%98%B8%EA%B0%B1%EB%85%B8%EB%85%B8&newwindow=1&tbm=nws&source=lnt&tbs=qdr:w&sa=X&ved=0ahUKEwiqjuTSpZPZAhUFKpQKHecZC7IQpwUIHA&biw=1327&bih=1299&dpr=2"
def get_news(page):
'''Return 3 different lists containing title, date and url from given search page'''
index = []
#Change headers in order to behave like a human
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome"}
#Get and translate the page
req = requests.get(source, headers = headers)
html = req.text
soup = BeautifulSoup(html, "html.parser")
tomatoSoup = soup
#Find desired elements by using CSS selector
news = soup.select("h3 > a")
date = soup.select("div > span")
titles = []
urls = []
dates = []
# Add title of the news to titles list and url of the news to urls list
for title in news:
titles.append(title.text)
urls.append(title.get("href"))
# Add date of the news to dates list
for day in date:
dates.append(day.text)
#return titles, dates, urls
return titles, dates, urls
def save_csv(page, file_name):
'''Create a csv file containing title, date, url of the search outcome page.'''
# Get title, date, url from the news in the search outcome
news = get_news(page)
# Assign 3 list components in news to 3 different variables; title, date, url
title = news[0]
date = news[1]
url = news[2]
#Create hogangnono.csv and save data
csv_file = open(file_name, "w+")
try:
writer = csv.writer(csv_file)
writer.writerow(("TITLE", "DATE", "URL"))
for i in range(len(title)):
writer.writerow((title.pop(), date.pop(), url.pop()))
finally:
csv_file.close()
# Scrape title, date, url of the news about "호갱노노" and save it as hogangnono.csv
save_csv(source, "../pyWeb/hogangnono.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment