Created
February 9, 2018 14:20
-
-
Save chaewonkong/6ea2d1b364a8dbf14ee9b4045beaa607 to your computer and use it in GitHub Desktop.
호갱노노 Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| '''"호갱노노" News Scraper | |
| Scrape (title, date, url) from the google news search results for "호갱노노" within 1 week, | |
| and create "hogangnono.csv" to contain the scraped data. | |
| ''' | |
| import requests | |
| import csv | |
| from bs4 import BeautifulSoup | |
| source = "https://www.google.co.kr/search?q=%ED%98%B8%EA%B0%B1%EB%85%B8%EB%85%B8&newwindow=1&tbm=nws&source=lnt&tbs=qdr:w&sa=X&ved=0ahUKEwiqjuTSpZPZAhUFKpQKHecZC7IQpwUIHA&biw=1327&bih=1299&dpr=2" | |
| def get_news(page): | |
| '''Return 3 different lists containing title, date and url from given search page''' | |
| index = [] | |
| #Change headers in order to behave like a human | |
| headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome"} | |
| #Get and translate the page | |
| req = requests.get(source, headers = headers) | |
| html = req.text | |
| soup = BeautifulSoup(html, "html.parser") | |
| tomatoSoup = soup | |
| #Find desired elements by using CSS selector | |
| news = soup.select("h3 > a") | |
| date = soup.select("div > span") | |
| titles = [] | |
| urls = [] | |
| dates = [] | |
| # Add title of the news to titles list and url of the news to urls list | |
| for title in news: | |
| titles.append(title.text) | |
| urls.append(title.get("href")) | |
| # Add date of the news to dates list | |
| for day in date: | |
| dates.append(day.text) | |
| #return titles, dates, urls | |
| return titles, dates, urls | |
| def save_csv(page, file_name): | |
| '''Create a csv file containing title, date, url of the search outcome page.''' | |
| # Get title, date, url from the news in the search outcome | |
| news = get_news(page) | |
| # Assign 3 list components in news to 3 different variables; title, date, url | |
| title = news[0] | |
| date = news[1] | |
| url = news[2] | |
| #Create hogangnono.csv and save data | |
| csv_file = open(file_name, "w+") | |
| try: | |
| writer = csv.writer(csv_file) | |
| writer.writerow(("TITLE", "DATE", "URL")) | |
| for i in range(len(title)): | |
| writer.writerow((title.pop(), date.pop(), url.pop())) | |
| finally: | |
| csv_file.close() | |
| # Scrape title, date, url of the news about "호갱노노" and save it as hogangnono.csv | |
| save_csv(source, "../pyWeb/hogangnono.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment