chaewonkong · February 9, 2018 14:20
diff --git a/hogangnono.py b/hogangnono.py
 '''"호갱노노" News Scraper

 Scrape (title, date, url) from the google news search results for "호갱노노" within 1 week,
 and create "hogangnono.csv" to contain the scraped data.
 '''

 import requests
 import csv
 from bs4 import BeautifulSoup

 source = "https://www.google.co.kr/search?q=%ED%98%B8%EA%B0%B1%EB%85%B8%EB%85%B8&newwindow=1&tbm=nws&source=lnt&tbs=qdr:w&sa=X&ved=0ahUKEwiqjuTSpZPZAhUFKpQKHecZC7IQpwUIHA&biw=1327&bih=1299&dpr=2"


 def get_news(page):
 	'''Return 3 different lists containing title, date and url from given search page'''

 	index = []

 	#Change headers in order to behave like a human
 	headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome"}

 	#Get and translate the page
 	req = requests.get(source, headers = headers)
 	html = req.text
 	soup = BeautifulSoup(html, "html.parser")
 	tomatoSoup = soup

 	#Find desired elements by using CSS selector
 	news = soup.select("h3 > a")
 	date = soup.select("div > span")


 	titles = []
 	urls = []
 	dates = []

 	# Add title of the news to titles list and url of the news to urls list
 	for title in news:
 		titles.append(title.text)
 		urls.append(title.get("href"))

 	# Add date of the news to dates list
 	for day in date:
 		dates.append(day.text)


 	#return titles, dates, urls
 	return titles, dates, urls

 	

 def save_csv(page, file_name):
 	'''Create a csv file containing title, date, url of the search outcome page.'''

 	# Get title, date, url from the news in the search outcome
 	news = get_news(page)

 	# Assign 3 list components in news to 3 different variables; title, date, url
 	title = news[0]
 	date = news[1]
 	url = news[2]

 	#Create hogangnono.csv and save data
 	csv_file = open(file_name, "w+")
 	try:
 		writer = csv.writer(csv_file)
 		writer.writerow(("TITLE", "DATE", "URL"))

 		for i in range(len(title)):
 			writer.writerow((title.pop(), date.pop(), url.pop()))
 	finally:
 		csv_file.close()


 # Scrape title, date, url of the news about "호갱노노" and save it as hogangnono.csv
 save_csv(source, "../pyWeb/hogangnono.csv")
	'''"호갱노노" News Scraper

	Scrape (title, date, url) from the google news search results for "호갱노노" within 1 week,
	and create "hogangnono.csv" to contain the scraped data.
	'''

	import requests
	import csv
	from bs4 import BeautifulSoup

	source = "https://www.google.co.kr/search?q=%ED%98%B8%EA%B0%B1%EB%85%B8%EB%85%B8&newwindow=1&tbm=nws&source=lnt&tbs=qdr:w&sa=X&ved=0ahUKEwiqjuTSpZPZAhUFKpQKHecZC7IQpwUIHA&biw=1327&bih=1299&dpr=2"


	def get_news(page):
	'''Return 3 different lists containing title, date and url from given search page'''

	index = []

	#Change headers in order to behave like a human
	headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome"}

	#Get and translate the page
	req = requests.get(source, headers = headers)
	html = req.text
	soup = BeautifulSoup(html, "html.parser")
	tomatoSoup = soup

	#Find desired elements by using CSS selector
	news = soup.select("h3 > a")
	date = soup.select("div > span")


	titles = []
	urls = []
	dates = []

	# Add title of the news to titles list and url of the news to urls list
	for title in news:
	titles.append(title.text)
	urls.append(title.get("href"))

	# Add date of the news to dates list
	for day in date:
	dates.append(day.text)


	#return titles, dates, urls
	return titles, dates, urls



	def save_csv(page, file_name):
	'''Create a csv file containing title, date, url of the search outcome page.'''

	# Get title, date, url from the news in the search outcome
	news = get_news(page)

	# Assign 3 list components in news to 3 different variables; title, date, url
	title = news[0]
	date = news[1]
	url = news[2]

	#Create hogangnono.csv and save data
	csv_file = open(file_name, "w+")
	try:
	writer = csv.writer(csv_file)
	writer.writerow(("TITLE", "DATE", "URL"))

	for i in range(len(title)):
	writer.writerow((title.pop(), date.pop(), url.pop()))
	finally:
	csv_file.close()


	# Scrape title, date, url of the news about "호갱노노" and save it as hogangnono.csv
	save_csv(source, "../pyWeb/hogangnono.csv")
No results found