Created
June 20, 2017 12:30
-
-
Save imdkm/044b9f2fdda3bdb78ab0c77515698078 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests, csv | |
from bs4 import BeautifulSoup | |
from time import sleep | |
for year in range(1940,2017): | |
print("start scraping " + str(year) + "'s chart.") | |
# initialize and put header on the main 'chart' list. | |
chart = [["rank", "artist name", "song title"]] | |
# make a url of the year and get its html data. | |
url = "http://billboardtop100of.com/" + str(year) + "-2/" | |
r = requests.get(url) | |
# check if the url is valid. if not, skip the whole step below. | |
if r.status_code != 200: | |
print("url error: " + url) | |
continue | |
# make a list of elements with BeautifulSoup. | |
soup = BeautifulSoup(r.content, "html.parser") | |
items = soup.find_all("td") | |
# from 'items' list, make a 2d-list 'chart'. | |
# replace commas in items to underbar to avoid confusion with separators. | |
for i, item in enumerate(items): | |
if i % 3 == 0: | |
chart.append([items[i].text, | |
items[i + 1].text.replace(",","_"), | |
items[i + 2].text.replace(",","_")]) | |
# save list in csv format. | |
with open('./csv/' + str(year) + '.csv', 'w', encoding='utf-8') as f: | |
writer = csv.writer(f, lineterminator='\n') | |
writer.writerows(chart) | |
print("-> " + str(year) + ".csv is finished. sleep for a while. \n") | |
sleep(30) | |
print("whole process is done. check the directory") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment