Created
January 21, 2018 23:13
-
-
Save theriley106/ba40d28e3c317e38adba22d0bbe43a72 to your computer and use it in GitHub Desktop.
Script used to grab this Dataset: https://www.kaggle.com/theriley106/university-statistics/data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import json | |
def grabSite(url): | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/60.0.3112.113 Chrome/60.0.3112.113 Safari/537.36'} | |
return requests.get(url, headers=headers) | |
if __name__ == '__main__': | |
DB = [] | |
for i in range(1,17): | |
url = "https://www.usnews.com/best-colleges/rankings/national-universities?_mode=table&_page={}&format=json".format(i) | |
print url | |
res = grabSite(url).json() | |
for schools in res['data']['items']: | |
information = {} | |
for key, val in schools['institution'].items(): | |
information[key] = val | |
for key, val in schools['searchData'].items(): | |
information[key] = schools['searchData'][key]['rawValue'] | |
DB.append(information) | |
with open('data.json', 'w') as fp: | |
json.dump(DB, fp) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment