Last active
April 13, 2021 21:13
-
-
Save kleutzinger/f1baf287c29201b291547e84c77b421c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
scraped from here: | |
https://en.wikipedia.org/wiki/Category:Musical_groups_by_city_in_the_United_States | |
TODO | |
[x] get cities from wikipedia | |
[x] get bands from each city | |
- there's subcollections (a band in a subcoll is not featured in the parent) | |
- try to identify next_pages add to Q. bfs style | |
[x] turn cities into coordinates city_to_coordinates | |
[x] make json file of cities, bands and give that to lucas? | |
[ ] find band's popularity? (sort by pop of wikipedia article? length of article?) | |
[ ] make map of all the cities/bands? | |
maybe lucas does this | |
#__PUSH__# gist -u f1baf287c29201b291547e84c77b421c #__file__# | |
""" | |
import requests | |
import wikipediaapi | |
from bs4 import BeautifulSoup | |
import lxml | |
import shelve | |
import time | |
import json | |
def main(): | |
""" | |
run whole scraper, ouptut to all_cities.json | |
store openstreetmaps api calls in a shelve | |
""" | |
cities = get_cities() | |
with shelve.open("cities.db") as db: | |
for idx, city in enumerate(cities): | |
city_name = city["city_name"] | |
bands = city["bands"] | |
href = city["href"] | |
if city_name in db: | |
city_location = db[city_name] | |
else: | |
city_location = city_name_to_location(city_name) | |
db[city_name] = city_location | |
try: | |
bands_in_city = scrape_city_for_bands(city_location, href) | |
except: | |
print("bad city", city_location, href) | |
bands_in_city = [] | |
cities[idx]["address"] = city_location.address | |
cities[idx]["lat"] = city_location.latitude | |
cities[idx]["lon"] = city_location.longitude | |
cities[idx]["bands"] = bands_in_city | |
time.sleep(1.5) | |
try: | |
with open("all_cities.json", "w+") as f: | |
json.dump(cities, f, indent=True) | |
except: | |
breakpoint() | |
def city_name_to_location(city_name): | |
""" | |
call openstreetmaps API to turn city name into location object | |
""" | |
from geopy.geocoders import Nominatim | |
geolocator = Nominatim(user_agent="band_locator_omega") | |
location = geolocator.geocode(city_name) | |
return location | |
def get_cities(): | |
""" | |
get a list of all available US cities with listed bands. ~100 cities | |
returns [ {city_name, href, bands: []} ] | |
""" | |
URL = "https://en.wikipedia.org/wiki/Category:Musical_groups_by_city_in_the_United_States" | |
html = requests.get(URL) | |
soup = BeautifulSoup(html.text, features="lxml") | |
elems = soup.findAll(attrs={"class": "mw-category-group"}) | |
cities = [] | |
for elem in elems: | |
for a in elem.findAll("a"): | |
city_name = a.text.split("Musical groups from ")[1] | |
href = "https://en.wikipedia.org" + a["href"] | |
cities.append({"city_name": city_name, "href": href, "bands": []}) | |
return cities | |
def scrape_city_for_bands(location, URL): | |
""" | |
take starting url and do a bfs finding bands/subcategories/next pages | |
returns [ {name: 'band_name', href: 'band_wiki_link'}, ... ] | |
""" | |
from collections import deque | |
bands = [] | |
seen_urls = set() | |
Q = deque([URL]) | |
rel2wiki = lambda url: f"https://en.wikipedia.org/{url}" | |
while Q: | |
cur_url = Q.popleft() | |
html = requests.get(cur_url) | |
soup = BeautifulSoup(html.text, features="lxml") | |
next_page_link = soup.find("a", text="next page") | |
if next_page_link: | |
link = rel2wiki(next_page_link["href"]) | |
if link not in seen_urls: | |
Q.appendleft(link) | |
seen_urls.add(link) | |
subs = soup.find("div", attrs={"id": "mw-subcategories"}) | |
if subs: | |
for sub_el in subs.findAll("a"): | |
link = rel2wiki(sub_el["href"]) | |
if link not in seen_urls: | |
Q.append(link) | |
seen_urls.add(link) | |
elem = soup.find("div", attrs={"id": "mw-pages"}) | |
for a in elem.findAll("a"): | |
band_name = a.text | |
if band_name in ["learn more", "previous page", "next page"]: | |
continue | |
href = rel2wiki(a["href"]) | |
bands.append({"href": href, "name": band_name}) | |
out_str = f"found {len(bands)} in {location}" | |
print(out_str) | |
return bands | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment