Skip to content

Instantly share code, notes, and snippets.

@kleutzinger
Last active April 13, 2021 21:13
Show Gist options
  • Save kleutzinger/f1baf287c29201b291547e84c77b421c to your computer and use it in GitHub Desktop.
Save kleutzinger/f1baf287c29201b291547e84c77b421c to your computer and use it in GitHub Desktop.
"""
scraped from here:
https://en.wikipedia.org/wiki/Category:Musical_groups_by_city_in_the_United_States
TODO
[x] get cities from wikipedia
[x] get bands from each city
- there's subcollections (a band in a subcoll is not featured in the parent)
- try to identify next_pages add to Q. bfs style
[x] turn cities into coordinates city_to_coordinates
[x] make json file of cities, bands and give that to lucas?
[ ] find band's popularity? (sort by pop of wikipedia article? length of article?)
[ ] make map of all the cities/bands?
maybe lucas does this
#__PUSH__# gist -u f1baf287c29201b291547e84c77b421c #__file__#
"""
import requests
import wikipediaapi
from bs4 import BeautifulSoup
import lxml
import shelve
import time
import json
def main():
"""
run whole scraper, ouptut to all_cities.json
store openstreetmaps api calls in a shelve
"""
cities = get_cities()
with shelve.open("cities.db") as db:
for idx, city in enumerate(cities):
city_name = city["city_name"]
bands = city["bands"]
href = city["href"]
if city_name in db:
city_location = db[city_name]
else:
city_location = city_name_to_location(city_name)
db[city_name] = city_location
try:
bands_in_city = scrape_city_for_bands(city_location, href)
except:
print("bad city", city_location, href)
bands_in_city = []
cities[idx]["address"] = city_location.address
cities[idx]["lat"] = city_location.latitude
cities[idx]["lon"] = city_location.longitude
cities[idx]["bands"] = bands_in_city
time.sleep(1.5)
try:
with open("all_cities.json", "w+") as f:
json.dump(cities, f, indent=True)
except:
breakpoint()
def city_name_to_location(city_name):
"""
call openstreetmaps API to turn city name into location object
"""
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="band_locator_omega")
location = geolocator.geocode(city_name)
return location
def get_cities():
"""
get a list of all available US cities with listed bands. ~100 cities
returns [ {city_name, href, bands: []} ]
"""
URL = "https://en.wikipedia.org/wiki/Category:Musical_groups_by_city_in_the_United_States"
html = requests.get(URL)
soup = BeautifulSoup(html.text, features="lxml")
elems = soup.findAll(attrs={"class": "mw-category-group"})
cities = []
for elem in elems:
for a in elem.findAll("a"):
city_name = a.text.split("Musical groups from ")[1]
href = "https://en.wikipedia.org" + a["href"]
cities.append({"city_name": city_name, "href": href, "bands": []})
return cities
def scrape_city_for_bands(location, URL):
"""
take starting url and do a bfs finding bands/subcategories/next pages
returns [ {name: 'band_name', href: 'band_wiki_link'}, ... ]
"""
from collections import deque
bands = []
seen_urls = set()
Q = deque([URL])
rel2wiki = lambda url: f"https://en.wikipedia.org/{url}"
while Q:
cur_url = Q.popleft()
html = requests.get(cur_url)
soup = BeautifulSoup(html.text, features="lxml")
next_page_link = soup.find("a", text="next page")
if next_page_link:
link = rel2wiki(next_page_link["href"])
if link not in seen_urls:
Q.appendleft(link)
seen_urls.add(link)
subs = soup.find("div", attrs={"id": "mw-subcategories"})
if subs:
for sub_el in subs.findAll("a"):
link = rel2wiki(sub_el["href"])
if link not in seen_urls:
Q.append(link)
seen_urls.add(link)
elem = soup.find("div", attrs={"id": "mw-pages"})
for a in elem.findAll("a"):
band_name = a.text
if band_name in ["learn more", "previous page", "next page"]:
continue
href = rel2wiki(a["href"])
bands.append({"href": href, "name": band_name})
out_str = f"found {len(bands)} in {location}"
print(out_str)
return bands
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment