Created
February 20, 2021 19:29
-
-
Save stuartlangridge/bdcffe7201e12d0a9521b4cf89b7c1e0 to your computer and use it in GitHub Desktop.
Pub quiz question: which country has produced the most million selling popstars per square km? https://twitter.com/dsquareddigest/status/1363092145301827586
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import requests | |
import requests_cache | |
import urllib.parse | |
import os.path | |
requests_cache.install_cache(os.path.join(os.path.dirname(__file__), 'million_sellers')) | |
#https://www.officialcharts.com/chart-news/the-best-selling-singles-of-all-time-on-the-official-uk-chart__21298/ | |
artists = \ | |
["ELTON JOHN", "BAND AID", "QUEEN", "WINGS", "JOHN TRAVOLTA & OLIVIA NEWTON-JOHN", | |
"FRANKIE GOES TO HOLLYWOOD", "BONEY M", "PHARRELL WILLIAMS", "THE BEATLES", | |
"WHAM!", "WET WET WET", "BONEY M", "STEVIE WONDER", "ROBSON GREEN & JEROME FLYNN", | |
"BRYAN ADAMS", "AQUA", "CHER", "THE BEATLES", "WILL YOUNG", "JOHN LENNON", | |
"MARK RONSON FT BRUNO MARS", "SURVIVOR", "ROBIN THICKE/TI/PHARRELL", | |
"PUFF DADDY & FAITH EVANS", "ADELE", "WHITNEY HOUSTON", "HUMAN LEAGUE", | |
"JOHN TRAVOLTA & OLIVIA NEWTON-JOHN", "BADDIEL & SKINNER & LIGHTNING SEEDS", | |
"FRANKIE GOES TO HOLLYWOOD", "COOLIO FEATURING L.V.", "CELINE DION", | |
"BRITNEY SPEARS", "VARIOUS ARTISTS", "THE BEATLES", | |
"MAROON 5 FEATURING CHRISTINA AGUILERA", "GEORGE MICHAEL", "CULTURE CLUB", | |
"OASIS", "GOTYE FEATURING KIMBRA", "KEN DODD", "VILLAGE PEOPLE", "AVICII", | |
"BLACK EYED PEAS", "DAFT PUNK FT PHARRELL WILLIAMS", "FUGEES", "JENNIFER RUSH", | |
"DEXY'S MIDNIGHT RUNNERS", "BILL HALEY & HIS COMETS", "SHAGGY FEATURING RIKROK", | |
"RIHANNA FEATURING CALVIN HARRIS", "THE SEEKERS", "THE BEATLES", | |
"KINGS OF LEON", "THE BEATLES", "SPICE GIRLS", "ENGELBERT HUMPERDINCK", | |
"CELINE DION", "ALL SAINTS", "SOFT CELL", "CARLY RAE JEPSEN", | |
"TONY CHRISTIE FT PETER KAY", "BRUNO MARS", "GARETH GATES", | |
"RUN-D.M.C. VS JASON NEVINS", "BLONDIE", "DAVID GUETTA FT SIA", | |
"ALEXANDRA BURKE", "JAMES ARTHUR", "SLADE", "PSY", "JOHN LEGEND", | |
"CLEAN BANDIT FT JESS GLYNNE", "LMFAO FEATURING LAUREN BENNETT AND GOONROCK", | |
"EMINEM FEATURING RIHANNA", "ELVIS PRESLEY", "POGUES FT KIRSTY MACCOLL", | |
"TOM JONES", "NEW ORDER", "PASSENGER", "ED SHEERAN", "PAUL ANKA", | |
"ROBBIE WILLIAMS", "MARIAH CAREY", "JESSIE J FEATURING B.O.B", | |
"ART GARFUNKEL", "STEPS", | |
"MICHAEL JACKSON", "NATALIE IMBRUGLIA", "AEROSMITH", "KYLIE MINOGUE", | |
"WHIGFIELD", "SNOW PATROL", "LADY GAGA", "MR ACKER BILK AND HIS PARAMOUNT JAZZ BAND", | |
"HARRY BELAFONTE", "ENGELBERT HUMPERDINCK", "BAND AID 20", | |
"RAY PARKER JR.", "RIHANNA", "DAVID SOUL", "BABYLON ZOO", "BOYZONE", | |
"PINK FLOYD", "THE BEATLES", "FUN FT JANELLE MONAE", "ABBA", | |
"ONEREPUBLIC", "GARY GLITTER", "SPICE GIRLS", "GNARLS BARKLEY", | |
"EIFFEL 65", "DONNA SUMMER", "KATY PERRY", "IAN DURY AND THE BLOCKHEADS", | |
"ED SHEERAN", "JOURNEY", "KINGS OF LEON", "TAKE THAT", "ROBSON & JEROME", | |
"LEONA LEWIS", "SHAYNE WARD", "RIHANNA", "EMINEM", | |
"THE RIGHTEOUS BROTHERS", "KATY PERRY", "TELETUBBIES", "FRANK IFIELD", | |
"ADELE", "HEAR'SAY", "BLACK BOX", "GLORIA GAYNOR", "TAKE THAT", | |
"IRENE CARA", "UB40", "NAUGHTY BOY FT SAM SMITH", "BILLY JOEL", | |
"ROD STEWART", "CLIFF RICHARD & THE SHADOWS", "SWEDISH HOUSE MAFIA/MARTIN", | |
"ADAM AND THE ANTS", "PITBULL FEATURING NE-YO, AFROJACK AND NAYER", | |
"LADY GAGA", | |
"BING CROSBY WITH THE KEN DARBY SINGERS AND JOHN SCOTT TROTTER ORCHESTRA", | |
"TIGHT FIT", "PETER ANDRE FEATURING BUBBLER RANX", "ENRIQUE IGLESIAS", | |
"THE ARCHIES", "CHERYL COLE", "ELTON JOHN & KIKI DEE", "LOU BEGA", "BLACK EYED PEAS", | |
"JULIE COVINGTON", "CHRISTINA PERRI", "KILLERS", "BOB THE BUILDER", "BROTHERHOOD OF MAN", | |
"ATOMIC KITTEN", "NO DOUBT", "GERRY & THE PACEMAKERS", "NICKI MINAJ", "ADELE", | |
"ONE DIRECTION", "OASIS", "MATT CARDLE", "THE NEW SEEKERS", "SHOWADDYWADDY", | |
"DAWN FEATURING TONY ORLANDO", "RICK ASTLEY", "KYLIE MINOGUE & JASON DONOVAN", | |
"COLDPLAY", "BRUNO MARS", "THE SIMON PARK ORCHESTRA", "CEE LO GREEN", "FRANK SINATRA", | |
"LITTLE JIMMY OSMOND", "LMFAO", "GOO GOO DOLLS"] | |
SPLITTERS = ["FT", "FEATURING"] | |
KINDS = { | |
"Q5741069": "rock group", | |
"Q5": "person", | |
"Q215380": "musical group", | |
"Q9212979": "musical duo", | |
"Q641066": "girl group", | |
"Q7623897": "all-female band", | |
"Q71129815": "electronica duo", | |
"Q216337": "boy band", | |
"Q6619719": "fictional musical group" | |
} | |
def wikidata(q): | |
qs = urllib.parse.urlencode({ | |
"action": "query", | |
"list": "search", | |
"srsearch": q, | |
"format": "json" | |
}) | |
parts = list(urllib.parse.urlparse("https://www.wikidata.org/w/api.php?qs=no")) | |
parts[4] = qs | |
url = urllib.parse.urlunparse(parts) | |
r = requests.get(url) | |
if not r.json()["query"]["search"]: return None | |
entityid = r.json()["query"]["search"][0]["title"] | |
if "album" in r.json()["query"]["search"][0]["snippet"] or "department" in r.json()["query"]["search"][0]["snippet"]: | |
entityid = r.json()["query"]["search"][1]["title"] | |
if "album" in r.json()["query"]["search"][1]["snippet"]: | |
entityid = r.json()["query"]["search"][2]["title"] | |
eurl =f"https://www.wikidata.org/wiki/Special:EntityData/{entityid}.json" | |
r2 = requests.get(eurl) | |
details = r2.json() | |
try: | |
kind = details["entities"][entityid]["claims"]["P31"][0]["mainsnak"]["datavalue"]["value"]["id"] | |
except KeyError: | |
return None | |
if kind not in KINDS: | |
return None | |
claims = details["entities"][entityid]["claims"] | |
if "P27" in claims: | |
country_entityid = claims["P27"][0]["mainsnak"]["datavalue"]["value"]["id"] | |
elif "P495" in claims: | |
country_entityid = claims["P495"][0]["mainsnak"]["datavalue"]["value"]["id"] | |
elif "P740" in claims: | |
location_entityid = claims["P740"][0]["mainsnak"]["datavalue"]["value"]["id"] | |
lurl =f"https://www.wikidata.org/wiki/Special:EntityData/{location_entityid}.json" | |
r3 = requests.get(lurl) | |
ldetails = r3.json() | |
P17s = ldetails["entities"][location_entityid]["claims"]["P17"] | |
country_entityid = P17s[-1]["mainsnak"]["datavalue"]["value"]["id"] | |
curl =f"https://www.wikidata.org/wiki/Special:EntityData/{country_entityid}.json" | |
r3 = requests.get(curl) | |
cdetails = r3.json() | |
name = cdetails["entities"][country_entityid]["labels"]["en"]["value"] | |
area = int(float(cdetails["entities"][country_entityid]["claims"]["P2046"][0]["mainsnak"]["datavalue"]["value"]["amount"].replace("+", ""))) | |
return (q, name, area) | |
by_country = {} | |
for artist in artists: | |
for item in SPLITTERS: | |
if f" {item} " in artist: | |
artist = artist.split(f" {item} ")[0] | |
res = wikidata(artist) | |
if res: | |
artist, country, area = res | |
if country not in by_country: | |
by_country[country] = {"area": area, "artists": []} | |
by_country[country]["artists"].append(artist) | |
results = [] | |
for c in by_country: | |
by_country[c]["artist_per_m"] = 1000000.0 * len(by_country[c]["artists"]) / by_country[c]["area"] | |
results.append((c, by_country[c]["artists"], by_country[c]["area"], by_country[c]["artist_per_m"])) | |
results.sort(key=lambda n:n[3], reverse=True) | |
for country, artists, area, artists_per_million_km in results: | |
print(f"\033[1m{country}\033[0m ({artists_per_million_km:.2f} artists per mkm²)") | |
if len(artists) > 5: | |
print(len(artists), "artists") | |
else: | |
print(", ".join([x.title() for x in list(set(artists))])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This makes a bunch of very dubious assumptions -- artists called "FRED FEATURING BLOGGS" are treated as just "FRED", Wikidata's "place of formation" and "country of origin" entries are correct, anyone without a Wikidata entry under the band name doesn't exist, which penalises duos a bit, and so on -- but hey, now you, gentle reader, have the script and can tweak it to your heart's content to make it rigorous (or at least to show some semblance of rigour!)