Skip to content

Instantly share code, notes, and snippets.

@sloev
Last active September 2, 2020 20:32
Show Gist options
  • Select an option

  • Save sloev/6e528f07f82b3a8af5bd7b8450ebf0b5 to your computer and use it in GitHub Desktop.

Select an option

Save sloev/6e528f07f82b3a8af5bd7b8450ebf0b5 to your computer and use it in GitHub Desktop.
import requests
requests_get = requests.get
import OpenSSL
def get(
*args, retries=5, **kwargs,
):
try:
return requests_get(*args, **kwargs, verify=False)
except OpenSSL.SSL.SysCallError:
if not retries:
raise
except requests.exceptions.ConnectionError:
if not retries:
raise
except requests.exceptions.SSLError:
if not retries:
raise
except requests.exceptions.ChunkedEncodingError:
if not retries:
raise
sleep_s = 6 - retries
info(f"got ssl error, sleeping: {sleep_s}s (retries left: {retries})")
time.sleep(sleep_s)
retries -= 1
return get(*args, retries=retries, **kwargs)
requests.get = requests_get
import warnings
warnings.filterwarnings("ignore")
import gzip
import json
from lxml import html
import logging
import time
from io import BytesIO
import metadata_parser
from requests.compat import chardet
import os
import _jsonnet
import tqdm
from collections import Counter
import click
from colorama.ansi import Fore, Back, Style, clear_line, Cursor, clear_screen, set_title
import shelve
GREEN = Style.BRIGHT + Fore.GREEN
RED = Style.BRIGHT + Fore.RED
MAGENTA = Style.BRIGHT + Fore.MAGENTA
def extract_meta(html_content):
page = metadata_parser.MetadataParser(html=html_content)
meta = page.metadata["meta"]
doc_type = meta.get("og:type", "UNKNOWN")
if doc_type == "website":
pass
elif doc_type == "music.album":
return extract_album(meta, html_content)
elif doc_type == "music.musician":
return extract_musician(meta, html_content)
elif doc_type == "music.song":
return extract_song(meta, html_content)
else:
logging.warning("received unknown type: ")
logging.error(html_content)
return []
def extract_song(meta, html_content):
song_meta = {
"object_type": "song",
"id": meta["og:url"].rsplit("/", 1)[1],
"url": meta["og:url"],
"name": meta["og:title"],
"release_date": meta["music:release_date"],
"duration_ms": int(meta["music:duration"]) * 1000,
"image": meta["twitter:image"],
}
output_documents = [song_meta]
musician_urls = meta["music:musician"]
if isinstance(musician_urls, str):
musician_urls = [musician_urls]
for musician_url in musician_urls:
musician_id = musician_url.rsplit("/", 1)[1]
output_documents.append(
{
"object_type": "musician_song",
"musician_id": musician_id,
"song_id": song_meta["id"],
}
)
albums = meta["music:album"]
album_tracks = meta["music:album:track"]
if isinstance(albums, str):
albums = [albums]
if isinstance(album_tracks, str):
album_tracks = [album_tracks]
for album_url, track_number in zip(albums, album_tracks):
album_id = album_url.rsplit("/", 1)[1]
output_documents.extend(
[
{
"object_type": "album_song",
"album_id": album_id,
"song_id": song_meta["id"],
"track_number": int(track_number),
},
{"object_type": "album", "id": album_id, "url": album_url},
]
)
return output_documents
def extract_musician(meta, html_content):
spotify_json_string = (
"{" + html_content.split("Spotify.Entity = {", 1)[1].split("};", 1)[0] + "}"
)
try:
spotfy_json_data = json.loads(spotify_json_string)
except:
try:
spotfy_json_data = json.loads(
_jsonnet.evaluate_snippet("snippet", spotify_json_string)
)
except:
logging.error(
f"couldnt json load or jsonnet load object:\n{spotify_json_string}"
)
raise
artist_meta = {
"object_type": "musician",
"following": spotfy_json_data["insights"]["following_count"],
"followers": spotfy_json_data["insights"]["follower_count"],
"global_chart_position": spotfy_json_data["insights"]["global_chart_position"],
"artist_gid": spotfy_json_data["insights"]["artist_gid"],
"monthly_listeners": int(
spotfy_json_data["insights"].get("monthly_listeners", 0)
),
"id": spotfy_json_data["id"],
"url": spotfy_json_data["href"],
"name": spotfy_json_data["name"],
"image": meta["og:image"],
}
output_documents = [artist_meta]
for genre in spotfy_json_data.get("genres", []):
output_documents.append(
{
"object_type": "genre_musician",
"genre": genre,
"musician_id": spotfy_json_data["id"],
}
)
for city in spotfy_json_data["insights"].get("cities", []):
city_code = "{country}_{region}_{city}".format(**city)
output_documents.extend(
[
{
"object_type": "musician_city",
"musician_id": artist_meta["id"],
"city_id": city_code,
"listeners": city["listeners"],
},
{
"object_type": "city",
"id": city_code,
"country": city["country"],
"region": city["region"],
"city": city["city"],
},
]
)
for track in spotfy_json_data.get("top_tracks", []):
output_documents.extend(
[
{
"object_type": "musician",
**{"url": d["href"], "id": d["id"], "name": d["name"]},
}
for d in track["artists"]
]
)
output_documents.extend(
[
{
"object_type": "musician_song",
"musician_id": d["id"],
"song_id": track["id"],
}
for d in track["artists"]
]
)
output_documents.extend(
[
{
"object_type": "song",
"id": track["id"],
"url": track["href"],
"name": track["name"],
"release_date": track["album"]["release_date"],
"mp3_preview": track["preview_url"],
"duration_ms": int(track.get("duration_ms", 0)),
"explicit": track.get("explicit", False),
"popularity": track["popularity"],
"isrc": track.get("external_ids", {}).get("isrc", None),
},
{
"object_type": "album",
"id": track["album"]["id"],
"url": track["album"]["href"],
"name": track["album"]["name"],
"image": (track["album"].get("images", None) or [{"url": None}])[0][
"url"
],
"release_date": track["album"]["release_date"],
},
{
"object_type": "album_song",
"track_number": track["track_number"],
"album_id": track["album"]["id"],
"song_id": track["id"],
},
]
)
return output_documents
def extract_album(meta, html_content):
songs_raw = meta.pop("music:song", [])
if isinstance(songs_raw, str):
songs_raw = [songs_raw]
album_meta = {
"object_type": "album",
"id": meta["og:url"].rsplit("/", 1)[1],
"url": meta["og:url"],
"name": meta["twitter:title"],
"image": meta["og:image"],
"release_date": meta["music:release_date"],
"player_link": meta["twitter:player"],
}
output_documents = [album_meta]
artist_url = meta["music:musician"]
artist_id = artist_url.rsplit("/", 1)[1]
for track_number, song_url in enumerate(songs_raw):
track_number += 1
song_id = song_url.rsplit("/", 1)[1]
output_documents.extend(
[
{
"object_type": "album_song",
"album_id": album_meta["id"],
"song_id": song_id,
"track_number": track_number,
},
{"object_type": "song", "id": song_id, "song_url": song_url},
]
)
return output_documents
def extract(filename, offset, length, url, **trash):
try:
offset, length = int(offset), int(length)
offset_end = offset + length - 1
prefix = "https://commoncrawl.s3.amazonaws.com/"
resp = get(
prefix + filename,
headers={"Range": "bytes={}-{}".format(offset, offset_end)},
)
raw_data = BytesIO(resp.content)
f = gzip.GzipFile(fileobj=raw_data)
data = f.read().decode()
data = data.strip().split("\r\n\r\n")
if len(data) != 3:
logging.error("no response in warc")
return
warc, header, response = data
rows = extract_meta(response)
for row in rows:
print(json.dumps(row), flush=True)
return rows
except KeyboardInterrupt:
raise
except:
logging.exception("error")
raise
time.sleep(5)
def info(message):
click.echo(GREEN + message + Style.RESET_ALL, err=True)
@click.command()
def cli():
info("Initializing cdx toolkit")
import cdx_toolkit
cdx = cdx_toolkit.CDXFetcher(source="cc")
SPOTIFY_ROOT_URL = "open.spotify.com/*"
TOTAL_RESULTS = cdx.get_size_estimate(SPOTIFY_ROOT_URL)
with shelve.open("progress.db") as progress_db:
with tqdm.tqdm(
total=TOTAL_RESULTS,
desc="initializing cdx iterator",
bar_format="%s{l_bar}%s{bar}%s{r_bar}%s"
% (MAGENTA, GREEN, MAGENTA, Style.RESET_ALL),
) as pbar:
pbar.refresh()
for index, obj in enumerate(
cdx.iter(SPOTIFY_ROOT_URL, filter=["status:200"])
):
url = obj.data["url"]
if progress_db.get(url, None):
pbar.set_description("{:15d} : skipping!".format(index))
else:
results = extract(**obj.data)
results_counter = ", ".join(
"{}: {:3d}".format(*item)
for item in Counter([d["object_type"] for d in results]).items()
)
progress_db[url] = 1
pbar.set_description("{:15d} :{}".format(index, results_counter))
pbar.update(1)
pbar.refresh()
if __name__ == "__main__":
cli()
{"object_type": "song", "id": "70qbVFpN7fMDZcUgVgkkuF", "url": "https://api.spotify.com/v1/tracks/70qbVFpN7fMDZcUgVgkkuF", "name": "Emepetreses", "release_date": "2005-07-01", "mp3_preview": "https://p.scdn.co/mp3-preview/3fb2eee55698e8846933e060b1d81cda1f5475ba?cid=162b7dc01f3a4a2ca32ed3cec83d1e02", "duration_ms": 193769, "explicit": false, "popularity": 8, "isrc": "ES7490500038"}
{"object_type": "album", "id": "72VG9hHR4dfRTQdOq1W6W4", "url": "https://api.spotify.com/v1/albums/72VG9hHR4dfRTQdOq1W6W4", "name": "Emepetreses", "image": "https://i.scdn.co/image/ab67616d0000b2738bd867b52c228d2e2ed1249d", "release_date": "2005-07-01"}
{"object_type": "album_song", "track_number": 7, "album_id": "72VG9hHR4dfRTQdOq1W6W4", "song_id": "70qbVFpN7fMDZcUgVgkkuF"}
{"object_type": "musician", "url": "https://api.spotify.com/v1/artists/00NhO1EJqUu5I9qSlOpNkc", "id": "00NhO1EJqUu5I9qSlOpNkc", "name": "Pablo Moro"}
{"object_type": "musician_song", "musician_id": "00NhO1EJqUu5I9qSlOpNkc", "song_id": "7tgfEUeI7g0k4Eq4e2opWN"}
{"object_type": "song", "id": "7tgfEUeI7g0k4Eq4e2opWN", "url": "https://api.spotify.com/v1/tracks/7tgfEUeI7g0k4Eq4e2opWN", "name": "Smoking Point", "release_date": "2007", "mp3_preview": "https://p.scdn.co/mp3-preview/458253db1a5e993a37e60b0f1f36a19815af86d6?cid=162b7dc01f3a4a2ca32ed3cec83d1e02", "duration_ms": 345533, "explicit": false, "popularity": 8, "isrc": "ES7490700058"}
{"object_type": "album", "id": "5qLRULrqsHOt8aFfKkn8XQ", "url": "https://api.spotify.com/v1/albums/5qLRULrqsHOt8aFfKkn8XQ", "name": "Smoking Point", "image": "https://i.scdn.co/image/ab67616d0000b273a3fdfe148361ae610e8d2f1a", "release_date": "2007"}
{"object_type": "album_song", "track_number": 11, "album_id": "5qLRULrqsHOt8aFfKkn8XQ", "song_id": "7tgfEUeI7g0k4Eq4e2opWN"}
{"object_type": "musician", "following": 0, "followers": 790823, "global_chart_position": 0, "artist_gid": "003da4723a97462c9162fb5d5c193536", "monthly_listeners": 2100804, "id": "00sazWvoTLOqg5MFwC68Um", "url": "https://api.spotify.com/v1/artists/00sazWvoTLOqg5MFwC68Um", "name": "Yann Tiersen", "image": "https://i.scdn.co/image/c600c139906fcf8341b1874e07c79b4186458231"}
{"object_type": "genre_musician", "genre": "bow pop", "musician_id": "00sazWvoTLOqg5MFwC68Um"}
{"object_type": "genre_musician", "genre": "compositional ambient", "musician_id": "00sazWvoTLOqg5MFwC68Um"}
{"object_type": "genre_musician", "genre": "french soundtrack", "musician_id": "00sazWvoTLOqg5MFwC68Um"}
{"object_type": "musician_city", "musician_id": "00sazWvoTLOqg5MFwC68Um", "city_id": "TR_34_Istanbul", "listeners": 55669}
{"object_type": "city", "id": "TR_34_Istanbul", "country": "TR", "region": "34", "city": "Istanbul"}
{"object_type": "musician_city", "musician_id": "00sazWvoTLOqg5MFwC68Um", "city_id": "FR_75_Paris", "listeners": 48868}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment