Last active
September 2, 2020 20:32
-
-
Save sloev/6e528f07f82b3a8af5bd7b8450ebf0b5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| requests_get = requests.get | |
| import OpenSSL | |
| def get( | |
| *args, retries=5, **kwargs, | |
| ): | |
| try: | |
| return requests_get(*args, **kwargs, verify=False) | |
| except OpenSSL.SSL.SysCallError: | |
| if not retries: | |
| raise | |
| except requests.exceptions.ConnectionError: | |
| if not retries: | |
| raise | |
| except requests.exceptions.SSLError: | |
| if not retries: | |
| raise | |
| except requests.exceptions.ChunkedEncodingError: | |
| if not retries: | |
| raise | |
| sleep_s = 6 - retries | |
| info(f"got ssl error, sleeping: {sleep_s}s (retries left: {retries})") | |
| time.sleep(sleep_s) | |
| retries -= 1 | |
| return get(*args, retries=retries, **kwargs) | |
| requests.get = requests_get | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| import gzip | |
| import json | |
| from lxml import html | |
| import logging | |
| import time | |
| from io import BytesIO | |
| import metadata_parser | |
| from requests.compat import chardet | |
| import os | |
| import _jsonnet | |
| import tqdm | |
| from collections import Counter | |
| import click | |
| from colorama.ansi import Fore, Back, Style, clear_line, Cursor, clear_screen, set_title | |
| import shelve | |
| GREEN = Style.BRIGHT + Fore.GREEN | |
| RED = Style.BRIGHT + Fore.RED | |
| MAGENTA = Style.BRIGHT + Fore.MAGENTA | |
| def extract_meta(html_content): | |
| page = metadata_parser.MetadataParser(html=html_content) | |
| meta = page.metadata["meta"] | |
| doc_type = meta.get("og:type", "UNKNOWN") | |
| if doc_type == "website": | |
| pass | |
| elif doc_type == "music.album": | |
| return extract_album(meta, html_content) | |
| elif doc_type == "music.musician": | |
| return extract_musician(meta, html_content) | |
| elif doc_type == "music.song": | |
| return extract_song(meta, html_content) | |
| else: | |
| logging.warning("received unknown type: ") | |
| logging.error(html_content) | |
| return [] | |
| def extract_song(meta, html_content): | |
| song_meta = { | |
| "object_type": "song", | |
| "id": meta["og:url"].rsplit("/", 1)[1], | |
| "url": meta["og:url"], | |
| "name": meta["og:title"], | |
| "release_date": meta["music:release_date"], | |
| "duration_ms": int(meta["music:duration"]) * 1000, | |
| "image": meta["twitter:image"], | |
| } | |
| output_documents = [song_meta] | |
| musician_urls = meta["music:musician"] | |
| if isinstance(musician_urls, str): | |
| musician_urls = [musician_urls] | |
| for musician_url in musician_urls: | |
| musician_id = musician_url.rsplit("/", 1)[1] | |
| output_documents.append( | |
| { | |
| "object_type": "musician_song", | |
| "musician_id": musician_id, | |
| "song_id": song_meta["id"], | |
| } | |
| ) | |
| albums = meta["music:album"] | |
| album_tracks = meta["music:album:track"] | |
| if isinstance(albums, str): | |
| albums = [albums] | |
| if isinstance(album_tracks, str): | |
| album_tracks = [album_tracks] | |
| for album_url, track_number in zip(albums, album_tracks): | |
| album_id = album_url.rsplit("/", 1)[1] | |
| output_documents.extend( | |
| [ | |
| { | |
| "object_type": "album_song", | |
| "album_id": album_id, | |
| "song_id": song_meta["id"], | |
| "track_number": int(track_number), | |
| }, | |
| {"object_type": "album", "id": album_id, "url": album_url}, | |
| ] | |
| ) | |
| return output_documents | |
| def extract_musician(meta, html_content): | |
| spotify_json_string = ( | |
| "{" + html_content.split("Spotify.Entity = {", 1)[1].split("};", 1)[0] + "}" | |
| ) | |
| try: | |
| spotfy_json_data = json.loads(spotify_json_string) | |
| except: | |
| try: | |
| spotfy_json_data = json.loads( | |
| _jsonnet.evaluate_snippet("snippet", spotify_json_string) | |
| ) | |
| except: | |
| logging.error( | |
| f"couldnt json load or jsonnet load object:\n{spotify_json_string}" | |
| ) | |
| raise | |
| artist_meta = { | |
| "object_type": "musician", | |
| "following": spotfy_json_data["insights"]["following_count"], | |
| "followers": spotfy_json_data["insights"]["follower_count"], | |
| "global_chart_position": spotfy_json_data["insights"]["global_chart_position"], | |
| "artist_gid": spotfy_json_data["insights"]["artist_gid"], | |
| "monthly_listeners": int( | |
| spotfy_json_data["insights"].get("monthly_listeners", 0) | |
| ), | |
| "id": spotfy_json_data["id"], | |
| "url": spotfy_json_data["href"], | |
| "name": spotfy_json_data["name"], | |
| "image": meta["og:image"], | |
| } | |
| output_documents = [artist_meta] | |
| for genre in spotfy_json_data.get("genres", []): | |
| output_documents.append( | |
| { | |
| "object_type": "genre_musician", | |
| "genre": genre, | |
| "musician_id": spotfy_json_data["id"], | |
| } | |
| ) | |
| for city in spotfy_json_data["insights"].get("cities", []): | |
| city_code = "{country}_{region}_{city}".format(**city) | |
| output_documents.extend( | |
| [ | |
| { | |
| "object_type": "musician_city", | |
| "musician_id": artist_meta["id"], | |
| "city_id": city_code, | |
| "listeners": city["listeners"], | |
| }, | |
| { | |
| "object_type": "city", | |
| "id": city_code, | |
| "country": city["country"], | |
| "region": city["region"], | |
| "city": city["city"], | |
| }, | |
| ] | |
| ) | |
| for track in spotfy_json_data.get("top_tracks", []): | |
| output_documents.extend( | |
| [ | |
| { | |
| "object_type": "musician", | |
| **{"url": d["href"], "id": d["id"], "name": d["name"]}, | |
| } | |
| for d in track["artists"] | |
| ] | |
| ) | |
| output_documents.extend( | |
| [ | |
| { | |
| "object_type": "musician_song", | |
| "musician_id": d["id"], | |
| "song_id": track["id"], | |
| } | |
| for d in track["artists"] | |
| ] | |
| ) | |
| output_documents.extend( | |
| [ | |
| { | |
| "object_type": "song", | |
| "id": track["id"], | |
| "url": track["href"], | |
| "name": track["name"], | |
| "release_date": track["album"]["release_date"], | |
| "mp3_preview": track["preview_url"], | |
| "duration_ms": int(track.get("duration_ms", 0)), | |
| "explicit": track.get("explicit", False), | |
| "popularity": track["popularity"], | |
| "isrc": track.get("external_ids", {}).get("isrc", None), | |
| }, | |
| { | |
| "object_type": "album", | |
| "id": track["album"]["id"], | |
| "url": track["album"]["href"], | |
| "name": track["album"]["name"], | |
| "image": (track["album"].get("images", None) or [{"url": None}])[0][ | |
| "url" | |
| ], | |
| "release_date": track["album"]["release_date"], | |
| }, | |
| { | |
| "object_type": "album_song", | |
| "track_number": track["track_number"], | |
| "album_id": track["album"]["id"], | |
| "song_id": track["id"], | |
| }, | |
| ] | |
| ) | |
| return output_documents | |
| def extract_album(meta, html_content): | |
| songs_raw = meta.pop("music:song", []) | |
| if isinstance(songs_raw, str): | |
| songs_raw = [songs_raw] | |
| album_meta = { | |
| "object_type": "album", | |
| "id": meta["og:url"].rsplit("/", 1)[1], | |
| "url": meta["og:url"], | |
| "name": meta["twitter:title"], | |
| "image": meta["og:image"], | |
| "release_date": meta["music:release_date"], | |
| "player_link": meta["twitter:player"], | |
| } | |
| output_documents = [album_meta] | |
| artist_url = meta["music:musician"] | |
| artist_id = artist_url.rsplit("/", 1)[1] | |
| for track_number, song_url in enumerate(songs_raw): | |
| track_number += 1 | |
| song_id = song_url.rsplit("/", 1)[1] | |
| output_documents.extend( | |
| [ | |
| { | |
| "object_type": "album_song", | |
| "album_id": album_meta["id"], | |
| "song_id": song_id, | |
| "track_number": track_number, | |
| }, | |
| {"object_type": "song", "id": song_id, "song_url": song_url}, | |
| ] | |
| ) | |
| return output_documents | |
| def extract(filename, offset, length, url, **trash): | |
| try: | |
| offset, length = int(offset), int(length) | |
| offset_end = offset + length - 1 | |
| prefix = "https://commoncrawl.s3.amazonaws.com/" | |
| resp = get( | |
| prefix + filename, | |
| headers={"Range": "bytes={}-{}".format(offset, offset_end)}, | |
| ) | |
| raw_data = BytesIO(resp.content) | |
| f = gzip.GzipFile(fileobj=raw_data) | |
| data = f.read().decode() | |
| data = data.strip().split("\r\n\r\n") | |
| if len(data) != 3: | |
| logging.error("no response in warc") | |
| return | |
| warc, header, response = data | |
| rows = extract_meta(response) | |
| for row in rows: | |
| print(json.dumps(row), flush=True) | |
| return rows | |
| except KeyboardInterrupt: | |
| raise | |
| except: | |
| logging.exception("error") | |
| raise | |
| time.sleep(5) | |
| def info(message): | |
| click.echo(GREEN + message + Style.RESET_ALL, err=True) | |
| @click.command() | |
| def cli(): | |
| info("Initializing cdx toolkit") | |
| import cdx_toolkit | |
| cdx = cdx_toolkit.CDXFetcher(source="cc") | |
| SPOTIFY_ROOT_URL = "open.spotify.com/*" | |
| TOTAL_RESULTS = cdx.get_size_estimate(SPOTIFY_ROOT_URL) | |
| with shelve.open("progress.db") as progress_db: | |
| with tqdm.tqdm( | |
| total=TOTAL_RESULTS, | |
| desc="initializing cdx iterator", | |
| bar_format="%s{l_bar}%s{bar}%s{r_bar}%s" | |
| % (MAGENTA, GREEN, MAGENTA, Style.RESET_ALL), | |
| ) as pbar: | |
| pbar.refresh() | |
| for index, obj in enumerate( | |
| cdx.iter(SPOTIFY_ROOT_URL, filter=["status:200"]) | |
| ): | |
| url = obj.data["url"] | |
| if progress_db.get(url, None): | |
| pbar.set_description("{:15d} : skipping!".format(index)) | |
| else: | |
| results = extract(**obj.data) | |
| results_counter = ", ".join( | |
| "{}: {:3d}".format(*item) | |
| for item in Counter([d["object_type"] for d in results]).items() | |
| ) | |
| progress_db[url] = 1 | |
| pbar.set_description("{:15d} :{}".format(index, results_counter)) | |
| pbar.update(1) | |
| pbar.refresh() | |
| if __name__ == "__main__": | |
| cli() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| {"object_type": "song", "id": "70qbVFpN7fMDZcUgVgkkuF", "url": "https://api.spotify.com/v1/tracks/70qbVFpN7fMDZcUgVgkkuF", "name": "Emepetreses", "release_date": "2005-07-01", "mp3_preview": "https://p.scdn.co/mp3-preview/3fb2eee55698e8846933e060b1d81cda1f5475ba?cid=162b7dc01f3a4a2ca32ed3cec83d1e02", "duration_ms": 193769, "explicit": false, "popularity": 8, "isrc": "ES7490500038"} | |
| {"object_type": "album", "id": "72VG9hHR4dfRTQdOq1W6W4", "url": "https://api.spotify.com/v1/albums/72VG9hHR4dfRTQdOq1W6W4", "name": "Emepetreses", "image": "https://i.scdn.co/image/ab67616d0000b2738bd867b52c228d2e2ed1249d", "release_date": "2005-07-01"} | |
| {"object_type": "album_song", "track_number": 7, "album_id": "72VG9hHR4dfRTQdOq1W6W4", "song_id": "70qbVFpN7fMDZcUgVgkkuF"} | |
| {"object_type": "musician", "url": "https://api.spotify.com/v1/artists/00NhO1EJqUu5I9qSlOpNkc", "id": "00NhO1EJqUu5I9qSlOpNkc", "name": "Pablo Moro"} | |
| {"object_type": "musician_song", "musician_id": "00NhO1EJqUu5I9qSlOpNkc", "song_id": "7tgfEUeI7g0k4Eq4e2opWN"} | |
| {"object_type": "song", "id": "7tgfEUeI7g0k4Eq4e2opWN", "url": "https://api.spotify.com/v1/tracks/7tgfEUeI7g0k4Eq4e2opWN", "name": "Smoking Point", "release_date": "2007", "mp3_preview": "https://p.scdn.co/mp3-preview/458253db1a5e993a37e60b0f1f36a19815af86d6?cid=162b7dc01f3a4a2ca32ed3cec83d1e02", "duration_ms": 345533, "explicit": false, "popularity": 8, "isrc": "ES7490700058"} | |
| {"object_type": "album", "id": "5qLRULrqsHOt8aFfKkn8XQ", "url": "https://api.spotify.com/v1/albums/5qLRULrqsHOt8aFfKkn8XQ", "name": "Smoking Point", "image": "https://i.scdn.co/image/ab67616d0000b273a3fdfe148361ae610e8d2f1a", "release_date": "2007"} | |
| {"object_type": "album_song", "track_number": 11, "album_id": "5qLRULrqsHOt8aFfKkn8XQ", "song_id": "7tgfEUeI7g0k4Eq4e2opWN"} | |
| {"object_type": "musician", "following": 0, "followers": 790823, "global_chart_position": 0, "artist_gid": "003da4723a97462c9162fb5d5c193536", "monthly_listeners": 2100804, "id": "00sazWvoTLOqg5MFwC68Um", "url": "https://api.spotify.com/v1/artists/00sazWvoTLOqg5MFwC68Um", "name": "Yann Tiersen", "image": "https://i.scdn.co/image/c600c139906fcf8341b1874e07c79b4186458231"} | |
| {"object_type": "genre_musician", "genre": "bow pop", "musician_id": "00sazWvoTLOqg5MFwC68Um"} | |
| {"object_type": "genre_musician", "genre": "compositional ambient", "musician_id": "00sazWvoTLOqg5MFwC68Um"} | |
| {"object_type": "genre_musician", "genre": "french soundtrack", "musician_id": "00sazWvoTLOqg5MFwC68Um"} | |
| {"object_type": "musician_city", "musician_id": "00sazWvoTLOqg5MFwC68Um", "city_id": "TR_34_Istanbul", "listeners": 55669} | |
| {"object_type": "city", "id": "TR_34_Istanbul", "country": "TR", "region": "34", "city": "Istanbul"} | |
| {"object_type": "musician_city", "musician_id": "00sazWvoTLOqg5MFwC68Um", "city_id": "FR_75_Paris", "listeners": 48868} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment