Skip to content

Instantly share code, notes, and snippets.

@ripiuk
Last active January 5, 2020 11:16
Show Gist options
  • Save ripiuk/89c17c3c64fbb8008a4d07251637141c to your computer and use it in GitHub Desktop.
Save ripiuk/89c17c3c64fbb8008a4d07251637141c to your computer and use it in GitHub Desktop.
Parse your spotify music pages and save all the artist, album and song names in the relevant csv files.
"""
requirements: requests-html==0.9.0
runtime: python-3.6
How to use:
1. Download these spotify pages as html source from your browser: albums (https://open.spotify.com/collection/albums),
artists (https://open.spotify.com/collection/artists) and songs (https://open.spotify.com/collection/tracks)
to the relevant directories (scroll all the pages in advance).
2. Enter your local paths to the pages under the constant values (ARTISTS_PAGE_PATH, ALBUMS_PAGE_PATH, SONGS_PAGE_PATH).
3. Run the script.
"""
import csv
from typing import Set, Tuple, List
from requests_html import HTML
ARTISTS_PAGE_PATH = 'artists/Your Library - Artists.html'
ALBUMS_PAGE_PATH = 'albums/Your Library - Albums.html'
SONGS_PAGE_PATH = 'songs/Your Library - Songs.html'
ARTISTS_OUTPUT_FILE = 'artists.csv'
ALBUMS_OUTPUT_FILE = 'albums.csv'
SONGS_OUTPUT_FILE = 'songs.csv'
class TooManyFlags(Exception):
pass
class SpotifyPagesParser:
def __init__(self):
self.artists_page_content = self._read_file(ARTISTS_PAGE_PATH)
self.albums_page_content = self._read_file(ALBUMS_PAGE_PATH)
self.songs_page_content = self._read_file(SONGS_PAGE_PATH)
@staticmethod
def _read_file(file_path: str) -> None or str:
"""
Ignore not existing files
:param file_path: path to the html file
:return: page content
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except FileNotFoundError:
print(f'The file "{file_path}" does not exists. This page will be ignored.')
def artists_page(self) -> Set[str]:
"""
Parse page with artists
:return: set of artist names on the page
"""
page = self.artists_page_content
if not page:
return set()
html = HTML(html=page)
# Parsing
content = html.find('section.contentSpacing', first=True)
artist_names = content.find('span')
return {artist_name.text for artist_name in artist_names}
def albums_page(self, get_albums: bool=False) -> Set[str] or Set[Tuple[str, str]]:
"""
Parse page with albums
:param get_albums: a flag for parsing not only artist names, but also their albums
:return: set of artist names on the page, or set of tuple {(artist_1, his_album_1), (..)..},
if get_albums flag is True
"""
page = self.albums_page_content
if not page:
return set()
html = HTML(html=page)
# Parsing
content = html.find('section.contentSpacing', first=True)
artists_and_albums = content.find('span > a')
artist_names = (item for item in artists_and_albums if '/artist/' in str(item.links))
if get_albums:
album_names = (item for item in artists_and_albums if '/album/' in str(item.links))
return {(artist_name.text, album_name.text) for artist_name, album_name in zip(artist_names, album_names)}
return {artist_name.text for artist_name in artist_names}
def songs_page(self, get_songs: bool=False, get_albums: bool=False) -> \
Set[str] or Set[Tuple[str, str]] or Set[Tuple[str, str, str]]:
"""
Parse page with songs
:param get_songs: a flag for parsing not only artist names, but also their albums
:param get_albums: a flag for parsing not only artist names, their albums, but also their songs
:return: set of artist names on the page, or {(artist_1, his_album_1), (..)..}, if the get_albums flag is True,
or {(artist_1, his_album_1, song_in_the_album_1), (..)..}, if the get_songs flag is True
"""
page = self.songs_page_content
if not page:
return set()
if sum([get_songs, get_albums]) > 1:
raise TooManyFlags('The flags "get_songs" and "get_albums" can not be identified simultaneously')
html = HTML(html=page)
# Parsing
content = html.find('section.tracklist-container', first=True)
artist_and_album_names = tuple(
(artist.text, album.text)
for artist, album in zip(
content.find('a.tracklist-row__artist-name-link'),
content.find('a.tracklist-row__album-name-link')
))
artist_names = tuple(artist for artist, album in artist_and_album_names)
if get_albums:
album_names = tuple(album for artist, album in artist_and_album_names)
return {(artist_name, album_name) for artist_name, album_name in zip(artist_names, album_names)}
if get_songs:
song_names = content.find('div.tracklist-name')
album_names = tuple(album for artist, album in artist_and_album_names)
return {(artist_name, album_name, song_name.text)
for artist_name, album_name, song_name in zip(artist_names, album_names, song_names)}
return {artist_name for artist_name in artist_names}
def get_artist_names(self) -> list:
"""
:return: a sorted list of artist names e.g. ['Anathema', 'David Gilmour', ...]
"""
return sorted(self.artists_page() | self.albums_page() | self.songs_page())
def get_artist_album_names(self) -> list:
"""
:return: a sorted list of artist names and their albums
e.g. [('Anathema', 'Judgement'), ('David Gilmour', 'On An Island'), ...]
"""
return sorted(self.albums_page(get_albums=True) | self.songs_page(get_albums=True))
def get_artist_album_song_names(self) -> list:
"""
:return: a sorted list of artist names, their albums and songs
e.g. [('Anathema', 'Judgement', 'Anyone, Anywhere'),
('David Gilmour', 'On An Island', 'A Pocketful Of Stones'), ...]
"""
return sorted(self.songs_page(get_songs=True))
@staticmethod
def save_to_file(path: str, headers: List[str], data: List[str] or List[Tuple]) -> None:
"""
:param path: e.g. 'some/path/artists.csv'
:param headers: e.g. ['Artist', 'Album', 'Song']
:param data: e.g ['Anathema', 'David Gilmour', ...]
or [('Anathema', 'Judgement'), ('David Gilmour', 'On An Island'), ...]
:return: None
"""
if data:
with open(path, 'w+', encoding='utf-8') as file:
csv_out = csv.writer(file)
csv_out.writerow(headers)
for el in data:
if isinstance(el, str):
el = [el]
csv_out.writerow(el)
def save_all(self) -> None:
self.save_to_file(path=ARTISTS_OUTPUT_FILE, headers=['Artist'], data=self.get_artist_names())
self.save_to_file(path=ALBUMS_OUTPUT_FILE, headers=['Artist', 'Album'], data=self.get_artist_album_names())
self.save_to_file(path=SONGS_OUTPUT_FILE, headers=['Artist', 'Album', 'Song'],
data=self.get_artist_album_song_names())
if __name__ == '__main__':
spotify = SpotifyPagesParser()
spotify.save_all()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment