Last active
January 5, 2020 11:16
-
-
Save ripiuk/89c17c3c64fbb8008a4d07251637141c to your computer and use it in GitHub Desktop.
Parse your spotify music pages and save all the artist, album and song names in the relevant csv files.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
requirements: requests-html==0.9.0 | |
runtime: python-3.6 | |
How to use: | |
1. Download these spotify pages as html source from your browser: albums (https://open.spotify.com/collection/albums), | |
artists (https://open.spotify.com/collection/artists) and songs (https://open.spotify.com/collection/tracks) | |
to the relevant directories (scroll all the pages in advance). | |
2. Enter your local paths to the pages under the constant values (ARTISTS_PAGE_PATH, ALBUMS_PAGE_PATH, SONGS_PAGE_PATH). | |
3. Run the script. | |
""" | |
import csv | |
from typing import Set, Tuple, List | |
from requests_html import HTML | |
ARTISTS_PAGE_PATH = 'artists/Your Library - Artists.html' | |
ALBUMS_PAGE_PATH = 'albums/Your Library - Albums.html' | |
SONGS_PAGE_PATH = 'songs/Your Library - Songs.html' | |
ARTISTS_OUTPUT_FILE = 'artists.csv' | |
ALBUMS_OUTPUT_FILE = 'albums.csv' | |
SONGS_OUTPUT_FILE = 'songs.csv' | |
class TooManyFlags(Exception): | |
pass | |
class SpotifyPagesParser: | |
def __init__(self): | |
self.artists_page_content = self._read_file(ARTISTS_PAGE_PATH) | |
self.albums_page_content = self._read_file(ALBUMS_PAGE_PATH) | |
self.songs_page_content = self._read_file(SONGS_PAGE_PATH) | |
@staticmethod | |
def _read_file(file_path: str) -> None or str: | |
""" | |
Ignore not existing files | |
:param file_path: path to the html file | |
:return: page content | |
""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as file: | |
return file.read() | |
except FileNotFoundError: | |
print(f'The file "{file_path}" does not exists. This page will be ignored.') | |
def artists_page(self) -> Set[str]: | |
""" | |
Parse page with artists | |
:return: set of artist names on the page | |
""" | |
page = self.artists_page_content | |
if not page: | |
return set() | |
html = HTML(html=page) | |
# Parsing | |
content = html.find('section.contentSpacing', first=True) | |
artist_names = content.find('span') | |
return {artist_name.text for artist_name in artist_names} | |
def albums_page(self, get_albums: bool=False) -> Set[str] or Set[Tuple[str, str]]: | |
""" | |
Parse page with albums | |
:param get_albums: a flag for parsing not only artist names, but also their albums | |
:return: set of artist names on the page, or set of tuple {(artist_1, his_album_1), (..)..}, | |
if get_albums flag is True | |
""" | |
page = self.albums_page_content | |
if not page: | |
return set() | |
html = HTML(html=page) | |
# Parsing | |
content = html.find('section.contentSpacing', first=True) | |
artists_and_albums = content.find('span > a') | |
artist_names = (item for item in artists_and_albums if '/artist/' in str(item.links)) | |
if get_albums: | |
album_names = (item for item in artists_and_albums if '/album/' in str(item.links)) | |
return {(artist_name.text, album_name.text) for artist_name, album_name in zip(artist_names, album_names)} | |
return {artist_name.text for artist_name in artist_names} | |
def songs_page(self, get_songs: bool=False, get_albums: bool=False) -> \ | |
Set[str] or Set[Tuple[str, str]] or Set[Tuple[str, str, str]]: | |
""" | |
Parse page with songs | |
:param get_songs: a flag for parsing not only artist names, but also their albums | |
:param get_albums: a flag for parsing not only artist names, their albums, but also their songs | |
:return: set of artist names on the page, or {(artist_1, his_album_1), (..)..}, if the get_albums flag is True, | |
or {(artist_1, his_album_1, song_in_the_album_1), (..)..}, if the get_songs flag is True | |
""" | |
page = self.songs_page_content | |
if not page: | |
return set() | |
if sum([get_songs, get_albums]) > 1: | |
raise TooManyFlags('The flags "get_songs" and "get_albums" can not be identified simultaneously') | |
html = HTML(html=page) | |
# Parsing | |
content = html.find('section.tracklist-container', first=True) | |
artist_and_album_names = tuple( | |
(artist.text, album.text) | |
for artist, album in zip( | |
content.find('a.tracklist-row__artist-name-link'), | |
content.find('a.tracklist-row__album-name-link') | |
)) | |
artist_names = tuple(artist for artist, album in artist_and_album_names) | |
if get_albums: | |
album_names = tuple(album for artist, album in artist_and_album_names) | |
return {(artist_name, album_name) for artist_name, album_name in zip(artist_names, album_names)} | |
if get_songs: | |
song_names = content.find('div.tracklist-name') | |
album_names = tuple(album for artist, album in artist_and_album_names) | |
return {(artist_name, album_name, song_name.text) | |
for artist_name, album_name, song_name in zip(artist_names, album_names, song_names)} | |
return {artist_name for artist_name in artist_names} | |
def get_artist_names(self) -> list: | |
""" | |
:return: a sorted list of artist names e.g. ['Anathema', 'David Gilmour', ...] | |
""" | |
return sorted(self.artists_page() | self.albums_page() | self.songs_page()) | |
def get_artist_album_names(self) -> list: | |
""" | |
:return: a sorted list of artist names and their albums | |
e.g. [('Anathema', 'Judgement'), ('David Gilmour', 'On An Island'), ...] | |
""" | |
return sorted(self.albums_page(get_albums=True) | self.songs_page(get_albums=True)) | |
def get_artist_album_song_names(self) -> list: | |
""" | |
:return: a sorted list of artist names, their albums and songs | |
e.g. [('Anathema', 'Judgement', 'Anyone, Anywhere'), | |
('David Gilmour', 'On An Island', 'A Pocketful Of Stones'), ...] | |
""" | |
return sorted(self.songs_page(get_songs=True)) | |
@staticmethod | |
def save_to_file(path: str, headers: List[str], data: List[str] or List[Tuple]) -> None: | |
""" | |
:param path: e.g. 'some/path/artists.csv' | |
:param headers: e.g. ['Artist', 'Album', 'Song'] | |
:param data: e.g ['Anathema', 'David Gilmour', ...] | |
or [('Anathema', 'Judgement'), ('David Gilmour', 'On An Island'), ...] | |
:return: None | |
""" | |
if data: | |
with open(path, 'w+', encoding='utf-8') as file: | |
csv_out = csv.writer(file) | |
csv_out.writerow(headers) | |
for el in data: | |
if isinstance(el, str): | |
el = [el] | |
csv_out.writerow(el) | |
def save_all(self) -> None: | |
self.save_to_file(path=ARTISTS_OUTPUT_FILE, headers=['Artist'], data=self.get_artist_names()) | |
self.save_to_file(path=ALBUMS_OUTPUT_FILE, headers=['Artist', 'Album'], data=self.get_artist_album_names()) | |
self.save_to_file(path=SONGS_OUTPUT_FILE, headers=['Artist', 'Album', 'Song'], | |
data=self.get_artist_album_song_names()) | |
if __name__ == '__main__': | |
spotify = SpotifyPagesParser() | |
spotify.save_all() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment