ripiuk · January 5, 2020 11:16
diff --git a/spotify_get_music_info.py b/spotify_get_music_info.py
 """
 requirements: requests-html==0.9.0
 runtime: python-3.6

 How to use:
 1. Download these spotify pages as html source from your browser: albums (https://open.spotify.com/collection/albums),
   artists (https://open.spotify.com/collection/artists) and songs (https://open.spotify.com/collection/tracks)
   to the relevant directories (scroll all the pages in advance).
 2. Enter your local paths to the pages under the constant values (ARTISTS_PAGE_PATH, ALBUMS_PAGE_PATH, SONGS_PAGE_PATH).
 3. Run the script.
 """
 import csv
 from typing import Set, Tuple, List

 from requests_html import HTML

 ARTISTS_PAGE_PATH = 'artists/Your Library - Artists.html'
 ALBUMS_PAGE_PATH = 'albums/Your Library - Albums.html'
 SONGS_PAGE_PATH = 'songs/Your Library - Songs.html'

 ARTISTS_OUTPUT_FILE = 'artists.csv'
 ALBUMS_OUTPUT_FILE = 'albums.csv'
 SONGS_OUTPUT_FILE = 'songs.csv'


 class TooManyFlags(Exception):
    pass


 class SpotifyPagesParser:

    def __init__(self):
        self.artists_page_content = self._read_file(ARTISTS_PAGE_PATH)
        self.albums_page_content = self._read_file(ALBUMS_PAGE_PATH)
        self.songs_page_content = self._read_file(SONGS_PAGE_PATH)

    @staticmethod
    def _read_file(file_path: str) -> None or str:
        """
        Ignore not existing files
        :param file_path: path to the html file
        :return: page content
        """
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except FileNotFoundError:
            print(f'The file "{file_path}" does not exists. This page will be ignored.')

    def artists_page(self) -> Set[str]:
        """
        Parse page with artists
        :return: set of artist names on the page
        """
        page = self.artists_page_content

        if not page:
            return set()

        html = HTML(html=page)
        # Parsing
        content = html.find('section.contentSpacing', first=True)
        artist_names = content.find('span')

        return {artist_name.text for artist_name in artist_names}

    def albums_page(self, get_albums: bool=False) -> Set[str] or Set[Tuple[str, str]]:
        """
        Parse page with albums
        :param get_albums: a flag for parsing not only artist names, but also their albums
        :return: set of artist names on the page, or set of tuple {(artist_1, his_album_1), (..)..},
        if get_albums flag is True
        """
        page = self.albums_page_content

        if not page:
            return set()

        html = HTML(html=page)
        # Parsing
        content = html.find('section.contentSpacing', first=True)
        artists_and_albums = content.find('span > a')
        artist_names = (item for item in artists_and_albums if '/artist/' in str(item.links))

        if get_albums:
            album_names = (item for item in artists_and_albums if '/album/' in str(item.links))
            return {(artist_name.text, album_name.text) for artist_name, album_name in zip(artist_names, album_names)}

        return {artist_name.text for artist_name in artist_names}

    def songs_page(self, get_songs: bool=False, get_albums: bool=False) -> \
            Set[str] or Set[Tuple[str, str]] or Set[Tuple[str, str, str]]:
        """
        Parse page with songs
        :param get_songs: a flag for parsing not only artist names, but also their albums
        :param get_albums: a flag for parsing not only artist names, their albums, but also their songs
        :return: set of artist names on the page, or {(artist_1, his_album_1), (..)..}, if the get_albums flag is True,
        or {(artist_1, his_album_1, song_in_the_album_1), (..)..}, if the get_songs flag is True
        """
        page = self.songs_page_content

        if not page:
            return set()

        if sum([get_songs, get_albums]) > 1:
            raise TooManyFlags('The flags "get_songs" and "get_albums" can not be identified simultaneously')

        html = HTML(html=page)
        # Parsing
        content = html.find('section.tracklist-container', first=True)
        artist_and_album_names = tuple(
            (artist.text, album.text)
            for artist, album in zip(
                content.find('a.tracklist-row__artist-name-link'),
                content.find('a.tracklist-row__album-name-link')
            ))
        artist_names = tuple(artist for artist, album in artist_and_album_names)

        if get_albums:
            album_names = tuple(album for artist, album in artist_and_album_names)
            return {(artist_name, album_name) for artist_name, album_name in zip(artist_names, album_names)}

        if get_songs:
            song_names = content.find('div.tracklist-name')
            album_names = tuple(album for artist, album in artist_and_album_names)
            return {(artist_name, album_name, song_name.text)
                    for artist_name, album_name, song_name in zip(artist_names, album_names, song_names)}

        return {artist_name for artist_name in artist_names}

    def get_artist_names(self) -> list:
        """
        :return: a sorted list of artist names e.g. ['Anathema', 'David Gilmour', ...]
        """
        return sorted(self.artists_page() | self.albums_page() | self.songs_page())

    def get_artist_album_names(self) -> list:
        """
        :return: a sorted list of artist names and their albums
        e.g. [('Anathema', 'Judgement'), ('David Gilmour', 'On An Island'), ...]
        """
        return sorted(self.albums_page(get_albums=True) | self.songs_page(get_albums=True))

    def get_artist_album_song_names(self) -> list:
        """
        :return: a sorted list of artist names, their albums and songs
        e.g. [('Anathema', 'Judgement', 'Anyone, Anywhere'),
              ('David Gilmour', 'On An Island', 'A Pocketful Of Stones'), ...]
        """
        return sorted(self.songs_page(get_songs=True))

    @staticmethod
    def save_to_file(path: str, headers: List[str], data: List[str] or List[Tuple]) -> None:
        """
        :param path: e.g. 'some/path/artists.csv'
        :param headers: e.g. ['Artist', 'Album', 'Song']
        :param data: e.g ['Anathema', 'David Gilmour', ...]
        or [('Anathema', 'Judgement'), ('David Gilmour', 'On An Island'), ...]
        :return: None
        """
        if data:
            with open(path, 'w+', encoding='utf-8') as file:
                csv_out = csv.writer(file)
                csv_out.writerow(headers)
                for el in data:
                    if isinstance(el, str):
                        el = [el]
                    csv_out.writerow(el)

    def save_all(self) -> None:
        self.save_to_file(path=ARTISTS_OUTPUT_FILE, headers=['Artist'], data=self.get_artist_names())
        self.save_to_file(path=ALBUMS_OUTPUT_FILE, headers=['Artist', 'Album'], data=self.get_artist_album_names())
        self.save_to_file(path=SONGS_OUTPUT_FILE, headers=['Artist', 'Album', 'Song'],
                          data=self.get_artist_album_song_names())


 if __name__ == '__main__':
    spotify = SpotifyPagesParser()
    spotify.save_all()
	"""
	requirements: requests-html==0.9.0
	runtime: python-3.6

	How to use:
	1. Download these spotify pages as html source from your browser: albums (https://open.spotify.com/collection/albums),
	artists (https://open.spotify.com/collection/artists) and songs (https://open.spotify.com/collection/tracks)
	to the relevant directories (scroll all the pages in advance).
	2. Enter your local paths to the pages under the constant values (ARTISTS_PAGE_PATH, ALBUMS_PAGE_PATH, SONGS_PAGE_PATH).
	3. Run the script.
	"""
	import csv
	from typing import Set, Tuple, List

	from requests_html import HTML

	ARTISTS_PAGE_PATH = 'artists/Your Library - Artists.html'
	ALBUMS_PAGE_PATH = 'albums/Your Library - Albums.html'
	SONGS_PAGE_PATH = 'songs/Your Library - Songs.html'

	ARTISTS_OUTPUT_FILE = 'artists.csv'
	ALBUMS_OUTPUT_FILE = 'albums.csv'
	SONGS_OUTPUT_FILE = 'songs.csv'


	class TooManyFlags(Exception):
	pass


	class SpotifyPagesParser:

	def __init__(self):
	self.artists_page_content = self._read_file(ARTISTS_PAGE_PATH)
	self.albums_page_content = self._read_file(ALBUMS_PAGE_PATH)
	self.songs_page_content = self._read_file(SONGS_PAGE_PATH)

	@staticmethod
	def _read_file(file_path: str) -> None or str:
	"""
	Ignore not existing files
	:param file_path: path to the html file
	:return: page content
	"""
	try:
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()
	except FileNotFoundError:
	print(f'The file "{file_path}" does not exists. This page will be ignored.')

	def artists_page(self) -> Set[str]:
	"""
	Parse page with artists
	:return: set of artist names on the page
	"""
	page = self.artists_page_content

	if not page:
	return set()

	html = HTML(html=page)
	# Parsing
	content = html.find('section.contentSpacing', first=True)
	artist_names = content.find('span')

	return {artist_name.text for artist_name in artist_names}

	def albums_page(self, get_albums: bool=False) -> Set[str] or Set[Tuple[str, str]]:
	"""
	Parse page with albums
	:param get_albums: a flag for parsing not only artist names, but also their albums
	:return: set of artist names on the page, or set of tuple {(artist_1, his_album_1), (..)..},
	if get_albums flag is True
	"""
	page = self.albums_page_content

	if not page:
	return set()

	html = HTML(html=page)
	# Parsing
	content = html.find('section.contentSpacing', first=True)
	artists_and_albums = content.find('span > a')
	artist_names = (item for item in artists_and_albums if '/artist/' in str(item.links))

	if get_albums:
	album_names = (item for item in artists_and_albums if '/album/' in str(item.links))
	return {(artist_name.text, album_name.text) for artist_name, album_name in zip(artist_names, album_names)}

	return {artist_name.text for artist_name in artist_names}

	def songs_page(self, get_songs: bool=False, get_albums: bool=False) -> \
	Set[str] or Set[Tuple[str, str]] or Set[Tuple[str, str, str]]:
	"""
	Parse page with songs
	:param get_songs: a flag for parsing not only artist names, but also their albums
	:param get_albums: a flag for parsing not only artist names, their albums, but also their songs
	:return: set of artist names on the page, or {(artist_1, his_album_1), (..)..}, if the get_albums flag is True,
	or {(artist_1, his_album_1, song_in_the_album_1), (..)..}, if the get_songs flag is True
	"""
	page = self.songs_page_content

	if not page:
	return set()

	if sum([get_songs, get_albums]) > 1:
	raise TooManyFlags('The flags "get_songs" and "get_albums" can not be identified simultaneously')

	html = HTML(html=page)
	# Parsing
	content = html.find('section.tracklist-container', first=True)
	artist_and_album_names = tuple(
	(artist.text, album.text)
	for artist, album in zip(
	content.find('a.tracklist-row__artist-name-link'),
	content.find('a.tracklist-row__album-name-link')
	))
	artist_names = tuple(artist for artist, album in artist_and_album_names)

	if get_albums:
	album_names = tuple(album for artist, album in artist_and_album_names)
	return {(artist_name, album_name) for artist_name, album_name in zip(artist_names, album_names)}

	if get_songs:
	song_names = content.find('div.tracklist-name')
	album_names = tuple(album for artist, album in artist_and_album_names)
	return {(artist_name, album_name, song_name.text)
	for artist_name, album_name, song_name in zip(artist_names, album_names, song_names)}

	return {artist_name for artist_name in artist_names}

	def get_artist_names(self) -> list:
	"""
	:return: a sorted list of artist names e.g. ['Anathema', 'David Gilmour', ...]
	"""
	return sorted(self.artists_page() \| self.albums_page() \| self.songs_page())

	def get_artist_album_names(self) -> list:
	"""
	:return: a sorted list of artist names and their albums
	e.g. [('Anathema', 'Judgement'), ('David Gilmour', 'On An Island'), ...]
	"""
	return sorted(self.albums_page(get_albums=True) \| self.songs_page(get_albums=True))

	def get_artist_album_song_names(self) -> list:
	"""
	:return: a sorted list of artist names, their albums and songs
	e.g. [('Anathema', 'Judgement', 'Anyone, Anywhere'),
	('David Gilmour', 'On An Island', 'A Pocketful Of Stones'), ...]
	"""
	return sorted(self.songs_page(get_songs=True))

	@staticmethod
	def save_to_file(path: str, headers: List[str], data: List[str] or List[Tuple]) -> None:
	"""
	:param path: e.g. 'some/path/artists.csv'
	:param headers: e.g. ['Artist', 'Album', 'Song']
	:param data: e.g ['Anathema', 'David Gilmour', ...]
	or [('Anathema', 'Judgement'), ('David Gilmour', 'On An Island'), ...]
	:return: None
	"""
	if data:
	with open(path, 'w+', encoding='utf-8') as file:
	csv_out = csv.writer(file)
	csv_out.writerow(headers)
	for el in data:
	if isinstance(el, str):
	el = [el]
	csv_out.writerow(el)

	def save_all(self) -> None:
	self.save_to_file(path=ARTISTS_OUTPUT_FILE, headers=['Artist'], data=self.get_artist_names())
	self.save_to_file(path=ALBUMS_OUTPUT_FILE, headers=['Artist', 'Album'], data=self.get_artist_album_names())
	self.save_to_file(path=SONGS_OUTPUT_FILE, headers=['Artist', 'Album', 'Song'],
	data=self.get_artist_album_song_names())


	if __name__ == '__main__':
	spotify = SpotifyPagesParser()
	spotify.save_all()