Skip to content

Instantly share code, notes, and snippets.

@bonelifer
Last active April 1, 2025 23:18
Show Gist options
  • Save bonelifer/4059100046084923660b23848b3c38f3 to your computer and use it in GitHub Desktop.
Save bonelifer/4059100046084923660b23848b3c38f3 to your computer and use it in GitHub Desktop.
Script to Fetch MusicBrainz IDs (MBIDs) for Artists from a CSV File
#!/usr/bin/env python3
"""
Script to Fetch MusicBrainz IDs (MBIDs) for Artists from a CSV File
Purpose:
This script reads a CSV file containing artist names, extracts unique artist names
from the columns 'Artist Name 1' to 'Artist Name 6', and queries the MusicBrainz API
for the MusicBrainz ID (MBID) of each artist. It then saves the results to a JSON file.
Steps:
1. Load a CSV file containing song details, which includes up to six columns for artist names.
2. Extract artist names from these columns, handling cases where multiple artists are listed in a single cell.
3. Query the MusicBrainz API to retrieve the MBID for each unique artist.
4. If an MBID is found, it is saved in a JSON file with just the MBID value.
5. The script respects the MusicBrainz API rate limit by introducing a delay between requests.
6. The results are saved in a formatted JSON file named 'artists_mbids.json'.
"""
import os # Import os to handle file path checks
import pandas as pd # Import pandas to read CSV and manipulate data
import requests # Import requests for making HTTP requests to the MusicBrainz API
import time # Import time for adding delays to prevent hitting rate limits
import json # Import json for formatting the output as JSON
# Define the email and app name for the User-Agent header
email = "[email protected]" # Replace with your actual email address
app_name = "GoogleTakeoutMusicToMBid/1.0" # A unique name that describes the purpose of this script
# File path for the CSV file
file_path = r"./musiclibrarysongs.csv" # Change this path to where your CSV file is located
# Check if the file exists before attempting to read it
if not os.path.isfile(file_path):
# If the file doesn't exist, print an error message and suggest actions
print(f"Error: The file at '{file_path}' could not be found.")
print("Please ensure the file exists at the specified location and try again.")
else:
try:
# Attempt to read the CSV file into a DataFrame, skipping problematic lines
df = pd.read_csv(file_path, on_bad_lines='skip') # 'skip' will ignore problematic lines
print("File successfully loaded.")
except Exception as e:
# Catch any other errors that may occur while reading the file
print(f"An error occurred while trying to load the file: {e}")
print("Please check the file format or permissions and try again.")
# If the file is successfully loaded, we proceed to process the data
if 'df' in locals():
# Clean up column names by stripping any leading or trailing spaces
# This is important for ensuring that column names are accessed correctly.
df.columns = df.columns.str.strip()
# Create a set to store unique artist names. We use a set to automatically handle duplicates.
artist_names = set()
# Iterate through the columns that represent artist names (Artist Name 1 to Artist Name 6).
# These columns may contain multiple artists separated by commas.
for i in range(1, 7): # Loop through columns 1 to 6 for artist names
artist_column = f'Artist Name {i}' # Dynamically generate the column name for each artist column
# Check if the column exists in the DataFrame before attempting to process it
if artist_column in df.columns:
# Drop any null values (NaNs) and iterate through the remaining artist names
for artists in df[artist_column].dropna():
# Each artist name may be a comma-separated string, so split it into individual artists
for artist in artists.split(','):
artist_names.add(artist.strip()) # Strip extra spaces and add to the set
# Initialize a list to store the MusicBrainz IDs (MBIDs)
mbids = []
# Define the MusicBrainz API endpoint that will be used for searching artists by name
api_endpoint = "https://musicbrainz.org/ws/2/artist/"
# Set up headers for the API request, including a custom User-Agent (required by MusicBrainz)
# The User-Agent helps MusicBrainz identify who is making the requests.
headers = {
'User-Agent': f'{app_name} ({email})' # Replace with your own details
}
def get_artist_mbid(artist_name):
"""
Queries the MusicBrainz API to retrieve the MusicBrainz ID (MBID) for a given artist.
Parameters:
artist_name (str): The name of the artist to search for.
Returns:
str: The MusicBrainz ID (MBID) for the first matching artist, or None if no match is found.
"""
# Define the query parameters for the API request
query_params = {
'query': artist_name, # The search query string to find the artist by name
'fmt': 'json' # Specify that the response should be in JSON format
}
# Send the request to the MusicBrainz API
try:
response = requests.get(api_endpoint, params=query_params, headers=headers)
response.raise_for_status() # Raise an error if the HTTP request failed
# If the request was successful, process the response
data = response.json() # Parse the response as JSON
if data.get('artists'): # Check if any artist was returned in the response
return data['artists'][0]['id'] # Return the ID of the first artist
except requests.exceptions.RequestException as e:
# If the API request failed, print an error message
print(f"Error while fetching data for artist '{artist_name}': {e}")
except Exception as e:
# Catch any other exceptions and print a general error message
print(f"Unexpected error while processing artist '{artist_name}': {e}")
# Return None if no artist was found or an error occurred
return None
# Iterate through the unique set of artist names and fetch their MusicBrainz IDs (MBIDs)
for artist in artist_names:
# Call the function to retrieve the MBID for the current artist
mbid = get_artist_mbid(artist)
# If an MBID is found, add it to the results list
if mbid:
mbids.append({"MusicBrainzId": mbid})
else:
# If no MBID is found, log this information for the artist
print(f"MBID not found for artist: {artist}")
# To avoid hitting rate limits on the MusicBrainz API, introduce a 1-second delay between requests
# This helps ensure that the script doesn't overwhelm the API with too many requests in a short time.
time.sleep(1)
# Convert the list of MBIDs into a properly formatted JSON array
# The indent parameter is used to pretty-print the JSON output with 4 spaces for indentation
json_output = json.dumps(mbids, indent=4)
# Define the name of the output file where the JSON data will be saved
output_file = "artists_mbids.json"
# Open the output file in write mode and save the formatted JSON data to it
# Use UTF-8 encoding to handle any special characters properly
try:
with open(output_file, 'w', encoding='utf-8') as json_file:
json_file.write(json_output)
print(f"JSON array of MusicBrainz IDs has been saved to '{output_file}'.")
except Exception as e:
# Handle errors that occur during file writing
print(f"An error occurred while writing the output file: {e}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment