Last active
December 15, 2021 14:46
-
-
Save bnekolny/8ec96faa0d57913eea6bb7848c06b912 to your computer and use it in GitHub Desktop.
MLFlow migration script from filesystem to database tracking data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import yaml | |
## Execution for me was: | |
## `python migrate_data.py > ./migration_inserts.sql` | |
## `sed -i '' 's/None/NULL/' ./migration_inserts.sql` | |
## `psql -f ./migration_inserts.sql` | |
## NOTE: A few things to know about the script here: | |
## - Artifacts were stored remotely, so no artifact migration | |
## - experiment source_type is always LOCAL for us, I avoided the mapping from int -> str | |
## - experiment status is always FINISHED for us, I avoided the mapping from int -> str | |
## - experiment source_version is never set | |
## - experiment lifecycle_stage is always active for us, I avoided the mapping from int -> str | |
## - metric timestamp is made up, since it was tracked as an array in filesystem and as an epoch in the DB | |
rootDir = 'MLFLOW_DATA_DIRECTORY_ROOT' | |
for experiment_id in os.listdir(rootDir): | |
if experiment_id in ['.trash']: | |
continue | |
f = open("{root}/{experiment}/meta.yaml".format(root=rootDir, experiment=experiment_id), "r") | |
experiment = yaml.load(f) | |
experiment['experiment_id'] = experiment_id | |
experiment['lifecycle_stage'] = 'active' | |
experiment_insert = "INSERT INTO experiments (experiment_id, name, artifact_location, lifecycle_stage) VALUES ({0}, '{1}', '{2}', '{3}');".format( | |
experiment['experiment_id'], | |
experiment['name'], | |
experiment['artifact_location'], | |
experiment['lifecycle_stage']) | |
print(experiment_insert) | |
for run_uuid in os.listdir("{root}/{experiment}".format(root=rootDir, experiment=experiment_id)): | |
if run_uuid in ['meta.yaml']: | |
continue | |
rf = open("{root}/{experiment}/{run}/meta.yaml".format(root=rootDir, experiment=experiment_id, run=run_uuid), "r") | |
run = yaml.load(rf) | |
run['run_uuid'] = run_uuid | |
run['lifecycle_stage'] = 'active' | |
run_insert = "INSERT INTO runs (" \ | |
"run_uuid, name, source_type, source_name, entry_point_name, user_id, status, start_time, end_time, source_version, lifecycle_stage, artifact_uri, experiment_id" \ | |
") VALUES ( '{0}', '{1}', '{2}', '{3}', '{4}', '{5}', '{6}', {7}, {8}, '{9}', '{10}', '{11}', {12});".format( | |
run['run_uuid'], | |
run['name'], | |
'LOCAL', | |
run['source_name'], | |
run['entry_point_name'], | |
run['user_id'], | |
'FINISHED', | |
run['start_time'], | |
run['end_time'], | |
'', | |
'active', | |
run['artifact_uri'], | |
experiment_id) | |
print(run_insert) | |
# Metrics | |
for metric in os.listdir("{root}/{experiment}/{run}/metrics".format(root=rootDir, experiment=experiment_id, run=run_uuid)): | |
f = open("{root}/{experiment}/{run}/metrics/{metric}".format(root=rootDir, experiment=experiment_id, run=run_uuid, metric=metric), "r") | |
line = f.readline() | |
while line: | |
#split | |
counter, val = line.split() | |
metric_insert = "INSERT INTO metrics (" \ | |
"key, value, timestamp, run_uuid" \ | |
") VALUES ( '{0}', '{1}', {2}, '{3}' );".format( | |
metric, | |
val, | |
int(run['start_time']) + int(counter), | |
run_uuid) | |
print(metric_insert) | |
line = f.readline() | |
f.close() | |
# Params | |
for param in os.listdir("{root}/{experiment}/{run}/params".format(root=rootDir, experiment=experiment_id, run=run_uuid)): | |
f = open("{root}/{experiment}/{run}/params/{param}".format(root=rootDir, experiment=experiment_id, run=run_uuid, param=param), "r") | |
line = f.readline() | |
while line: | |
param_insert = "INSERT INTO params (" \ | |
"key, value, run_uuid" \ | |
") VALUES ( '{0}', '{1}', '{2}' );".format( | |
param, | |
line.strip(), | |
run_uuid) | |
print(param_insert) | |
line = f.readline() | |
f.close() |
@bnekolny can you please advise for which MLFlow version this scripts suits?
I had originally written this for 0.9.0, but I haven't kept up to know if it's still applicable
Thank you for your reply.
So you fully migrated to 0.9.0 from previous versions(e.g. 0.8.x)?
This migration was written to move from Mlflow data being stored in the filestystem to Mlflow data being stored in a database. So while I more than likely did perform a version update, I don't think there was much significance to the version change.
The driver for this script was to take the existing content from the filesystem and get it shuffled into the database to be used by the new DB data backend.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you for your reply.
So you fully migrated to 0.9.0 from previous versions(e.g. 0.8.x)?