Skip to content

Instantly share code, notes, and snippets.

@hartikainen
Last active July 27, 2019 00:34
Show Gist options
  • Save hartikainen/0eeaca69e6a8b2e039d8a1b3dec59802 to your computer and use it in GitHub Desktop.
Save hartikainen/0eeaca69e6a8b2e039d8a1b3dec59802 to your computer and use it in GitHub Desktop.
import glob
import os
from pprint import pprint
import re
import shutil
RESULT_FILE_REGEXES = (
"^result.json$",
"^progress.csv$",
"^events.out.tfevents.\\d+.\\w$",
)
PARAMS_FILE_REGEXES = (
"^params.json$",
"^params.pkl$",
)
CHECKPOINT_DIRECTORY_REGEXES = (
"^checkpoint_\\d+$"
)
def is_result_file(filename):
return any(
re.match(result_file_regex, filename)
for result_file_regex in RESULT_FILE_REGEXES)
def is_params_file(filename):
return any(
re.match(params_file_regex, filename)
for params_file_regex in PARAMS_FILE_REGEXES)
def is_checkpoint_directory(dirname):
# TODO(hartikainen): might want to check the contents of this directory.
# e.g. check `.tune_metadata`, etc.
return any(
re.match(checkpoint_directory_regex, dirname)
for checkpoint_directory_regex in CHECKPOINT_DIRECTORY_REGEXES)
def is_trial_directory(root_dir):
root, directories, files = next(os.walk(root_dir))
# json logger: params.json, result.json, params.pkl
# csv logger: progress.csv
# tf logger: events.out.tfevents.1562394433.ray-hopp-2-head-4ba37bcf
# log_syncxurz09ic.log
result_files = [
filename
for filename in files
if is_result_file(filename)
]
params_files = [
filename
for filename in files
if is_params_file(filename)
]
# TODO(hartikainen): checkpoint_directories are currently unused here
checkpoint_directories = [
directory
for directory in directories
if is_checkpoint_directory(os.path.join(root, directory))
]
# TODO(hartikainen): might want to check if "^log_sync\\d{8}.log$" exists
return result_files and params_files
def is_experiment_directory(root, directories, files):
# 1) experiment_state.json exists -> is experiment
experiment_state_paths = glob.glob(
os.path.join(root, "experiment_state*.json"))
if experiment_state_paths:
# TODO(hartikainen): This needs to be fixed. In general, a directory
# can have multiple experiment state files. Softlearning experiment
# directories shouldn't though.
assert len(experiment_state_paths) == 1, experiment_state_paths
return True
# 2) All the subfolders are trials -> is experiment
if directories and all(
is_trial_directory(os.path.join(root, directory))
for directory in directories):
return True
return False
def find_all_experiment_directories(root_dir):
root_dir = os.path.expanduser(root_dir)
root, directories, files = next(os.walk(root_dir))
if is_experiment_directory(root, directories, files):
return (root_dir, )
all_experiment_directories = sum((
find_all_experiment_directories(os.path.join(root, directory))
for directory in directories
), ())
return all_experiment_directories
def prune_and_copy_directory(source_base_directory,
target_base_directory,
relative_directory):
files_to_keep_regexes = (
"^params.json$",
"^result.json$",
"^progress.csv$",
"^experiment_state[\\w-]?.json$"
"^experiment_state$",
)
for root, directories, files in os.walk(
os.path.join(source_base_directory, relative_directory)):
files_to_copy = [
file_name
for file_name in files
if any(re.match(regex, file_name)
for regex in files_to_keep_regexes)
]
if files_to_copy:
target_directory = os.path.join(
target_base_directory,
root.replace(source_base_directory, "").strip("/"))
if not os.path.exists(target_directory):
os.makedirs(target_directory)
for file_name in files_to_copy:
shutil.copyfile(
os.path.join(root, file_name),
os.path.join(target_directory, file_name))
def main():
base_directory = os.path.expanduser("~/ray_results")
target_directory = "/tmp/data_for_richard"
experiment_directories = find_all_experiment_directories(base_directory)
pprint(experiment_directories)
for experiment_directory in experiment_directories:
relative_experiment_directory = experiment_directory.replace(
base_directory, "").strip("/")
prune_and_copy_directory(
base_directory,
target_directory,
relative_experiment_directory)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment