heyalexchoi · April 3, 2020 03:25
diff --git a/copy_redis.py b/copy_redis.py
 # there are a few different ways to copy data from one redis instance to another
 # from what i've read, you can set up your new instance as a slave to the old 
 # and let redis replication do its thing
 # or, it seems you can copy the underlying dump data from one instance's file system to the other's
 # however, sometimes you don't manage your own redis instances, or maybe you just don't want to bother
 # with any of that. you could just copy the data. 
 # this script does that.
 # there were some other scripts online already
 # - https://github.com/jeremyfa/node-redis-dump
 # - https://github.com/yaauie/redis-copy
 # the first one ran out of memory. it's a reported issue. the owner says 🤷🏽‍♀️ sorry, find another tool.
 # the second one was really slow. 
 # so, i wrote this one.
 # it uses redis pipelines, dump, restore, and larger-than-default batch sizes to reduce network latency effects.
 # it also sets a default ttl on each key it copies over. for my use case these are all good things but you can 
 # adjust the ttl and batch sizes pretty easily below.
 # I run python 3.6
 # 🙏🏽

 import os
 import redis

 # copy data FROM source
 source_redis_url = 'redis://user:pass@host:port'
 # copy data TO destination
 destination_redis_url = 'redis://user2:pass2@host2:port2'

 # redis scan count default is 10.
 # this variable is used for a couple related things
 # 1. how many keys are requested from redis on each scan
 # 2. how many commands are dispatched to redis in each pipeline execution
 # generally speaking, i've found i can reduce significant network latency effects
 # by batching things in this way. eg, pay the cost for a round trip across the network every
 # N commands instead of every 1 command.
 scan_count = 1000
 # 72 hour expiration (ttl param is in ms). set this to 0 for no ttl expiration on your keys.
 default_ttl = 60 * 60 * 72 * 1000

 source_conn = redis.StrictRedis.from_url(source_redis_url, decode_responses=False)
 destination_conn = redis.StrictRedis.from_url(destination_redis_url, decode_responses=False)

 # pipelines dump calls to keys
 # returns zipped list of keys and serialized dump results
 # [[key1, serialized_result1], [key2, serialized_result2],...]
 def dump_keys(conn, keys):
    pipe = conn.pipeline()
    for key in keys:
        pipe.dump(key)
    results = pipe.execute()
    zipped_results = zip(keys, results)
    return zipped_results

 # pipelines restore calls
 # takes in zipped list of keys and serialized dump values
 # restores serialized value at key, with replacement, and a default ttl of 72 hours
 def restore_dump_values(conn, zipped_results):
    pipe = conn.pipeline()
    for pair in zipped_results:
        key = pair[0]
        value = pair[1]
        pipe.restore(name=key, ttl=default_ttl, value=value, replace=True)
    result = pipe.execute()
    return result

 def dump_and_restore(source_conn, destination_conn, keys):
    # dump all the keys
    zipped_results = dump_keys(conn=source_conn, keys=keys)
    # restore the serialized results
    restore_result = restore_dump_values(conn=destination_conn, zipped_results=zipped_results)
    return restore_result

 total_run = 0
 total_success = 0
 keys = []
 for key in source_conn.scan_iter(count=scan_count):
    keys.append(key)
    bucket_count = len(keys)
    if bucket_count >= scan_count:
        res = dump_and_restore(source_conn=source_conn, destination_conn=destination_conn, keys=keys)
        total_run = total_run + len(res)
        successes = [result for result in res if result == b'OK']
        total_success += len(successes)
        keys = []
        print(f"total_run {total_run}")
        print(f"total_success {total_success}")

 res = dump_and_restore(source_conn=source_conn, destination_conn=destination_conn, keys=keys)
 total_run = total_run + len(res)
 successes = [result for result in res if result == b'OK']
 total_success += len(successes)
 print(f"total_run {total_run}")
 print(f"total_success {total_success}")
	# there are a few different ways to copy data from one redis instance to another
	# from what i've read, you can set up your new instance as a slave to the old
	# and let redis replication do its thing
	# or, it seems you can copy the underlying dump data from one instance's file system to the other's
	# however, sometimes you don't manage your own redis instances, or maybe you just don't want to bother
	# with any of that. you could just copy the data.
	# this script does that.
	# there were some other scripts online already
	# - https://github.com/jeremyfa/node-redis-dump
	# - https://github.com/yaauie/redis-copy
	# the first one ran out of memory. it's a reported issue. the owner says 🤷🏽‍♀️ sorry, find another tool.
	# the second one was really slow.
	# so, i wrote this one.
	# it uses redis pipelines, dump, restore, and larger-than-default batch sizes to reduce network latency effects.
	# it also sets a default ttl on each key it copies over. for my use case these are all good things but you can
	# adjust the ttl and batch sizes pretty easily below.
	# I run python 3.6
	# 🙏🏽

	import os
	import redis

	# copy data FROM source
	source_redis_url = 'redis://user:pass@host:port'
	# copy data TO destination
	destination_redis_url = 'redis://user2:pass2@host2:port2'

	# redis scan count default is 10.
	# this variable is used for a couple related things
	# 1. how many keys are requested from redis on each scan
	# 2. how many commands are dispatched to redis in each pipeline execution
	# generally speaking, i've found i can reduce significant network latency effects
	# by batching things in this way. eg, pay the cost for a round trip across the network every
	# N commands instead of every 1 command.
	scan_count = 1000
	# 72 hour expiration (ttl param is in ms). set this to 0 for no ttl expiration on your keys.
	default_ttl = 60 * 60 * 72 * 1000

	source_conn = redis.StrictRedis.from_url(source_redis_url, decode_responses=False)
	destination_conn = redis.StrictRedis.from_url(destination_redis_url, decode_responses=False)

	# pipelines dump calls to keys
	# returns zipped list of keys and serialized dump results
	# [[key1, serialized_result1], [key2, serialized_result2],...]
	def dump_keys(conn, keys):
	pipe = conn.pipeline()
	for key in keys:
	pipe.dump(key)
	results = pipe.execute()
	zipped_results = zip(keys, results)
	return zipped_results

	# pipelines restore calls
	# takes in zipped list of keys and serialized dump values
	# restores serialized value at key, with replacement, and a default ttl of 72 hours
	def restore_dump_values(conn, zipped_results):
	pipe = conn.pipeline()
	for pair in zipped_results:
	key = pair[0]
	value = pair[1]
	pipe.restore(name=key, ttl=default_ttl, value=value, replace=True)
	result = pipe.execute()
	return result

	def dump_and_restore(source_conn, destination_conn, keys):
	# dump all the keys
	zipped_results = dump_keys(conn=source_conn, keys=keys)
	# restore the serialized results
	restore_result = restore_dump_values(conn=destination_conn, zipped_results=zipped_results)
	return restore_result

	total_run = 0
	total_success = 0
	keys = []
	for key in source_conn.scan_iter(count=scan_count):
	keys.append(key)
	bucket_count = len(keys)
	if bucket_count >= scan_count:
	res = dump_and_restore(source_conn=source_conn, destination_conn=destination_conn, keys=keys)
	total_run = total_run + len(res)
	successes = [result for result in res if result == b'OK']
	total_success += len(successes)
	keys = []
	print(f"total_run {total_run}")
	print(f"total_success {total_success}")

	res = dump_and_restore(source_conn=source_conn, destination_conn=destination_conn, keys=keys)
	total_run = total_run + len(res)
	successes = [result for result in res if result == b'OK']
	total_success += len(successes)
	print(f"total_run {total_run}")
	print(f"total_success {total_success}")