Last active
July 25, 2025 06:59
-
-
Save jazzl0ver/c87c5ebfd76c07b56ffe8448f40e737b to your computer and use it in GitHub Desktop.
Firecamp Cassandra restore script (aws cli v2 is required!)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # | |
| # https://gist.github.com/jazzl0ver/c87c5ebfd76c07b56ffe8448f40e737b | |
| # | |
| # Firecamp Cassandra restore script | |
| # Example: | |
| # ./fc_cass_restore.sh -r us-east-1 -c firecamp-qa -s cass-qa -d 2019-05-28 -u bd751a2269a44a2e52898bc0dd5cb2ac -o firecamp-uat -k no | |
| # where: | |
| # -r - region | |
| # -c - firecamp cluster name (MUST NOT match the cluster where backup was taken) | |
| # -s - firecamp cluster's service name (MAY match the name of the backed up service, but not recommended) | |
| # -d - backup creation date (Created tag of the volumes snapshots) | |
| # -u - ServiceUUID (ServiceUUID tag of the volumes snapshots) | |
| # -o - origin firecamp cluster name (MUST match the cluster where backup was taken) | |
| # -k - keep the new cluster name after restoration (yes/no) | |
| # | |
| # Script should be executed on an instance within the same VPC as Firecamp cluster and in AppSecurityGroup | |
| # Dependencies: awscli, docker, firecamp-service-cli, firecamp-volume-replace, jq | |
| # | |
| # The script does not change anything. It just creates new volumes from snapshots and prints | |
| # commands needed to restore the backed up volumes (made by fc_cass_backup.sh). | |
| # | |
| # Following policy should be assigned to the instance (or a user) where the script is executed: | |
| # { | |
| # "Version": "2012-10-17", | |
| # "Statement": [ | |
| # { | |
| # "Sid": "CreateVolumeFromSnapshot", | |
| # "Effect": "Allow", | |
| # "Action": "ec2:CreateVolume", | |
| # "Resource": "arn:aws:ec2:*::snapshot/*", | |
| # "Condition": { | |
| # "StringLike": { | |
| # "aws:ResourceTag/Name": "firecamp-*" | |
| # } | |
| # } | |
| # }, | |
| # { | |
| # "Sid": "CreateVolumeNewResource", | |
| # "Effect": "Allow", | |
| # "Action": "ec2:CreateVolume", | |
| # "Resource": "arn:aws:ec2:*:*:volume/*", | |
| # "Condition": { | |
| # "StringLike": { | |
| # "aws:RequestTag/Name": "firecamp-*" | |
| # } | |
| # } | |
| # }, | |
| # { | |
| # "Sid": "CreateSnapshotFromVolume", | |
| # "Effect": "Allow", | |
| # "Action": "ec2:CreateSnapshot", | |
| # "Resource": "arn:aws:ec2:*:277022112796:volume/*" | |
| # }, | |
| # { | |
| # "Sid": "CreateSnapshotResource", | |
| # "Effect": "Allow", | |
| # "Action": "ec2:CreateSnapshot", | |
| # "Resource": "arn:aws:ec2:*::snapshot/*", | |
| # "Condition": { | |
| # "StringLike": { | |
| # "aws:RequestTag/Name": "firecamp-*" | |
| # } | |
| # } | |
| # }, | |
| # { | |
| # "Sid": "CopySnapshot", | |
| # "Effect": "Allow", | |
| # "Action": "ec2:CopySnapshot", | |
| # "Resource": "arn:aws:ec2:*::snapshot/*" | |
| # }, | |
| # { | |
| # "Sid": "VisualEditor2", | |
| # "Effect": "Allow", | |
| # "Action": [ | |
| # "dynamodb:GetItem", | |
| # "dynamodb:Query", | |
| # "dynamodb:UpdateItem", | |
| # "ec2:CreateSnapshot", | |
| # "ec2:CopySnapshot" | |
| # ], | |
| # "Resource": [ | |
| # "arn:aws:dynamodb:*:*:table/firecamp-*", | |
| # "arn:aws:ec2:*::snapshot/*" | |
| # ] | |
| # }, | |
| # { | |
| # "Sid": "VisualEditor3", | |
| # "Effect": "Allow", | |
| # "Action": [ | |
| # "ec2:DeleteSnapshot" | |
| # ], | |
| # "Resource": [ | |
| # "arn:aws:ec2:*::snapshot/*" | |
| # ], | |
| # "Condition": { | |
| # "StringLike": { | |
| # "aws:ResourceTag/Name": "firecamp-*" | |
| # } | |
| # } | |
| # }, | |
| # { | |
| # "Sid": "VisualEditor4", | |
| # "Effect": "Allow", | |
| # "Action": [ | |
| # "ec2:DescribeInstances", | |
| # "ec2:CreateTags", | |
| # "ec2:DescribeVolumes", | |
| # "sts:GetCallerIdentity", | |
| # "ec2:DescribeSnapshots" | |
| # ], | |
| # "Resource": "*" | |
| # } | |
| # ] | |
| # } | |
| # | |
| # | |
| # Modify FCCLI and FCVR vars to the actual paths | |
| FCCLI=~ec2-user/firecamp/3.0/firecamp-service-cli | |
| FCVR=~ec2-user/firecamp/3.0/firecamp-volume-replace | |
| #-- Do not modify below | |
| [ -x "$FCCLI" ] || { echo "Download $(basname $FCCLI) tool into $(dirname $FCCLI) before using this script"; exit; } | |
| [ -x "$FCVR" ] || { echo "Download $(basname $FCVR) tool into $(dirname $FCVR) before using this script"; exit; } | |
| PATH=$PATH:/usr/local/bin | |
| [ $(aws --version | cut -f1 -d' ' | cut -f1 -d.) = "aws-cli/2" ] || { echo "AWS CLI v2 is required"; exit; } | |
| while getopts :s:c:r:d:u:o:k: opt; do | |
| case $opt in | |
| r) region="$OPTARG" ;; | |
| c) cluster="$OPTARG" ;; | |
| s) servicename="$OPTARG" ;; | |
| d) created="$OPTARG" ;; | |
| u) uuid="$OPTARG" ;; | |
| o) origcluster="$OPTARG" ;; | |
| k) keepnew="$OPTARG" ;; | |
| *) echo "=== Error with Options Input. Cause of failure is most likely that an unsupported parameter was passed or a parameter was passed without a corresponding option." 1>&2 ; exit 64 ;; | |
| esac | |
| done | |
| [ "$region" = "" -o "$cluster" = "" -o "$servicename" = "" -o "$created" = "" -o "$uuid" = "" -o "$origcluster" = "" -o "$keepnew" = "" ] && { echo "Not enough paramaters, exiting..."; exit; } | |
| #-- iterates the expression until the output is non-zero | |
| wait_until() | |
| { | |
| result=`eval $* | sed 's/ //g'` | |
| if [[ $result == 0 ]] | |
| then | |
| sleep 5 | |
| wait_until $* | |
| fi | |
| } | |
| replace_cmd=""; delete_cmd=""; | |
| echo | |
| echo "=== Before running this script make sure to create new C* service (hit Ctrl-C after the 1st 'wait the service containers running, RunningCount 1' line) and stop it:" | |
| echo $FCCLI -region=$region -cluster=$cluster -op=create-service -service-type=cassandra -service-name=$servicename -replicas=3 -volume-size=2 -journal-volume-size=1 -cas-heap-size=512 | |
| echo $FCCLI -region=$region -cluster=$cluster -op=stop-service -service-type=cassandra -service-name=$servicename | |
| echo | |
| echo "*** Hit Ctrl-C to exit or Enter to continue" | |
| read | |
| seeds=$($FCCLI -region=$region -cluster=$cluster -op=get-service -service-type=cassandra -service-name=$servicename | grep CASSANDRA_SEEDS | cut -f2 -d= | sed -e 's/,/ /g') | |
| for line in $($FCCLI -region=$region -cluster=$cluster -op=list-members -service-type=cassandra -service-name=$servicename \ | |
| | grep -oE "(MemberName|AvailableZone|PrimaryVolumeID|JournalVolumeID):[a-z0-9-]+"); do | |
| key=$(echo $line | cut -f1 -d:) | |
| val=$(echo $line | cut -f2 -d:) | |
| if [[ $key == "MemberName" ]]; then | |
| member=$val | |
| fi | |
| if [[ $key == "AvailableZone" ]]; then | |
| az=$val | |
| fi | |
| if [[ $key == "PrimaryVolumeID" ]]; then | |
| pvolid=$val | |
| fi | |
| if [[ $key == "JournalVolumeID" ]]; then | |
| jvolid=$val | |
| fi | |
| if [ -n "$member" -a -n "$az" -a -n "$pvolid" -a -n "$jvolid" ]; then | |
| pvolName=$(aws --region=$region ec2 describe-volumes --volume-ids $pvolid --query 'Volumes[].Tags[?Key==`Name`].Value' --output text) | |
| jvolName=$(aws --region=$region ec2 describe-volumes --volume-ids $jvolid --query 'Volumes[].Tags[?Key==`Name`].Value' --output text) | |
| [ -n "$pvolName" -a -n "$jvolName" ] || { echo "Can't get volume name for $pvolid or $jvolid"; exit; } | |
| echo "Creating volumes from snapshots for $member:" | |
| accountid=$(aws sts get-caller-identity --output text --query 'Account') | |
| sshots=$(aws --region=us-east-1 ec2 describe-snapshots --owner-ids $accountid \ | |
| --filters Name=tag:Created,Values=$created \ | |
| Name=tag:ServiceUUID,Values=$uuid \ | |
| Name=tag:AvailableZone,Values=$az \ | |
| --output json) | |
| spvolid=$(echo $sshots | jq -r '.[][] | select(.Tags[].Value|test("Primary")) | .SnapshotId') #' | |
| sjvolid=$(echo $sshots | jq -r '.[][] | select(.Tags[].Value|test("Journal")) | .SnapshotId') #' | |
| if [ -z "$spvolid" -o -z "$sjvolid" ]; then | |
| echo "Something went wrong - can't find Primary/Journal tags in snapshots" | |
| exit | |
| fi | |
| npvolid=$(aws --region=$region ec2 create-volume --snapshot-id $spvolid --availability-zone $az --volume-type gp2 \ | |
| --tag-specifications "ResourceType=volume,Tags=[\ | |
| {Key=Name,Value=$pvolName} | |
| ]" --query 'VolumeId' --output text) | |
| [ -n "$npvolid" ] || { echo "Can't create volume from snapshot $spvolid"; exit; } | |
| VOL_AVAIL="aws --region=$region ec2 describe-volumes --volume-ids $npvolid | grep available | wc -l" | |
| wait_until $VOL_AVAIL | |
| echo -e "\tPrimary volume created - $npvolid" | |
| njvolid=$(aws --region=$region ec2 create-volume --snapshot-id $sjvolid --availability-zone $az --volume-type gp2 \ | |
| --tag-specifications "ResourceType=volume,Tags=[\ | |
| {Key=Name,Value=$jvolName} | |
| ]" --query 'VolumeId' --output text) | |
| [ -n "$njvolid" ] || { echo "Can't create volume from snapshot $sjvolid"; exit; } | |
| VOL_AVAIL="aws --region=$region ec2 describe-volumes --volume-ids $njvolid | grep available | wc -l" | |
| wait_until $VOL_AVAIL | |
| echo -e "\tJournal volume created - $njvolid" | |
| replace_cmd+="$FCVR -cluster=$cluster -service-name=$servicename -bad-volumeid=$pvolid -new-volumeid=$npvolid 2>> fcvr.replace.log\n" | |
| replace_cmd+="$FCVR -cluster=$cluster -service-name=$servicename -bad-volumeid=$jvolid -new-volumeid=$njvolid 2>> fcvr.replace.log\n" | |
| restore_cmd+="$FCVR -cluster=$cluster -service-name=$servicename -bad-volumeid=$npvolid -new-volumeid=$pvolid 2>> fcvr.restore.log\n" | |
| restore_cmd+="$FCVR -cluster=$cluster -service-name=$servicename -bad-volumeid=$njvolid -new-volumeid=$jvolid 2>> fcvr.restore.log\n" | |
| delete_cmd+="aws --region $region ec2 delete-volume --volume-id $pvolid\n" | |
| delete_cmd+="aws --region $region ec2 delete-volume --volume-id $jvolid\n" | |
| member=""; az=""; pvolid=""; jvolid="" | |
| fi | |
| done | |
| echo | |
| echo "=== Replace volumes using the following commands:" | |
| echo -e "$replace_cmd" | |
| echo | |
| echo "=== Modify task definition and service:" | |
| echo taskdef=\$\(aws --region $region ecs describe-services --cluster $cluster --services $servicename \| jq -r \'.services[].taskDefinition\'\) | |
| echo aws --region $region ecs describe-task-definition --task-definition \$taskdef \| jq -r \'.taskDefinition\' \| jq -r \'del\(.taskDefinitionArn,.requiresAttributes,.compatibilities,.status,.family,.revision,.registeredAt,.registeredBy\)\' \| jq -r \'.containerDefinitions[] += { \"entryPoint\": [\"/bin/bash\"\,\"-c\"\,\"sed -ie \\\"s/CLUSTER=$cluster/CLUSTER=$origcluster/g\\\" /data/conf/service.conf \&\& /docker-entrypoint.sh cassandra -f\" ] }\' \> taskdef.json | |
| if [ "$keepnew" = "no" ]; then | |
| echo aws --region $region ecs deregister-task-definition --task-definition \$taskdef --no-paginate \>/dev/null | |
| fi | |
| echo aws --region $region ecs register-task-definition --family \$\(echo \$taskdef \| cut -f2 -d/ \| cut -f1 -d:\) --cli-input-json file://taskdef.json \>/dev/null | |
| echo | |
| echo "=== Update ECS service:" | |
| echo aws --region $region ecs update-service --cluster $cluster --service $servicename --task-definition \$\(echo \$taskdef \| cut -f2 -d/ \| cut -f1 -d:\) \>/dev/null | |
| echo | |
| echo "=== Start C*:" | |
| echo $FCCLI -region=$region -cluster=$cluster -op=start-service -service-type=cassandra -service-name=$servicename | |
| echo | |
| echo "=== Wait for 30 seconds to get C* up..." | |
| echo sleep 30 | |
| echo | |
| echo "=== Save JMX creds" | |
| echo "$FCCLI -region=$region -cluster=$cluster -op=get-service -service-type=cassandra -service-name=$servicename | grep JMX_REMOTE | cut -f2 -d= > /tmp/.jmx.pass" | |
| echo | |
| update_cql="" | |
| if [ "$keepnew" = "yes" ]; then | |
| update_cql="UPDATE system.local SET cluster_name = '$cluster' WHERE key = 'local'; " | |
| fi | |
| update_cql="${update_cql}TRUNCATE system.peers; TRUNCATE system.peers_v2;" | |
| echo "=== Update each C* node" | |
| echo "for seed in \$(echo \"$seeds\"); do /bin/docker run -ti --rm --mount type=bind,source=/tmp/.jmx.pass,destination=/root/jmx.pass harisekhon/cassandra-dev cqlsh -u sp \$seed -e \"$update_cql\"; done" | |
| echo | |
| echo "=== Delete JMX creds" | |
| echo /bin/rm /tmp/.jmx.pass | |
| echo | |
| echo "=== Stop C*:" | |
| echo $FCCLI -region=$region -cluster=$cluster -op=stop-service -service-type=cassandra -service-name=$servicename | |
| echo | |
| if [ "$keepnew" = "yes" ]; then | |
| echo "=== Modify task definition and service:" | |
| echo aws --region $region ecs describe-task-definition --task-definition \$taskdef \| jq -r \'.taskDefinition\' \| jq -r \'del\(.taskDefinitionArn,.requiresAttributes,.compatibilities,.status,.family,.revision,.registeredAt,.registeredBy\)\' \> taskdef.json | |
| echo aws --region $region ecs deregister-task-definition --task-definition \$taskdef --no-paginate \>/dev/null | |
| echo aws --region $region ecs register-task-definition --family \$\(echo \$taskdef \| cut -f2 -d/ \| cut -f1 -d:\) --cli-input-json file://taskdef.json \>/dev/null | |
| echo | |
| echo "=== Update ECS service:" | |
| echo taskdef2=\$\(aws --region $region ecs describe-services --cluster $cluster --services $servicename \| jq -r \'.services[].taskDefinition\'\) | |
| echo aws --region $region ecs update-service --cluster $cluster --service $servicename --task-definition \$\(echo \$taskdef \| cut -f2 -d/ \| cut -f1 -d:\) \>/dev/null | |
| echo aws --region $region ecs deregister-task-definition --task-definition \$taskdef2 --no-paginate \>/dev/null | |
| echo | |
| fi | |
| echo "=== Start C*:" | |
| echo $FCCLI -region=$region -cluster=$cluster -op=start-service -service-type=cassandra -service-name=$servicename | |
| echo | |
| echo "=== Make sure everything is alright and (if yes) run the following commands to delete the old volumes:" | |
| echo -e "$delete_cmd" | |
| echo | |
| echo "=== To revert changes back, use the following commands:" | |
| echo -e "$restore_cmd" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment