Skip to content

Instantly share code, notes, and snippets.

@ramondeklein
Created March 2, 2021 18:49
Show Gist options
  • Save ramondeklein/fb016893c500685c976dd1e89a0b5873 to your computer and use it in GitHub Desktop.
Save ramondeklein/fb016893c500685c976dd1e89a0b5873 to your computer and use it in GitHub Desktop.
Rename table in AWS glue (including partition scheme)
# This GIST can be used to rename a table in AWS Glue and keep the
# partitioning scheme intact.
import boto3
region = "eu-west-1"
database_name = "rdk-test"
table_name = "yt_uncompressed"
new_table_name = "nyc_taxiset"
client = boto3.client("glue", region_name=region)
response = client.get_table(DatabaseName=database_name, Name=table_name)
table_input = response["Table"]
table_input["Name"] = new_table_name
table_input.pop("CreatedBy")
table_input.pop("CreateTime")
table_input.pop("UpdateTime")
table_input.pop("DatabaseName")
table_input.pop("IsRegisteredWithLakeFormation")
table_input.pop("CatalogId")
client.create_table(DatabaseName=database_name, TableInput=table_input)
response = client.get_table(DatabaseName=database_name, Name=new_table_name)
table_input = response["Table"]
paginator = client.get_paginator('get_partitions')
response = paginator.paginate(
DatabaseName=database_name,
TableName=table_name
)
partitions = list()
for page in response:
for p in page['Partitions']:
copy = p.copy()
copy.pop("DatabaseName")
copy.pop("TableName")
copy.pop("CreationTime")
copy.pop("CatalogId")
partitions.append(copy)
client.batch_create_partition(
CatalogId=table_input["CatalogId"],
DatabaseName=database_name,
TableName=new_table_name,
PartitionInputList=partitions
)
@msropp
Copy link

msropp commented Dec 14, 2022

This is a super helpful script. Thank you! 🥳

One update (probably a change in AWS since you wrote the original): I had to also do a table_input.pop() on "VersionId". 😄

@duke8585
Copy link

duke8585 commented Jun 27, 2023

This is a super helpful script. Thank you! 🥳

One update (probably a change in AWS since you wrote the original): I had to also do a table_input.pop() on "VersionId". 😄

+1

and you might wanna add a batched add partitions à la

    batch_size = 50  # NOTE must be lower than 100

    while len(partitions) > 0:
        partitions_batch = partitions[0:batch_size]
        del partitions[0:batch_size]

        partition_list = sorted([p.get("Values")[0] for p in partitions_batch])
        print(f"+++\n\n adding partitions to {new_table_name}: {partition_list} \n\n+++")

        client.batch_create_partition(
            CatalogId=table_input["CatalogId"],
            DatabaseName=database_name,
            TableName=new_table_name,
            PartitionInputList=partitions_batch,
        )

@tommydevrijer-lumen
Copy link

tommydevrijer-lumen commented Sep 3, 2024

I have a version to be used as a glue job which is more late-2024 friendly that could be useful for people:

import boto3
import logging
from botocore.exceptions import ClientError

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

region = "eu-west-1"
database_name = "lamp-production"
table_name = "recrawledmodel_output"
new_table_name = "model_output"
partition_batch_size = 100  # AWS Glue allows a maximum of 100 partitions per batch

def rename_glue_table():
    try:
        client = boto3.client("glue", region_name=region)

        logger.info(f"Fetching the table '{table_name}' from the database '{database_name}'")
        response = client.get_table(DatabaseName=database_name, Name=table_name)
        table_input = response["Table"]

        # Prepare the table input by removing unnecessary or invalid fields
        table_input = {
            key: value for key, value in table_input.items()
            if key in {
                "Name", "Description", "Owner", "LastAccessTime", "LastAnalyzedTime", "Retention",
                "StorageDescriptor", "PartitionKeys", "ViewOriginalText", "ViewExpandedText",
                "TableType", "Parameters", "TargetTable", "ViewDefinition"
            }
        }

        table_input["Name"] = new_table_name

        logger.info(f"Creating the new table '{new_table_name}'")
        client.create_table(DatabaseName=database_name, TableInput=table_input)

        logger.info(f"Fetching the partitions for the original table '{table_name}'")
        paginator = client.get_paginator('get_partitions')
        response = paginator.paginate(DatabaseName=database_name, TableName=table_name)

        partitions = []
        for page in response:
            for p in page['Partitions']:
                copy = p.copy()
                copy.pop("DatabaseName", None)
                copy.pop("TableName", None)
                copy.pop("CreationTime", None)
                copy.pop("CatalogId", None)
                partitions.append(copy)

        # Batch partitions into groups of 100
        for i in range(0, len(partitions), partition_batch_size):
            batch = partitions[i:i + partition_batch_size]
            logger.info(f"Creating partitions batch {i // partition_batch_size + 1}/{-(-len(partitions) // partition_batch_size)}")

            try:
                client.batch_create_partition(
                    DatabaseName=database_name,
                    TableName=new_table_name,
                    PartitionInputList=batch
                )
            except ClientError as e:
                logger.error(f"Failed to create partition batch {i // partition_batch_size + 1}: {e}")
                raise

        logger.info(f"Table '{new_table_name}' created successfully with all partitions")

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    rename_glue_table()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment