Created
September 17, 2025 22:00
-
-
Save EcZachly/29520e2fb5c47cc7e33239209534e181 to your computer and use it in GitHub Desktop.
Tabular to AWS Glue Conversion Script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Register Iceberg tables from config.json to AWS Glue Catalog | |
| # Usage: ./register-tables-glue-fixed.sh [config.json] | |
| set -e | |
| CONFIG_FILE="${1:-config.json}" | |
| AWS_REGION="${AWS_REGION:-us-west-2}" | |
| if [[ ! -f "$CONFIG_FILE" ]]; then | |
| echo "Error: Config file $CONFIG_FILE not found" | |
| exit 1 | |
| fi | |
| echo "Creating Glue database: $GLUE_DATABASE" | |
| aws glue create-database \ | |
| --database-input Name="$GLUE_DATABASE",Description="Iceberg tables from Tabular.io migration" \ | |
| --region "$AWS_REGION" 2>/dev/null || echo "Database $GLUE_DATABASE already exists" | |
| echo "Processing tables from $CONFIG_FILE..." | |
| # Extract table information and register each one | |
| jq -r '.[] | "\(.table_name)|\(."metadata-location")|\(.metadata.location)|\(.metadata.schemas[0].fields | map("\(.name):\(.type)") | join(","))|\(.metadata["partition-specs"][0].fields // [] | map("\(.name):\(.transform):\(.["source-id"])") | join(","))"' "$CONFIG_FILE" | \ | |
| while IFS='|' read -r table_name metadata_location data_location schema_fields partition_fields; do | |
| # Extract namespace and table name | |
| if [[ "$table_name" == *"."* ]]; then | |
| namespace=$(echo "$table_name" | cut -d'.' -f1) | |
| table=$(echo "$table_name" | cut -d'.' -f2) | |
| else | |
| namespace="default" | |
| table="$table_name" | |
| fi | |
| aws glue create-database \ | |
| --database-input Name="${namespace}",Description="Iceberg tables from Tabular.io migration" \ | |
| --region "$AWS_REGION" 2>/dev/null || echo "Database ${namespace} already exists" | |
| # Clean table name for Glue (alphanumeric and underscores only) | |
| clean_table=$(echo "${table}") | |
| echo "Registering table: $clean_table" | |
| # Build column definitions for Glue | |
| columns="" | |
| if [[ -n "$schema_fields" ]]; then | |
| IFS=',' read -ra FIELDS <<< "$schema_fields" | |
| for field in "${FIELDS[@]}"; do | |
| if [[ -n "$field" ]]; then | |
| field_name=$(echo "$field" | cut -d':' -f1) | |
| field_type=$(echo "$field" | cut -d':' -f2) | |
| if [[ -n "$columns" ]]; then | |
| columns="${columns}," | |
| fi | |
| columns="${columns}{\"Name\":\"$field_name\",\"Type\":\"$hive_type\"}" | |
| fi | |
| done | |
| fi | |
| # Build partition key definitions for Glue | |
| partition_keys="" | |
| if [[ -n "$partition_fields" ]]; then | |
| IFS=',' read -ra PARTITIONS <<< "$partition_fields" | |
| for partition_field in "${PARTITIONS[@]}"; do | |
| if [[ -n "$partition_field" ]]; then | |
| partition_name=$(echo "$partition_field" | cut -d':' -f1) | |
| # Find the source field type from schema | |
| source_field_type="string" # default | |
| IFS=',' read -ra FIELDS <<< "$schema_fields" | |
| for field in "${FIELDS[@]}"; do | |
| field_name=$(echo "$field" | cut -d':' -f1) | |
| field_type=$(echo "$field" | cut -d':' -f2) | |
| if [[ "$field_name" == "$partition_name" ]]; then | |
| source_field_type="$field_type" | |
| break | |
| fi | |
| done | |
| if [[ -n "$partition_keys" ]]; then | |
| partition_keys="${partition_keys}," | |
| fi | |
| partition_keys="${partition_keys}{\"Name\":\"$partition_name\",\"Type\":\"$hive_type\"}" | |
| fi | |
| done | |
| fi | |
| # Build table input JSON with optional partition keys | |
| table_input="{ | |
| \"Name\": \"$clean_table\", | |
| \"StorageDescriptor\": { | |
| \"Columns\": [$columns], | |
| \"Location\": \"$data_location\", | |
| \"InputFormat\": \"org.apache.iceberg.mr.mapred.IcebergInputFormat\", | |
| \"OutputFormat\": \"org.apache.iceberg.mr.mapred.IcebergOutputFormat\", | |
| \"SerdeInfo\": { | |
| \"SerializationLibrary\": \"org.apache.iceberg.mr.hive.HiveIcebergSerDe\" | |
| } | |
| }, | |
| \"Parameters\": { | |
| \"table_type\": \"ICEBERG\", | |
| \"metadata_location\": \"$metadata_location\", | |
| \"EXTERNAL\": \"TRUE\" | |
| }" | |
| # Add partition keys if they exist | |
| if [[ -n "$partition_keys" ]]; then | |
| table_input="${table_input},\"PartitionKeys\": [$partition_keys]" | |
| echo " └─ Partitioned by: $(echo "$partition_fields" | sed 's/:[^,]*//g')" | |
| fi | |
| table_input="${table_input}}" | |
| # Register table in Glue | |
| aws glue create-table \ | |
| --database-name "${namespace}" \ | |
| --region "$AWS_REGION" \ | |
| --table-input "$table_input" 2>/dev/null && echo "✓ Registered $clean_table" || echo "✗ Failed to register $clean_table" | |
| done | |
| echo "Table registration complete!" | |
| echo "View tables with: aws glue get-tables --database-name $GLUE_DATABASE --region $AWS_REGION" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment