Skip to content

Instantly share code, notes, and snippets.

@EcZachly
Created September 17, 2025 22:00
Show Gist options
  • Select an option

  • Save EcZachly/29520e2fb5c47cc7e33239209534e181 to your computer and use it in GitHub Desktop.

Select an option

Save EcZachly/29520e2fb5c47cc7e33239209534e181 to your computer and use it in GitHub Desktop.
Tabular to AWS Glue Conversion Script
#!/bin/bash
# Register Iceberg tables from config.json to AWS Glue Catalog
# Usage: ./register-tables-glue-fixed.sh [config.json]
set -e
CONFIG_FILE="${1:-config.json}"
AWS_REGION="${AWS_REGION:-us-west-2}"
if [[ ! -f "$CONFIG_FILE" ]]; then
echo "Error: Config file $CONFIG_FILE not found"
exit 1
fi
echo "Creating Glue database: $GLUE_DATABASE"
aws glue create-database \
--database-input Name="$GLUE_DATABASE",Description="Iceberg tables from Tabular.io migration" \
--region "$AWS_REGION" 2>/dev/null || echo "Database $GLUE_DATABASE already exists"
echo "Processing tables from $CONFIG_FILE..."
# Extract table information and register each one
jq -r '.[] | "\(.table_name)|\(."metadata-location")|\(.metadata.location)|\(.metadata.schemas[0].fields | map("\(.name):\(.type)") | join(","))|\(.metadata["partition-specs"][0].fields // [] | map("\(.name):\(.transform):\(.["source-id"])") | join(","))"' "$CONFIG_FILE" | \
while IFS='|' read -r table_name metadata_location data_location schema_fields partition_fields; do
# Extract namespace and table name
if [[ "$table_name" == *"."* ]]; then
namespace=$(echo "$table_name" | cut -d'.' -f1)
table=$(echo "$table_name" | cut -d'.' -f2)
else
namespace="default"
table="$table_name"
fi
aws glue create-database \
--database-input Name="${namespace}",Description="Iceberg tables from Tabular.io migration" \
--region "$AWS_REGION" 2>/dev/null || echo "Database ${namespace} already exists"
# Clean table name for Glue (alphanumeric and underscores only)
clean_table=$(echo "${table}")
echo "Registering table: $clean_table"
# Build column definitions for Glue
columns=""
if [[ -n "$schema_fields" ]]; then
IFS=',' read -ra FIELDS <<< "$schema_fields"
for field in "${FIELDS[@]}"; do
if [[ -n "$field" ]]; then
field_name=$(echo "$field" | cut -d':' -f1)
field_type=$(echo "$field" | cut -d':' -f2)
if [[ -n "$columns" ]]; then
columns="${columns},"
fi
columns="${columns}{\"Name\":\"$field_name\",\"Type\":\"$hive_type\"}"
fi
done
fi
# Build partition key definitions for Glue
partition_keys=""
if [[ -n "$partition_fields" ]]; then
IFS=',' read -ra PARTITIONS <<< "$partition_fields"
for partition_field in "${PARTITIONS[@]}"; do
if [[ -n "$partition_field" ]]; then
partition_name=$(echo "$partition_field" | cut -d':' -f1)
# Find the source field type from schema
source_field_type="string" # default
IFS=',' read -ra FIELDS <<< "$schema_fields"
for field in "${FIELDS[@]}"; do
field_name=$(echo "$field" | cut -d':' -f1)
field_type=$(echo "$field" | cut -d':' -f2)
if [[ "$field_name" == "$partition_name" ]]; then
source_field_type="$field_type"
break
fi
done
if [[ -n "$partition_keys" ]]; then
partition_keys="${partition_keys},"
fi
partition_keys="${partition_keys}{\"Name\":\"$partition_name\",\"Type\":\"$hive_type\"}"
fi
done
fi
# Build table input JSON with optional partition keys
table_input="{
\"Name\": \"$clean_table\",
\"StorageDescriptor\": {
\"Columns\": [$columns],
\"Location\": \"$data_location\",
\"InputFormat\": \"org.apache.iceberg.mr.mapred.IcebergInputFormat\",
\"OutputFormat\": \"org.apache.iceberg.mr.mapred.IcebergOutputFormat\",
\"SerdeInfo\": {
\"SerializationLibrary\": \"org.apache.iceberg.mr.hive.HiveIcebergSerDe\"
}
},
\"Parameters\": {
\"table_type\": \"ICEBERG\",
\"metadata_location\": \"$metadata_location\",
\"EXTERNAL\": \"TRUE\"
}"
# Add partition keys if they exist
if [[ -n "$partition_keys" ]]; then
table_input="${table_input},\"PartitionKeys\": [$partition_keys]"
echo " └─ Partitioned by: $(echo "$partition_fields" | sed 's/:[^,]*//g')"
fi
table_input="${table_input}}"
# Register table in Glue
aws glue create-table \
--database-name "${namespace}" \
--region "$AWS_REGION" \
--table-input "$table_input" 2>/dev/null && echo "✓ Registered $clean_table" || echo "✗ Failed to register $clean_table"
done
echo "Table registration complete!"
echo "View tables with: aws glue get-tables --database-name $GLUE_DATABASE --region $AWS_REGION"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment