Last active
June 14, 2018 13:23
-
-
Save iconara/6f2a95d589b54202156a73f5bf0558ae to your computer and use it in GitHub Desktop.
Quick and dirty script to find spurious files in the prefix of a Glue table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'aws-sdk-glue' | |
require 'aws-sdk-s3' | |
def split_s3_uri(s3_uri) | |
s3_uri.match(%r{\As3://(.+?)/(.+)\z}).to_a.drop(1) | |
end | |
database, table_name = ARGV.take(2) | |
glue = Aws::Glue::Client.new | |
s3 = Aws::S3::Client.new | |
valid_prefixes = [] | |
table_response = glue.get_table(database_name: database, name: table_name) | |
exit if table_response.table.partition_keys.empty? | |
table_location = table_response.table.storage_descriptor.location | |
table_bucket, table_prefix = split_s3_uri(table_location) | |
table_prefix << '/' unless table_prefix.end_with?('/') | |
partitions_response = glue.get_partitions(database_name: database, table_name: table_name) | |
loop do | |
partitions_response.partitions.each do |partition| | |
prefix = partition.storage_descriptor.location | |
bucket, prefix = split_s3_uri(partition.storage_descriptor.location) | |
if bucket == table_bucket && prefix.start_with?(table_prefix) | |
prefix += '/' unless prefix.end_with?('/') | |
valid_prefixes << prefix | |
else | |
$stderr.puts("Partition location outside table prefix! (#{partition.storage_descriptor.location} not in #{table_location})") | |
end | |
end | |
if partitions_response.next_page? | |
partitions_response = partitions_response.next_page | |
else | |
break | |
end | |
end | |
listing_response = s3.list_objects_v2(bucket: table_bucket, prefix: table_prefix) | |
loop do | |
listing_response.contents.each do |obj| | |
unless valid_prefixes.any? { |prefix| obj.key.start_with?(prefix) } | |
$stdout.puts(obj.key) | |
end | |
end | |
if listing_response.next_page? | |
listing_response = listing_response.next_page | |
else | |
break | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment