Skip to content

Instantly share code, notes, and snippets.

@fabrizioc1
Created May 26, 2018 05:10
Show Gist options
  • Save fabrizioc1/2ad760a4cd83c1742f0d3d2e6f3da8ad to your computer and use it in GitHub Desktop.
Save fabrizioc1/2ad760a4cd83c1742f0d3d2e6f3da8ad to your computer and use it in GitHub Desktop.
Expand hadoop path
import re
def expand_hadoop_path(path):
brace_patterns = re.findall('{{[^}]+}}', path)
brace_patterns_values = [(brace_pattern, brace_pattern.replace('{{','').replace('}}','').split(',')) for brace_pattern in brace_patterns]
replacements = []
for (brace_pattern, values) in brace_patterns_values:
if replacements:
replacements_count = len(replacements)
for i in range(replacements_count):
replacement = replacements[i]
for value in values:
if not replacement.has_key(brace_pattern):
replacement[brace_pattern] = value
else:
new_replacement = dict(replacement)
new_replacement[brace_pattern] = value
replacements.append(new_replacement)
else:
for value in values:
replacement = dict()
replacement[brace_pattern] = value
replacements.append(replacement)
expanded_paths = []
for replacement in replacements:
expanded_path = str(path)
for brace_pattern, value in replacement.items():
expanded_path = expanded_path.replace(brace_pattern, value)
expanded_paths.append(expanded_path)
return expanded_paths
testing_data = {
's3://bucket/{{highlq,lowlq}}/201804/{{bid,nobid}}/': ['s3://bucket/highlq/201804/bid/', 's3://bucket/lowlq/201804/bid/', 's3://bucket/highlq/201804/nobid/', 's3://bucket/lowlq/201804/nobid/'],
's3://bucket/{{highlq,midlq,lowlq}}/201804/{{bid,nobid}}/': ['s3://bucket/highlq/201804/bid/', 's3://bucket/midlq/201804/bid/', 's3://bucket/lowlq/201804/bid/', 's3://bucket/highlq/201804/nobid/', 's3://bucket/midlq/201804/nobid/', 's3://bucket/lowlq/201804/nobid/'],
's3://bucket/{{highlq,lowlq}}/2018{{03,04}}/{{bid,nobid}}/': ['s3://bucket/highlq/201803/bid/', 's3://bucket/lowlq/201803/bid/', 's3://bucket/highlq/201804/bid/', 's3://bucket/lowlq/201804/bid/', 's3://bucket/highlq/201803/nobid/', 's3://bucket/lowlq/201803/nobid/', 's3://bucket/highlq/201804/nobid/', 's3://bucket/lowlq/201804/nobid/']
}
for (path, expected) in testing_data.items():
assert set(expand_hadoop_path(path)) == set(expected), "expand_hadoop_path failed on #{path}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment