Last active
January 18, 2016 17:10
-
-
Save fblundun/6458307b06a31bd86bf1 to your computer and use it in GitHub Desktop.
Rewriting data pipelines in bash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e | |
path=$1; | |
schema=$2; | |
# String manipulation to extract "s3://first_part_of_path/" | |
path_prefix=$(echo $1 | sed 's_\(^s3://[^/]*/\).*$_\1_g'); | |
# Get the names of the files to download | |
aws s3 ls --recursive $path | grep "part-" | awk '{print $4}' | | |
# Start one download process per file, running at most 8 in parallel | |
xargs -n1 -P8 -I{} aws s3 cp $path_prefix{} - | | |
# Get the 53rd field (i.e. contexts) of the TSV | |
awk '-F\t' '{print $53}' | | |
# Print every innermost "data" JSON whose schema is $schema | |
jq -c '.data | .[] | select(.schema == "'$schema'") | .data' | | |
# # Split the resulting JSONs into files prefixed "filteredjsons_" of at most 100000 lines each | |
split -l 100000 - filteredjsons_; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# This version differs in that it parallelizes not only the downloads but also the processing of the files | |
set -e | |
path=$1; | |
schema=$2; | |
# String manipulation to extract "s3://first_part_of_path/" | |
path_prefix=$(echo $1 | sed 's_\(^s3://[^/]*/\).*$_\1_g'); | |
# Download a given file and extract the data JSONs from only those contexts with the required schema | |
function process_file() { | |
full_path=$1; | |
schema=$2; | |
>&2 echo "Downloading $full_path" | |
# Copy the file to stdout | |
aws s3 cp "$full_path" - | | |
# Get the 53rd field (i.e. contexts) of the TSV | |
awk '-F\t' '{print $53}' | | |
# Print every innermost "data" JSON whose schema is $schema | |
jq -c '.data | .[] | select(.schema == "'$schema'") | .data'; | |
>&2 echo "Finished processing $full_path"; | |
} | |
# Required to call process_file using xargs | |
export -f process_file | |
# Get the names of the files to download | |
aws s3 ls --recursive $path | grep "part-" | awk '{print $4}' | | |
# Start up to 8 parallel processes to extract the JSONs | |
xargs -n1 -P8 -I{} bash -c "process_file $path_prefix{} $schema" | | |
# Split the resulting JSONs into files prefixed "filteredjsons_" of at most 100000 lines each | |
split -l 100000 - filteredjsons_; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage:
The JSONs will end up in files whose names have the prefix
filteredjsons_
.This script assumes jq is installed.