fblundun · January 18, 2016 17:10 · fblundun · Jan 18, 2016
diff --git a/collect_contexts.bash b/collect_contexts.bash
 #!/bin/bash

 set -e

 path=$1;
 schema=$2;

 # String manipulation to extract "s3://first_part_of_path/"
 path_prefix=$(echo $1 | sed 's_\(^s3://[^/]*/\).*$_\1_g');

 # Get the names of the files to download
 aws s3 ls --recursive $path | grep "part-" | awk '{print $4}' |

 # Start one download process per file, running at most 8 in parallel
 xargs -n1 -P8 -I{} aws s3 cp $path_prefix{} - |

 # Get the 53rd field (i.e. contexts) of the TSV
 awk '-F\t' '{print $53}' |

 # Print every innermost "data" JSON whose schema is $schema
 jq -c '.data | .[] | select(.schema == "'$schema'") | .data' |

 # # Split the resulting JSONs into files prefixed "filteredjsons_" of at most 100000 lines each
 split -l 100000 - filteredjsons_;
diff --git a/differently_parallel_collect_contexts.bash b/differently_parallel_collect_contexts.bash
 #!/bin/bash

 # This version differs in that it parallelizes not only the downloads but also the processing of the files 

 set -e

 path=$1;
 schema=$2;

 # String manipulation to extract "s3://first_part_of_path/"
 path_prefix=$(echo $1 | sed 's_\(^s3://[^/]*/\).*$_\1_g');

 # Download a given file and extract the data JSONs from only those contexts with the required schema
 function process_file() {
 	full_path=$1;
 	schema=$2;
 	>&2 echo "Downloading $full_path"

 	# Copy the file to stdout
 	aws s3 cp "$full_path" - |

 	# Get the 53rd field (i.e. contexts) of the TSV
 	awk '-F\t' '{print $53}' |

 	# Print every innermost "data" JSON whose schema is $schema
 	jq -c '.data | .[] | select(.schema == "'$schema'") | .data';

 	>&2 echo "Finished processing $full_path";
 }

 # Required to call process_file using xargs
 export -f process_file

 # Get the names of the files to download
 aws s3 ls --recursive $path | grep "part-" | awk '{print $4}' |

 # Start up to 8 parallel processes to extract the JSONs
 xargs -n1 -P8 -I{} bash -c "process_file $path_prefix{} $schema" |

 # Split the resulting JSONs into files prefixed "filteredjsons_" of at most 100000 lines each
 split -l 100000 - filteredjsons_;
	#!/bin/bash

	set -e

	path=$1;
	schema=$2;

	# String manipulation to extract "s3://first_part_of_path/"
	path_prefix=$(echo $1 \| sed 's_\(^s3://[^/]/\).$_\1_g');

	# Get the names of the files to download
	aws s3 ls --recursive $path \| grep "part-" \| awk '{print $4}' \|

	# Start one download process per file, running at most 8 in parallel
	xargs -n1 -P8 -I{} aws s3 cp $path_prefix{} - \|

	# Get the 53rd field (i.e. contexts) of the TSV
	awk '-F\t' '{print $53}' \|

	# Print every innermost "data" JSON whose schema is $schema
	jq -c '.data \| .[] \| select(.schema == "'$schema'") \| .data' \|

	# # Split the resulting JSONs into files prefixed "filteredjsons_" of at most 100000 lines each
	split -l 100000 - filteredjsons_;
	#!/bin/bash

	# This version differs in that it parallelizes not only the downloads but also the processing of the files

	set -e

	path=$1;
	schema=$2;

	# String manipulation to extract "s3://first_part_of_path/"
	path_prefix=$(echo $1 \| sed 's_\(^s3://[^/]/\).$_\1_g');

	# Download a given file and extract the data JSONs from only those contexts with the required schema
	function process_file() {
	full_path=$1;
	schema=$2;
	>&2 echo "Downloading $full_path"

	# Copy the file to stdout
	aws s3 cp "$full_path" - \|

	# Get the 53rd field (i.e. contexts) of the TSV
	awk '-F\t' '{print $53}' \|

	# Print every innermost "data" JSON whose schema is $schema
	jq -c '.data \| .[] \| select(.schema == "'$schema'") \| .data';

	>&2 echo "Finished processing $full_path";
	}

	# Required to call process_file using xargs
	export -f process_file

	# Get the names of the files to download
	aws s3 ls --recursive $path \| grep "part-" \| awk '{print $4}' \|

	# Start up to 8 parallel processes to extract the JSONs
	xargs -n1 -P8 -I{} bash -c "process_file $path_prefix{} $schema" \|

	# Split the resulting JSONs into files prefixed "filteredjsons_" of at most 100000 lines each
	split -l 100000 - filteredjsons_;