Created
January 11, 2017 18:02
-
-
Save klyr/78e7b7da6fbd46ff7c40c70bf7f29fe4 to your computer and use it in GitHub Desktop.
Compact multiple PNDA avro files into one
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set -e | |
for d in $(hdfs dfs -ls -R '/user/pnda/PNDA_datasets/datasets/source=*'|awk '/^d.*hour=..$/ {printf "%s ", $NF}'); do | |
IN=$(hdfs dfs -ls ${d}/*.avro | awk '{printf "%s ", $NF}') | |
OUT=${d}/compacted.avro.new | |
echo "--- In '$d' Compacting files '${IN}' to '${OUT}'" | |
hadoop jar /opt/cloudera/parcels/CDH/lib/avro/avro-tools.jar concat ${IN} ${OUT} | |
hdfs dfs -chown gobblin:pnda ${OUT} | |
hdfs dfs -rm -skipTrash ${d}/*.avro | |
hdfs dfs -mv ${OUT} ${d}/compacted.avro | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment