Created
July 17, 2022 00:43
-
-
Save AlexAtkinson/c5b75d7ad838fead4cb90551e2bfc782 to your computer and use it in GitHub Desktop.
BASH: Mock data generator with some basic ETL for simple jobs. (Handles fields with commas.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# genMockData.sh | |
# ------------------------------------------------------------------------------ | |
# Gets some free sample data from: https://www.briandunning.com/sample-data/, | |
# Extracts Transformas and Loads the data. (ETL in BASH...) | |
# regions: us, ca, uk, au | |
region=${1:-ca} # ca (default), us, uk, au - supply as arg1 | |
size=500 # 500 sample size is free | |
file="${region}-${size}" | |
# Get the data | |
[[ ! -f "$file".zip ]] && wget -q https://www.briandunning.com/sample-data/"$file".zip | |
# Extract & normalize for ^M | |
[[ ! -f "$file".csv ]] && unzip -q "$file".zip | |
[[ ! -f "$file".normalized.csv ]] && tr '\r' '\n' < "$file".csv > "$file".normalized.csv | |
# Note: Using AWK as this data set has commas within some fields. | |
# Extract and transform fields as necessary | |
# - zip is getting uppercased and spaces removed | |
# - emails are being lowercased | |
gawk -vFPAT='[^,]*|"[^"]*"' \ | |
'{first_name=$1; second_name=$2; company_name=$3; address=$4; city=$5; \ | |
country=$6; state=$7; zip=$8; phone1=$9; phone2=$10; email=tolower($11); web=$10; } \ | |
{zip=toupper(zip)} {sub(" ","",zip)} \ | |
{email=tolower(email)} \ | |
{print first_name "," second_name "," zip "," email} \ | |
' ca-500.normalized.csv | |
# An alternate approach would be defining human-readabel vars outside of awk for each column, | |
# like 'first_name=1', and using them with awk like: | |
# gawk -vFPAT='[^,]*|"[^"]*"' "toupper(sub(/ /,\"\",\$$zip)); BEGIN {print \$$first_name "," \$$last_name "," tolower(\$$email)}" ca-500.normalized.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment