Skip to content

Instantly share code, notes, and snippets.

@whyvez
Created May 29, 2015 21:45
Show Gist options
  • Select an option

  • Save whyvez/643f69c273782781a9d3 to your computer and use it in GitHub Desktop.

Select an option

Save whyvez/643f69c273782781a9d3 to your computer and use it in GitHub Desktop.
bawlk " " update
#!/usr/bin/awk -F, -f
# generated by: awk-csvalid https://github.com/tesera/awk-csvalid
# awk-csvalid csv toolset generator: https://github.com/tesera/awk-csvalid
# usage:
# validate: awk -f action=validate validator.awk > violations.txt
# create table: awk -v action=table -f validator.awk | psql afgo_dev
# sanitize csv: awk -v action=sanitize -f validator.awk > sanitized.csv
# insert sql: awk -v action=insert -f validator.awk | psql afgo_dev
# awk is a simple unix text file parser: http://www.gnu.org/software/gawk/manual/gawk.html
# awk primer:
# NR = number/index current record
# RS = record seperator new line i.e. \n
# FS = field seperator i.e. ,
# /pattern/ { expression } = if pattern is truthy run expression
BEGIN {
FS=","; OFS=","; err_count=0; cats["na"]=0;
if(!action) action = "validate"
summary_header="file_name,field_name,rule,message,violation_count"
CSVFILENAME = CSVFILENAME ? CSVFILENAME : FILENAME
FPAT = "([^,]+)|(\"[^\"]+\")"
}
# builtin helper functions
function eql(x,y) {v=1; for (i in x) v=(v&&x[i]==y[i]); return v;}
function are_headers_valid(valid_headers) { v=0; split($0, h, ","); split(valid_headers, vh, "|"); return eql(h, vh); }
function is_unique(i, val) { if (vals[i,val]) { return 0; } else { vals[i,val] = 1; return 1; } }
function is_numeric(x){ return(x==x+0) }
function print_cats(categories) { for (category in categories) { if (categories[category]) print " " category ": " categories[category]; } }
function log_err(cat) { cats[cat]++; err_count++; }
#get rid of the evil windows cr
{ sub("\r$", "") }
# for quoted values parse them out correctly
# and store the index for insert
{
for (i = 1; i <= NF; i++) {
wasquoted[i] = 0
if (substr($i, 1, 1) == "\"") {
len = length($i)
$i = substr($i, 2, len - 2)
wasquoted[i] = 1
}
}
}
# make header index/map
NR > 1 {
datum=$22
company_plot_number=$2
latitude=$23
company_stand_number=$3
longitude=$24
establishment_year=$4
natural_subregion=$25
establishment_month=$5
ecosite_guide=$26
establishment_day=$6
ecosite=$27
fmu=$7
ecosite_phase=$28
fma=$8
plot_comment=$29
ats_township=$9
ats_range=$10
ats_meridian=$11
ats_section=$12
opening_number=$13
sampling_unit_number=$14
topographic_position=$15
elevation=$16
slope=$17
aspect=$18
x_coord=$19
y_coord=$20
utm_zone=$21
company=$1
}
# awk rules based on user csv ruleset
NR == 1 && action == "validate" { headers="company|company_plot_number|company_stand_number|establishment_year|establishment_month|establishment_day|fmu|fma|ats_township|ats_range|ats_meridian|ats_section|opening_number|sampling_unit_number|topographic_position|elevation|slope|aspect|x_coord|y_coord|utm_zone|datum|latitude|longitude|natural_subregion|ecosite_guide|ecosite|ecosite_phase|plot_comment"; if (!are_headers_valid(headers)) { gsub(/\|/, FS, headers); print RS "INVALID HEADERS IN " CSVFILENAME RS "WAS: " RS $0 RS "EXPECTED:" RS headers RS; exit 0; } }
NR == 1 && action == "validate:summary" { headers="company|company_plot_number|company_stand_number|establishment_year|establishment_month|establishment_day|fmu|fma|ats_township|ats_range|ats_meridian|ats_section|opening_number|sampling_unit_number|topographic_position|elevation|slope|aspect|x_coord|y_coord|utm_zone|datum|latitude|longitude|natural_subregion|ecosite_guide|ecosite|ecosite_phase|plot_comment"; if (!are_headers_valid(headers)) { violations[CSVFILENAME FS "headers" FS "names" FS "csv headers are invalid" FS "error"]=1; exit 0; } }
action ~ /validate/ && NR > 1 { pkey=company "-" company_plot_number; if(keys[pkey]) { if (dupkeys[pkey]) dupkeys[pkey]++; else dupkeys[pkey] = 1 } else { keys[pkey] = NR } }
action == "validate" && NR > 1 && company == "" { log_err("error"); print "Field company in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && company == "" { key=CSVFILENAME FS "company" FS "required" FS "value is required but was empty" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && company != "" && company !~ /^(AINS|GOA|APLY|ALPC|ANC|BLUE|CFPL|CFS|DAIS|FOFP|MDFP|MWWC|SLPC|SPRA|SUND|SFPI|HLFP|TOLK|TOSL|UOA|VAND|WFML|WYGP|WYPM|UNKN|HPFP)$/ { log_err("error"); print "company in " CSVFILENAME " line " NR " should match the following pattern /^(AINS|GOA|APLY|ALPC|ANC|BLUE|CFPL|CFS|DAIS|FOFP|MDFP|MWWC|SLPC|SPRA|SUND|SFPI|HLFP|TOLK|TOSL|UOA|VAND|WFML|WYGP|WYPM|UNKN|HPFP)$/ and was " company " " RS $0 RS; }
action == "validate:summary" && NR > 1 && company != "" && company !~ /^(AINS|GOA|APLY|ALPC|ANC|BLUE|CFPL|CFS|DAIS|FOFP|MDFP|MWWC|SLPC|SPRA|SUND|SFPI|HLFP|TOLK|TOSL|UOA|VAND|WFML|WYGP|WYPM|UNKN|HPFP)$/ { key=CSVFILENAME FS "company" FS "pattern" FS "value should match: /^(AINS|GOA|APLY|ALPC|ANC|BLUE|CFPL|CFS|DAIS|FOFP|MDFP|MWWC|SLPC|SPRA|SUND|SFPI|HLFP|TOLK|TOSL|UOA|VAND|WFML|WYGP|WYPM|UNKN|HPFP)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && company_plot_number == "" { log_err("error"); print "Field company_plot_number in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && company_plot_number == "" { key=CSVFILENAME FS "company_plot_number" FS "required" FS "value is required but was empty" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && !is_unique(company_plot_number) { log_err("error"); print "Field company_plot_number in " CSVFILENAME " line " NR " is a duplicate and should be unique" RS $0 RS; }
action == "validate:summary" && NR > 1 && !is_unique(company_plot_number) { key=CSVFILENAME FS "company_plot_number" FS "unique" FS "value should be unique but had duplicates" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && company_plot_number != "" && length(company_plot_number) > 15 { log_err("error"); print "company_plot_number length in " CSVFILENAME " line " NR " should be less than 15 and was " length(company_plot_number) " " RS $0 RS; }
action == "validate:summary" && NR > 1 && company_plot_number != "" && length(company_plot_number) > 15 { key=CSVFILENAME FS "company_plot_number" FS "maxLength" FS "max length is: 15" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && company_stand_number != "" && length(company_stand_number) > 15 { log_err("error"); print "company_stand_number length in " CSVFILENAME " line " NR " should be less than 15 and was " length(company_stand_number) " " RS $0 RS; }
action == "validate:summary" && NR > 1 && company_stand_number != "" && length(company_stand_number) > 15 { key=CSVFILENAME FS "company_stand_number" FS "maxLength" FS "max length is: 15" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_year && !is_numeric(establishment_year) { log_err("error"); print "Field establishment_year in " CSVFILENAME " line " NR " should be a numeric but was " establishment_year " " RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_year && !is_numeric(establishment_year) { key=CSVFILENAME FS "establishment_year" FS "type" FS "max length is: 15" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_year == "" { log_err("error"); print "Field establishment_year in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_year == "" { key=CSVFILENAME FS "establishment_year" FS "required" FS "value is required but was empty" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_year != "" && establishment_year < 1900 { log_err("error"); print "establishment_year in " CSVFILENAME " line " NR " should be greater than 1900 and was " establishment_year " " RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_year != "" && establishment_year < 1900 { key=CSVFILENAME FS "establishment_year" FS "minimum" FS "value should be greater than: 1900" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_year != "" && establishment_year > 2050 { log_err("error"); print "establishment_year in " CSVFILENAME " line " NR " should be less than 2050 and was " establishment_year " " RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_year != "" && establishment_year > 2050 { key=CSVFILENAME FS "establishment_year" FS "maximum" FS "value should be less than: 2050" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_month && !is_numeric(establishment_month) { log_err("error"); print "Field establishment_month in " CSVFILENAME " line " NR " should be a numeric but was " establishment_month " " RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_month && !is_numeric(establishment_month) { key=CSVFILENAME FS "establishment_month" FS "type" FS "value should be less than: 2050" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_month != "" && establishment_month < 1 { log_err("error"); print "establishment_month in " CSVFILENAME " line " NR " should be greater than 1 and was " establishment_month " " RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_month != "" && establishment_month < 1 { key=CSVFILENAME FS "establishment_month" FS "minimum" FS "value should be greater than: 1" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_month != "" && establishment_month > 12 { log_err("error"); print "establishment_month in " CSVFILENAME " line " NR " should be less than 12 and was " establishment_month " " RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_month != "" && establishment_month > 12 { key=CSVFILENAME FS "establishment_month" FS "maximum" FS "value should be less than: 12" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_day && !is_numeric(establishment_day) { log_err("error"); print "Field establishment_day in " CSVFILENAME " line " NR " should be a numeric but was " establishment_day " " RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_day && !is_numeric(establishment_day) { key=CSVFILENAME FS "establishment_day" FS "type" FS "value should be less than: 12" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_day != "" && establishment_day < 1 { log_err("error"); print "establishment_day in " CSVFILENAME " line " NR " should be greater than 1 and was " establishment_day " " RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_day != "" && establishment_day < 1 { key=CSVFILENAME FS "establishment_day" FS "minimum" FS "value should be greater than: 1" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && establishment_day != "" && establishment_day > 31 { log_err("error"); print "establishment_day in " CSVFILENAME " line " NR " should be less than 31 and was " establishment_day " " RS $0 RS; }
action == "validate:summary" && NR > 1 && establishment_day != "" && establishment_day > 31 { key=CSVFILENAME FS "establishment_day" FS "maximum" FS "value should be less than: 31" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && fmu != "" && fmu !~ /^(A10|A11|A12|A13|A14|A15|A4|A5|A6|A7|A8|A9|B10|B11|B9|BO1|BO2|C4|C5|CO1|CO2|E1|E10|E11|E13|E14|E15|E2|E3|E4|E5|E6|E7|E8|E9|EO1|F1|F10|F11|F14|F20|F21|F23|F24|F25|FO1|G1|G10|G11|G12|G13|G14|G15|G16|G2|G3|G4|G5|G6|G7|G8|G9|GO1|GO2|GO3|GO4|H1|L1|L11|L2|L3|L8|L9|LO1|M1|M10|M3|M4|M7|M9|NA|P1|P10|P11|P13|P14|P19|P20|P21|P3|P4|P5|P6|P7|P8|P9|PO1|PO2|PO3|R1|R10|R11|R12|R13|R14|R2|R2U|R3|R4|R4Y|RO1|S10|S11|S14|S15|S16|S17|S18|S19|S20|S21|S22|S23|S24|S25|S7|S9|SO1|SO2|W1|W10|W11|W13|W14|W15|W2|W3|W4|W5|W6|W8|WO1|WO2|WO3|E14)$/ { log_err("error"); print "fmu in " CSVFILENAME " line " NR " should match the following pattern /^(A10|A11|A12|A13|A14|A15|A4|A5|A6|A7|A8|A9|B10|B11|B9|BO1|BO2|C4|C5|CO1|CO2|E1|E10|E11|E13|E14|E15|E2|E3|E4|E5|E6|E7|E8|E9|EO1|F1|F10|F11|F14|F20|F21|F23|F24|F25|FO1|G1|G10|G11|G12|G13|G14|G15|G16|G2|G3|G4|G5|G6|G7|G8|G9|GO1|GO2|GO3|GO4|H1|L1|L11|L2|L3|L8|L9|LO1|M1|M10|M3|M4|M7|M9|NA|P1|P10|P11|P13|P14|P19|P20|P21|P3|P4|P5|P6|P7|P8|P9|PO1|PO2|PO3|R1|R10|R11|R12|R13|R14|R2|R2U|R3|R4|R4Y|RO1|S10|S11|S14|S15|S16|S17|S18|S19|S20|S21|S22|S23|S24|S25|S7|S9|SO1|SO2|W1|W10|W11|W13|W14|W15|W2|W3|W4|W5|W6|W8|WO1|WO2|WO3|E14)$/ and was " fmu " " RS $0 RS; }
action == "validate:summary" && NR > 1 && fmu != "" && fmu !~ /^(A10|A11|A12|A13|A14|A15|A4|A5|A6|A7|A8|A9|B10|B11|B9|BO1|BO2|C4|C5|CO1|CO2|E1|E10|E11|E13|E14|E15|E2|E3|E4|E5|E6|E7|E8|E9|EO1|F1|F10|F11|F14|F20|F21|F23|F24|F25|FO1|G1|G10|G11|G12|G13|G14|G15|G16|G2|G3|G4|G5|G6|G7|G8|G9|GO1|GO2|GO3|GO4|H1|L1|L11|L2|L3|L8|L9|LO1|M1|M10|M3|M4|M7|M9|NA|P1|P10|P11|P13|P14|P19|P20|P21|P3|P4|P5|P6|P7|P8|P9|PO1|PO2|PO3|R1|R10|R11|R12|R13|R14|R2|R2U|R3|R4|R4Y|RO1|S10|S11|S14|S15|S16|S17|S18|S19|S20|S21|S22|S23|S24|S25|S7|S9|SO1|SO2|W1|W10|W11|W13|W14|W15|W2|W3|W4|W5|W6|W8|WO1|WO2|WO3|E14)$/ { key=CSVFILENAME FS "fmu" FS "pattern" FS "value should match: /^(A10|A11|A12|A13|A14|A15|A4|A5|A6|A7|A8|A9|B10|B11|B9|BO1|BO2|C4|C5|CO1|CO2|E1|E10|E11|E13|E14|E15|E2|E3|E4|E5|E6|E7|E8|E9|EO1|F1|F10|F11|F14|F20|F21|F23|F24|F25|FO1|G1|G10|G11|G12|G13|G14|G15|G16|G2|G3|G4|G5|G6|G7|G8|G9|GO1|GO2|GO3|GO4|H1|L1|L11|L2|L3|L8|L9|LO1|M1|M10|M3|M4|M7|M9|NA|P1|P10|P11|P13|P14|P19|P20|P21|P3|P4|P5|P6|P7|P8|P9|PO1|PO2|PO3|R1|R10|R11|R12|R13|R14|R2|R2U|R3|R4|R4Y|RO1|S10|S11|S14|S15|S16|S17|S18|S19|S20|S21|S22|S23|S24|S25|S7|S9|SO1|SO2|W1|W10|W11|W13|W14|W15|W2|W3|W4|W5|W6|W8|WO1|WO2|WO3|E14)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && fma != "" && length(fma) > 30 { log_err("error"); print "fma length in " CSVFILENAME " line " NR " should be less than 30 and was " length(fma) " " RS $0 RS; }
action == "validate:summary" && NR > 1 && fma != "" && length(fma) > 30 { key=CSVFILENAME FS "fma" FS "maxLength" FS "max length is: 30" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_township && !is_numeric(ats_township) { log_err("error"); print "Field ats_township in " CSVFILENAME " line " NR " should be a numeric but was " ats_township " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_township && !is_numeric(ats_township) { key=CSVFILENAME FS "ats_township" FS "type" FS "max length is: 30" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_township == "" { log_err("warning"); print "Field ats_township in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_township == "" { key=CSVFILENAME FS "ats_township" FS "required" FS "value is required but was empty" FS "warning"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_township != "" && ats_township < 1 { log_err("error"); print "ats_township in " CSVFILENAME " line " NR " should be greater than 1 and was " ats_township " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_township != "" && ats_township < 1 { key=CSVFILENAME FS "ats_township" FS "minimum" FS "value should be greater than: 1" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_township != "" && ats_township > 126 { log_err("error"); print "ats_township in " CSVFILENAME " line " NR " should be less than 126 and was " ats_township " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_township != "" && ats_township > 126 { key=CSVFILENAME FS "ats_township" FS "maximum" FS "value should be less than: 126" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_range && !is_numeric(ats_range) { log_err("error"); print "Field ats_range in " CSVFILENAME " line " NR " should be a numeric but was " ats_range " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_range && !is_numeric(ats_range) { key=CSVFILENAME FS "ats_range" FS "type" FS "value should be less than: 126" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_range == "" { log_err("warning"); print "Field ats_range in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_range == "" { key=CSVFILENAME FS "ats_range" FS "required" FS "value is required but was empty" FS "warning"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_range != "" && ats_range < 1 { log_err("error"); print "ats_range in " CSVFILENAME " line " NR " should be greater than 1 and was " ats_range " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_range != "" && ats_range < 1 { key=CSVFILENAME FS "ats_range" FS "minimum" FS "value should be greater than: 1" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_range != "" && ats_range > 26 { log_err("error"); print "ats_range in " CSVFILENAME " line " NR " should be less than 26 and was " ats_range " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_range != "" && ats_range > 26 { key=CSVFILENAME FS "ats_range" FS "maximum" FS "value should be less than: 26" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_meridian && !is_numeric(ats_meridian) { log_err("error"); print "Field ats_meridian in " CSVFILENAME " line " NR " should be a numeric but was " ats_meridian " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_meridian && !is_numeric(ats_meridian) { key=CSVFILENAME FS "ats_meridian" FS "type" FS "value should be less than: 26" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_meridian == "" { log_err("warning"); print "Field ats_meridian in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_meridian == "" { key=CSVFILENAME FS "ats_meridian" FS "required" FS "value is required but was empty" FS "warning"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_meridian != "" && ats_meridian < 4 { log_err("error"); print "ats_meridian in " CSVFILENAME " line " NR " should be greater than 4 and was " ats_meridian " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_meridian != "" && ats_meridian < 4 { key=CSVFILENAME FS "ats_meridian" FS "minimum" FS "value should be greater than: 4" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_meridian != "" && ats_meridian > 6 { log_err("error"); print "ats_meridian in " CSVFILENAME " line " NR " should be less than 6 and was " ats_meridian " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_meridian != "" && ats_meridian > 6 { key=CSVFILENAME FS "ats_meridian" FS "maximum" FS "value should be less than: 6" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_section && !is_numeric(ats_section) { log_err("error"); print "Field ats_section in " CSVFILENAME " line " NR " should be a numeric but was " ats_section " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_section && !is_numeric(ats_section) { key=CSVFILENAME FS "ats_section" FS "type" FS "value should be less than: 6" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_section == "" { log_err("warning"); print "Field ats_section in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_section == "" { key=CSVFILENAME FS "ats_section" FS "required" FS "value is required but was empty" FS "warning"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_section != "" && ats_section < 1 { log_err("error"); print "ats_section in " CSVFILENAME " line " NR " should be greater than 1 and was " ats_section " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_section != "" && ats_section < 1 { key=CSVFILENAME FS "ats_section" FS "minimum" FS "value should be greater than: 1" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ats_section != "" && ats_section > 36 { log_err("error"); print "ats_section in " CSVFILENAME " line " NR " should be less than 36 and was " ats_section " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ats_section != "" && ats_section > 36 { key=CSVFILENAME FS "ats_section" FS "maximum" FS "value should be less than: 36" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && opening_number != "" && length(opening_number) > 11 { log_err("error"); print "opening_number length in " CSVFILENAME " line " NR " should be less than 11 and was " length(opening_number) " " RS $0 RS; }
action == "validate:summary" && NR > 1 && opening_number != "" && length(opening_number) > 11 { key=CSVFILENAME FS "opening_number" FS "maxLength" FS "max length is: 11" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && sampling_unit_number != "" && length(sampling_unit_number) > 3 { log_err("error"); print "sampling_unit_number length in " CSVFILENAME " line " NR " should be less than 3 and was " length(sampling_unit_number) " " RS $0 RS; }
action == "validate:summary" && NR > 1 && sampling_unit_number != "" && length(sampling_unit_number) > 3 { key=CSVFILENAME FS "sampling_unit_number" FS "maxLength" FS "max length is: 3" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && topographic_position && !is_numeric(topographic_position) { log_err("error"); print "Field topographic_position in " CSVFILENAME " line " NR " should be a numeric but was " topographic_position " " RS $0 RS; }
action == "validate:summary" && NR > 1 && topographic_position && !is_numeric(topographic_position) { key=CSVFILENAME FS "topographic_position" FS "type" FS "max length is: 3" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && topographic_position != "" && topographic_position !~ /^(1|2|3|4|5|6|7)$/ { log_err("error"); print "topographic_position in " CSVFILENAME " line " NR " should match the following pattern /^(1|2|3|4|5|6|7)$/ and was " topographic_position " " RS $0 RS; }
action == "validate:summary" && NR > 1 && topographic_position != "" && topographic_position !~ /^(1|2|3|4|5|6|7)$/ { key=CSVFILENAME FS "topographic_position" FS "pattern" FS "value should match: /^(1|2|3|4|5|6|7)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && elevation && !is_numeric(elevation) { log_err("error"); print "Field elevation in " CSVFILENAME " line " NR " should be a numeric but was " elevation " " RS $0 RS; }
action == "validate:summary" && NR > 1 && elevation && !is_numeric(elevation) { key=CSVFILENAME FS "elevation" FS "type" FS "value should match: /^(1|2|3|4|5|6|7)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && elevation != "" && elevation < 0 { log_err("error"); print "elevation in " CSVFILENAME " line " NR " should be greater than 0 and was " elevation " " RS $0 RS; }
action == "validate:summary" && NR > 1 && elevation != "" && elevation < 0 { key=CSVFILENAME FS "elevation" FS "minimum" FS "value should be greater than: 0" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && elevation != "" && elevation > 3747 { log_err("error"); print "elevation in " CSVFILENAME " line " NR " should be less than 3747 and was " elevation " " RS $0 RS; }
action == "validate:summary" && NR > 1 && elevation != "" && elevation > 3747 { key=CSVFILENAME FS "elevation" FS "maximum" FS "value should be less than: 3747" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && slope && !is_numeric(slope) { log_err("error"); print "Field slope in " CSVFILENAME " line " NR " should be a numeric but was " slope " " RS $0 RS; }
action == "validate:summary" && NR > 1 && slope && !is_numeric(slope) { key=CSVFILENAME FS "slope" FS "type" FS "value should be less than: 3747" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && slope != "" && slope < 0 { log_err("error"); print "slope in " CSVFILENAME " line " NR " should be greater than 0 and was " slope " " RS $0 RS; }
action == "validate:summary" && NR > 1 && slope != "" && slope < 0 { key=CSVFILENAME FS "slope" FS "minimum" FS "value should be greater than: 0" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && slope != "" && slope > 200 { log_err("error"); print "slope in " CSVFILENAME " line " NR " should be less than 200 and was " slope " " RS $0 RS; }
action == "validate:summary" && NR > 1 && slope != "" && slope > 200 { key=CSVFILENAME FS "slope" FS "maximum" FS "value should be less than: 200" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && aspect != "" && aspect !~ /^(N|E|S|W|NE|SE|SW|NW|NA)$/ { log_err("error"); print "aspect in " CSVFILENAME " line " NR " should match the following pattern /^(N|E|S|W|NE|SE|SW|NW|NA)$/ and was " aspect " " RS $0 RS; }
action == "validate:summary" && NR > 1 && aspect != "" && aspect !~ /^(N|E|S|W|NE|SE|SW|NW|NA)$/ { key=CSVFILENAME FS "aspect" FS "pattern" FS "value should match: /^(N|E|S|W|NE|SE|SW|NW|NA)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && x_coord && !is_numeric(x_coord) { log_err("error"); print "Field x_coord in " CSVFILENAME " line " NR " should be a numeric but was " x_coord " " RS $0 RS; }
action == "validate:summary" && NR > 1 && x_coord && !is_numeric(x_coord) { key=CSVFILENAME FS "x_coord" FS "type" FS "value should match: /^(N|E|S|W|NE|SE|SW|NW|NA)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && x_coord != "" && x_coord < -1030400 { log_err("error"); print "x_coord in " CSVFILENAME " line " NR " should be greater than -1030400 and was " x_coord " " RS $0 RS; }
action == "validate:summary" && NR > 1 && x_coord != "" && x_coord < -1030400 { key=CSVFILENAME FS "x_coord" FS "minimum" FS "value should be greater than: -1030400" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && x_coord != "" && x_coord > -111700 { log_err("error"); print "x_coord in " CSVFILENAME " line " NR " should be less than -111700 and was " x_coord " " RS $0 RS; }
action == "validate:summary" && NR > 1 && x_coord != "" && x_coord > -111700 { key=CSVFILENAME FS "x_coord" FS "maximum" FS "value should be less than: -111700" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && y_coord && !is_numeric(y_coord) { log_err("error"); print "Field y_coord in " CSVFILENAME " line " NR " should be a numeric but was " y_coord " " RS $0 RS; }
action == "validate:summary" && NR > 1 && y_coord && !is_numeric(y_coord) { key=CSVFILENAME FS "y_coord" FS "type" FS "value should be less than: -111700" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && y_coord != "" && y_coord < 5643600 { log_err("error"); print "y_coord in " CSVFILENAME " line " NR " should be greater than 5643600 and was " y_coord " " RS $0 RS; }
action == "validate:summary" && NR > 1 && y_coord != "" && y_coord < 5643600 { key=CSVFILENAME FS "y_coord" FS "minimum" FS "value should be greater than: 5643600" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && y_coord != "" && y_coord > 6702500 { log_err("error"); print "y_coord in " CSVFILENAME " line " NR " should be less than 6702500 and was " y_coord " " RS $0 RS; }
action == "validate:summary" && NR > 1 && y_coord != "" && y_coord > 6702500 { key=CSVFILENAME FS "y_coord" FS "maximum" FS "value should be less than: 6702500" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && utm_zone == "" { log_err("warning"); print "Field utm_zone in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && utm_zone == "" { key=CSVFILENAME FS "utm_zone" FS "required" FS "value is required but was empty" FS "warning"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && utm_zone != "" && utm_zone !~ /^(UTM11|UTM12)$/ { log_err("error"); print "utm_zone in " CSVFILENAME " line " NR " should match the following pattern /^(UTM11|UTM12)$/ and was " utm_zone " " RS $0 RS; }
action == "validate:summary" && NR > 1 && utm_zone != "" && utm_zone !~ /^(UTM11|UTM12)$/ { key=CSVFILENAME FS "utm_zone" FS "pattern" FS "value should match: /^(UTM11|UTM12)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && datum == "" { log_err("warning"); print "Field datum in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && datum == "" { key=CSVFILENAME FS "datum" FS "required" FS "value is required but was empty" FS "warning"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && datum != "" && datum !~ /^(NAD27|NAD83)$/ { log_err("error"); print "datum in " CSVFILENAME " line " NR " should match the following pattern /^(NAD27|NAD83)$/ and was " datum " " RS $0 RS; }
action == "validate:summary" && NR > 1 && datum != "" && datum !~ /^(NAD27|NAD83)$/ { key=CSVFILENAME FS "datum" FS "pattern" FS "value should match: /^(NAD27|NAD83)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && latitude && !is_numeric(latitude) { log_err("error"); print "Field latitude in " CSVFILENAME " line " NR " should be a numeric but was " latitude " " RS $0 RS; }
action == "validate:summary" && NR > 1 && latitude && !is_numeric(latitude) { key=CSVFILENAME FS "latitude" FS "type" FS "value should match: /^(NAD27|NAD83)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && latitude != "" && latitude < 49 { log_err("error"); print "latitude in " CSVFILENAME " line " NR " should be greater than 49 and was " latitude " " RS $0 RS; }
action == "validate:summary" && NR > 1 && latitude != "" && latitude < 49 { key=CSVFILENAME FS "latitude" FS "minimum" FS "value should be greater than: 49" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && latitude != "" && latitude > 60 { log_err("error"); print "latitude in " CSVFILENAME " line " NR " should be less than 60 and was " latitude " " RS $0 RS; }
action == "validate:summary" && NR > 1 && latitude != "" && latitude > 60 { key=CSVFILENAME FS "latitude" FS "maximum" FS "value should be less than: 60" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && longitude && !is_numeric(longitude) { log_err("error"); print "Field longitude in " CSVFILENAME " line " NR " should be a numeric but was " longitude " " RS $0 RS; }
action == "validate:summary" && NR > 1 && longitude && !is_numeric(longitude) { key=CSVFILENAME FS "longitude" FS "type" FS "value should be less than: 60" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && longitude != "" && longitude < -120 { log_err("error"); print "longitude in " CSVFILENAME " line " NR " should be greater than -120 and was " longitude " " RS $0 RS; }
action == "validate:summary" && NR > 1 && longitude != "" && longitude < -120 { key=CSVFILENAME FS "longitude" FS "minimum" FS "value should be greater than: -120" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && longitude != "" && longitude > -110 { log_err("error"); print "longitude in " CSVFILENAME " line " NR " should be less than -110 and was " longitude " " RS $0 RS; }
action == "validate:summary" && NR > 1 && longitude != "" && longitude > -110 { key=CSVFILENAME FS "longitude" FS "maximum" FS "value should be less than: -110" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && natural_subregion == "" { log_err("error"); print "Field natural_subregion in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && natural_subregion == "" { key=CSVFILENAME FS "natural_subregion" FS "required" FS "value is required but was empty" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && natural_subregion != "" && natural_subregion !~ /^(CM|DM|NM|BSA|PAD|LBH|UBH|AP|ALP|SA|MT|UF|LF|KU|FP|PRP|CP|DMG|FF|NF|MG)$/ { log_err("error"); print "natural_subregion in " CSVFILENAME " line " NR " should match the following pattern /^(CM|DM|NM|BSA|PAD|LBH|UBH|AP|ALP|SA|MT|UF|LF|KU|FP|PRP|CP|DMG|FF|NF|MG)$/ and was " natural_subregion " " RS $0 RS; }
action == "validate:summary" && NR > 1 && natural_subregion != "" && natural_subregion !~ /^(CM|DM|NM|BSA|PAD|LBH|UBH|AP|ALP|SA|MT|UF|LF|KU|FP|PRP|CP|DMG|FF|NF|MG)$/ { key=CSVFILENAME FS "natural_subregion" FS "pattern" FS "value should match: /^(CM|DM|NM|BSA|PAD|LBH|UBH|AP|ALP|SA|MT|UF|LF|KU|FP|PRP|CP|DMG|FF|NF|MG)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ecosite_guide == "" { log_err("warning"); print "Field ecosite_guide in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && ecosite_guide == "" { key=CSVFILENAME FS "ecosite_guide" FS "required" FS "value is required but was empty" FS "warning"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ecosite_guide != "" && ecosite_guide !~ /^(N|WC|SW)$/ { log_err("error"); print "ecosite_guide in " CSVFILENAME " line " NR " should match the following pattern /^(N|WC|SW)$/ and was " ecosite_guide " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ecosite_guide != "" && ecosite_guide !~ /^(N|WC|SW)$/ { key=CSVFILENAME FS "ecosite_guide" FS "pattern" FS "value should match: /^(N|WC|SW)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ecosite == "" { log_err("warning"); print "Field ecosite in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && ecosite == "" { key=CSVFILENAME FS "ecosite" FS "required" FS "value is required but was empty" FS "warning"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ecosite != "" && ecosite !~ /^(a|b|c|d|e|f|g|h|i|j|k|l|m|n)$/ { log_err("error"); print "ecosite in " CSVFILENAME " line " NR " should match the following pattern /^(a|b|c|d|e|f|g|h|i|j|k|l|m|n)$/ and was " ecosite " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ecosite != "" && ecosite !~ /^(a|b|c|d|e|f|g|h|i|j|k|l|m|n)$/ { key=CSVFILENAME FS "ecosite" FS "pattern" FS "value should match: /^(a|b|c|d|e|f|g|h|i|j|k|l|m|n)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ecosite_phase == "" { log_err("warning"); print "Field ecosite_phase in " CSVFILENAME " line " NR " is required" RS $0 RS; }
action == "validate:summary" && NR > 1 && ecosite_phase == "" { key=CSVFILENAME FS "ecosite_phase" FS "required" FS "value is required but was empty" FS "warning"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && ecosite_phase != "" && ecosite_phase !~ /^(a1|b1|b2|b3|b4|c1|d1|d2|d3|e1|e2|e3|f1|f2|f3|g1|h1|i1|i2|j1|j2|k1|k2|k3|l1|h2|j3|a2|a3|g2|h3|b5|c2|c3|c4|c5|d4|e4|f4|f5|f6|g3|i3|l2|l3|m1|m2|m3|n1)$/ { log_err("error"); print "ecosite_phase in " CSVFILENAME " line " NR " should match the following pattern /^(a1|b1|b2|b3|b4|c1|d1|d2|d3|e1|e2|e3|f1|f2|f3|g1|h1|i1|i2|j1|j2|k1|k2|k3|l1|h2|j3|a2|a3|g2|h3|b5|c2|c3|c4|c5|d4|e4|f4|f5|f6|g3|i3|l2|l3|m1|m2|m3|n1)$/ and was " ecosite_phase " " RS $0 RS; }
action == "validate:summary" && NR > 1 && ecosite_phase != "" && ecosite_phase !~ /^(a1|b1|b2|b3|b4|c1|d1|d2|d3|e1|e2|e3|f1|f2|f3|g1|h1|i1|i2|j1|j2|k1|k2|k3|l1|h2|j3|a2|a3|g2|h3|b5|c2|c3|c4|c5|d4|e4|f4|f5|f6|g3|i3|l2|l3|m1|m2|m3|n1)$/ { key=CSVFILENAME FS "ecosite_phase" FS "pattern" FS "value should match: /^(a1|b1|b2|b3|b4|c1|d1|d2|d3|e1|e2|e3|f1|f2|f3|g1|h1|i1|i2|j1|j2|k1|k2|k3|l1|h2|j3|a2|a3|g2|h3|b5|c2|c3|c4|c5|d4|e4|f4|f5|f6|g3|i3|l2|l3|m1|m2|m3|n1)$/" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
action == "validate" && NR > 1 && plot_comment != "" && length(plot_comment) > 250 { log_err("error"); print "plot_comment length in " CSVFILENAME " line " NR " should be less than 250 and was " length(plot_comment) " " RS $0 RS; }
action == "validate:summary" && NR > 1 && plot_comment != "" && length(plot_comment) > 250 { key=CSVFILENAME FS "plot_comment" FS "maxLength" FS "max length is: 250" FS "error"; if(!violations[key]) { violations[key]=0; } violations[key]++; }
# sanitize rules
action ~ /^(sanitize|insert)$/ && NR > 1 {
if (x_coord == "") $19 = "\\N"
if (aspect == "") $18 = "\\N"
if (topographic_position == "") $15 = "\\N"
if (elevation == "") $16 = "\\N"
if (ats_range == "") $10 = "\\N"
if (ats_section == "") $12 = "\\N"
if (latitude == "") $23 = "\\N"
if (sampling_unit_number == "") $14 = "\\N"
if (ecosite_guide == "") $26 = "\\N"
if (ats_township == "") $9 = "\\N"
if (ecosite == "") $27 = "\\N"
if (opening_number == "") $13 = "\\N"
if (utm_zone == "") $21 = "\\N"
if (fmu == "") $7 = "\\N"
if (ats_meridian == "") $11 = "\\N"
if (natural_subregion == "") $25 = "\\N"
if (y_coord == "") $20 = "\\N"
if (company_plot_number == "") $2 = "\\N"
if (establishment_day == "") $6 = "\\N"
if (longitude == "") $24 = "\\N"
if (slope == "") $17 = "\\N"
if (datum == "") $22 = "\\N"
if (establishment_year == "") $4 = "\\N"
if (ecosite_phase == "") $28 = "\\N"
if (establishment_month == "") $5 = "\\N"
if (company_stand_number == "") $3 = "\\N"
if (company == "") $1 = "\\N"
if (fma == "") $8 = "\\N"
if (plot_comment == "") $29 = "\\N"
}
# action handlers
action == "insert" && NR == 1 {
print "COPY plot (" addfields FS "source_row_index" FS $0 ") FROM stdin;"
}
action == "insert" && NR > 1 {
record = addvals "\t" NR
for (i = 1; i < NF; i++) {
if (wasquoted[i] == 1) {
$i = "'" $i "'"
}
record = record "\t" $i
}
print record
}
action == "table" && NR == 1 {
print "CREATE TABLE IF NOT EXISTS plot (company ,company_plot_number ,company_stand_number ,establishment_year ,establishment_month ,establishment_day ,fmu ,fma ,ats_township ,ats_range ,ats_meridian ,ats_section ,opening_number ,sampling_unit_number ,topographic_position ,elevation ,slope ,aspect ,x_coord ,y_coord ,utm_zone ,datum ,latitude ,longitude ,natural_subregion ,ecosite_guide ,ecosite ,ecosite_phase ,plot_comment , CONSTRAINT plot_pkey PRIMARY KEY (company,company_plot_number) );"
}
action == "sanitize" { print }
# la fin
END {
if (action == "validate:summary" && length(dupkeys) > 0) for (dup in dupkeys) { violation=CSVFILENAME FS "company|company_plot_number" FS "duplicate" FS dup " violates pkey" FS "error"; violations[violation] = dupkeys[dup]}
if (action == "validate:summary") { if (length(violations) > 0) for (violation in violations) { print violation FS violations[violation]; } }
if (action == "validate" && options["summary"] == "true") { print RS "violation summary: " RS " counts: " RS " total: " err_count; print_cats(cats); }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment