Last active
February 10, 2023 21:33
-
-
Save otheus/92162e3a764d2697c3272b98b2663a94 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/awk -f | |
## Awk script to compare to SQL (postgres) dumps for which each line of input is a row | |
## and has been preprocessed by | |
## paste -d $'\a' file1 file2 | |
## The BEL symbol is used by this program to quickly split the input | |
## | |
## Sometimes, numbers differ by some kind of rounding error / floating-point implementation | |
## Ignore that error by subtracting the two values and seeing if they are < maxdiff, | |
## maxdiff = 1 / (10 ^ (length-after-decimal-point(shortest-value)) | |
## Consider: | |
## 4.2 vs 4.19998 | |
## The shortest number is 4.2, its length is | |
## Notes: | |
## d is the global *d*iff counter | |
## p is the *p*osition / field that first had a difference | |
## i is a loop variable,usually current field | |
## L is the array of fields from the current line of the *L*eft-file | |
## R is " " " " " " " " " " " *R*ight-file | |
## clhs is the number of fields in L | |
## crhs is the number of fields in R | |
BEGIN { | |
FS="\a"; | |
DECIMAL_SEP="."; | |
FIELD_SEP="\t"; # "\t" for postgresql; for mysql, maybe ", "; | |
MAX_DIFFS=10; | |
DEBUG=0; | |
# Efficiently fill out our table of maximum tolerances of values | |
Maxdiffs[1] = 0.1; | |
for (i=2; i<31; ++i) | |
Maxdiffs[i] = Maxdiffs[i-1] / 10; | |
p=-1; | |
} | |
# if -v start=...., skip until that line | |
NR < (0 + start) { next } | |
# When pairs don't match, investigate further... | |
("_" $1) != ("_" $2) { | |
if (DEBUG>1) print "Line",NR ": Input lines differed somehow. Investigating..."; | |
p=0; # p is field# where difference was found; 0 means whole line | |
# split each half into tab-delimited fields | |
clhs=split($1,L,FIELD_SEP); | |
crhs=split($2,R,FIELD_SEP); | |
if (clhs == crhs) { | |
if (DEBUG>1) print "Line",NR ": Same number of tokens in each line, delimited by '" FIELD_SEP "'"; | |
## compare field by field | |
p = -1; # if we don't set p in the loop below, no real differences | |
# Compare each field, until a difference is found | |
for (i=1; i<=clhs && p<0; ++i) { | |
# Hint: force this compare to be string-based | |
if (("_" L[i]) != ("_" R[i])) { | |
if (DEBUG>1) print "Line",NR ": Field",i,"differs somehow"; | |
## They differ... but are they numbers? | |
if ( \ | |
L[i] ~ /^-?[0-9]+\.[0-9]+([eE][-+][0-9]+)?$/ && \ | |
R[i] ~ /^-?[0-9]+\.[0-9]+([eE][-+][0-9]+)?$/ \ | |
) { | |
# both fields are floating-point numbers, compare loosely | |
# strip exponent part | |
sub(/[eE].*/,"",L[i]);sub(/[eE].*/,"",R[i]); | |
# determine precision of shortest value | |
precision=( \ | |
length(L[i]) < length(R[i]) ? \ | |
length(L[i]) - index(L[i],DECIMAL_SEP) : \ | |
length(R[i]) - index(R[i],DECIMAL_SEP) \ | |
); | |
# look up the maxdiff from our table | |
maxdiff=Maxdiffs[precision]; | |
diff=(L[1] - R[1]); | |
if (diff > maxdiff || diff < -maxdiff) { | |
if (DEBUG) print "Line",NR ": Numbers differed at",i,"between",L[i],"and",R[i],"differing more than",maxdiff; | |
p=i; | |
} | |
else { | |
if (DEBUG) print "Line",NR ": Numbers differed at",i,"between",L[i],"and",R[i],"but differed less than",maxdiff; | |
} | |
} | |
else { | |
if (DEBUG) print "Line",NR ": Strings or ints differed at",i,"between",L[i],"and",R[i]; | |
p=i; | |
} | |
} | |
else { | |
if (DEBUG) print "Line",NR ": No differences found"; | |
} | |
} | |
} | |
# else, field count is different, so whole line is. | |
else { | |
if (DEBUG) print "Line",NR ": Number of fields in line differ"; | |
} | |
} | |
p>=0 { | |
++d; # bump total diffs count | |
# Output a little header for each non-matching records | |
print "Line",NR,"diffs found so far:",d,(p ? "here at field: " p : "" ); | |
# Output the lines that didnt match | |
print $1; print $2; print ""; | |
p=-1; | |
} | |
# Progress counter | |
NR % 100000 == 0 { print "Line",NR } | |
d > MAX_DIFFS { exit(1);} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## NOTE: THE WHITESPACES SHOULD BE TABS | |
75747358 1 53 2011-03-29 23:00:00+00 7.428 | |
75747359 1 53 2011-03-29 23:30:00+00 5.757 | |
75747360 1 53 2011-03-30 00:00:00+00 6.739 | |
75747361 1 53 2011-03-30 00:30:00+00 6.109 | |
75747362 1 53 2011-03-30 01:00:00+00 6.736 extra-field | |
75747363 1 53 2011-03-30 01:30:00+00 7.576 | |
75747364 1 53 2011-03-30 02:00:00+00 6.789 | |
75747365 1 53 2011-03-30 02:30:00+00 6.386e+2 | |
75747366 1 53 2011-03-30 03:00:00+00 6.016E-2 | |
75747367 1 53 2011-03-30 03:30:00+00 6.336 | |
75747368 1 53 2011-03-30 04:00:00+00 6.1 | |
75747374 1 53 2011-03-30 07:00:00+00 5.9412 | |
75747375 1 53 2011-03-30 07:30:00+00 6.137803249 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## NOTE: THE WHITESPACES SHOULD BE TABS | |
75747358 1 53 2011-03-28 23:00:00+00 7.428 | |
75747359 1 53 2011-03-29 23:30:00+01 5.757 | |
75747360 1 54 2011-03-30 00:00:00+00 6.74 | |
75747361 1 53 2011-03-30 00:30:00+00 6.109 | |
75747362 1 53 2011-03-30 01:00:00+00 6.73599999999999977 | |
75747363 1 53 2011-03-30 01:30:00+00 7.576e+10 | |
75747364 1 53 2011-03-30 02:00:00+00 6.789e-10 | |
75747365 1 53 2011-03-30 02:30:00+00 6.38600000000000012e+2 | |
75747366 1 53 2011-03-30 03:00:00+00 6.01600000000000001E-2 | |
75747367 1 53 2011-03-30 03:30:00+00 6.3360000000000003 | |
75747368 1 53 2011-03-30 04:00:00+00 6.0999999999999993 | |
75747374 1 53 2011-03-30 07:00:00+00 5.94099999999999984 | |
75747375 1 53 2011-03-30 07:30:00+00 6.13780324900000007 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment