Last active
January 2, 2016 15:49
-
-
Save ace-dent/8326202 to your computer and use it in GitHub Desktop.
Pack small strings or Codes (e.g. 2 letter ISO country codes), into a compact 'magic string' minimized by: 1> Running Codes into each other (eliminating matching first and last letters) 'AB'+'BA' => 'ABA' ; 2> Avoiding spaces / code break characters, by joining Codes 'AA,BB' => 'AABB';Only where 1&2 generates allowable sequences (unique, with no…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* MAGICSTRINGPACKER | |
* | |
* Pack small strings or Codes (e.g. 2 letter ISO country codes), into a compact 'magic string' minimized by: | |
* 1> Running Codes into each other (eliminating matching first and last letters) 'AB'+'BA' => 'ABA' ; | |
* 2> Avoiding spaces / code break characters, by joining Codes 'AA,BB' => 'AABB'; | |
* Only where 1&2 generates allowable sequences (unique, with no collisions). | |
* This is useful for matching data against a compact reference (magic string), where hashing isn't an option. | |
* E.g. Checking country codes in a spreadsheet against the list of EU countries. | |
* | |
* This software is provided 'as-is', without any express or implied warranty. | |
* | |
* @author Andrew C.E. Dent | |
* | |
* @license GPL | |
* @license http://opensource.org/licenses/gpl-license.php GNU Public License | |
* @version 1.2.1 | |
*/ | |
// The_Beginning: | |
echo "Start... \n"; | |
$verbose = true; // Provide useful feedback for testing. Set to False for batch runs. | |
// Small strings -or codes- that will be packed | |
$codes_to_pack = array( | |
// Country codes for EU VAT | |
"AT","BE","BG","CY","CZ","DE","DK","EE","ES", | |
"FI","FR","\GB","EL","HU","IE","IT","LT","LU", // VAT codes. Greece uses language code EL not country code GR | |
"LV","MT","NL","PL","PT","RO","SE","SI","SK", | |
//"EU", // General non-domicile EU registration | |
"HR", // Croatia joined EU 1st July 2013 | |
"UK","IM","GR", // Extra country codes for checking Shipping destination against | |
"MC" // Monaco, French dependency | |
); | |
// To select the most commonly occuring code and fix its position at the start of the magic string | |
// prefix with a slash ('GB' > '\GB'). This hack prevents string matches and may reduce compression. | |
// Any slashes are ignored for statistics and stripped out at the end of the process. | |
// This is useful in practice when matching data against the magic string, for early matching and escape. | |
// Strings that are allowable in resulting compact form (unused Codes) | |
// These codes may be generated when we remove the break character between two input codes | |
// e.g. AA+BB > AABB, as 'AB' substring does not collide with another valid code. | |
$codes_allowed = array( | |
// Subset of codes -not- in use from ISO 3166-1:2006. | |
// Selection based on EU countries. Some commented out for smaller array and speed gain. | |
// "AB","AC","AH","AJ","AK","AP",//"AV","AY", | |
"BC","BK","BL","BP","BQ","BU",//"BX", | |
"CB","CE","CJ","CP","CQ","CT",//"CW", | |
"EA","EB","ED","EF","EI","EJ","EK",/*"EL" - Greece*/ "EM","EN","EO","EP","EQ",//"EV","EW","EX","EY","EZ", | |
"GC","GJ","GK","GO",//"GV","GX","GZ", | |
"IA","IB","IC","IF","IG","IH","II","IJ","IK","IP","IU",//"IV","IW","IX","IY","IZ", | |
"KA","KB","KC","KD","KF","KJ","KK","KL","KO","KQ","KS","KT","KU",//"KV","KX", | |
"LD","LE","LF","LG","LH","LJ","LL","LM","LN","LO","LP","LQ",//"LW","LX","LZ", | |
"MB","MF","MI","MJ", | |
"OA","OB","OC","OD","OE","OF","OG","OH","OI","OJ","OK","OL","ON","OP","OQ","OR","OS","OT","OU",//"OV","OW","OX","OY","OZ", | |
//"QB", "QC", "QD", "QE", "QF", "QG", "QH", "QI", "QJ", "QK", "QL" | |
"RA","RB","RC","RD","RF","RG","RH","RI","RJ","RK","RL","RM","RN","RP","RQ","RR","RT",//"RV","RX","RY","RZ", | |
"SF","SP","SQ","SS","SU",//"SW","SX", | |
"TA","TB","TE","TI","TQ","TS","TU",//"TX","TY", | |
"UB","UC","UD","UE","UF","UH","UI","UJ","UK","UL","UN","UO","UP","UQ","UR","UT","UU",//"UV","UW","UX", | |
"VB","VD","VF","VH","VJ","VK","VL","VM","VO","VP","VQ","VR","VS","VT",//"VV","VW","VX","VY","VZ", | |
"YA","YB","YC","YD","YF","YG","YH","YI","YJ","YK","YL","YM","YN","YO","YP","YQ","YR","YS",//"YV","YW","YX","YY","YZ", | |
"ZB","ZC","ZD","ZE","ZF","ZG","ZH","ZI","ZJ","ZK","ZL","ZN","ZO","ZP","ZQ","ZR","ZS","ZT","ZU",//"ZV","ZX","ZY" | |
// Include the following historically 'reserved' and private codes, as we know they wont appear in our data set. | |
// Some commented out for smaller array and speed gain. | |
//"AA", "OO" | |
"CS","TP","YU","EU" | |
//"QM", "QN", "QO", "QP", "QQ", "QR", "QS", "QT", "QU", // "QV", "QW", "QX", "QY", "QZ", | |
//"XA", "XB", "XC", "XD", "XE", "XF", "XG", "XH", "XI", "XJ", "XK", "XL", "XM", "XN", | |
//"XO", "XP", "XQ", "XR", "XS", "XT", "XU", // "XV", "XW", "XX", "XY", "XZ", | |
//"ZZ" | |
); | |
// Add in the Codes to Pack as Allowable duplicates in the final magic string. | |
// (Remove following line if each code can only occur once). | |
$codes_allowed = array_merge($codes_allowed, $codes_to_pack); | |
// Benchmarking - Starting details | |
$start_codes = implode(".", $codes_to_pack); // Starting string with delimited Codes | |
$start_length = strlen(stripslashes($start_codes)); // Ignore slashes for Stats- stripped out at the end | |
$total_codes = count($codes_to_pack); | |
if ($verbose) { | |
echo "\n"; | |
echo $total_codes . " codes using " . $start_length . " chars (~" . round(8*$start_length/$total_codes,1) . " bits/code) -> \n"; | |
echo $start_codes . "\n"; | |
} | |
// @TODO: We should scan and clean up arrays to remove duplicates. | |
// Remove Allowed codes that cannot be matched against Codes to Pack - faster later | |
if ($verbose) echo "\nReduce list of Allowed Codes that cannot be matched: "; | |
foreach ($codes_allowed as &$code) { | |
$last = substr($code,-1,1) ; // Last letter of string | |
$check = false; | |
foreach ($codes_to_pack as $code_check) { | |
$first_check = substr($code_check,0,1) ; // First letter of string | |
if ($last == $first_check) $check = true; // The code was matched | |
} | |
unset($code_check); // Break the reference with the last element | |
if ($check == false) { | |
if ($verbose) echo $code. " - "; | |
$code = null; | |
} | |
} | |
unset($code); // Break the reference with the last element | |
// Remove Codes that cannot be merged in Round 1- faster matching later | |
/* Currently this approach creates local areas of low compression in the final string. | |
It could be possible to transform these strings, substituting characters. | |
Then requires code overhead to decode. | |
*/ | |
if ($verbose) echo "\n\nIgnore codes with no matching letters for first round: "; | |
$removed_codes = array (); // We will store removed codes to add back to the final output | |
foreach ($codes_to_pack as &$code) { | |
$first = substr($code,0,1) ; // First letter of string | |
$last = substr($code,-1,1) ; // Last letter of string | |
$check = false; | |
foreach ($codes_to_pack as $code_check) { | |
$first_check = substr($code_check,0,1) ; // First letter of string | |
$last_check = substr($code_check,-1,1) ; // Last letter of string | |
if ($first == $last_check || $last == $first_check) $check = true; // The code was matched | |
} | |
unset($code_check); // Break the reference with the last element | |
if ($check == false) { | |
if ($verbose) echo $code. " (". $first . $last . ") - "; | |
$removed_codes[] = $code; | |
$code = null; | |
} | |
} | |
unset($code); // Break the reference with the last element | |
if ($verbose) echo "\n"; | |
// Round 1 - Merge strings by matching (and combining) first and last letters | |
/* Current algorithm is quite lazy. A 'greedy' approach may yield higher local compression: | |
e.g. IESEE > IE ES SE EE > 10bits/code | |
SIEESELV > SI IE EE ES SE EL LV > 9.14bits/code | |
BGBEESIMCZ > BG GB BE EE ES SI IM MC CZ > 8.89bits/code | |
In testing, where the numbers of codes to compact are small, a greedy approach yields more | |
orphan codes that are uncompacted, balancing out any gains of the few long runs. Every merge saves 2 bytes, | |
whereas in Round 2, each join only saves 1 byte. It seems to be a better strategy to be lazy... | |
*/ | |
$input = $codes_to_pack; | |
for ($i = 1; $i <= 5; $i++) { | |
shuffle($input); // Randomize order of strings in each sweep | |
$result = array(); | |
foreach ($input as &$code) { | |
$first = substr($code,0,1) ; // First letter of string | |
$last = substr($code,-1,1) ; // Last letter of string | |
foreach ($input as &$code_check) { | |
$first_check = substr($code_check,0,1) ; // First letter of string | |
$last_check = substr($code_check,-1,1) ; // Last letter of string | |
if ($last == $first_check && $code != $code_check ) { | |
// Check this match first, to prefer 'GB+BG>GBG' vs 'GB+BG>BGB' | |
$result[] = $code . substr($code_check,1); | |
$code = null; | |
$code_check = null; | |
break; | |
} else if ($first == $last_check && $code != $code_check ) { | |
$result[] = $code_check . substr($code,1); | |
$code = null; | |
$code_check = null; | |
break; | |
} | |
} | |
unset($code_check); // Break the reference with the last element | |
} | |
unset($code); // Break the reference with the last element | |
// Tidy up - Find any codes that weren't matched and add to the $results array for another pass | |
foreach ($input as &$code) { | |
if ($code != null) { | |
$result[] = $code; | |
$code = null; | |
} | |
} | |
if ($verbose) { | |
echo "\n Run-".$i." : "; | |
echo implode(".", $result); | |
} | |
unset($input); | |
$input = $result; | |
unset($result); | |
} | |
// Add back in the unmatching codes removed at the start | |
$codes_to_pack = array_merge($codes_to_pack, $removed_codes); // Fix the original array | |
$result = array_merge($input, $removed_codes); // Working array | |
unset($input); | |
unset($removed_codes); | |
if ($verbose) echo "\n *** \n"; | |
// Last round - remove space between codes, if consecutive letters are in Allowed list | |
$input = $result; | |
for ($i = 1; $i <= 20; $i++) { | |
shuffle($input); | |
$result = array(); | |
foreach ($input as &$code) { | |
$last = substr($code,-1,1) ; // Last letter of string | |
foreach ($input as &$code_check) { | |
$first_check = substr($code_check,0,1) ; // First letter of string | |
foreach ($codes_allowed as $code_available) { | |
$combo = $last . $first_check ; | |
if ($combo == $code_available && $code != $code_check) { | |
// @TODO : | |
// Test if joining of the two strings duplicates a code | |
// that may be replaced elsewhere (although unlikely with few codes) | |
// Find the original, existing string e.g. ES | |
// If string length = 2, delete ES -> - | |
// Check start of strings ESAA -> -SAA | |
// Check end of strings AAES -> AAE- | |
// Else we cannot remove AESA -> AESA | |
// Only bother to test for one letter, as two is very unlikely for code length=2... | |
// Finally combine the two strings | |
// @TODO : | |
// Redundant check to see if first / last letter is equal then compress instead of concatenate | |
// IE + EE > IEE ... or AA + AA = AA | |
// This shouldn't happen after the 1st pass!... Check we don't overly compress e.g. 'EE' | |
$result[] = $code . $code_check; // Concatenate strings | |
$code = null; | |
$code_check = null; | |
break; | |
} | |
} | |
unset($code_available); // Break the reference with the last element | |
} | |
unset($code_check); // Break the reference with the last element | |
} | |
unset($code); // Break the reference with the last element | |
// Tidy up - Find any codes that weren't matched and add to the $results array for another pass | |
foreach ($input as &$code) { | |
if ($code != null) { | |
$result[] = $code; | |
$code = null; | |
} | |
} | |
if ($verbose) { | |
echo "\n Run-".$i." : "; | |
echo implode(".", $result); | |
} | |
unset($input); | |
$input = $result; | |
unset($result); | |
} | |
// Finish up | |
$result = $input; | |
unset($input); | |
if ($verbose) echo "\n"; | |
$magic_string = stripslashes( implode(".", $result) ); // Remove any slashes | |
unset($result); | |
$magic_length = strlen($magic_string); | |
// Test for duplicates | |
/* | |
Currently we don't do anything about this. Best is to tackle at the point repeated strings are generated. | |
Also edge case of repeated runs of letters, | |
e.g. AA+AB BA+AA > AAB BAA, AAB+BAA > AABAA | |
*/ | |
if ($verbose) { | |
$result = array (); // Array to store codes for duplication check | |
$result_inter_codes = array (); // Array to store 'inter-codes' (adjacent non-coding letters) | |
// TODO - This currently only works for fixed 2 character codes | |
// Should be rewritten for variable input code lengths... | |
for ($i = 1; $i < $magic_length; $i++) { | |
$code = substr($magic_string,$i-1,2) ; // Grab possible 2 character code | |
$check = false; | |
foreach ($codes_to_pack as $code_check) { | |
if (stripslashes($code_check) == $code) { | |
$check = true; // The code was matched | |
$result[] = $code; // Codes get logged | |
} | |
} | |
unset($code_check); // Break the reference with the last element | |
if ($check == false) $result_inter_codes[] = $code; // Log inter-code | |
} | |
asort($result); | |
asort($result_inter_codes); | |
// If we have more Codes than we started with, duplicates are present... | |
if (count($result) != $total_codes) echo "\nWARNING: Possible duplicate codes... "; | |
echo "\nCodes, from adjacent letters: \n "; | |
echo implode("-", $result); | |
echo " ". count($result)." codes \n\n"; | |
echo "Non-matching codes, from adjacent letters: \n "; | |
echo implode("-", $result_inter_codes); | |
echo " ". count($result_inter_codes)." codes \n\n"; | |
} | |
if ($verbose) { | |
echo $magic_string . " <- " . $magic_length . " chars (~" . round(8*$magic_length/$total_codes,1) . " bits/code).\n"; | |
echo "Compressed: " . round(($start_length - $magic_length) / $start_length * 100) . "% of original. \n"; | |
echo round((800*$magic_length)/($total_codes*10),0) . "% Theoretical limit (10 bits/code). \n"; | |
// Theoretical limit applies to country codes | |
} | |
// Set here conditions to Save the packed magic string generated | |
// For our data, we want the most common code first 'GB' for early matching and escape. | |
if ($magic_length < 49 || $magic_length < 50 && substr($magic_string,0,2) == "GB" ) { | |
$filename = 'magic_string.txt'; | |
$somecontent = "\n" . $magic_length . " - ". $magic_string; | |
// Let's make sure the file exists and is writable first. | |
if (is_writable($filename)) { | |
// We're opening $filename in append mode. | |
// The file pointer is at the bottom of the file hence | |
// that's where $somecontent will go when we fwrite() it. | |
if (!$handle = fopen($filename, 'a')) { | |
echo "Cannot open file ($filename)"; | |
exit; | |
} | |
// Write $somecontent to our opened file. | |
if (fwrite($handle, $somecontent) === FALSE) { | |
echo "Cannot write to file ($filename)"; | |
exit; | |
} | |
echo "Success, wrote ( $somecontent ) to file ($filename)"; | |
fclose($handle); | |
} else { | |
echo "The file $filename is not writable"; | |
} | |
} | |
echo " ...End \n"; | |
// Cleanup and unset all variables | |
$keys = array(); | |
foreach($GLOBALS as $k => $v){ | |
$keys[] = $k; | |
} | |
for($t=1;$keys[$t];$t++){ | |
unset($$keys[$t]); | |
} | |
unset($k); unset($v); unset($t); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment