Last active
August 29, 2015 13:57
-
-
Save buchgr/9486590 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package palantir; | |
import java.util.Comparator; | |
public class FileNameComparator implements Comparator<String> | |
{ | |
/* | |
* DESCRIPTION: | |
* Numbers in the string are compared by their natural order | |
* and not their ASCII order. (2 < 10) | |
* | |
* The comparison is case insensitive. | |
* | |
* Spaces are ignored completely, to make ordering of cases | |
* like "New York" and "Newmark" more intuitive. | |
* | |
* All the non alphanumeric characters are considered smaller | |
* than the alphanumeric characters. The order between the non | |
* alphanumeric characters is still defined by their ASCII code | |
* tough. | |
* | |
* If two strings are equal according to the above criteria, | |
* shorter strings (ignoring whitespaces and leading zeros) are | |
* considered smaller. | |
* | |
* The algorithm runs in one pass and constant space. | |
* | |
* POSSIBLE IMPROVEMENTS: | |
* I believe there is two main "paths" to improve the algorithm: | |
* 1) The devil is in the detail and there are a lot of | |
* corner cases were "intuitive sorting" as performed | |
* by humans might lead to different results. | |
* - Should file extensions be ignored or only be used | |
* in the case the strings are identical but of different filetype? | |
* - Should paranthesis, braces and brackets be considered equal? | |
* - If two strings are equal, should we look at their white spaces | |
* and leading zeros and define an order on them? | |
* - Full Unicode Support | |
* There are probably several dozens of such cases, I believe a good | |
* way to handle them would be to closely look at/mimic the behaviour | |
* of Windows Explorer and Finder, as most people are used to their way | |
* of sorting and I am quite certain that Microsoft and Apple have done | |
* a lot of research/gathered lots of feedback to find the most intuitive | |
* sorting algorithm. | |
* Another way to improve things would be to perform testing with real | |
* people and ask them about the "intuitiveness" of some example sorts. | |
* | |
* 2) Depending on the application, the scenarios and additional information | |
* one has, it might be reasonable to apply e.g. NLP techniques to assign | |
* a special order to e.g. synonyms and maybe ignore words like "a", "the", | |
* etc. | |
* For a normal file browser this would probably lead to confusion, but for | |
* specialized apps it might make sense. | |
*/ | |
public int compare(String s1, String s2) | |
{ | |
if (s1 == null && s2 != null) | |
return -1; | |
if (s1 != null && s2 == null) | |
return 1; | |
if (s1 == null && s2 == null) | |
return 0; | |
// this flag tells if the method is currently | |
// comparing numbers "number compare mode" | |
boolean numcmpmode = false; | |
int s1pos = 0, | |
s2pos = 0, | |
numcmpres = 0, | |
s1len = s1.length(), | |
s2len = s2.length(); | |
while (s1pos < s1len && s2pos < s2len) | |
{ | |
char c1 = s1.charAt(s1pos), c2 = s2.charAt(s2pos); | |
// skip leading zeros | |
if (c1 == '0' && !numcmpmode) | |
s1pos++; | |
else if (c2 == '0' && !numcmpmode) | |
s2pos++; | |
else | |
{ | |
if (isDigit(c1) && isDigit(c2)) | |
{ | |
if (!numcmpmode) | |
numcmpmode = true; | |
// find the first digit where the numbers mismatch | |
// this is only relevant if the numbers have the same number | |
// of digits. | |
if (numcmpres == 0) | |
numcmpres = compareIgnoreCase(c1, c2); | |
s1pos++; | |
s2pos++; | |
} | |
else | |
{ | |
if (numcmpmode) | |
{ | |
// the number in s1 is longer | |
if (isDigit(c1)) | |
return 1; | |
// the number in s2 is longer | |
if (isDigit(c2)) | |
return -1; | |
// the numbers have the same length | |
if (numcmpres != 0) | |
return numcmpres; | |
// the numbers are equal | |
numcmpmode = false; | |
} | |
// ignore spaces | |
if (c1 == ' ') | |
s1pos++; | |
else if (c2 == ' ') | |
s2pos++; | |
else | |
{ | |
int mismatch = compareIgnoreCase(c1, c2); | |
if (mismatch != 0) | |
return mismatch; | |
s1pos++; | |
s2pos++; | |
} | |
} | |
} | |
} | |
if (s1pos < s1len) | |
{ | |
// s1 is longer than s2 and s2 ends with | |
// a number. check if s1 contains the bigger number. | |
if (numcmpmode && isDigit(s1.charAt(s1pos))) | |
return 1; | |
// ignore trailing whitespace | |
while (s1pos < s1len && s1.charAt(s1pos) == ' ') | |
s1pos++; | |
} | |
else if (s2pos < s2len) | |
{ | |
if (numcmpmode && isDigit(s2.charAt(s2pos))) | |
return -1; | |
// ignore trailing whitespace | |
while (s2pos < s2len && s2.charAt(s2pos) == ' ') | |
s2pos++; | |
} | |
// the last numbers in the two strings have the | |
// same length and one string ends with its number. | |
// e.g. "test500" and "test600t" | |
if (numcmpres != 0) | |
return numcmpres; | |
// if two strings are equal, | |
// the shorter one is considered smaller. | |
return (s1.length() - s1pos) + (s2pos - s2.length()); | |
} | |
private int compareIgnoreCase(char c1, char c2) | |
{ | |
// when comparing a alphanumeric character with a special | |
// character, the special character is always smaller. | |
if (!isLetterOrDigit(c1) && isLetterOrDigit(c2)) | |
return -1; | |
if (isLetterOrDigit(c1) && !isLetterOrDigit(c2)) | |
return 1; | |
// if its upper case, make it lower case | |
if (isUpperCase(c1)) | |
c1 += 32; | |
if (isUpperCase(c2)) | |
c2 += 32; | |
return c1 - c2; | |
} | |
private boolean isLetterOrDigit(char c) | |
{ | |
return isDigit(c) || isUpperCase(c) || c > 96 && c < 123 /* lower case */; | |
} | |
private boolean isUpperCase(char c) | |
{ | |
return c > 64 && c < 91; | |
} | |
private boolean isDigit(char c) | |
{ | |
return c > 47 && c < 58; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package palantir; | |
import static org.junit.Assert.*; | |
import org.junit.Test; | |
import java.util.Arrays; | |
public class FileNameComparatorTest | |
{ | |
private FileNameComparator fnc = new FileNameComparator(); | |
@Test | |
public void testNumbers() | |
{ | |
// basic properties | |
assertTrue(fnc.compare("test", "test") == 0); | |
assertTrue(fnc.compare("test1", "test2") < 0); | |
assertTrue(fnc.compare("test2", "test1") > 0); | |
assertTrue(fnc.compare("test2", "test3") < 0); | |
assertTrue(fnc.compare("test3", "test4") < 0); | |
assertTrue(fnc.compare("test2", "test4") < 0); | |
// numbers with different number of digits | |
assertTrue(fnc.compare("121", "12") > 0); | |
assertTrue(fnc.compare("1", "137") < 0); | |
assertTrue(fnc.compare("260790", "60790") > 0); | |
assertTrue(fnc.compare("1.2.3.4.5.6.7.8.10", "1.2.3.4.5.6.7.9.10") < 0); | |
// numbers with same number of digits | |
assertTrue(fnc.compare("120", "121") < 0); | |
assertTrue(fnc.compare("1024", "1023") > 0); | |
assertTrue(fnc.compare("t1025", "t1026") < 0); | |
assertTrue(fnc.compare("file512txt", "file511txt") > 0); | |
// large numbers | |
assertTrue(fnc.compare("file222232244629420445529739893461909967206666939096499764990979600.txt", "file222232244629420445529739893461909967206666939096499764990979600.txt") == 0); | |
// 250 digit number | |
assertTrue(fnc.compare("file5781538327977828897150909166778407659250458379645823062042492461576758526757490910073628008613977550546382774775570888130029763571528699574717583228939535960234464230882573615930384979100379102915657483866755371559811718767760594919456971354184113721", | |
"file6325427919960049066585247837578372385418559154923477553398129089734082978096069693032859784967901396775824948679013568274245239986849282715816927424255093730637896848500237375779410539868274303393510928400955586603945601202203906813552017713600173613") | |
< 0); | |
// leading zeros | |
assertTrue(fnc.compare("02.txt", "0001.txt") > 0); | |
assertTrue(fnc.compare("file000000100.txt", "file00100.txt") == 0); | |
assertTrue(fnc.compare("hello00012world0000000013.txt", "hello012world00014.txt") < 0); | |
// non leading zeros and shorter is considered smaller | |
assertTrue(fnc.compare("0", "0000000") < 0); | |
// examples from the problem definition | |
assertTrue(fnc.compare("1 2 10", "1 10 2") < 0); | |
assertTrue(fnc.compare("1210", "1102") > 0); | |
assertTrue(fnc.compare("1/5", "1/20") < 0); | |
assertTrue(fnc.compare("1&5", "1$20") != 0); | |
assertTrue(fnc.compare("1/00", "1/000") < 0); | |
assertTrue(fnc.compare("-5", "-4") > 0); | |
// null inputs. | |
// I just defined it this way, thinking that null | |
// is smaller than everything else, but one can define it however | |
assertTrue(fnc.compare(null, "") < 0); | |
assertTrue(fnc.compare("", null) > 0); | |
assertTrue(fnc.compare(null, null) == 0); | |
} | |
@Test | |
public void testCaseInsensitivity() | |
{ | |
assertTrue(fnc.compare("FILE10.TXT", "file10.txt") == 0); | |
assertTrue(fnc.compare("hEllOWOrld.DoC", "HeLLowoRLD.dOc") == 0); | |
assertTrue(fnc.compare("A", "a") == 0); | |
} | |
@Test | |
public void testSpaces() | |
{ | |
assertTrue(fnc.compare("New York", "Newmark") > 0); | |
assertTrue(fnc.compare("Hello World ", "HelloWorld") == 0); | |
assertTrue(fnc.compare(" H e l l o W or l d ", "Hello World") == 0); | |
} | |
@Test | |
public void testSorting() | |
{ | |
// example from problem definition | |
String[] files = new String[] {"test2.txt", "test10.txt", "test1.txt"}; | |
String[] expected = new String[] {"test1.txt", "test2.txt", "test10.txt"}; | |
Arrays.sort(files, fnc); | |
assertArrayEquals(expected, files); | |
files = new String[] {"PICTURE2.JPG", "Picture1.jpg", "Text12.doc", "picture001.jpg", "Text13.doc", "Picture20.JPG"}; | |
expected = new String[] {"Picture1.jpg", "picture001.jpg", "PICTURE2.JPG", "Picture20.JPG", "Text12.doc", "Text13.doc"}; | |
Arrays.sort(files,fnc); | |
assertArrayEquals(expected, files); | |
files = new String[] {"File 20.jpeg", "File12.jpeg", "File1.jpeg", "File100.jpeg", "File3.jpeg", " File 5.jpeg"}; | |
expected = new String[] {"File1.jpeg", "File3.jpeg", " File 5.jpeg", "File12.jpeg", "File 20.jpeg", "File100.jpeg"}; | |
Arrays.sort(files, fnc); | |
assertArrayEquals(expected, files); | |
files = new String[] {"$HelloWorld.txt", "HelloWorld.txt", "Hello_World.txt", "$$HelloWorld.txt", "@ABC.doc", | |
"Hello_World1.txt", "Hello_World2.txt", "Hello_World10.txt", "[[[]]]!.doc", "1.docx", | |
"1.xlsx", "1.txt"}; | |
expected = new String[] {"$$HelloWorld.txt", "$HelloWorld.txt", "@ABC.doc", "[[[]]]!.doc", "1.docx", "1.txt", | |
"1.xlsx", "Hello_World.txt", "Hello_World1.txt", "Hello_World2.txt", "Hello_World10.txt", | |
"HelloWorld.txt"}; | |
Arrays.sort(files, fnc); | |
assertArrayEquals(expected, files); | |
files = new String[] {"WirelessKeyView.chm", "wirelesskeyview (1).zip", "wirelesskeyview.zip", "WirelessKeyView.cfg", | |
"WirelessKeyView.exe", "wirelesskeyview-x64.zip"}; | |
expected = new String[] {"wirelesskeyview (1).zip", "wirelesskeyview-x64.zip", "WirelessKeyView.cfg", "WirelessKeyView.chm", | |
"WirelessKeyView.exe", "wirelesskeyview.zip"}; | |
Arrays.sort(files, fnc); | |
assertArrayEquals(expected, files); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment