Skip to content

Instantly share code, notes, and snippets.

@buchgr
Last active August 29, 2015 13:57
Show Gist options
  • Save buchgr/9486590 to your computer and use it in GitHub Desktop.
Save buchgr/9486590 to your computer and use it in GitHub Desktop.
package palantir;
import java.util.Comparator;
public class FileNameComparator implements Comparator<String>
{
/*
* DESCRIPTION:
* Numbers in the string are compared by their natural order
* and not their ASCII order. (2 < 10)
*
* The comparison is case insensitive.
*
* Spaces are ignored completely, to make ordering of cases
* like "New York" and "Newmark" more intuitive.
*
* All the non alphanumeric characters are considered smaller
* than the alphanumeric characters. The order between the non
* alphanumeric characters is still defined by their ASCII code
* tough.
*
* If two strings are equal according to the above criteria,
* shorter strings (ignoring whitespaces and leading zeros) are
* considered smaller.
*
* The algorithm runs in one pass and constant space.
*
* POSSIBLE IMPROVEMENTS:
* I believe there is two main "paths" to improve the algorithm:
* 1) The devil is in the detail and there are a lot of
* corner cases were "intuitive sorting" as performed
* by humans might lead to different results.
* - Should file extensions be ignored or only be used
* in the case the strings are identical but of different filetype?
* - Should paranthesis, braces and brackets be considered equal?
* - If two strings are equal, should we look at their white spaces
* and leading zeros and define an order on them?
* - Full Unicode Support
* There are probably several dozens of such cases, I believe a good
* way to handle them would be to closely look at/mimic the behaviour
* of Windows Explorer and Finder, as most people are used to their way
* of sorting and I am quite certain that Microsoft and Apple have done
* a lot of research/gathered lots of feedback to find the most intuitive
* sorting algorithm.
* Another way to improve things would be to perform testing with real
* people and ask them about the "intuitiveness" of some example sorts.
*
* 2) Depending on the application, the scenarios and additional information
* one has, it might be reasonable to apply e.g. NLP techniques to assign
* a special order to e.g. synonyms and maybe ignore words like "a", "the",
* etc.
* For a normal file browser this would probably lead to confusion, but for
* specialized apps it might make sense.
*/
public int compare(String s1, String s2)
{
if (s1 == null && s2 != null)
return -1;
if (s1 != null && s2 == null)
return 1;
if (s1 == null && s2 == null)
return 0;
// this flag tells if the method is currently
// comparing numbers "number compare mode"
boolean numcmpmode = false;
int s1pos = 0,
s2pos = 0,
numcmpres = 0,
s1len = s1.length(),
s2len = s2.length();
while (s1pos < s1len && s2pos < s2len)
{
char c1 = s1.charAt(s1pos), c2 = s2.charAt(s2pos);
// skip leading zeros
if (c1 == '0' && !numcmpmode)
s1pos++;
else if (c2 == '0' && !numcmpmode)
s2pos++;
else
{
if (isDigit(c1) && isDigit(c2))
{
if (!numcmpmode)
numcmpmode = true;
// find the first digit where the numbers mismatch
// this is only relevant if the numbers have the same number
// of digits.
if (numcmpres == 0)
numcmpres = compareIgnoreCase(c1, c2);
s1pos++;
s2pos++;
}
else
{
if (numcmpmode)
{
// the number in s1 is longer
if (isDigit(c1))
return 1;
// the number in s2 is longer
if (isDigit(c2))
return -1;
// the numbers have the same length
if (numcmpres != 0)
return numcmpres;
// the numbers are equal
numcmpmode = false;
}
// ignore spaces
if (c1 == ' ')
s1pos++;
else if (c2 == ' ')
s2pos++;
else
{
int mismatch = compareIgnoreCase(c1, c2);
if (mismatch != 0)
return mismatch;
s1pos++;
s2pos++;
}
}
}
}
if (s1pos < s1len)
{
// s1 is longer than s2 and s2 ends with
// a number. check if s1 contains the bigger number.
if (numcmpmode && isDigit(s1.charAt(s1pos)))
return 1;
// ignore trailing whitespace
while (s1pos < s1len && s1.charAt(s1pos) == ' ')
s1pos++;
}
else if (s2pos < s2len)
{
if (numcmpmode && isDigit(s2.charAt(s2pos)))
return -1;
// ignore trailing whitespace
while (s2pos < s2len && s2.charAt(s2pos) == ' ')
s2pos++;
}
// the last numbers in the two strings have the
// same length and one string ends with its number.
// e.g. "test500" and "test600t"
if (numcmpres != 0)
return numcmpres;
// if two strings are equal,
// the shorter one is considered smaller.
return (s1.length() - s1pos) + (s2pos - s2.length());
}
private int compareIgnoreCase(char c1, char c2)
{
// when comparing a alphanumeric character with a special
// character, the special character is always smaller.
if (!isLetterOrDigit(c1) && isLetterOrDigit(c2))
return -1;
if (isLetterOrDigit(c1) && !isLetterOrDigit(c2))
return 1;
// if its upper case, make it lower case
if (isUpperCase(c1))
c1 += 32;
if (isUpperCase(c2))
c2 += 32;
return c1 - c2;
}
private boolean isLetterOrDigit(char c)
{
return isDigit(c) || isUpperCase(c) || c > 96 && c < 123 /* lower case */;
}
private boolean isUpperCase(char c)
{
return c > 64 && c < 91;
}
private boolean isDigit(char c)
{
return c > 47 && c < 58;
}
}
package palantir;
import static org.junit.Assert.*;
import org.junit.Test;
import java.util.Arrays;
public class FileNameComparatorTest
{
private FileNameComparator fnc = new FileNameComparator();
@Test
public void testNumbers()
{
// basic properties
assertTrue(fnc.compare("test", "test") == 0);
assertTrue(fnc.compare("test1", "test2") < 0);
assertTrue(fnc.compare("test2", "test1") > 0);
assertTrue(fnc.compare("test2", "test3") < 0);
assertTrue(fnc.compare("test3", "test4") < 0);
assertTrue(fnc.compare("test2", "test4") < 0);
// numbers with different number of digits
assertTrue(fnc.compare("121", "12") > 0);
assertTrue(fnc.compare("1", "137") < 0);
assertTrue(fnc.compare("260790", "60790") > 0);
assertTrue(fnc.compare("1.2.3.4.5.6.7.8.10", "1.2.3.4.5.6.7.9.10") < 0);
// numbers with same number of digits
assertTrue(fnc.compare("120", "121") < 0);
assertTrue(fnc.compare("1024", "1023") > 0);
assertTrue(fnc.compare("t1025", "t1026") < 0);
assertTrue(fnc.compare("file512txt", "file511txt") > 0);
// large numbers
assertTrue(fnc.compare("file222232244629420445529739893461909967206666939096499764990979600.txt", "file222232244629420445529739893461909967206666939096499764990979600.txt") == 0);
// 250 digit number
assertTrue(fnc.compare("file5781538327977828897150909166778407659250458379645823062042492461576758526757490910073628008613977550546382774775570888130029763571528699574717583228939535960234464230882573615930384979100379102915657483866755371559811718767760594919456971354184113721",
"file6325427919960049066585247837578372385418559154923477553398129089734082978096069693032859784967901396775824948679013568274245239986849282715816927424255093730637896848500237375779410539868274303393510928400955586603945601202203906813552017713600173613")
< 0);
// leading zeros
assertTrue(fnc.compare("02.txt", "0001.txt") > 0);
assertTrue(fnc.compare("file000000100.txt", "file00100.txt") == 0);
assertTrue(fnc.compare("hello00012world0000000013.txt", "hello012world00014.txt") < 0);
// non leading zeros and shorter is considered smaller
assertTrue(fnc.compare("0", "0000000") < 0);
// examples from the problem definition
assertTrue(fnc.compare("1 2 10", "1 10 2") < 0);
assertTrue(fnc.compare("1210", "1102") > 0);
assertTrue(fnc.compare("1/5", "1/20") < 0);
assertTrue(fnc.compare("1&5", "1$20") != 0);
assertTrue(fnc.compare("1/00", "1/000") < 0);
assertTrue(fnc.compare("-5", "-4") > 0);
// null inputs.
// I just defined it this way, thinking that null
// is smaller than everything else, but one can define it however
assertTrue(fnc.compare(null, "") < 0);
assertTrue(fnc.compare("", null) > 0);
assertTrue(fnc.compare(null, null) == 0);
}
@Test
public void testCaseInsensitivity()
{
assertTrue(fnc.compare("FILE10.TXT", "file10.txt") == 0);
assertTrue(fnc.compare("hEllOWOrld.DoC", "HeLLowoRLD.dOc") == 0);
assertTrue(fnc.compare("A", "a") == 0);
}
@Test
public void testSpaces()
{
assertTrue(fnc.compare("New York", "Newmark") > 0);
assertTrue(fnc.compare("Hello World ", "HelloWorld") == 0);
assertTrue(fnc.compare(" H e l l o W or l d ", "Hello World") == 0);
}
@Test
public void testSorting()
{
// example from problem definition
String[] files = new String[] {"test2.txt", "test10.txt", "test1.txt"};
String[] expected = new String[] {"test1.txt", "test2.txt", "test10.txt"};
Arrays.sort(files, fnc);
assertArrayEquals(expected, files);
files = new String[] {"PICTURE2.JPG", "Picture1.jpg", "Text12.doc", "picture001.jpg", "Text13.doc", "Picture20.JPG"};
expected = new String[] {"Picture1.jpg", "picture001.jpg", "PICTURE2.JPG", "Picture20.JPG", "Text12.doc", "Text13.doc"};
Arrays.sort(files,fnc);
assertArrayEquals(expected, files);
files = new String[] {"File 20.jpeg", "File12.jpeg", "File1.jpeg", "File100.jpeg", "File3.jpeg", " File 5.jpeg"};
expected = new String[] {"File1.jpeg", "File3.jpeg", " File 5.jpeg", "File12.jpeg", "File 20.jpeg", "File100.jpeg"};
Arrays.sort(files, fnc);
assertArrayEquals(expected, files);
files = new String[] {"$HelloWorld.txt", "HelloWorld.txt", "Hello_World.txt", "$$HelloWorld.txt", "@ABC.doc",
"Hello_World1.txt", "Hello_World2.txt", "Hello_World10.txt", "[[[]]]!.doc", "1.docx",
"1.xlsx", "1.txt"};
expected = new String[] {"$$HelloWorld.txt", "$HelloWorld.txt", "@ABC.doc", "[[[]]]!.doc", "1.docx", "1.txt",
"1.xlsx", "Hello_World.txt", "Hello_World1.txt", "Hello_World2.txt", "Hello_World10.txt",
"HelloWorld.txt"};
Arrays.sort(files, fnc);
assertArrayEquals(expected, files);
files = new String[] {"WirelessKeyView.chm", "wirelesskeyview (1).zip", "wirelesskeyview.zip", "WirelessKeyView.cfg",
"WirelessKeyView.exe", "wirelesskeyview-x64.zip"};
expected = new String[] {"wirelesskeyview (1).zip", "wirelesskeyview-x64.zip", "WirelessKeyView.cfg", "WirelessKeyView.chm",
"WirelessKeyView.exe", "wirelesskeyview.zip"};
Arrays.sort(files, fnc);
assertArrayEquals(expected, files);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment