buchgr · August 29, 2015 13:57
diff --git a/FileNameComparator.java b/FileNameComparator.java
 package palantir;

 import java.util.Comparator;

 public class FileNameComparator implements Comparator<String>
 {
    /*
     * DESCRIPTION:
     * Numbers in the string are compared by their natural order
     * and not their ASCII order. (2 < 10)
     *
     * The comparison is case insensitive.
     *
     * Spaces are ignored completely, to make ordering of cases
     * like "New York" and "Newmark" more intuitive.
     *
     * All the non alphanumeric characters are considered smaller
     * than the alphanumeric characters. The order between the non
     * alphanumeric characters is still defined by their ASCII code
     * tough.
     *
     * If two strings are equal according to the above criteria,
     * shorter strings (ignoring whitespaces and leading zeros) are
     * considered smaller.
     *
     * The algorithm runs in one pass and constant space.
     *
     * POSSIBLE IMPROVEMENTS:
     * I believe there is two main "paths" to improve the algorithm:
     *      1) The devil is in the detail and there are a lot of
     *         corner cases were "intuitive sorting" as performed
     *         by humans might lead to different results.
     *          -   Should file extensions be ignored or only be used
     *              in the case the strings are identical but of different filetype?
     *          -   Should paranthesis, braces and brackets be considered equal?
     *          -   If two strings are equal, should we look at their white spaces
     *              and leading zeros and define an order on them?
     *          -   Full Unicode Support
     *         There are probably several dozens of such cases, I believe a good
     *         way to handle them would be to closely look at/mimic the behaviour
     *         of Windows Explorer and Finder, as most people are used to their way
     *         of sorting and I am quite certain that Microsoft and Apple have done
     *         a lot of research/gathered lots of feedback to find the most intuitive
     *         sorting algorithm.
     *         Another way to improve things would be to perform testing with real
     *         people and ask them about the "intuitiveness" of some example sorts.
     *
     *      2) Depending on the application, the scenarios and additional information
     *          one has, it might be reasonable to apply e.g. NLP techniques to assign
     *          a special order to e.g. synonyms and maybe ignore words like "a", "the",
     *          etc.
     *          For a normal file browser this would probably lead to confusion, but for
     *          specialized apps it might make sense.
     */
    public int compare(String s1, String s2)
    {
        if (s1 == null && s2 != null)
            return -1;
        if (s1 != null && s2 == null)
            return 1;
        if (s1 == null && s2 == null)
            return 0;

        // this flag tells if the method is currently
        // comparing numbers "number compare mode"
        boolean numcmpmode = false;

        int s1pos = 0,
                s2pos = 0,
                numcmpres = 0,
                s1len = s1.length(),
                s2len = s2.length();

        while (s1pos < s1len && s2pos < s2len)
        {
            char c1 = s1.charAt(s1pos), c2 = s2.charAt(s2pos);

            // skip leading zeros
            if (c1 == '0' && !numcmpmode)
                s1pos++;
            else if (c2 == '0' && !numcmpmode)
                s2pos++;
            else
            {
                if (isDigit(c1) && isDigit(c2))
                {
                    if (!numcmpmode)
                        numcmpmode = true;

                    // find the first digit where the numbers mismatch
                    // this is only relevant if the numbers have the same number
                    // of digits.
                    if (numcmpres == 0)
                        numcmpres = compareIgnoreCase(c1, c2);

                    s1pos++;
                    s2pos++;
                }
                else
                {
                    if (numcmpmode)
                    {
                        // the number in s1 is longer
                        if (isDigit(c1))
                            return 1;
                        // the number in s2 is longer
                        if (isDigit(c2))
                            return -1;
                        // the numbers have the same length
                        if (numcmpres != 0)
                            return numcmpres;

                        // the numbers are equal
                        numcmpmode = false;
                    }

                    // ignore spaces
                    if (c1 == ' ')
                        s1pos++;
                    else if (c2 == ' ')
                        s2pos++;
                    else
                    {
                        int mismatch = compareIgnoreCase(c1, c2);
                        if (mismatch != 0)
                            return mismatch;

                        s1pos++;
                        s2pos++;
                    }
                }
            }
        }

        if (s1pos < s1len)
        {
            // s1 is longer than s2 and s2 ends with
            // a number. check if s1 contains the bigger number.
            if (numcmpmode && isDigit(s1.charAt(s1pos)))
                return 1;

            // ignore trailing whitespace
            while (s1pos < s1len && s1.charAt(s1pos) == ' ')
                s1pos++;
        }
        else if (s2pos < s2len)
        {
            if (numcmpmode && isDigit(s2.charAt(s2pos)))
                return -1;

            // ignore trailing whitespace
            while (s2pos < s2len && s2.charAt(s2pos) == ' ')
                s2pos++;
        }

        // the last numbers in the two strings have the
        // same length and one string ends with its number.
        // e.g. "test500" and "test600t"
        if (numcmpres != 0)
            return numcmpres;

        // if two strings are equal,
        // the shorter one is considered smaller.
        return (s1.length() - s1pos) + (s2pos - s2.length());
    }

    private int compareIgnoreCase(char c1, char c2)
    {
        // when comparing a alphanumeric character with a special
        // character, the special character is always smaller.
        if (!isLetterOrDigit(c1) && isLetterOrDigit(c2))
            return -1;
        if (isLetterOrDigit(c1) && !isLetterOrDigit(c2))
            return 1;

        // if its upper case, make it lower case
        if (isUpperCase(c1))
            c1 += 32;
        if (isUpperCase(c2))
            c2 += 32;

        return c1 - c2;
    }

    private boolean isLetterOrDigit(char c)
    {
        return isDigit(c) || isUpperCase(c) || c > 96 && c < 123 /* lower case */;
    }

    private boolean isUpperCase(char c)
    {
        return c > 64 && c < 91;
    }

    private boolean isDigit(char c)
    {
        return c > 47 && c < 58;
    }
 }
diff --git a/FileNameComparatorTest.java b/FileNameComparatorTest.java
 package palantir;

 import static org.junit.Assert.*;
 import org.junit.Test;
 import java.util.Arrays;

 public class FileNameComparatorTest
 {
    private FileNameComparator fnc = new FileNameComparator();

    @Test
    public void testNumbers()
    {
        // basic properties
        assertTrue(fnc.compare("test", "test") == 0);
        assertTrue(fnc.compare("test1", "test2") < 0);
        assertTrue(fnc.compare("test2", "test1") > 0);

        assertTrue(fnc.compare("test2", "test3") < 0);
        assertTrue(fnc.compare("test3", "test4") < 0);
        assertTrue(fnc.compare("test2", "test4") < 0);

        // numbers with different number of digits
        assertTrue(fnc.compare("121", "12") > 0);
        assertTrue(fnc.compare("1", "137") < 0);
        assertTrue(fnc.compare("260790", "60790") > 0);
        assertTrue(fnc.compare("1.2.3.4.5.6.7.8.10", "1.2.3.4.5.6.7.9.10") < 0);

        // numbers with same number of digits
        assertTrue(fnc.compare("120", "121") < 0);
        assertTrue(fnc.compare("1024", "1023") > 0);
        assertTrue(fnc.compare("t1025", "t1026") < 0);
        assertTrue(fnc.compare("file512txt", "file511txt") > 0);

        // large numbers
        assertTrue(fnc.compare("file222232244629420445529739893461909967206666939096499764990979600.txt", "file222232244629420445529739893461909967206666939096499764990979600.txt") == 0);
        // 250 digit number
        assertTrue(fnc.compare("file5781538327977828897150909166778407659250458379645823062042492461576758526757490910073628008613977550546382774775570888130029763571528699574717583228939535960234464230882573615930384979100379102915657483866755371559811718767760594919456971354184113721",
                                "file6325427919960049066585247837578372385418559154923477553398129089734082978096069693032859784967901396775824948679013568274245239986849282715816927424255093730637896848500237375779410539868274303393510928400955586603945601202203906813552017713600173613")
                                < 0);

        // leading zeros
        assertTrue(fnc.compare("02.txt", "0001.txt") > 0);
        assertTrue(fnc.compare("file000000100.txt", "file00100.txt") == 0);
        assertTrue(fnc.compare("hello00012world0000000013.txt", "hello012world00014.txt") < 0);
        // non leading zeros and shorter is considered smaller
        assertTrue(fnc.compare("0", "0000000") < 0);

        // examples from the problem definition
        assertTrue(fnc.compare("1 2 10", "1 10 2") < 0);
        assertTrue(fnc.compare("1210", "1102") > 0);
        assertTrue(fnc.compare("1/5", "1/20") < 0);
        assertTrue(fnc.compare("1&5", "1$20") != 0);
        assertTrue(fnc.compare("1/00", "1/000") < 0);
        assertTrue(fnc.compare("-5", "-4") > 0);

        // null inputs.
        // I just defined it this way, thinking that null
        // is smaller than everything else, but one can define it however
        assertTrue(fnc.compare(null, "") < 0);
        assertTrue(fnc.compare("", null) > 0);
        assertTrue(fnc.compare(null, null) == 0);
    }

    @Test
    public void testCaseInsensitivity()
    {
        assertTrue(fnc.compare("FILE10.TXT", "file10.txt") == 0);
        assertTrue(fnc.compare("hEllOWOrld.DoC", "HeLLowoRLD.dOc") == 0);
        assertTrue(fnc.compare("A", "a") == 0);
    }

    @Test
    public void testSpaces()
    {
        assertTrue(fnc.compare("New York", "Newmark") > 0);
        assertTrue(fnc.compare("Hello World ", "HelloWorld") == 0);
        assertTrue(fnc.compare(" H e l   l o    W  or  l d     ", "Hello World") == 0);
    }

    @Test
    public void testSorting()
    {
        // example from problem definition
        String[] files = new String[] {"test2.txt", "test10.txt", "test1.txt"};
        String[] expected = new String[] {"test1.txt", "test2.txt", "test10.txt"};

        Arrays.sort(files, fnc);
        assertArrayEquals(expected, files);

        files = new String[] {"PICTURE2.JPG", "Picture1.jpg", "Text12.doc", "picture001.jpg", "Text13.doc", "Picture20.JPG"};
        expected = new String[] {"Picture1.jpg", "picture001.jpg", "PICTURE2.JPG", "Picture20.JPG", "Text12.doc", "Text13.doc"};

        Arrays.sort(files,fnc);
        assertArrayEquals(expected, files);


        files = new String[] {"File 20.jpeg", "File12.jpeg", "File1.jpeg", "File100.jpeg", "File3.jpeg", " File  5.jpeg"};
        expected = new String[] {"File1.jpeg",  "File3.jpeg",  " File  5.jpeg",  "File12.jpeg", "File 20.jpeg",  "File100.jpeg"};

        Arrays.sort(files, fnc);
        assertArrayEquals(expected, files);


        files = new String[] {"$HelloWorld.txt", "HelloWorld.txt", "Hello_World.txt", "$$HelloWorld.txt", "@ABC.doc",
                                "Hello_World1.txt", "Hello_World2.txt", "Hello_World10.txt", "[[[]]]!.doc", "1.docx",
                                "1.xlsx", "1.txt"};
        expected = new String[] {"$$HelloWorld.txt", "$HelloWorld.txt", "@ABC.doc", "[[[]]]!.doc", "1.docx", "1.txt",
                                    "1.xlsx", "Hello_World.txt", "Hello_World1.txt", "Hello_World2.txt", "Hello_World10.txt",
                                    "HelloWorld.txt"};

        Arrays.sort(files, fnc);
        assertArrayEquals(expected, files);

        files = new String[] {"WirelessKeyView.chm", "wirelesskeyview (1).zip", "wirelesskeyview.zip", "WirelessKeyView.cfg",
                                "WirelessKeyView.exe", "wirelesskeyview-x64.zip"};

        expected = new String[] {"wirelesskeyview (1).zip", "wirelesskeyview-x64.zip", "WirelessKeyView.cfg", "WirelessKeyView.chm",
                                    "WirelessKeyView.exe", "wirelesskeyview.zip"};
        Arrays.sort(files, fnc);
        assertArrayEquals(expected, files);
    }
 }
	package palantir;

	import java.util.Comparator;

	public class FileNameComparator implements Comparator<String>
	{
	/*
	* DESCRIPTION:
	* Numbers in the string are compared by their natural order
	* and not their ASCII order. (2 < 10)
	*
	* The comparison is case insensitive.
	*
	* Spaces are ignored completely, to make ordering of cases
	* like "New York" and "Newmark" more intuitive.
	*
	* All the non alphanumeric characters are considered smaller
	* than the alphanumeric characters. The order between the non
	* alphanumeric characters is still defined by their ASCII code
	* tough.
	*
	* If two strings are equal according to the above criteria,
	* shorter strings (ignoring whitespaces and leading zeros) are
	* considered smaller.
	*
	* The algorithm runs in one pass and constant space.
	*
	* POSSIBLE IMPROVEMENTS:
	* I believe there is two main "paths" to improve the algorithm:
	* 1) The devil is in the detail and there are a lot of
	* corner cases were "intuitive sorting" as performed
	* by humans might lead to different results.
	* - Should file extensions be ignored or only be used
	* in the case the strings are identical but of different filetype?
	* - Should paranthesis, braces and brackets be considered equal?
	* - If two strings are equal, should we look at their white spaces
	* and leading zeros and define an order on them?
	* - Full Unicode Support
	* There are probably several dozens of such cases, I believe a good
	* way to handle them would be to closely look at/mimic the behaviour
	* of Windows Explorer and Finder, as most people are used to their way
	* of sorting and I am quite certain that Microsoft and Apple have done
	* a lot of research/gathered lots of feedback to find the most intuitive
	* sorting algorithm.
	* Another way to improve things would be to perform testing with real
	* people and ask them about the "intuitiveness" of some example sorts.
	*
	* 2) Depending on the application, the scenarios and additional information
	* one has, it might be reasonable to apply e.g. NLP techniques to assign
	* a special order to e.g. synonyms and maybe ignore words like "a", "the",
	* etc.
	* For a normal file browser this would probably lead to confusion, but for
	* specialized apps it might make sense.
	*/
	public int compare(String s1, String s2)
	{
	if (s1 == null && s2 != null)
	return -1;
	if (s1 != null && s2 == null)
	return 1;
	if (s1 == null && s2 == null)
	return 0;

	// this flag tells if the method is currently
	// comparing numbers "number compare mode"
	boolean numcmpmode = false;

	int s1pos = 0,
	s2pos = 0,
	numcmpres = 0,
	s1len = s1.length(),
	s2len = s2.length();

	while (s1pos < s1len && s2pos < s2len)
	{
	char c1 = s1.charAt(s1pos), c2 = s2.charAt(s2pos);

	// skip leading zeros
	if (c1 == '0' && !numcmpmode)
	s1pos++;
	else if (c2 == '0' && !numcmpmode)
	s2pos++;
	else
	{
	if (isDigit(c1) && isDigit(c2))
	{
	if (!numcmpmode)
	numcmpmode = true;

	// find the first digit where the numbers mismatch
	// this is only relevant if the numbers have the same number
	// of digits.
	if (numcmpres == 0)
	numcmpres = compareIgnoreCase(c1, c2);

	s1pos++;
	s2pos++;
	}
	else
	{
	if (numcmpmode)
	{
	// the number in s1 is longer
	if (isDigit(c1))
	return 1;
	// the number in s2 is longer
	if (isDigit(c2))
	return -1;
	// the numbers have the same length
	if (numcmpres != 0)
	return numcmpres;

	// the numbers are equal
	numcmpmode = false;
	}

	// ignore spaces
	if (c1 == ' ')
	s1pos++;
	else if (c2 == ' ')
	s2pos++;
	else
	{
	int mismatch = compareIgnoreCase(c1, c2);
	if (mismatch != 0)
	return mismatch;

	s1pos++;
	s2pos++;
	}
	}
	}
	}

	if (s1pos < s1len)
	{
	// s1 is longer than s2 and s2 ends with
	// a number. check if s1 contains the bigger number.
	if (numcmpmode && isDigit(s1.charAt(s1pos)))
	return 1;

	// ignore trailing whitespace
	while (s1pos < s1len && s1.charAt(s1pos) == ' ')
	s1pos++;
	}
	else if (s2pos < s2len)
	{
	if (numcmpmode && isDigit(s2.charAt(s2pos)))
	return -1;

	// ignore trailing whitespace
	while (s2pos < s2len && s2.charAt(s2pos) == ' ')
	s2pos++;
	}

	// the last numbers in the two strings have the
	// same length and one string ends with its number.
	// e.g. "test500" and "test600t"
	if (numcmpres != 0)
	return numcmpres;

	// if two strings are equal,
	// the shorter one is considered smaller.
	return (s1.length() - s1pos) + (s2pos - s2.length());
	}

	private int compareIgnoreCase(char c1, char c2)
	{
	// when comparing a alphanumeric character with a special
	// character, the special character is always smaller.
	if (!isLetterOrDigit(c1) && isLetterOrDigit(c2))
	return -1;
	if (isLetterOrDigit(c1) && !isLetterOrDigit(c2))
	return 1;

	// if its upper case, make it lower case
	if (isUpperCase(c1))
	c1 += 32;
	if (isUpperCase(c2))
	c2 += 32;

	return c1 - c2;
	}

	private boolean isLetterOrDigit(char c)
	{
	return isDigit(c) \|\| isUpperCase(c) \|\| c > 96 && c < 123 /* lower case */;
	}

	private boolean isUpperCase(char c)
	{
	return c > 64 && c < 91;
	}

	private boolean isDigit(char c)
	{
	return c > 47 && c < 58;
	}
	}
	package palantir;

	import static org.junit.Assert.*;
	import org.junit.Test;
	import java.util.Arrays;

	public class FileNameComparatorTest
	{
	private FileNameComparator fnc = new FileNameComparator();

	@Test
	public void testNumbers()
	{
	// basic properties
	assertTrue(fnc.compare("test", "test") == 0);
	assertTrue(fnc.compare("test1", "test2") < 0);
	assertTrue(fnc.compare("test2", "test1") > 0);

	assertTrue(fnc.compare("test2", "test3") < 0);
	assertTrue(fnc.compare("test3", "test4") < 0);
	assertTrue(fnc.compare("test2", "test4") < 0);

	// numbers with different number of digits
	assertTrue(fnc.compare("121", "12") > 0);
	assertTrue(fnc.compare("1", "137") < 0);
	assertTrue(fnc.compare("260790", "60790") > 0);
	assertTrue(fnc.compare("1.2.3.4.5.6.7.8.10", "1.2.3.4.5.6.7.9.10") < 0);

	// numbers with same number of digits
	assertTrue(fnc.compare("120", "121") < 0);
	assertTrue(fnc.compare("1024", "1023") > 0);
	assertTrue(fnc.compare("t1025", "t1026") < 0);
	assertTrue(fnc.compare("file512txt", "file511txt") > 0);

	// large numbers
	assertTrue(fnc.compare("file222232244629420445529739893461909967206666939096499764990979600.txt", "file222232244629420445529739893461909967206666939096499764990979600.txt") == 0);
	// 250 digit number
	assertTrue(fnc.compare("file5781538327977828897150909166778407659250458379645823062042492461576758526757490910073628008613977550546382774775570888130029763571528699574717583228939535960234464230882573615930384979100379102915657483866755371559811718767760594919456971354184113721",
	"file6325427919960049066585247837578372385418559154923477553398129089734082978096069693032859784967901396775824948679013568274245239986849282715816927424255093730637896848500237375779410539868274303393510928400955586603945601202203906813552017713600173613")
	< 0);

	// leading zeros
	assertTrue(fnc.compare("02.txt", "0001.txt") > 0);
	assertTrue(fnc.compare("file000000100.txt", "file00100.txt") == 0);
	assertTrue(fnc.compare("hello00012world0000000013.txt", "hello012world00014.txt") < 0);
	// non leading zeros and shorter is considered smaller
	assertTrue(fnc.compare("0", "0000000") < 0);

	// examples from the problem definition
	assertTrue(fnc.compare("1 2 10", "1 10 2") < 0);
	assertTrue(fnc.compare("1210", "1102") > 0);
	assertTrue(fnc.compare("1/5", "1/20") < 0);
	assertTrue(fnc.compare("1&5", "1$20") != 0);
	assertTrue(fnc.compare("1/00", "1/000") < 0);
	assertTrue(fnc.compare("-5", "-4") > 0);

	// null inputs.
	// I just defined it this way, thinking that null
	// is smaller than everything else, but one can define it however
	assertTrue(fnc.compare(null, "") < 0);
	assertTrue(fnc.compare("", null) > 0);
	assertTrue(fnc.compare(null, null) == 0);
	}

	@Test
	public void testCaseInsensitivity()
	{
	assertTrue(fnc.compare("FILE10.TXT", "file10.txt") == 0);
	assertTrue(fnc.compare("hEllOWOrld.DoC", "HeLLowoRLD.dOc") == 0);
	assertTrue(fnc.compare("A", "a") == 0);
	}

	@Test
	public void testSpaces()
	{
	assertTrue(fnc.compare("New York", "Newmark") > 0);
	assertTrue(fnc.compare("Hello World ", "HelloWorld") == 0);
	assertTrue(fnc.compare(" H e l l o W or l d ", "Hello World") == 0);
	}

	@Test
	public void testSorting()
	{
	// example from problem definition
	String[] files = new String[] {"test2.txt", "test10.txt", "test1.txt"};
	String[] expected = new String[] {"test1.txt", "test2.txt", "test10.txt"};

	Arrays.sort(files, fnc);
	assertArrayEquals(expected, files);

	files = new String[] {"PICTURE2.JPG", "Picture1.jpg", "Text12.doc", "picture001.jpg", "Text13.doc", "Picture20.JPG"};
	expected = new String[] {"Picture1.jpg", "picture001.jpg", "PICTURE2.JPG", "Picture20.JPG", "Text12.doc", "Text13.doc"};

	Arrays.sort(files,fnc);
	assertArrayEquals(expected, files);


	files = new String[] {"File 20.jpeg", "File12.jpeg", "File1.jpeg", "File100.jpeg", "File3.jpeg", " File 5.jpeg"};
	expected = new String[] {"File1.jpeg", "File3.jpeg", " File 5.jpeg", "File12.jpeg", "File 20.jpeg", "File100.jpeg"};

	Arrays.sort(files, fnc);
	assertArrayEquals(expected, files);


	files = new String[] {"$HelloWorld.txt", "HelloWorld.txt", "Hello_World.txt", "$$HelloWorld.txt", "@ABC.doc",
	"Hello_World1.txt", "Hello_World2.txt", "Hello_World10.txt", "[[[]]]!.doc", "1.docx",
	"1.xlsx", "1.txt"};
	expected = new String[] {"$$HelloWorld.txt", "$HelloWorld.txt", "@ABC.doc", "[[[]]]!.doc", "1.docx", "1.txt",
	"1.xlsx", "Hello_World.txt", "Hello_World1.txt", "Hello_World2.txt", "Hello_World10.txt",
	"HelloWorld.txt"};

	Arrays.sort(files, fnc);
	assertArrayEquals(expected, files);

	files = new String[] {"WirelessKeyView.chm", "wirelesskeyview (1).zip", "wirelesskeyview.zip", "WirelessKeyView.cfg",
	"WirelessKeyView.exe", "wirelesskeyview-x64.zip"};

	expected = new String[] {"wirelesskeyview (1).zip", "wirelesskeyview-x64.zip", "WirelessKeyView.cfg", "WirelessKeyView.chm",
	"WirelessKeyView.exe", "wirelesskeyview.zip"};
	Arrays.sort(files, fnc);
	assertArrayEquals(expected, files);
	}
	}