fire-eggs · July 17, 2020 01:02 · fire-eggs · Jul 15, 2020 · Albrecht-S · Jul 16, 2020
diff --git a/isDir_unittest.cpp b/isDir_unittest.cpp
 /*
 1. modify the const BASE to the drive letter you want to test on.
 1a. To build, the additional option "/utf-8" needs to be added to the compiler options.
    See project settings, compiler, "command line".
 2. execute the following commands in a console window at the root of said drive

 chcp 65001
 mkdir isdirtest
 cd isdirtest
 mkdir testǼ
 mkdir test♥
 echo. 2> file♥.txt
 echo. 2> file.txt
 cd test♥
 echo. 2> file♥.txt
 echo. 2> file.txt

 3. Run the program.
 */

 #define _CRT_SECURE_NO_WARNINGS

 #include <string>
 #include <windows.h>
 #include <fileapi.h> // GetFileAttributes
 #include <algorithm> // replace

 const std::string BASE = "E:";

 #define FL_PATH_MAX 2048

 unsigned fl_utf8decode(const char* p, const char* end, int* len)
 {
    unsigned char c = *(const unsigned char*)p;
    if (c < 0x80) {
        if (len) *len = 1;
        return c;
 #if ERRORS_TO_CP1252
    }
    else if (c < 0xa0) {
        if (len) *len = 1;
        return cp1252[c - 0x80];
 #endif
    }
    else if (c < 0xc2) {
        goto FAIL;
    }
    if ((end && p + 1 >= end) || (p[1] & 0xc0) != 0x80) goto FAIL;
    if (c < 0xe0) {
        if (len) *len = 2;
        return
            ((p[0] & 0x1f) << 6) +
            ((p[1] & 0x3f));
    }
    else if (c == 0xe0) {
        if (((const unsigned char*)p)[1] < 0xa0) goto FAIL;
        goto UTF8_3;
 #if STRICT_RFC3629
    }
    else if (c == 0xed) {
        /* RFC 3629 says surrogate chars are illegal. */
        if (((const unsigned char*)p)[1] >= 0xa0) goto FAIL;
        goto UTF8_3;
    }
    else if (c == 0xef) {
        /* 0xfffe and 0xffff are also illegal characters */
        if (((const unsigned char*)p)[1] == 0xbf &&
            ((const unsigned char*)p)[2] >= 0xbe) goto FAIL;
        goto UTF8_3;
 #endif
    }
    else if (c < 0xf0) {
    UTF8_3:
        if ((end && p + 2 >= end) || (p[2] & 0xc0) != 0x80) goto FAIL;
        if (len) *len = 3;
        return
            ((p[0] & 0x0f) << 12) +
            ((p[1] & 0x3f) << 6) +
            ((p[2] & 0x3f));
    }
    else if (c == 0xf0) {
        if (((const unsigned char*)p)[1] < 0x90) goto FAIL;
        goto UTF8_4;
    }
    else if (c < 0xf4) {
    UTF8_4:
        if ((end && p + 3 >= end) || (p[2] & 0xc0) != 0x80 || (p[3] & 0xc0) != 0x80) goto FAIL;
        if (len) *len = 4;
 #if STRICT_RFC3629
        /* RFC 3629 says all codes ending in fffe or ffff are illegal: */
        if ((p[1] & 0xf) == 0xf &&
            ((const unsigned char*)p)[2] == 0xbf &&
            ((const unsigned char*)p)[3] >= 0xbe) goto FAIL;
 #endif
        return
            ((p[0] & 0x07) << 18) +
            ((p[1] & 0x3f) << 12) +
            ((p[2] & 0x3f) << 6) +
            ((p[3] & 0x3f));
    }
    else if (c == 0xf4) {
        if (((const unsigned char*)p)[1] > 0x8f) goto FAIL; /* after 0x10ffff */
        goto UTF8_4;
    }
    else {
    FAIL:
        if (len) *len = 1;
 #if ERRORS_TO_ISO8859_1
        return c;
 #else
        return 0xfffd; /* Unicode REPLACEMENT CHARACTER */
 #endif
    }
 }

 unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
    unsigned short* dst, unsigned dstlen)
 {
    const char* p = src;
    const char* e = src + srclen;
    unsigned count = 0;
    if (dstlen) for (;;) {
        if (p >= e) { dst[count] = 0; return count; }
        if (!(*p & 0x80)) { /* ascii */
            dst[count] = *p++;
        }
        else {
            int len; unsigned ucs = fl_utf8decode(p, e, &len);
            p += len;
            if (ucs < 0x10000) {
                dst[count] = ucs;
            }
            else {
                /* make a surrogate pair: */
                if (count + 2 >= dstlen) { dst[count] = 0; count += 2; break; }
                dst[count] = (((ucs - 0x10000u) >> 10) & 0x3ff) | 0xd800;
                dst[++count] = (ucs & 0x3ff) | 0xdc00;
            }
        }
        if (++count == dstlen) { dst[count - 1] = 0; break; }
    }
    /* we filled dst, measure the rest: */
    while (p < e) {
        if (!(*p & 0x80)) p++;
        else {
            int len; unsigned ucs = fl_utf8decode(p, e, &len);
            p += len;
            if (ucs >= 0x10000) ++count;
        }
        ++count;
    }
    return count;
 }

 static wchar_t* wbuf = NULL;

 static wchar_t* utf8_to_wchar(const char* utf8, wchar_t*& wbuf, int lg = -1) {
    unsigned len = (lg >= 0) ? (unsigned)lg : (unsigned)strlen(utf8);
    unsigned wn = fl_utf8toUtf16(utf8, len, NULL, 0) + 1; // Query length
    wbuf = (wchar_t*)realloc(wbuf, sizeof(wchar_t) * wn);
    wn = fl_utf8toUtf16(utf8, len, (unsigned short*)wbuf, wn); // Convert string
    wbuf[wn] = 0;
    return wbuf;
 }


 int isdirGFAW(const char* n)
 {
    utf8_to_wchar(n, wbuf);
    DWORD res = GetFileAttributesW(wbuf);
    return res != INVALID_FILE_ATTRIBUTES && (res & FILE_ATTRIBUTE_DIRECTORY);
 }

 inline int isdirsep(char c) { return c == '/' || c == '\\'; }

 int isdirORIG(const char* n)
 {
    struct _stat	s;
    char		fn[FL_PATH_MAX];
    int		length;
    length = (int)strlen(n);
    // This workaround brought to you by the fine folks at Microsoft!
    // (read lots of sarcasm in that...)
    if (length < (int)(sizeof(fn) - 1)) {
        if (length < 4 && isalpha(n[0]) && n[1] == ':' &&
            (isdirsep(n[2]) || !n[2])) {
            // Always use D:/ for drive letters
            fn[0] = n[0];
            strcpy(fn + 1, ":/");
            n = fn;
        }
        else if (length > 0 && isdirsep(n[length - 1])) {
            // Strip trailing slash from name...
            length--;
            memcpy(fn, n, length);
            fn[length] = '\0';
            n = fn;
        }
    }
    return !_stat(n, &s) && (s.st_mode & _S_IFDIR);
 }

 int isdirWStat(const char* n)
 {
    char fn[4]; // used for drive letter only: "X:/"
    int length = (int)strlen(n);
    // Strip trailing slash from name...
    if (length > 0 && isdirsep(n[length - 1]))
        length--;
    if (length < 1)
        return 0;

    // This workaround brought to you by the fine folks at Microsoft!
    // (read lots of sarcasm in that...)

    if (length == 2 && isalpha(n[0]) && n[1] == ':') {
        fn[0] = n[0];
        strcpy(fn + 1, ":/");
        n = fn;
        length = 3;
    }

    // convert filename to wide chars using *length*
    utf8_to_wchar(n, wbuf, length);

    struct _stat  s;
    return (!_wstat(wbuf, &s) && (s.st_mode & _S_IFDIR));
 }

 // NOTE: these paths should only use the '/' separator. The test will
 // repeat test each path using the backslash separator.
 std::pair<std::string, int> testVals [] =
 {
    {BASE, 1},
    {BASE + "/", 1},
    {"/", 1},
    {".", 1},
    {"..", 1},

    // variations on an ASCII folder
    {BASE + "/isdirtest", 1},
    {BASE + "/isdirtest/", 1},
    {BASE + "/isdirtest/.", 1},
    {BASE + "/isdirtest/..", 1},
    {BASE + "/isdirtest/./", 1},
    {BASE + "/isdirtest/../", 1},

    // variations on a UTF-8 folder
    {BASE + "/isdirtest/testǼ", 1},
    {BASE + "/isdirtest/testǼ/", 1},
    {BASE + "/isdirtest/testǼ/.", 1},
    {BASE + "/isdirtest/testǼ/..", 1},
    {BASE + "/isdirtest/testǼ/./", 1},
    {BASE + "/isdirtest/testǼ/../", 1},

    // variations on a different UTF-8 folder [utf-8 value larger than 255]
    {BASE + "/isdirtest/test♥", 1},
    {BASE + "/isdirtest/test♥/", 1},
    {BASE + "/isdirtest/test♥/.", 1},
    {BASE + "/isdirtest/test♥/..", 1},
    {BASE + "/isdirtest/test♥/./", 1},
    {BASE + "/isdirtest/test♥/../", 1},

    // files within an ASCII folder
    {BASE + "/isdirtest/file.txt", 0},
    {BASE + "/isdirtest/file♥.txt", 0},

    // files within a UTF-8 folder.
    {BASE + "/isdirtest/test♥/file.txt", 0},
    {BASE + "/isdirtest/test♥/file♥.txt", 0},
 };

 void testOne(const char* name, int expect)
 {
    if (isdirORIG(name) != expect)
        printf("isdirORIG fail: %s\n", name);
    if (isdirGFAW(name) != expect)
        printf("isdirGFAW fail: %s\n", name);
    if (isdirWStat(name) != expect)
        printf("isdirWStat fail: %s\n", name);
 }

 void testBothSlashes(std::string name, int expect)
 {
    testOne(name.c_str(), expect);
    const char* name2 = name.c_str();

    if (name.find('/', 0) != std::string::npos)
    {
        std::replace(name.begin(), name.end(), '/', '\\');
        testOne(name.c_str(), expect);
    }
 }

 int main()
 {
    SetConsoleOutputCP(65001); // make sure we can display unicode

    SetCurrentDirectory((BASE + "/").c_str());
    char buf[2048];
    GetCurrentDirectory(2048, buf);
    printf("cwd = %s\n", buf);

    int count = sizeof(testVals) / sizeof(testVals[0]);
    for (int i = 0; i < count; i++)
    {
        auto name = testVals[i].first;
        int expect = testVals[i].second;
        testBothSlashes(name, expect);
    }
 }
	/*
	1. modify the const BASE to the drive letter you want to test on.
	1a. To build, the additional option "/utf-8" needs to be added to the compiler options.
	See project settings, compiler, "command line".
	2. execute the following commands in a console window at the root of said drive

	chcp 65001
	mkdir isdirtest
	cd isdirtest
	mkdir testǼ
	mkdir test♥
	echo. 2> file♥.txt
	echo. 2> file.txt
	cd test♥
	echo. 2> file♥.txt
	echo. 2> file.txt

	3. Run the program.
	*/

	#define _CRT_SECURE_NO_WARNINGS

	#include <string>
	#include <windows.h>
	#include <fileapi.h> // GetFileAttributes
	#include <algorithm> // replace

	const std::string BASE = "E:";

	#define FL_PATH_MAX 2048

	unsigned fl_utf8decode(const char* p, const char* end, int* len)
	{
	unsigned char c = (const unsigned char)p;
	if (c < 0x80) {
	if (len) *len = 1;
	return c;
	#if ERRORS_TO_CP1252
	}
	else if (c < 0xa0) {
	if (len) *len = 1;
	return cp1252[c - 0x80];
	#endif
	}
	else if (c < 0xc2) {
	goto FAIL;
	}
	if ((end && p + 1 >= end) \|\| (p[1] & 0xc0) != 0x80) goto FAIL;
	if (c < 0xe0) {
	if (len) *len = 2;
	return
	((p[0] & 0x1f) << 6) +
	((p[1] & 0x3f));
	}
	else if (c == 0xe0) {
	if (((const unsigned char*)p)[1] < 0xa0) goto FAIL;
	goto UTF8_3;
	#if STRICT_RFC3629
	}
	else if (c == 0xed) {
	/* RFC 3629 says surrogate chars are illegal. */
	if (((const unsigned char*)p)[1] >= 0xa0) goto FAIL;
	goto UTF8_3;
	}
	else if (c == 0xef) {
	/* 0xfffe and 0xffff are also illegal characters */
	if (((const unsigned char*)p)[1] == 0xbf &&
	((const unsigned char*)p)[2] >= 0xbe) goto FAIL;
	goto UTF8_3;
	#endif
	}
	else if (c < 0xf0) {
	UTF8_3:
	if ((end && p + 2 >= end) \|\| (p[2] & 0xc0) != 0x80) goto FAIL;
	if (len) *len = 3;
	return
	((p[0] & 0x0f) << 12) +
	((p[1] & 0x3f) << 6) +
	((p[2] & 0x3f));
	}
	else if (c == 0xf0) {
	if (((const unsigned char*)p)[1] < 0x90) goto FAIL;
	goto UTF8_4;
	}
	else if (c < 0xf4) {
	UTF8_4:
	if ((end && p + 3 >= end) \|\| (p[2] & 0xc0) != 0x80 \|\| (p[3] & 0xc0) != 0x80) goto FAIL;
	if (len) *len = 4;
	#if STRICT_RFC3629
	/* RFC 3629 says all codes ending in fffe or ffff are illegal: */
	if ((p[1] & 0xf) == 0xf &&
	((const unsigned char*)p)[2] == 0xbf &&
	((const unsigned char*)p)[3] >= 0xbe) goto FAIL;
	#endif
	return
	((p[0] & 0x07) << 18) +
	((p[1] & 0x3f) << 12) +
	((p[2] & 0x3f) << 6) +
	((p[3] & 0x3f));
	}
	else if (c == 0xf4) {
	if (((const unsigned char)p)[1] > 0x8f) goto FAIL; / after 0x10ffff */
	goto UTF8_4;
	}
	else {
	FAIL:
	if (len) *len = 1;
	#if ERRORS_TO_ISO8859_1
	return c;
	#else
	return 0xfffd; /* Unicode REPLACEMENT CHARACTER */
	#endif
	}
	}

	unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
	unsigned short* dst, unsigned dstlen)
	{
	const char* p = src;
	const char* e = src + srclen;
	unsigned count = 0;
	if (dstlen) for (;;) {
	if (p >= e) { dst[count] = 0; return count; }
	if (!(p & 0x80)) { / ascii */
	dst[count] = *p++;
	}
	else {
	int len; unsigned ucs = fl_utf8decode(p, e, &len);
	p += len;
	if (ucs < 0x10000) {
	dst[count] = ucs;
	}
	else {
	/* make a surrogate pair: */
	if (count + 2 >= dstlen) { dst[count] = 0; count += 2; break; }
	dst[count] = (((ucs - 0x10000u) >> 10) & 0x3ff) \| 0xd800;
	dst[++count] = (ucs & 0x3ff) \| 0xdc00;
	}
	}
	if (++count == dstlen) { dst[count - 1] = 0; break; }
	}
	/* we filled dst, measure the rest: */
	while (p < e) {
	if (!(*p & 0x80)) p++;
	else {
	int len; unsigned ucs = fl_utf8decode(p, e, &len);
	p += len;
	if (ucs >= 0x10000) ++count;
	}
	++count;
	}
	return count;
	}

	static wchar_t* wbuf = NULL;

	static wchar_t* utf8_to_wchar(const char* utf8, wchar_t*& wbuf, int lg = -1) {
	unsigned len = (lg >= 0) ? (unsigned)lg : (unsigned)strlen(utf8);
	unsigned wn = fl_utf8toUtf16(utf8, len, NULL, 0) + 1; // Query length
	wbuf = (wchar_t)realloc(wbuf, sizeof(wchar_t) wn);
	wn = fl_utf8toUtf16(utf8, len, (unsigned short*)wbuf, wn); // Convert string
	wbuf[wn] = 0;
	return wbuf;
	}


	int isdirGFAW(const char* n)
	{
	utf8_to_wchar(n, wbuf);
	DWORD res = GetFileAttributesW(wbuf);
	return res != INVALID_FILE_ATTRIBUTES && (res & FILE_ATTRIBUTE_DIRECTORY);
	}

	inline int isdirsep(char c) { return c == '/' \|\| c == '\\'; }

	int isdirORIG(const char* n)
	{
	struct _stat s;
	char fn[FL_PATH_MAX];
	int length;
	length = (int)strlen(n);
	// This workaround brought to you by the fine folks at Microsoft!
	// (read lots of sarcasm in that...)
	if (length < (int)(sizeof(fn) - 1)) {
	if (length < 4 && isalpha(n[0]) && n[1] == ':' &&
	(isdirsep(n[2]) \|\| !n[2])) {
	// Always use D:/ for drive letters
	fn[0] = n[0];
	strcpy(fn + 1, ":/");
	n = fn;
	}
	else if (length > 0 && isdirsep(n[length - 1])) {
	// Strip trailing slash from name...
	length--;
	memcpy(fn, n, length);
	fn[length] = '\0';
	n = fn;
	}
	}
	return !_stat(n, &s) && (s.st_mode & _S_IFDIR);
	}

	int isdirWStat(const char* n)
	{
	char fn[4]; // used for drive letter only: "X:/"
	int length = (int)strlen(n);
	// Strip trailing slash from name...
	if (length > 0 && isdirsep(n[length - 1]))
	length--;
	if (length < 1)
	return 0;

	// This workaround brought to you by the fine folks at Microsoft!
	// (read lots of sarcasm in that...)

	if (length == 2 && isalpha(n[0]) && n[1] == ':') {
	fn[0] = n[0];
	strcpy(fn + 1, ":/");
	n = fn;
	length = 3;
	}

	// convert filename to wide chars using length
	utf8_to_wchar(n, wbuf, length);

	struct _stat s;
	return (!_wstat(wbuf, &s) && (s.st_mode & _S_IFDIR));
	}

	// NOTE: these paths should only use the '/' separator. The test will
	// repeat test each path using the backslash separator.
	std::pair<std::string, int> testVals [] =
	{
	{BASE, 1},
	{BASE + "/", 1},
	{"/", 1},
	{".", 1},
	{"..", 1},

	// variations on an ASCII folder
	{BASE + "/isdirtest", 1},
	{BASE + "/isdirtest/", 1},
	{BASE + "/isdirtest/.", 1},
	{BASE + "/isdirtest/..", 1},
	{BASE + "/isdirtest/./", 1},
	{BASE + "/isdirtest/../", 1},

	// variations on a UTF-8 folder
	{BASE + "/isdirtest/testǼ", 1},
	{BASE + "/isdirtest/testǼ/", 1},
	{BASE + "/isdirtest/testǼ/.", 1},
	{BASE + "/isdirtest/testǼ/..", 1},
	{BASE + "/isdirtest/testǼ/./", 1},
	{BASE + "/isdirtest/testǼ/../", 1},

	// variations on a different UTF-8 folder [utf-8 value larger than 255]
	{BASE + "/isdirtest/test♥", 1},
	{BASE + "/isdirtest/test♥/", 1},
	{BASE + "/isdirtest/test♥/.", 1},
	{BASE + "/isdirtest/test♥/..", 1},
	{BASE + "/isdirtest/test♥/./", 1},
	{BASE + "/isdirtest/test♥/../", 1},

	// files within an ASCII folder
	{BASE + "/isdirtest/file.txt", 0},
	{BASE + "/isdirtest/file♥.txt", 0},

	// files within a UTF-8 folder.
	{BASE + "/isdirtest/test♥/file.txt", 0},
	{BASE + "/isdirtest/test♥/file♥.txt", 0},
	};

	void testOne(const char* name, int expect)
	{
	if (isdirORIG(name) != expect)
	printf("isdirORIG fail: %s\n", name);
	if (isdirGFAW(name) != expect)
	printf("isdirGFAW fail: %s\n", name);
	if (isdirWStat(name) != expect)
	printf("isdirWStat fail: %s\n", name);
	}

	void testBothSlashes(std::string name, int expect)
	{
	testOne(name.c_str(), expect);
	const char* name2 = name.c_str();

	if (name.find('/', 0) != std::string::npos)
	{
	std::replace(name.begin(), name.end(), '/', '\\');
	testOne(name.c_str(), expect);
	}
	}

	int main()
	{
	SetConsoleOutputCP(65001); // make sure we can display unicode

	SetCurrentDirectory((BASE + "/").c_str());
	char buf[2048];
	GetCurrentDirectory(2048, buf);
	printf("cwd = %s\n", buf);

	int count = sizeof(testVals) / sizeof(testVals[0]);
	for (int i = 0; i < count; i++)
	{
	auto name = testVals[i].first;
	int expect = testVals[i].second;
	testBothSlashes(name, expect);
	}
	}