leduyquang753 · December 27, 2021 14:34
diff --git a/identifierValidator.cpp b/identifierValidator.cpp
 /*
 	Unicode identifier validator.
 	Needs two data files generated by processDatabase.cpp.
 */

 #include <array>
 #include <clocale>
 #include <cstdlib>
 #include <fstream>
 #include <ios>
 #include <iostream>
 #include <locale>
 #include <memory>
 #include <string>

 // Unicode-enabling code adapted from https://stackoverflow.com/a/48180107

 // Boilerplate feature-test macros.
 #if _WIN32 || _WIN64
 	#define _WIN32_WINNT 0x0A00 // _WIN32_WINNT_WIN10
 	#define NTDDI_VERSION 0x0A000002 // NTDDI_WIN10_RS1
 	#include <sdkddkver.h>
 #else
 	#define _XOPEN_SOURCE 700
 	#define _POSIX_C_SOURCE 200809L
 #endif
 #ifndef MS_STDLIB_BUGS // Allow overriding the autodetection.
 	/*
 		The Microsoft C and C++ runtime libraries that ship with Visual Studio,
 		as of 2017, have a bug that neither stdio, iostreams or wide iostreams
 		can handle Unicode input or output.  Windows needs some non-standard
 		magic to work around that.  This includes programs compiled with MinGW
 		and Clang for the win32 and win64 targets. NOTE TO USERS OF TDM-GCC:
 		This code is known to break on tdm-gcc 4.9.2. As a workaround,
 		`-D MS_STDLIB_BUGS=0` will at least get it to compile, but Unicode
 		output will still not work.
 	*/
 	#if (_MSC_VER || __MINGW32__ || __MSVCRT__)
 		/*
 			This code is being compiled either on MS Visual C++, or MinGW, or
 			clang++ in compatibility mode for either, or is being linked to the
 			msvcrt (Microsoft Visual C RunTime) library.
 		 */
 		#define MS_STDLIB_BUGS 1
 	#else
 		#define MS_STDLIB_BUGS 0
 	#endif
 #endif

 #if MS_STDLIB_BUGS
 	#include <io.h>
 	#include <fcntl.h>
 #endif

 using namespace std::string_literals;

 // Does magic so that wcout can work.
 void initializeUnicodeIO() {
 	#if MS_STDLIB_BUGS
 		// Windows needs a little non-standard magic.
 		constexpr char cp_utf16le[] = ".1200";
 		setlocale(LC_ALL, cp_utf16le);
 		_setmode(_fileno(stdout), _O_WTEXT);
 		_setmode(_fileno(stdin), _O_WTEXT);
 	#else
 		// The correct locale name may vary by OS, e.g., "en_US.utf8".
 		constexpr char locale_name[] = "";
 		setlocale(LC_ALL, locale_name);
 		std::locale::global(std::locale(locale_name));
 		std::wcout.imbue(std::locale());
 		std::wcin.imbue(std::locale());
 	#endif
 }

 using DataArray = std::array<unsigned char, 0x110000/8>;
 DataArray startData, continueData;

 void loadData(const std::string &fileName, DataArray &data) {
 	std::ifstream file(fileName, std::ios::in | std::ios::binary);
 	file.read(reinterpret_cast<char*>(data.data()), data.size());
 }

 bool isValidName(const std::wstring &name) {
 	bool isFirst = true, waiting = false;
 	unsigned long codePoint;
 	// The string is encoded as UTF-16.
 	for (const wchar_t c : name) {
 		if (waiting) { // Low surrogate.
 			if ((c & 0xDC00) != 0xDC00) return false;
 			codePoint = ((codePoint << 10) | (c & 0x03FF)) + 0x10000;
 			waiting = false;
 		} else {
 			if ((c & 0xDC00) == 0xD800) { // High surrogate.
 				codePoint = c & 0x03FF;
 				waiting = true;
 			} else { // Low code point.
 				codePoint = c;
 			}
 		}
 		if (!waiting) {
 			if ((
 				(isFirst ? startData : continueData)[codePoint/8]
 				& (1 << (7-codePoint%8))
 			) == 0) return false;
 			isFirst = false;
 		}
 	}
 	return !waiting;
 }

 void validate(const std::wstring &name) {
 	std::wcout << (isValidName(name) ? L"Valid.\n"s : L"Invalid.\n"s);
 }

 int main() {
 	initializeUnicodeIO();
 	loadData("XID_Start.dat"s, startData);
 	loadData("XID_Continue.dat"s, continueData);
 	std::wcout <<
 		L"Enter any string to see whether it is a valid identifier. "
 		L"Enter nothing to exit.\n"s;
 	while (true) {
 		std::wstring input;
 		std::getline(std::wcin, input);
 		if (input.empty()) break;
 		validate(input);
 	}
 }
diff --git a/processDatabase.cpp b/processDatabase.cpp
 /*
 	Process data from the Unicode character database into data files for the
 	identifier validator.
 	Get the database from
 	https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
 	then copy the data lines for the property XID_Start into XID_Start.txt and
 	do similarly for XID_Continue, then run this program and the identifier
 	validator can be run.
 */

 #include <array>
 #include <cstring>
 #include <fstream>
 #include <ios>
 #include <limits>
 #include <string>

 unsigned long parseCodePoint(const std::string &input) {
 	unsigned long res = 0;
 	for (const char c : input)
 		res = res<<4 | (c>='A' ? c-'A'+10 : c-'0');
 	return res;
 }

 std::array<unsigned char, 0x110000/8> data;

 void process(
 	const std::string &inputFileName, const std::string &outputFileName
 ) {
 	std::ifstream inputFile(inputFileName, std::ios::in);
 	std::memset(data.data(), 0, data.size());
 	while (true) {
 		std::string input;
 		char c;
 		inputFile >> c;
 		if (!inputFile) break;
 		while (c != ' ' && c != '.') {
 			input += c;
 			inputFile.get(c);
 		}
 		if (c == ' ') { // Single codepoint.
 			const unsigned long pos = parseCodePoint(input);
 			data[pos/8] |= 1u << (7 - pos%8);
 		} else { // Codepoint range.
 			const unsigned long
 				start = parseCodePoint(input),
 				startByte = start/8;
 			inputFile.ignore();
 			std::getline(inputFile, input, ' ');
 			const unsigned long
 				end = parseCodePoint(input),
 				endByte = end/8;
 			if (startByte == endByte) {
 				data[startByte] |= (1u<<(8-start%8))-1 & ~((1u<<(8-end%8))-1);
 			} else {
 				data[startByte] |= (1u << (8-start%8)) - 1;
 				if (endByte-startByte != 1) {
 					std::memset(
 						data.data() + (startByte+1), 0xFF,
 						endByte - startByte - 1
 					);
 				}
 				data[endByte] |= 0xFFu & ~((1u << (8-end%8)) - 1);
 			}
 		}
 		inputFile.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
 	}
 	inputFile.close();
 	std::ofstream outputFile(
 		outputFileName, std::ios::out | std::ios::binary
 	);
 	outputFile.write(reinterpret_cast<char*>(data.data()), data.size());
 	outputFile.close();
 }

 int main() {
 	using namespace std::string_literals;
 	process("XID_Start.txt"s, "XID_Start.dat"s);
 	process("XID_Continue.txt"s, "XID_Continue.dat"s);
 }
	/*
	Unicode identifier validator.
	Needs two data files generated by processDatabase.cpp.
	*/

	#include <array>
	#include <clocale>
	#include <cstdlib>
	#include <fstream>
	#include <ios>
	#include <iostream>
	#include <locale>
	#include <memory>
	#include <string>

	// Unicode-enabling code adapted from https://stackoverflow.com/a/48180107

	// Boilerplate feature-test macros.
	#if _WIN32 \|\| _WIN64
	#define _WIN32_WINNT 0x0A00 // _WIN32_WINNT_WIN10
	#define NTDDI_VERSION 0x0A000002 // NTDDI_WIN10_RS1
	#include <sdkddkver.h>
	#else
	#define _XOPEN_SOURCE 700
	#define _POSIX_C_SOURCE 200809L
	#endif
	#ifndef MS_STDLIB_BUGS // Allow overriding the autodetection.
	/*
	The Microsoft C and C++ runtime libraries that ship with Visual Studio,
	as of 2017, have a bug that neither stdio, iostreams or wide iostreams
	can handle Unicode input or output. Windows needs some non-standard
	magic to work around that. This includes programs compiled with MinGW
	and Clang for the win32 and win64 targets. NOTE TO USERS OF TDM-GCC:
	This code is known to break on tdm-gcc 4.9.2. As a workaround,
	`-D MS_STDLIB_BUGS=0` will at least get it to compile, but Unicode
	output will still not work.
	*/
	#if (_MSC_VER \|\| __MINGW32__ \|\| __MSVCRT__)
	/*
	This code is being compiled either on MS Visual C++, or MinGW, or
	clang++ in compatibility mode for either, or is being linked to the
	msvcrt (Microsoft Visual C RunTime) library.
	*/
	#define MS_STDLIB_BUGS 1
	#else
	#define MS_STDLIB_BUGS 0
	#endif
	#endif

	#if MS_STDLIB_BUGS
	#include <io.h>
	#include <fcntl.h>
	#endif

	using namespace std::string_literals;

	// Does magic so that wcout can work.
	void initializeUnicodeIO() {
	#if MS_STDLIB_BUGS
	// Windows needs a little non-standard magic.
	constexpr char cp_utf16le[] = ".1200";
	setlocale(LC_ALL, cp_utf16le);
	_setmode(_fileno(stdout), _O_WTEXT);
	_setmode(_fileno(stdin), _O_WTEXT);
	#else
	// The correct locale name may vary by OS, e.g., "en_US.utf8".
	constexpr char locale_name[] = "";
	setlocale(LC_ALL, locale_name);
	std::locale::global(std::locale(locale_name));
	std::wcout.imbue(std::locale());
	std::wcin.imbue(std::locale());
	#endif
	}

	using DataArray = std::array<unsigned char, 0x110000/8>;
	DataArray startData, continueData;

	void loadData(const std::string &fileName, DataArray &data) {
	std::ifstream file(fileName, std::ios::in \| std::ios::binary);
	file.read(reinterpret_cast<char*>(data.data()), data.size());
	}

	bool isValidName(const std::wstring &name) {
	bool isFirst = true, waiting = false;
	unsigned long codePoint;
	// The string is encoded as UTF-16.
	for (const wchar_t c : name) {
	if (waiting) { // Low surrogate.
	if ((c & 0xDC00) != 0xDC00) return false;
	codePoint = ((codePoint << 10) \| (c & 0x03FF)) + 0x10000;
	waiting = false;
	} else {
	if ((c & 0xDC00) == 0xD800) { // High surrogate.
	codePoint = c & 0x03FF;
	waiting = true;
	} else { // Low code point.
	codePoint = c;
	}
	}
	if (!waiting) {
	if ((
	(isFirst ? startData : continueData)[codePoint/8]
	& (1 << (7-codePoint%8))
	) == 0) return false;
	isFirst = false;
	}
	}
	return !waiting;
	}

	void validate(const std::wstring &name) {
	std::wcout << (isValidName(name) ? L"Valid.\n"s : L"Invalid.\n"s);
	}

	int main() {
	initializeUnicodeIO();
	loadData("XID_Start.dat"s, startData);
	loadData("XID_Continue.dat"s, continueData);
	std::wcout <<
	L"Enter any string to see whether it is a valid identifier. "
	L"Enter nothing to exit.\n"s;
	while (true) {
	std::wstring input;
	std::getline(std::wcin, input);
	if (input.empty()) break;
	validate(input);
	}
	}
	/*
	Process data from the Unicode character database into data files for the
	identifier validator.
	Get the database from
	https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
	then copy the data lines for the property XID_Start into XID_Start.txt and
	do similarly for XID_Continue, then run this program and the identifier
	validator can be run.
	*/

	#include <array>
	#include <cstring>
	#include <fstream>
	#include <ios>
	#include <limits>
	#include <string>

	unsigned long parseCodePoint(const std::string &input) {
	unsigned long res = 0;
	for (const char c : input)
	res = res<<4 \| (c>='A' ? c-'A'+10 : c-'0');
	return res;
	}

	std::array<unsigned char, 0x110000/8> data;

	void process(
	const std::string &inputFileName, const std::string &outputFileName
	) {
	std::ifstream inputFile(inputFileName, std::ios::in);
	std::memset(data.data(), 0, data.size());
	while (true) {
	std::string input;
	char c;
	inputFile >> c;
	if (!inputFile) break;
	while (c != ' ' && c != '.') {
	input += c;
	inputFile.get(c);
	}
	if (c == ' ') { // Single codepoint.
	const unsigned long pos = parseCodePoint(input);
	data[pos/8] \|= 1u << (7 - pos%8);
	} else { // Codepoint range.
	const unsigned long
	start = parseCodePoint(input),
	startByte = start/8;
	inputFile.ignore();
	std::getline(inputFile, input, ' ');
	const unsigned long
	end = parseCodePoint(input),
	endByte = end/8;
	if (startByte == endByte) {
	data[startByte] \|= (1u<<(8-start%8))-1 & ~((1u<<(8-end%8))-1);
	} else {
	data[startByte] \|= (1u << (8-start%8)) - 1;
	if (endByte-startByte != 1) {
	std::memset(
	data.data() + (startByte+1), 0xFF,
	endByte - startByte - 1
	);
	}
	data[endByte] \|= 0xFFu & ~((1u << (8-end%8)) - 1);
	}
	}
	inputFile.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
	}
	inputFile.close();
	std::ofstream outputFile(
	outputFileName, std::ios::out \| std::ios::binary
	);
	outputFile.write(reinterpret_cast<char*>(data.data()), data.size());
	outputFile.close();
	}

	int main() {
	using namespace std::string_literals;
	process("XID_Start.txt"s, "XID_Start.dat"s);
	process("XID_Continue.txt"s, "XID_Continue.dat"s);
	}