Last active
December 27, 2021 14:34
-
-
Save leduyquang753/a6d82d3ce0f2a06833493e3da5f2ccb5 to your computer and use it in GitHub Desktop.
Unicode identifier validator.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Unicode identifier validator. | |
Needs two data files generated by processDatabase.cpp. | |
*/ | |
#include <array> | |
#include <clocale> | |
#include <cstdlib> | |
#include <fstream> | |
#include <ios> | |
#include <iostream> | |
#include <locale> | |
#include <memory> | |
#include <string> | |
// Unicode-enabling code adapted from https://stackoverflow.com/a/48180107 | |
// Boilerplate feature-test macros. | |
#if _WIN32 || _WIN64 | |
#define _WIN32_WINNT 0x0A00 // _WIN32_WINNT_WIN10 | |
#define NTDDI_VERSION 0x0A000002 // NTDDI_WIN10_RS1 | |
#include <sdkddkver.h> | |
#else | |
#define _XOPEN_SOURCE 700 | |
#define _POSIX_C_SOURCE 200809L | |
#endif | |
#ifndef MS_STDLIB_BUGS // Allow overriding the autodetection. | |
/* | |
The Microsoft C and C++ runtime libraries that ship with Visual Studio, | |
as of 2017, have a bug that neither stdio, iostreams or wide iostreams | |
can handle Unicode input or output. Windows needs some non-standard | |
magic to work around that. This includes programs compiled with MinGW | |
and Clang for the win32 and win64 targets. NOTE TO USERS OF TDM-GCC: | |
This code is known to break on tdm-gcc 4.9.2. As a workaround, | |
`-D MS_STDLIB_BUGS=0` will at least get it to compile, but Unicode | |
output will still not work. | |
*/ | |
#if (_MSC_VER || __MINGW32__ || __MSVCRT__) | |
/* | |
This code is being compiled either on MS Visual C++, or MinGW, or | |
clang++ in compatibility mode for either, or is being linked to the | |
msvcrt (Microsoft Visual C RunTime) library. | |
*/ | |
#define MS_STDLIB_BUGS 1 | |
#else | |
#define MS_STDLIB_BUGS 0 | |
#endif | |
#endif | |
#if MS_STDLIB_BUGS | |
#include <io.h> | |
#include <fcntl.h> | |
#endif | |
using namespace std::string_literals; | |
// Does magic so that wcout can work. | |
void initializeUnicodeIO() { | |
#if MS_STDLIB_BUGS | |
// Windows needs a little non-standard magic. | |
constexpr char cp_utf16le[] = ".1200"; | |
setlocale(LC_ALL, cp_utf16le); | |
_setmode(_fileno(stdout), _O_WTEXT); | |
_setmode(_fileno(stdin), _O_WTEXT); | |
#else | |
// The correct locale name may vary by OS, e.g., "en_US.utf8". | |
constexpr char locale_name[] = ""; | |
setlocale(LC_ALL, locale_name); | |
std::locale::global(std::locale(locale_name)); | |
std::wcout.imbue(std::locale()); | |
std::wcin.imbue(std::locale()); | |
#endif | |
} | |
using DataArray = std::array<unsigned char, 0x110000/8>; | |
DataArray startData, continueData; | |
void loadData(const std::string &fileName, DataArray &data) { | |
std::ifstream file(fileName, std::ios::in | std::ios::binary); | |
file.read(reinterpret_cast<char*>(data.data()), data.size()); | |
} | |
bool isValidName(const std::wstring &name) { | |
bool isFirst = true, waiting = false; | |
unsigned long codePoint; | |
// The string is encoded as UTF-16. | |
for (const wchar_t c : name) { | |
if (waiting) { // Low surrogate. | |
if ((c & 0xDC00) != 0xDC00) return false; | |
codePoint = ((codePoint << 10) | (c & 0x03FF)) + 0x10000; | |
waiting = false; | |
} else { | |
if ((c & 0xDC00) == 0xD800) { // High surrogate. | |
codePoint = c & 0x03FF; | |
waiting = true; | |
} else { // Low code point. | |
codePoint = c; | |
} | |
} | |
if (!waiting) { | |
if (( | |
(isFirst ? startData : continueData)[codePoint/8] | |
& (1 << (7-codePoint%8)) | |
) == 0) return false; | |
isFirst = false; | |
} | |
} | |
return !waiting; | |
} | |
void validate(const std::wstring &name) { | |
std::wcout << (isValidName(name) ? L"Valid.\n"s : L"Invalid.\n"s); | |
} | |
int main() { | |
initializeUnicodeIO(); | |
loadData("XID_Start.dat"s, startData); | |
loadData("XID_Continue.dat"s, continueData); | |
std::wcout << | |
L"Enter any string to see whether it is a valid identifier. " | |
L"Enter nothing to exit.\n"s; | |
while (true) { | |
std::wstring input; | |
std::getline(std::wcin, input); | |
if (input.empty()) break; | |
validate(input); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Process data from the Unicode character database into data files for the | |
identifier validator. | |
Get the database from | |
https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt | |
then copy the data lines for the property XID_Start into XID_Start.txt and | |
do similarly for XID_Continue, then run this program and the identifier | |
validator can be run. | |
*/ | |
#include <array> | |
#include <cstring> | |
#include <fstream> | |
#include <ios> | |
#include <limits> | |
#include <string> | |
unsigned long parseCodePoint(const std::string &input) { | |
unsigned long res = 0; | |
for (const char c : input) | |
res = res<<4 | (c>='A' ? c-'A'+10 : c-'0'); | |
return res; | |
} | |
std::array<unsigned char, 0x110000/8> data; | |
void process( | |
const std::string &inputFileName, const std::string &outputFileName | |
) { | |
std::ifstream inputFile(inputFileName, std::ios::in); | |
std::memset(data.data(), 0, data.size()); | |
while (true) { | |
std::string input; | |
char c; | |
inputFile >> c; | |
if (!inputFile) break; | |
while (c != ' ' && c != '.') { | |
input += c; | |
inputFile.get(c); | |
} | |
if (c == ' ') { // Single codepoint. | |
const unsigned long pos = parseCodePoint(input); | |
data[pos/8] |= 1u << (7 - pos%8); | |
} else { // Codepoint range. | |
const unsigned long | |
start = parseCodePoint(input), | |
startByte = start/8; | |
inputFile.ignore(); | |
std::getline(inputFile, input, ' '); | |
const unsigned long | |
end = parseCodePoint(input), | |
endByte = end/8; | |
if (startByte == endByte) { | |
data[startByte] |= (1u<<(8-start%8))-1 & ~((1u<<(8-end%8))-1); | |
} else { | |
data[startByte] |= (1u << (8-start%8)) - 1; | |
if (endByte-startByte != 1) { | |
std::memset( | |
data.data() + (startByte+1), 0xFF, | |
endByte - startByte - 1 | |
); | |
} | |
data[endByte] |= 0xFFu & ~((1u << (8-end%8)) - 1); | |
} | |
} | |
inputFile.ignore(std::numeric_limits<std::streamsize>::max(), '\n'); | |
} | |
inputFile.close(); | |
std::ofstream outputFile( | |
outputFileName, std::ios::out | std::ios::binary | |
); | |
outputFile.write(reinterpret_cast<char*>(data.data()), data.size()); | |
outputFile.close(); | |
} | |
int main() { | |
using namespace std::string_literals; | |
process("XID_Start.txt"s, "XID_Start.dat"s); | |
process("XID_Continue.txt"s, "XID_Continue.dat"s); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment