Skip to content

Instantly share code, notes, and snippets.

@leduyquang753
Last active December 27, 2021 14:34
Show Gist options
  • Save leduyquang753/a6d82d3ce0f2a06833493e3da5f2ccb5 to your computer and use it in GitHub Desktop.
Save leduyquang753/a6d82d3ce0f2a06833493e3da5f2ccb5 to your computer and use it in GitHub Desktop.
Unicode identifier validator.
/*
Unicode identifier validator.
Needs two data files generated by processDatabase.cpp.
*/
#include <array>
#include <clocale>
#include <cstdlib>
#include <fstream>
#include <ios>
#include <iostream>
#include <locale>
#include <memory>
#include <string>
// Unicode-enabling code adapted from https://stackoverflow.com/a/48180107
// Boilerplate feature-test macros.
#if _WIN32 || _WIN64
#define _WIN32_WINNT 0x0A00 // _WIN32_WINNT_WIN10
#define NTDDI_VERSION 0x0A000002 // NTDDI_WIN10_RS1
#include <sdkddkver.h>
#else
#define _XOPEN_SOURCE 700
#define _POSIX_C_SOURCE 200809L
#endif
#ifndef MS_STDLIB_BUGS // Allow overriding the autodetection.
/*
The Microsoft C and C++ runtime libraries that ship with Visual Studio,
as of 2017, have a bug that neither stdio, iostreams or wide iostreams
can handle Unicode input or output. Windows needs some non-standard
magic to work around that. This includes programs compiled with MinGW
and Clang for the win32 and win64 targets. NOTE TO USERS OF TDM-GCC:
This code is known to break on tdm-gcc 4.9.2. As a workaround,
`-D MS_STDLIB_BUGS=0` will at least get it to compile, but Unicode
output will still not work.
*/
#if (_MSC_VER || __MINGW32__ || __MSVCRT__)
/*
This code is being compiled either on MS Visual C++, or MinGW, or
clang++ in compatibility mode for either, or is being linked to the
msvcrt (Microsoft Visual C RunTime) library.
*/
#define MS_STDLIB_BUGS 1
#else
#define MS_STDLIB_BUGS 0
#endif
#endif
#if MS_STDLIB_BUGS
#include <io.h>
#include <fcntl.h>
#endif
using namespace std::string_literals;
// Does magic so that wcout can work.
void initializeUnicodeIO() {
#if MS_STDLIB_BUGS
// Windows needs a little non-standard magic.
constexpr char cp_utf16le[] = ".1200";
setlocale(LC_ALL, cp_utf16le);
_setmode(_fileno(stdout), _O_WTEXT);
_setmode(_fileno(stdin), _O_WTEXT);
#else
// The correct locale name may vary by OS, e.g., "en_US.utf8".
constexpr char locale_name[] = "";
setlocale(LC_ALL, locale_name);
std::locale::global(std::locale(locale_name));
std::wcout.imbue(std::locale());
std::wcin.imbue(std::locale());
#endif
}
using DataArray = std::array<unsigned char, 0x110000/8>;
DataArray startData, continueData;
void loadData(const std::string &fileName, DataArray &data) {
std::ifstream file(fileName, std::ios::in | std::ios::binary);
file.read(reinterpret_cast<char*>(data.data()), data.size());
}
bool isValidName(const std::wstring &name) {
bool isFirst = true, waiting = false;
unsigned long codePoint;
// The string is encoded as UTF-16.
for (const wchar_t c : name) {
if (waiting) { // Low surrogate.
if ((c & 0xDC00) != 0xDC00) return false;
codePoint = ((codePoint << 10) | (c & 0x03FF)) + 0x10000;
waiting = false;
} else {
if ((c & 0xDC00) == 0xD800) { // High surrogate.
codePoint = c & 0x03FF;
waiting = true;
} else { // Low code point.
codePoint = c;
}
}
if (!waiting) {
if ((
(isFirst ? startData : continueData)[codePoint/8]
& (1 << (7-codePoint%8))
) == 0) return false;
isFirst = false;
}
}
return !waiting;
}
void validate(const std::wstring &name) {
std::wcout << (isValidName(name) ? L"Valid.\n"s : L"Invalid.\n"s);
}
int main() {
initializeUnicodeIO();
loadData("XID_Start.dat"s, startData);
loadData("XID_Continue.dat"s, continueData);
std::wcout <<
L"Enter any string to see whether it is a valid identifier. "
L"Enter nothing to exit.\n"s;
while (true) {
std::wstring input;
std::getline(std::wcin, input);
if (input.empty()) break;
validate(input);
}
}
/*
Process data from the Unicode character database into data files for the
identifier validator.
Get the database from
https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
then copy the data lines for the property XID_Start into XID_Start.txt and
do similarly for XID_Continue, then run this program and the identifier
validator can be run.
*/
#include <array>
#include <cstring>
#include <fstream>
#include <ios>
#include <limits>
#include <string>
unsigned long parseCodePoint(const std::string &input) {
unsigned long res = 0;
for (const char c : input)
res = res<<4 | (c>='A' ? c-'A'+10 : c-'0');
return res;
}
std::array<unsigned char, 0x110000/8> data;
void process(
const std::string &inputFileName, const std::string &outputFileName
) {
std::ifstream inputFile(inputFileName, std::ios::in);
std::memset(data.data(), 0, data.size());
while (true) {
std::string input;
char c;
inputFile >> c;
if (!inputFile) break;
while (c != ' ' && c != '.') {
input += c;
inputFile.get(c);
}
if (c == ' ') { // Single codepoint.
const unsigned long pos = parseCodePoint(input);
data[pos/8] |= 1u << (7 - pos%8);
} else { // Codepoint range.
const unsigned long
start = parseCodePoint(input),
startByte = start/8;
inputFile.ignore();
std::getline(inputFile, input, ' ');
const unsigned long
end = parseCodePoint(input),
endByte = end/8;
if (startByte == endByte) {
data[startByte] |= (1u<<(8-start%8))-1 & ~((1u<<(8-end%8))-1);
} else {
data[startByte] |= (1u << (8-start%8)) - 1;
if (endByte-startByte != 1) {
std::memset(
data.data() + (startByte+1), 0xFF,
endByte - startByte - 1
);
}
data[endByte] |= 0xFFu & ~((1u << (8-end%8)) - 1);
}
}
inputFile.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
}
inputFile.close();
std::ofstream outputFile(
outputFileName, std::ios::out | std::ios::binary
);
outputFile.write(reinterpret_cast<char*>(data.data()), data.size());
outputFile.close();
}
int main() {
using namespace std::string_literals;
process("XID_Start.txt"s, "XID_Start.dat"s);
process("XID_Continue.txt"s, "XID_Continue.dat"s);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment