Last active
August 29, 2015 14:02
-
-
Save RavuAlHemio/0babc5f719d20cf48b0e to your computer and use it in GitHub Desktop.
HTML and URL encoding outgoing messages for the vBulletin chatbox, for Qt.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iomanip> | |
#include <iostream> | |
#include <QByteArray> | |
#include <QSet> | |
#include <QString> | |
#include <QTextCodec> | |
static const QSet<QChar> urlSafeCharacters { | |
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', | |
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', | |
'-', '_', '.' | |
}; | |
static QByteArray encodeChatboxOutgoingText(const QString &inputString) | |
{ | |
QByteArray ret; | |
bool awaitingLowSurrogate = false; | |
uint32_t highUnicodeChar = 0; | |
QTextEncoder *encode1252 = QTextCodec::codecForName("windows-1252")->makeEncoder(QTextCodec::ConvertInvalidToNull); | |
for (const QChar &c : inputString) | |
{ | |
if (awaitingLowSurrogate) | |
{ | |
if (c.isLowSurrogate()) | |
{ | |
highUnicodeChar |= (c.unicode() - 0xDC00); | |
highUnicodeChar += 0x010000; | |
// append the XML character entity (Ӓ) | |
// with URL-escaping | |
ret.append("%26%23"); | |
ret.append(QString::number(highUnicodeChar).toUtf8()); | |
ret.append("%3B"); | |
// reset the surrogate pair state | |
highUnicodeChar = 0; | |
awaitingLowSurrogate = false; | |
} | |
else | |
{ | |
// leading surrogate not followed by trailing surrogate | |
// FIXME: handle this error | |
} | |
} | |
else if (c.unicode() == 0x0000) | |
{ | |
// special case to not interfere with Win1252 detection | |
// URL-encode � | |
ret.append("%26%230%3B"); | |
} | |
else if (urlSafeCharacters.contains(c)) | |
{ | |
// include this character verbatim | |
ret.append(static_cast<char>(c.unicode())); | |
} | |
else if (c.isHighSurrogate()) | |
{ | |
highUnicodeChar = (c.unicode() - 0xD800) << 10; | |
awaitingLowSurrogate = true; | |
} | |
else if (c.isLowSurrogate()) | |
{ | |
// trailing surrogate following something that isn't a leading surrogate | |
// FIXME: handle this error | |
} | |
else | |
{ | |
// try encoding using Windows-1252 | |
QByteArray as1252 = encode1252->fromUnicode(&c, 1); | |
if (as1252.contains('\0')) | |
{ | |
// Unicode BMP char not in Windows-1252 | |
// -> HTML-escape with URL-escaping | |
ret.append("%26%23"); | |
ret.append(QString::number(c.unicode()).toUtf8()); | |
ret.append("%3B"); | |
} | |
else | |
{ | |
// Windows-1252; URL-escape it | |
for (const char &b : as1252) | |
{ | |
QString escaped = QString::number((static_cast<unsigned int>(b) & 0xFF), 16).toUpper(); | |
while (escaped.size() < 2) | |
{ | |
escaped.prepend('0'); | |
} | |
ret.append('%'); | |
ret.append(escaped.toUtf8()); | |
} | |
} | |
} | |
} | |
return ret; | |
} | |
int main(void) | |
{ | |
QString str = QString::fromUtf8( | |
"P\305\231\303\255li\305\241 " | |
"\305\276lu\305\245ou\304\215k\303\275 " | |
"k\305\257\305\210 " | |
"\360\237\220\216 " | |
"\303\272p\304\233l " | |
"\304\217\303\241belsk\303\251 " | |
"\303\263dy. " | |
"\360\237\222\251" | |
); | |
#define URL_HTML(num) "%26%23" #num "%3B" | |
QByteArray compare = | |
"P" URL_HTML(345) "%EDli%9A%20" | |
"%9Elu" URL_HTML(357) "ou" URL_HTML(269) "k%FD%20" | |
"k" URL_HTML(367) URL_HTML(328) "%20" | |
URL_HTML(128014) "%20" | |
"%FAp" URL_HTML(283) "l%20" | |
URL_HTML(271) "%E1belsk%E9%20" | |
"%F3dy.%20" | |
URL_HTML(128169) | |
; | |
#undef URL_HTML | |
QByteArray returned = encodeChatboxOutgoingText(str); | |
int size = (compare.size() < returned.size()) ? compare.size() : returned.size(); | |
if (compare.size() != returned.size()) | |
{ | |
std::cout << "sizes differ (compare " << compare.size() << ", returned " << returned.size() << ")" << std::endl; | |
} | |
for (int i = 0; i < size; ++i) | |
{ | |
char c = compare.at(i); | |
char r = returned.at(i); | |
if (c == r) | |
{ | |
continue; | |
} | |
std::cout << "byte " << std::setw(3) << i << " compare " << std::setw(3) << static_cast<int>(c) << " "; | |
if (c >= 0x20 && c <= 0x7E) | |
{ | |
std::cout << "(" << c << ") "; | |
} | |
else | |
{ | |
std::cout << " "; | |
} | |
std::cout << "returned " << std::setw(3) << static_cast<int>(r) << " "; | |
if (r >= 0x20 && r <= 0x7E) | |
{ | |
std::cout << "(" << r << ") "; | |
} | |
else | |
{ | |
std::cout << " "; | |
} | |
std::cout << std::endl; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment