Skip to content

Instantly share code, notes, and snippets.

@RavuAlHemio
Last active August 29, 2015 14:02
Show Gist options
  • Save RavuAlHemio/0babc5f719d20cf48b0e to your computer and use it in GitHub Desktop.
Save RavuAlHemio/0babc5f719d20cf48b0e to your computer and use it in GitHub Desktop.
HTML and URL encoding outgoing messages for the vBulletin chatbox, for Qt.
#include <iomanip>
#include <iostream>
#include <QByteArray>
#include <QSet>
#include <QString>
#include <QTextCodec>
static const QSet<QChar> urlSafeCharacters {
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'-', '_', '.'
};
static QByteArray encodeChatboxOutgoingText(const QString &inputString)
{
QByteArray ret;
bool awaitingLowSurrogate = false;
uint32_t highUnicodeChar = 0;
QTextEncoder *encode1252 = QTextCodec::codecForName("windows-1252")->makeEncoder(QTextCodec::ConvertInvalidToNull);
for (const QChar &c : inputString)
{
if (awaitingLowSurrogate)
{
if (c.isLowSurrogate())
{
highUnicodeChar |= (c.unicode() - 0xDC00);
highUnicodeChar += 0x010000;
// append the XML character entity (&#1234;)
// with URL-escaping
ret.append("%26%23");
ret.append(QString::number(highUnicodeChar).toUtf8());
ret.append("%3B");
// reset the surrogate pair state
highUnicodeChar = 0;
awaitingLowSurrogate = false;
}
else
{
// leading surrogate not followed by trailing surrogate
// FIXME: handle this error
}
}
else if (c.unicode() == 0x0000)
{
// special case to not interfere with Win1252 detection
// URL-encode &#0;
ret.append("%26%230%3B");
}
else if (urlSafeCharacters.contains(c))
{
// include this character verbatim
ret.append(static_cast<char>(c.unicode()));
}
else if (c.isHighSurrogate())
{
highUnicodeChar = (c.unicode() - 0xD800) << 10;
awaitingLowSurrogate = true;
}
else if (c.isLowSurrogate())
{
// trailing surrogate following something that isn't a leading surrogate
// FIXME: handle this error
}
else
{
// try encoding using Windows-1252
QByteArray as1252 = encode1252->fromUnicode(&c, 1);
if (as1252.contains('\0'))
{
// Unicode BMP char not in Windows-1252
// -> HTML-escape with URL-escaping
ret.append("%26%23");
ret.append(QString::number(c.unicode()).toUtf8());
ret.append("%3B");
}
else
{
// Windows-1252; URL-escape it
for (const char &b : as1252)
{
QString escaped = QString::number((static_cast<unsigned int>(b) & 0xFF), 16).toUpper();
while (escaped.size() < 2)
{
escaped.prepend('0');
}
ret.append('%');
ret.append(escaped.toUtf8());
}
}
}
}
return ret;
}
int main(void)
{
QString str = QString::fromUtf8(
"P\305\231\303\255li\305\241 "
"\305\276lu\305\245ou\304\215k\303\275 "
"k\305\257\305\210 "
"\360\237\220\216 "
"\303\272p\304\233l "
"\304\217\303\241belsk\303\251 "
"\303\263dy. "
"\360\237\222\251"
);
#define URL_HTML(num) "%26%23" #num "%3B"
QByteArray compare =
"P" URL_HTML(345) "%EDli%9A%20"
"%9Elu" URL_HTML(357) "ou" URL_HTML(269) "k%FD%20"
"k" URL_HTML(367) URL_HTML(328) "%20"
URL_HTML(128014) "%20"
"%FAp" URL_HTML(283) "l%20"
URL_HTML(271) "%E1belsk%E9%20"
"%F3dy.%20"
URL_HTML(128169)
;
#undef URL_HTML
QByteArray returned = encodeChatboxOutgoingText(str);
int size = (compare.size() < returned.size()) ? compare.size() : returned.size();
if (compare.size() != returned.size())
{
std::cout << "sizes differ (compare " << compare.size() << ", returned " << returned.size() << ")" << std::endl;
}
for (int i = 0; i < size; ++i)
{
char c = compare.at(i);
char r = returned.at(i);
if (c == r)
{
continue;
}
std::cout << "byte " << std::setw(3) << i << " compare " << std::setw(3) << static_cast<int>(c) << " ";
if (c >= 0x20 && c <= 0x7E)
{
std::cout << "(" << c << ") ";
}
else
{
std::cout << " ";
}
std::cout << "returned " << std::setw(3) << static_cast<int>(r) << " ";
if (r >= 0x20 && r <= 0x7E)
{
std::cout << "(" << r << ") ";
}
else
{
std::cout << " ";
}
std::cout << std::endl;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment