Skip to content

Instantly share code, notes, and snippets.

@rjw57
Created October 25, 2013 11:49
Show Gist options
  • Save rjw57/7153452 to your computer and use it in GitHub Desktop.
Save rjw57/7153452 to your computer and use it in GitHub Desktop.
#include "encoding.hpp"
#include <deque>
#include <string>
using namespace std;
namespace rcc {
void UTF8Decoder::operator () (int code_unit)
{
using namespace std;
// pass -ve code units to next stage without advancing location
if(code_unit < 0) {
current_code_point_ = code_unit;
send_current();
return;
}
assert(code_unit >= 0x00);
assert(code_unit <= 0xff);
switch(state_)
{
case START:
current_code_point_ = 0;
// if the code unit has no high-bit set and we're in the start
// state, simply output the low order bits as the code point
if(0 == (code_unit & 0x80)) {
current_code_point_ = code_unit & 0x7f;
send_current();
return;
}
// otherwise the high order bit is set. If the two high order
// bits are 10 we've arrived in the middle of a UTF stream and so should
// just skip
if(0x80 == (code_unit & 0xc0)) {
// skip to re-synchronise stream
return;
}
// 110xxxxx => one octet follows
if(0xc0 == (code_unit & 0xe0)) {
state_ = ONE_OCTET_TO_GO;
current_code_point_ |= (code_unit & 0x1f) << 6;
return;
}
// 1110xxxx => two octets follow
if(0xe0 == (code_unit & 0xf0)) {
state_ = TWO_OCTETS_TO_GO;
current_code_point_ |= (code_unit & 0x0f) << 12;
return;
}
// 11110xxx => three octets follow
if(0xf0 == (code_unit & 0xf8)) {
state_ = THREE_OCTETS_TO_GO;
current_code_point_ |= (code_unit & 0x07) << 18;
return;
}
// should not get here. If we do, we have a malformed stream
send_error();
break;
case ONE_OCTET_TO_GO:
// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to
// start state outputting malformed character
if(0x80 != (code_unit & 0xc0)) {
state_ = START;
send_error();
}
// set lower 6 bits
current_code_point_ |= (code_unit & 0x3f);
if((current_code_point_ >= 0xd800) && (current_code_point_ <= 0xdfff)) {
// check we don't have a surrogate pair from UTF-16
send_error();
} else {
send_current();
}
// back to start state
state_ = START;
break;
case TWO_OCTETS_TO_GO:
// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to
// start state outputting malformed character
if(0x80 != (code_unit & 0xc0)) {
state_ = START;
send_error();
}
// set middle 6 bits and advance state
current_code_point_ |= (code_unit & 0x3f) << 6;
state_ = ONE_OCTET_TO_GO;
break;
case THREE_OCTETS_TO_GO:
// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to
// start state outputting malformed character
if(0x80 != (code_unit & 0xc0)) {
state_ = START;
send_error();
}
// set high 6 bits and advance state
current_code_point_ |= (code_unit & 0x3f) << 12;
state_ = TWO_OCTETS_TO_GO;
break;
default:
// state is invalid
cerr << __FILE__ << ": impossible state encountered: " << state_ << endl;
assert(false && "Unreachable code.");
break;
}
}
void UTF8Encoder::operator () (int code_point)
{
using namespace std;
if(code_point < 0) {
throw invalid_argument("Invalid Unicode code point passed to UTF8Encoder");
return;
}
if(code_point <= 0x7f) {
// single octet output
next_stage_(code_point);
return;
}
if(code_point <= 0x7ff) {
// two octet output
next_stage_(0xc0 | ((code_point >> 6) & 0x1f));
next_stage_(0x80 | (code_point & 0x3f));
return;
}
if(code_point <= 0xffff) {
// three octet output
next_stage_(0xe0 | ((code_point >> 12) & 0x0f));
next_stage_(0x80 | ((code_point >> 6) & 0x3f));
next_stage_(0x80 | (code_point & 0x3f));
return;
}
if(code_point <= 0x1fffff) {
// four octet output
next_stage_(0xf0 | ((code_point >> 18) & 0x07));
next_stage_(0x80 | ((code_point >> 12) & 0x3f));
next_stage_(0x80 | ((code_point >> 6) & 0x3f));
next_stage_(0x80 | (code_point & 0x3f));
return;
}
// invalid code point!
cerr << __FILE__ << ": invalid code point: 0x" << hex << code_point << endl;
throw runtime_error("invalid code point in input UTF-8 output stream.");
}
u32string utf8_to_u32(const string& utf8)
{
deque<char32_t> points;
UTF8Decoder decode([&] (int code_point, const Cursor&) {
if(code_point >= 0) {
points.push_back(static_cast<char32_t>(code_point));
}
});
for( char ch : utf8 ) { decode(static_cast<unsigned char>(ch)); }
return u32string(points.begin(), points.end());
}
string u32_to_utf8(const u32string& u32)
{
deque<char> units;
UTF8Encoder encode([&] (int code_unit) {
units.push_back(static_cast<char>(code_unit));
});
for( char32_t cp : u32 ) { encode(cp); }
return string(units.begin(), units.end());
}
wstring utf8_to_w(const string& utf8)
{
deque<char32_t> points;
UTF8Decoder decode([&] (int code_point, const Cursor&) {
if(code_point >= 0) {
points.push_back(static_cast<wchar_t>(code_point));
}
});
for( char ch : utf8 ) { decode(static_cast<unsigned char>(ch)); }
return wstring(points.begin(), points.end());
}
string w_to_utf8(const wstring& w)
{
deque<char> units;
UTF8Encoder encode([&] (int code_unit) {
units.push_back(static_cast<char>(code_unit));
});
for( wchar_t cp : w ) { encode(cp); }
return string(units.begin(), units.end());
}
u16string u32_to_u16(const u32string& u32)
{
// not efficient or pretty but we only ever do this in string concatenation
deque<char16_t> code_units;
// quick and dirty UTF16
for( char32_t cp : u32 ) {
if(cp <= 0xffff) {
code_units.push_back(cp);
} else {
code_units.push_back(0xd800 + ((cp - 0x10000) >> 10));
code_units.push_back(0xdc00 + ((cp - 0x10000) & 0x3ff));
}
}
return u16string(code_units.begin(), code_units.end());
}
u16string utf8_to_u16(const string& utf8)
{
// yuk!
return u32_to_u16(utf8_to_u32(utf8));
}
} // namespace rcc
/// @file
/// @brief Translation of source file characters to/from Unicode code points.
///
/// The first stage of processing is handled by code within this file. The UTF8Decoder class
/// provides a decoder for the UTF-8 encoding scheme. The UTF8Encoder class provides the equivalent
/// code point to UTF-8 code unit decoding. Both classes work as a wrapper around a std::function
/// which represents the remaining phases of translation.
///
/// The CodePoints scoped enum also defines some useful non-graphical code points along with some
/// 'special' code points which can be used to indicate various conditions to later pipeline stages.
///
/// @sa UTF-8 definition: http://tools.ietf.org/html/rfc3629#page-4
///
#pragma once
#ifndef RCC_UTIL_ENCODINGS_HPP__
#define RCC_UTIL_ENCODINGS_HPP__
#include <cassert>
#include <functional>
#include <iostream>
#include <stdexcept>
#include <string>
#include "cursor.hpp"
#include "special_code_points.hpp"
namespace rcc {
/// @addtogroup util
/// @{
/// @brief Encode Unicode code points to output UTF-8 code units.
///
/// Instances are initialised with a function which is used to pass the encoded code units to the next
/// stage in the processing pipeline.
///
/// Encoding is performed by calling UTF8Encoder::operator () with a Unicode code point. As the code
/// points are encoded to code units, the passed functor will be called once for every output code
/// unit. "Special" code points, i.e. those which are \f$ \lt 0 \f$, will cause an instance of
/// std::invalid_argument to be thrown from UTF8Encoder::operator ().
///
class UTF8Encoder {
private:
typedef std::function<void (int)> StageFunction;
StageFunction next_stage_;
public:
UTF8Encoder(StageFunction next_stage)
: next_stage_(next_stage)
{ }
/// @brief Encode a Unicode code point into one or more code units.
///
/// Call the next stage functor once per code unit passing it the encoded code unit as an
/// integer.
///
/// @param code_point The Unicode code point to encode. Values less than zero will throw a
/// std::invalid_argument exception.
void operator () (int code_point);
};
/// @brief Decode Unicode code points from a stream of UTF-8 code units.
///
/// Instances are initialised with a function which is used to pass the decoded code points to the
/// next stage in the processing pipeline.
///
/// Decoding is performed by calling UTF8Decoder::operator () with an input code unit which, for
/// UTF-8, should be on the interval \f$ [0, 255] \f$. Negative code units are passed directly through
/// to the next stage functor as "special" code points. As the code units are decoded to code points,
/// the passed functor will be called once for every output code point.
///
/// In addition to the decoded code points, a Cursor detailing the code point physical line, column
/// and offset from the start of the file is passed to the next stage functor.
class UTF8Decoder
{
private:
typedef std::function<void (int, const rcc::Cursor&)> StageFunction;
enum State {
START,
ONE_OCTET_TO_GO,
TWO_OCTETS_TO_GO,
THREE_OCTETS_TO_GO,
};
StageFunction next_stage_;
State state_;
int current_code_point_;
Location current_loc_;
std::shared_ptr<const std::string> current_source_name_;
void send_error() {
throw std::runtime_error("Invalid UTF-8 encountered in input stream.");
}
void send_current() {
next_stage_(current_code_point_, Cursor(current_source_name_, current_loc_));
if(current_code_point_ > 0) {
++current_loc_.index;
++current_loc_.column;
if(current_code_point_ == static_cast<int>(CodePoints::LineFeed)) {
++current_loc_.line;
current_loc_.column = 1;
}
}
}
public:
/// @brief Construct a UTF8Decoder.
///
/// @param next_stage
/// @param source_name The source name which is associated with Cursor-s passed to next stage.
/// @param loc The Location to use as the first location in the input stream.
UTF8Decoder(StageFunction next_stage,
const std::string& source_name,
const Location& loc)
: next_stage_(next_stage)
, state_(START)
, current_code_point_(0)
, current_loc_(loc)
, current_source_name_(std::make_shared<std::string>(source_name))
{ }
/// @brief Convenience override which uses line 1, column 1 as the starting point.
///
/// @param next_stage
/// @param source_name
UTF8Decoder(StageFunction next_stage,
const std::string& source_name)
: UTF8Decoder(next_stage, source_name, Location())
{ }
/// @brief Convenience override which uses line 1, column 1 as the starting point and "" as
/// the source name.
///
/// @param next_stage
UTF8Decoder(StageFunction next_stage)
: UTF8Decoder(next_stage, "")
{ }
/// @brief Decode a UTF-8 code unit into zero or more Unicode code points.
///
/// Call once per code unit. For each set of code units which form a valid Unicode code point
/// in UTF-8, the next stage functor will be called with that code point and a Cursor as an
/// argument.
///
/// @param code_unit The Unicode code unit to decode. Values less than zero will be passed
/// directly to the next stage functor.
void operator () (int code_unit);
/// @brief Update the source name and starting location used to generator Cursor-s.
///
/// @param source_name
/// @param loc
void set_source_location(const std::string& source_name, const Location& loc = Location()) {
current_loc_ = loc;
current_source_name_ = std::make_shared<const std::string>(source_name);
}
/// @brief Retrieve the current source name.
///
/// @return
std::string source_name() const { return current_source_name_ ? *current_source_name_ : ""; }
};
std::u32string utf8_to_u32(const std::string& utf8);
std::string u32_to_utf8(const std::u32string& u32);
std::wstring utf8_to_w(const std::string& utf8);
std::string w_to_utf8(const std::wstring& w);
std::u16string u32_to_u16(const std::u32string& u32);
std::u16string utf8_to_u16(const std::string& utf8);
// TODO: u16_to_{utf8, u32}
/// @}
} // namespace rcc
#endif // RCC_UTIL_ENCODINGS_HPP__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment