Created
October 25, 2013 11:49
-
-
Save rjw57/7153452 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "encoding.hpp" | |
#include <deque> | |
#include <string> | |
using namespace std; | |
namespace rcc { | |
void UTF8Decoder::operator () (int code_unit) | |
{ | |
using namespace std; | |
// pass -ve code units to next stage without advancing location | |
if(code_unit < 0) { | |
current_code_point_ = code_unit; | |
send_current(); | |
return; | |
} | |
assert(code_unit >= 0x00); | |
assert(code_unit <= 0xff); | |
switch(state_) | |
{ | |
case START: | |
current_code_point_ = 0; | |
// if the code unit has no high-bit set and we're in the start | |
// state, simply output the low order bits as the code point | |
if(0 == (code_unit & 0x80)) { | |
current_code_point_ = code_unit & 0x7f; | |
send_current(); | |
return; | |
} | |
// otherwise the high order bit is set. If the two high order | |
// bits are 10 we've arrived in the middle of a UTF stream and so should | |
// just skip | |
if(0x80 == (code_unit & 0xc0)) { | |
// skip to re-synchronise stream | |
return; | |
} | |
// 110xxxxx => one octet follows | |
if(0xc0 == (code_unit & 0xe0)) { | |
state_ = ONE_OCTET_TO_GO; | |
current_code_point_ |= (code_unit & 0x1f) << 6; | |
return; | |
} | |
// 1110xxxx => two octets follow | |
if(0xe0 == (code_unit & 0xf0)) { | |
state_ = TWO_OCTETS_TO_GO; | |
current_code_point_ |= (code_unit & 0x0f) << 12; | |
return; | |
} | |
// 11110xxx => three octets follow | |
if(0xf0 == (code_unit & 0xf8)) { | |
state_ = THREE_OCTETS_TO_GO; | |
current_code_point_ |= (code_unit & 0x07) << 18; | |
return; | |
} | |
// should not get here. If we do, we have a malformed stream | |
send_error(); | |
break; | |
case ONE_OCTET_TO_GO: | |
// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to | |
// start state outputting malformed character | |
if(0x80 != (code_unit & 0xc0)) { | |
state_ = START; | |
send_error(); | |
} | |
// set lower 6 bits | |
current_code_point_ |= (code_unit & 0x3f); | |
if((current_code_point_ >= 0xd800) && (current_code_point_ <= 0xdfff)) { | |
// check we don't have a surrogate pair from UTF-16 | |
send_error(); | |
} else { | |
send_current(); | |
} | |
// back to start state | |
state_ = START; | |
break; | |
case TWO_OCTETS_TO_GO: | |
// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to | |
// start state outputting malformed character | |
if(0x80 != (code_unit & 0xc0)) { | |
state_ = START; | |
send_error(); | |
} | |
// set middle 6 bits and advance state | |
current_code_point_ |= (code_unit & 0x3f) << 6; | |
state_ = ONE_OCTET_TO_GO; | |
break; | |
case THREE_OCTETS_TO_GO: | |
// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to | |
// start state outputting malformed character | |
if(0x80 != (code_unit & 0xc0)) { | |
state_ = START; | |
send_error(); | |
} | |
// set high 6 bits and advance state | |
current_code_point_ |= (code_unit & 0x3f) << 12; | |
state_ = TWO_OCTETS_TO_GO; | |
break; | |
default: | |
// state is invalid | |
cerr << __FILE__ << ": impossible state encountered: " << state_ << endl; | |
assert(false && "Unreachable code."); | |
break; | |
} | |
} | |
void UTF8Encoder::operator () (int code_point) | |
{ | |
using namespace std; | |
if(code_point < 0) { | |
throw invalid_argument("Invalid Unicode code point passed to UTF8Encoder"); | |
return; | |
} | |
if(code_point <= 0x7f) { | |
// single octet output | |
next_stage_(code_point); | |
return; | |
} | |
if(code_point <= 0x7ff) { | |
// two octet output | |
next_stage_(0xc0 | ((code_point >> 6) & 0x1f)); | |
next_stage_(0x80 | (code_point & 0x3f)); | |
return; | |
} | |
if(code_point <= 0xffff) { | |
// three octet output | |
next_stage_(0xe0 | ((code_point >> 12) & 0x0f)); | |
next_stage_(0x80 | ((code_point >> 6) & 0x3f)); | |
next_stage_(0x80 | (code_point & 0x3f)); | |
return; | |
} | |
if(code_point <= 0x1fffff) { | |
// four octet output | |
next_stage_(0xf0 | ((code_point >> 18) & 0x07)); | |
next_stage_(0x80 | ((code_point >> 12) & 0x3f)); | |
next_stage_(0x80 | ((code_point >> 6) & 0x3f)); | |
next_stage_(0x80 | (code_point & 0x3f)); | |
return; | |
} | |
// invalid code point! | |
cerr << __FILE__ << ": invalid code point: 0x" << hex << code_point << endl; | |
throw runtime_error("invalid code point in input UTF-8 output stream."); | |
} | |
u32string utf8_to_u32(const string& utf8) | |
{ | |
deque<char32_t> points; | |
UTF8Decoder decode([&] (int code_point, const Cursor&) { | |
if(code_point >= 0) { | |
points.push_back(static_cast<char32_t>(code_point)); | |
} | |
}); | |
for( char ch : utf8 ) { decode(static_cast<unsigned char>(ch)); } | |
return u32string(points.begin(), points.end()); | |
} | |
string u32_to_utf8(const u32string& u32) | |
{ | |
deque<char> units; | |
UTF8Encoder encode([&] (int code_unit) { | |
units.push_back(static_cast<char>(code_unit)); | |
}); | |
for( char32_t cp : u32 ) { encode(cp); } | |
return string(units.begin(), units.end()); | |
} | |
wstring utf8_to_w(const string& utf8) | |
{ | |
deque<char32_t> points; | |
UTF8Decoder decode([&] (int code_point, const Cursor&) { | |
if(code_point >= 0) { | |
points.push_back(static_cast<wchar_t>(code_point)); | |
} | |
}); | |
for( char ch : utf8 ) { decode(static_cast<unsigned char>(ch)); } | |
return wstring(points.begin(), points.end()); | |
} | |
string w_to_utf8(const wstring& w) | |
{ | |
deque<char> units; | |
UTF8Encoder encode([&] (int code_unit) { | |
units.push_back(static_cast<char>(code_unit)); | |
}); | |
for( wchar_t cp : w ) { encode(cp); } | |
return string(units.begin(), units.end()); | |
} | |
u16string u32_to_u16(const u32string& u32) | |
{ | |
// not efficient or pretty but we only ever do this in string concatenation | |
deque<char16_t> code_units; | |
// quick and dirty UTF16 | |
for( char32_t cp : u32 ) { | |
if(cp <= 0xffff) { | |
code_units.push_back(cp); | |
} else { | |
code_units.push_back(0xd800 + ((cp - 0x10000) >> 10)); | |
code_units.push_back(0xdc00 + ((cp - 0x10000) & 0x3ff)); | |
} | |
} | |
return u16string(code_units.begin(), code_units.end()); | |
} | |
u16string utf8_to_u16(const string& utf8) | |
{ | |
// yuk! | |
return u32_to_u16(utf8_to_u32(utf8)); | |
} | |
} // namespace rcc |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// @file | |
/// @brief Translation of source file characters to/from Unicode code points. | |
/// | |
/// The first stage of processing is handled by code within this file. The UTF8Decoder class | |
/// provides a decoder for the UTF-8 encoding scheme. The UTF8Encoder class provides the equivalent | |
/// code point to UTF-8 code unit decoding. Both classes work as a wrapper around a std::function | |
/// which represents the remaining phases of translation. | |
/// | |
/// The CodePoints scoped enum also defines some useful non-graphical code points along with some | |
/// 'special' code points which can be used to indicate various conditions to later pipeline stages. | |
/// | |
/// @sa UTF-8 definition: http://tools.ietf.org/html/rfc3629#page-4 | |
/// | |
#pragma once | |
#ifndef RCC_UTIL_ENCODINGS_HPP__ | |
#define RCC_UTIL_ENCODINGS_HPP__ | |
#include <cassert> | |
#include <functional> | |
#include <iostream> | |
#include <stdexcept> | |
#include <string> | |
#include "cursor.hpp" | |
#include "special_code_points.hpp" | |
namespace rcc { | |
/// @addtogroup util | |
/// @{ | |
/// @brief Encode Unicode code points to output UTF-8 code units. | |
/// | |
/// Instances are initialised with a function which is used to pass the encoded code units to the next | |
/// stage in the processing pipeline. | |
/// | |
/// Encoding is performed by calling UTF8Encoder::operator () with a Unicode code point. As the code | |
/// points are encoded to code units, the passed functor will be called once for every output code | |
/// unit. "Special" code points, i.e. those which are \f$ \lt 0 \f$, will cause an instance of | |
/// std::invalid_argument to be thrown from UTF8Encoder::operator (). | |
/// | |
class UTF8Encoder { | |
private: | |
typedef std::function<void (int)> StageFunction; | |
StageFunction next_stage_; | |
public: | |
UTF8Encoder(StageFunction next_stage) | |
: next_stage_(next_stage) | |
{ } | |
/// @brief Encode a Unicode code point into one or more code units. | |
/// | |
/// Call the next stage functor once per code unit passing it the encoded code unit as an | |
/// integer. | |
/// | |
/// @param code_point The Unicode code point to encode. Values less than zero will throw a | |
/// std::invalid_argument exception. | |
void operator () (int code_point); | |
}; | |
/// @brief Decode Unicode code points from a stream of UTF-8 code units. | |
/// | |
/// Instances are initialised with a function which is used to pass the decoded code points to the | |
/// next stage in the processing pipeline. | |
/// | |
/// Decoding is performed by calling UTF8Decoder::operator () with an input code unit which, for | |
/// UTF-8, should be on the interval \f$ [0, 255] \f$. Negative code units are passed directly through | |
/// to the next stage functor as "special" code points. As the code units are decoded to code points, | |
/// the passed functor will be called once for every output code point. | |
/// | |
/// In addition to the decoded code points, a Cursor detailing the code point physical line, column | |
/// and offset from the start of the file is passed to the next stage functor. | |
class UTF8Decoder | |
{ | |
private: | |
typedef std::function<void (int, const rcc::Cursor&)> StageFunction; | |
enum State { | |
START, | |
ONE_OCTET_TO_GO, | |
TWO_OCTETS_TO_GO, | |
THREE_OCTETS_TO_GO, | |
}; | |
StageFunction next_stage_; | |
State state_; | |
int current_code_point_; | |
Location current_loc_; | |
std::shared_ptr<const std::string> current_source_name_; | |
void send_error() { | |
throw std::runtime_error("Invalid UTF-8 encountered in input stream."); | |
} | |
void send_current() { | |
next_stage_(current_code_point_, Cursor(current_source_name_, current_loc_)); | |
if(current_code_point_ > 0) { | |
++current_loc_.index; | |
++current_loc_.column; | |
if(current_code_point_ == static_cast<int>(CodePoints::LineFeed)) { | |
++current_loc_.line; | |
current_loc_.column = 1; | |
} | |
} | |
} | |
public: | |
/// @brief Construct a UTF8Decoder. | |
/// | |
/// @param next_stage | |
/// @param source_name The source name which is associated with Cursor-s passed to next stage. | |
/// @param loc The Location to use as the first location in the input stream. | |
UTF8Decoder(StageFunction next_stage, | |
const std::string& source_name, | |
const Location& loc) | |
: next_stage_(next_stage) | |
, state_(START) | |
, current_code_point_(0) | |
, current_loc_(loc) | |
, current_source_name_(std::make_shared<std::string>(source_name)) | |
{ } | |
/// @brief Convenience override which uses line 1, column 1 as the starting point. | |
/// | |
/// @param next_stage | |
/// @param source_name | |
UTF8Decoder(StageFunction next_stage, | |
const std::string& source_name) | |
: UTF8Decoder(next_stage, source_name, Location()) | |
{ } | |
/// @brief Convenience override which uses line 1, column 1 as the starting point and "" as | |
/// the source name. | |
/// | |
/// @param next_stage | |
UTF8Decoder(StageFunction next_stage) | |
: UTF8Decoder(next_stage, "") | |
{ } | |
/// @brief Decode a UTF-8 code unit into zero or more Unicode code points. | |
/// | |
/// Call once per code unit. For each set of code units which form a valid Unicode code point | |
/// in UTF-8, the next stage functor will be called with that code point and a Cursor as an | |
/// argument. | |
/// | |
/// @param code_unit The Unicode code unit to decode. Values less than zero will be passed | |
/// directly to the next stage functor. | |
void operator () (int code_unit); | |
/// @brief Update the source name and starting location used to generator Cursor-s. | |
/// | |
/// @param source_name | |
/// @param loc | |
void set_source_location(const std::string& source_name, const Location& loc = Location()) { | |
current_loc_ = loc; | |
current_source_name_ = std::make_shared<const std::string>(source_name); | |
} | |
/// @brief Retrieve the current source name. | |
/// | |
/// @return | |
std::string source_name() const { return current_source_name_ ? *current_source_name_ : ""; } | |
}; | |
std::u32string utf8_to_u32(const std::string& utf8); | |
std::string u32_to_utf8(const std::u32string& u32); | |
std::wstring utf8_to_w(const std::string& utf8); | |
std::string w_to_utf8(const std::wstring& w); | |
std::u16string u32_to_u16(const std::u32string& u32); | |
std::u16string utf8_to_u16(const std::string& utf8); | |
// TODO: u16_to_{utf8, u32} | |
/// @} | |
} // namespace rcc | |
#endif // RCC_UTIL_ENCODINGS_HPP__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment