rjw57 · October 25, 2013 11:49
diff --git a/encoding.cpp b/encoding.cpp
 #include "encoding.hpp"

 #include <deque>
 #include <string>

 using namespace std;

 namespace rcc {

 void UTF8Decoder::operator () (int code_unit)
 {
 	using namespace std;

 	// pass -ve code units to next stage without advancing location
 	if(code_unit < 0) {
 		current_code_point_ = code_unit;
 		send_current();
 		return;
 	}

 	assert(code_unit >= 0x00);
 	assert(code_unit <= 0xff);

 	switch(state_)
 	{
 	case START:
 		current_code_point_ = 0;

 		// if the code unit has no high-bit set and we're in the start
 		// state, simply output the low order bits as the code point
 		if(0 == (code_unit & 0x80)) {
 			current_code_point_ = code_unit & 0x7f;
 			send_current();
 			return;
 		}

 		// otherwise the high order bit is set. If the two high order
 		// bits are 10 we've arrived in the middle of a UTF stream and so should
 		// just skip
 		if(0x80 == (code_unit & 0xc0)) {
 			// skip to re-synchronise stream
 			return;
 		}

 		// 110xxxxx => one octet follows
 		if(0xc0 == (code_unit & 0xe0)) {
 			state_ = ONE_OCTET_TO_GO;
 			current_code_point_ |= (code_unit & 0x1f) << 6;
 			return;
 		}

 		// 1110xxxx => two octets follow
 		if(0xe0 == (code_unit & 0xf0)) {
 			state_ = TWO_OCTETS_TO_GO;
 			current_code_point_ |= (code_unit & 0x0f) << 12;
 			return;
 		}

 		// 11110xxx => three octets follow
 		if(0xf0 == (code_unit & 0xf8)) {
 			state_ = THREE_OCTETS_TO_GO;
 			current_code_point_ |= (code_unit & 0x07) << 18;
 			return;
 		}

 		// should not get here. If we do, we have a malformed stream
 		send_error();
 		break;

 	case ONE_OCTET_TO_GO:
 		// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to
 		// start state outputting malformed character
 		if(0x80 != (code_unit & 0xc0)) {
 			state_ = START;
 			send_error();
 		}

 		// set lower 6 bits
 		current_code_point_ |= (code_unit & 0x3f);

 		if((current_code_point_ >= 0xd800) && (current_code_point_ <= 0xdfff)) {
 			// check we don't have a surrogate pair from UTF-16
 			send_error();
 		} else {
 			send_current();
 		}

 		// back to start state
 		state_ = START;
 		break;

 	case TWO_OCTETS_TO_GO:
 		// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to
 		// start state outputting malformed character
 		if(0x80 != (code_unit & 0xc0)) {
 			state_ = START;
 			send_error();
 		}

 		// set middle 6 bits and advance state
 		current_code_point_ |= (code_unit & 0x3f) << 6;
 		state_ = ONE_OCTET_TO_GO;
 		break;

 	case THREE_OCTETS_TO_GO:
 		// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to
 		// start state outputting malformed character
 		if(0x80 != (code_unit & 0xc0)) {
 			state_ = START;
 			send_error();
 		}

 		// set high 6 bits and advance state
 		current_code_point_ |= (code_unit & 0x3f) << 12;
 		state_ = TWO_OCTETS_TO_GO;
 		break;

 	default:
 		// state is invalid
 		cerr << __FILE__ << ": impossible state encountered: " << state_ << endl;
 		assert(false && "Unreachable code.");
 		break;
 	}
 }

 void UTF8Encoder::operator () (int code_point)
 {
 	using namespace std;

 	if(code_point < 0) {
 		throw invalid_argument("Invalid Unicode code point passed to UTF8Encoder");
 		return;
 	}

 	if(code_point <= 0x7f) {
 		// single octet output
 		next_stage_(code_point);
 		return;
 	}

 	if(code_point <= 0x7ff) {
 		// two octet output
 		next_stage_(0xc0 | ((code_point >> 6) & 0x1f));
 		next_stage_(0x80 | (code_point & 0x3f));
 		return;
 	}

 	if(code_point <= 0xffff) {
 		// three octet output
 		next_stage_(0xe0 | ((code_point >> 12) & 0x0f));
 		next_stage_(0x80 | ((code_point >> 6) & 0x3f));
 		next_stage_(0x80 | (code_point & 0x3f));
 		return;
 	}

 	if(code_point <= 0x1fffff) {
 		// four octet output
 		next_stage_(0xf0 | ((code_point >> 18) & 0x07));
 		next_stage_(0x80 | ((code_point >> 12) & 0x3f));
 		next_stage_(0x80 | ((code_point >> 6) & 0x3f));
 		next_stage_(0x80 | (code_point & 0x3f));
 		return;
 	}

 	// invalid code point!
 	cerr << __FILE__ << ": invalid code point: 0x" << hex << code_point << endl;
 	throw runtime_error("invalid code point in input UTF-8 output stream.");
 }

 u32string utf8_to_u32(const string& utf8)
 {
 	deque<char32_t> points;
 	UTF8Decoder decode([&] (int code_point, const Cursor&) {
 		if(code_point >= 0) {
 			points.push_back(static_cast<char32_t>(code_point)); 
 		}
 	});
 	for( char ch : utf8 ) { decode(static_cast<unsigned char>(ch)); }
 	return u32string(points.begin(), points.end());
 }

 string u32_to_utf8(const u32string& u32)
 {
 	deque<char> units;
 	UTF8Encoder encode([&] (int code_unit) {
 		units.push_back(static_cast<char>(code_unit)); 
 	});
 	for( char32_t cp : u32 ) { encode(cp); }
 	return string(units.begin(), units.end());
 }

 wstring utf8_to_w(const string& utf8)
 {
 	deque<char32_t> points;
 	UTF8Decoder decode([&] (int code_point, const Cursor&) {
 		if(code_point >= 0) {
 			points.push_back(static_cast<wchar_t>(code_point)); 
 		}
 	});
 	for( char ch : utf8 ) { decode(static_cast<unsigned char>(ch)); }
 	return wstring(points.begin(), points.end());
 }

 string w_to_utf8(const wstring& w)
 {
 	deque<char> units;
 	UTF8Encoder encode([&] (int code_unit) {
 		units.push_back(static_cast<char>(code_unit)); 
 	});
 	for( wchar_t cp : w ) { encode(cp); }
 	return string(units.begin(), units.end());
 }

 u16string u32_to_u16(const u32string& u32)
 {
 	// not efficient or pretty but we only ever do this in string concatenation

 	deque<char16_t> code_units;
 	// quick and dirty UTF16
 	for( char32_t cp : u32 ) {
 		if(cp <= 0xffff) {
 			code_units.push_back(cp);
 		} else {
 			code_units.push_back(0xd800 + ((cp - 0x10000) >> 10));
 			code_units.push_back(0xdc00 + ((cp - 0x10000) & 0x3ff));
 		}
 	}
 	return u16string(code_units.begin(), code_units.end());
 }

 u16string utf8_to_u16(const string& utf8)
 {
 	// yuk!
 	return u32_to_u16(utf8_to_u32(utf8));
 }

 } // namespace rcc
diff --git a/encoding.hpp b/encoding.hpp
 /// @file
 /// @brief Translation of source file characters to/from Unicode code points.
 ///
 /// The first stage of processing is handled by code within this file. The UTF8Decoder class
 /// provides a decoder for the UTF-8 encoding scheme. The UTF8Encoder class provides the equivalent
 /// code point to UTF-8 code unit decoding. Both classes work as a wrapper around a std::function
 /// which represents the remaining phases of translation.
 ///
 /// The CodePoints scoped enum also defines some useful non-graphical code points along with some
 /// 'special' code points which can be used to indicate various conditions to later pipeline stages.
 ///
 /// @sa UTF-8 definition: http://tools.ietf.org/html/rfc3629#page-4
 ///
 #pragma once
 #ifndef RCC_UTIL_ENCODINGS_HPP__
 #define RCC_UTIL_ENCODINGS_HPP__

 #include <cassert>
 #include <functional>
 #include <iostream>
 #include <stdexcept>
 #include <string>

 #include "cursor.hpp"
 #include "special_code_points.hpp"

 namespace rcc {

 /// @addtogroup util
 /// @{

 /// @brief Encode Unicode code points to output UTF-8 code units.
 ///
 /// Instances are initialised with a function which is used to pass the encoded code units to the next
 /// stage in the processing pipeline.
 ///
 /// Encoding is performed by calling UTF8Encoder::operator () with a Unicode code point. As the code
 /// points are encoded to code units, the passed functor will be called once for every output code
 /// unit. "Special" code points, i.e. those which are \f$ \lt 0 \f$, will cause an instance of
 /// std::invalid_argument to be thrown from UTF8Encoder::operator ().
 ///
 class UTF8Encoder {
 	private:

 	typedef std::function<void (int)> StageFunction;

 	StageFunction next_stage_;

 	public:
 	
 	UTF8Encoder(StageFunction next_stage)
 		: next_stage_(next_stage)
 	{ }

 	/// @brief Encode a Unicode code point into one or more code units.
 	///
 	/// Call the next stage functor once per code unit passing it the encoded code unit as an
 	/// integer.
 	///
 	/// @param code_point The Unicode code point to encode. Values less than zero will throw a
 	/// std::invalid_argument exception.
 	void operator () (int code_point);
 };

 /// @brief Decode Unicode code points from a stream of UTF-8 code units.
 ///
 /// Instances are initialised with a function which is used to pass the decoded code points to the
 /// next stage in the processing pipeline.
 ///
 /// Decoding is performed by calling UTF8Decoder::operator () with an input code unit which, for
 /// UTF-8, should be on the interval \f$ [0, 255] \f$. Negative code units are passed directly through
 /// to the next stage functor as "special" code points. As the code units are decoded to code points,
 /// the passed functor will be called once for every output code point.
 ///
 /// In addition to the decoded code points, a Cursor detailing the code point physical line, column
 /// and offset from the start of the file is passed to the next stage functor.
 class UTF8Decoder
 {
 	private:

 	typedef std::function<void (int, const rcc::Cursor&)> StageFunction;

 	enum State {
 		START,
 		ONE_OCTET_TO_GO,
 		TWO_OCTETS_TO_GO,
 		THREE_OCTETS_TO_GO,
 	};

 	StageFunction 	 next_stage_;
 	State            state_;
 	int		 current_code_point_;
 	Location         current_loc_;
 	std::shared_ptr<const std::string> current_source_name_;

 	void send_error() {
 		throw std::runtime_error("Invalid UTF-8 encountered in input stream.");
 	}

 	void send_current() {
 		next_stage_(current_code_point_, Cursor(current_source_name_, current_loc_));
 		if(current_code_point_ > 0) {
 			++current_loc_.index;
 			++current_loc_.column;

 			if(current_code_point_ == static_cast<int>(CodePoints::LineFeed)) {
 				++current_loc_.line;
 				current_loc_.column = 1;
 			}
 		}
 	}

       	public:

 	/// @brief Construct a UTF8Decoder.
 	///
 	/// @param next_stage
 	/// @param source_name The source name which is associated with Cursor-s passed to next stage.
 	/// @param loc The Location to use as the first location in the input stream.
 	UTF8Decoder(StageFunction next_stage,
 		    const std::string& source_name,
 		    const Location& loc)
 		: next_stage_(next_stage)
 		, state_(START)
 		, current_code_point_(0)
 		, current_loc_(loc)
 		, current_source_name_(std::make_shared<std::string>(source_name))
 	{ }

 	/// @brief Convenience override which uses line 1, column 1 as the starting point.
 	///
 	/// @param next_stage
 	/// @param source_name
 	UTF8Decoder(StageFunction next_stage,
 		    const std::string& source_name)
 		: UTF8Decoder(next_stage, source_name, Location())
 	{ }
 	
 	/// @brief Convenience override which uses line 1, column 1 as the starting point and "" as
 	/// the source name.
 	///
 	/// @param next_stage
 	UTF8Decoder(StageFunction next_stage)
 		: UTF8Decoder(next_stage, "")
 	{ }

 	/// @brief Decode a UTF-8 code unit into zero or more Unicode code points.
 	///
 	/// Call once per code unit. For each set of code units which form a valid Unicode code point
 	/// in UTF-8, the next stage functor will be called with that code point and a Cursor as an
 	/// argument.
 	///
 	/// @param code_unit The Unicode code unit to decode. Values less than zero will be passed
 	/// directly to the next stage functor.
 	void operator () (int code_unit);

 	/// @brief Update the source name and starting location used to generator Cursor-s.
 	///
 	/// @param source_name
 	/// @param loc
 	void set_source_location(const std::string& source_name, const Location& loc = Location()) {
 		current_loc_ = loc;
 		current_source_name_ = std::make_shared<const std::string>(source_name);
 	}

 	/// @brief Retrieve the current source name.
 	///
 	/// @return 
 	std::string source_name() const { return current_source_name_ ? *current_source_name_ : ""; }
 };

 std::u32string utf8_to_u32(const std::string& utf8);
 std::string u32_to_utf8(const std::u32string& u32);

 std::wstring utf8_to_w(const std::string& utf8);
 std::string w_to_utf8(const std::wstring& w);

 std::u16string u32_to_u16(const std::u32string& u32);
 std::u16string utf8_to_u16(const std::string& utf8);

 // TODO: u16_to_{utf8, u32}

 /// @}

 } // namespace rcc

 #endif // RCC_UTIL_ENCODINGS_HPP__
	#include "encoding.hpp"

	#include <deque>
	#include <string>

	using namespace std;

	namespace rcc {

	void UTF8Decoder::operator () (int code_unit)
	{
	using namespace std;

	// pass -ve code units to next stage without advancing location
	if(code_unit < 0) {
	current_code_point_ = code_unit;
	send_current();
	return;
	}

	assert(code_unit >= 0x00);
	assert(code_unit <= 0xff);

	switch(state_)
	{
	case START:
	current_code_point_ = 0;

	// if the code unit has no high-bit set and we're in the start
	// state, simply output the low order bits as the code point
	if(0 == (code_unit & 0x80)) {
	current_code_point_ = code_unit & 0x7f;
	send_current();
	return;
	}

	// otherwise the high order bit is set. If the two high order
	// bits are 10 we've arrived in the middle of a UTF stream and so should
	// just skip
	if(0x80 == (code_unit & 0xc0)) {
	// skip to re-synchronise stream
	return;
	}

	// 110xxxxx => one octet follows
	if(0xc0 == (code_unit & 0xe0)) {
	state_ = ONE_OCTET_TO_GO;
	current_code_point_ \|= (code_unit & 0x1f) << 6;
	return;
	}

	// 1110xxxx => two octets follow
	if(0xe0 == (code_unit & 0xf0)) {
	state_ = TWO_OCTETS_TO_GO;
	current_code_point_ \|= (code_unit & 0x0f) << 12;
	return;
	}

	// 11110xxx => three octets follow
	if(0xf0 == (code_unit & 0xf8)) {
	state_ = THREE_OCTETS_TO_GO;
	current_code_point_ \|= (code_unit & 0x07) << 18;
	return;
	}

	// should not get here. If we do, we have a malformed stream
	send_error();
	break;

	case ONE_OCTET_TO_GO:
	// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to
	// start state outputting malformed character
	if(0x80 != (code_unit & 0xc0)) {
	state_ = START;
	send_error();
	}

	// set lower 6 bits
	current_code_point_ \|= (code_unit & 0x3f);

	if((current_code_point_ >= 0xd800) && (current_code_point_ <= 0xdfff)) {
	// check we don't have a surrogate pair from UTF-16
	send_error();
	} else {
	send_current();
	}

	// back to start state
	state_ = START;
	break;

	case TWO_OCTETS_TO_GO:
	// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to
	// start state outputting malformed character
	if(0x80 != (code_unit & 0xc0)) {
	state_ = START;
	send_error();
	}

	// set middle 6 bits and advance state
	current_code_point_ \|= (code_unit & 0x3f) << 6;
	state_ = ONE_OCTET_TO_GO;
	break;

	case THREE_OCTETS_TO_GO:
	// if the code unit if not 10xxxxxx, we have malformed UTF8, go back to
	// start state outputting malformed character
	if(0x80 != (code_unit & 0xc0)) {
	state_ = START;
	send_error();
	}

	// set high 6 bits and advance state
	current_code_point_ \|= (code_unit & 0x3f) << 12;
	state_ = TWO_OCTETS_TO_GO;
	break;

	default:
	// state is invalid
	cerr << __FILE__ << ": impossible state encountered: " << state_ << endl;
	assert(false && "Unreachable code.");
	break;
	}
	}

	void UTF8Encoder::operator () (int code_point)
	{
	using namespace std;

	if(code_point < 0) {
	throw invalid_argument("Invalid Unicode code point passed to UTF8Encoder");
	return;
	}

	if(code_point <= 0x7f) {
	// single octet output
	next_stage_(code_point);
	return;
	}

	if(code_point <= 0x7ff) {
	// two octet output
	next_stage_(0xc0 \| ((code_point >> 6) & 0x1f));
	next_stage_(0x80 \| (code_point & 0x3f));
	return;
	}

	if(code_point <= 0xffff) {
	// three octet output
	next_stage_(0xe0 \| ((code_point >> 12) & 0x0f));
	next_stage_(0x80 \| ((code_point >> 6) & 0x3f));
	next_stage_(0x80 \| (code_point & 0x3f));
	return;
	}

	if(code_point <= 0x1fffff) {
	// four octet output
	next_stage_(0xf0 \| ((code_point >> 18) & 0x07));
	next_stage_(0x80 \| ((code_point >> 12) & 0x3f));
	next_stage_(0x80 \| ((code_point >> 6) & 0x3f));
	next_stage_(0x80 \| (code_point & 0x3f));
	return;
	}

	// invalid code point!
	cerr << __FILE__ << ": invalid code point: 0x" << hex << code_point << endl;
	throw runtime_error("invalid code point in input UTF-8 output stream.");
	}

	u32string utf8_to_u32(const string& utf8)
	{
	deque<char32_t> points;
	UTF8Decoder decode([&] (int code_point, const Cursor&) {
	if(code_point >= 0) {
	points.push_back(static_cast<char32_t>(code_point));
	}
	});
	for( char ch : utf8 ) { decode(static_cast<unsigned char>(ch)); }
	return u32string(points.begin(), points.end());
	}

	string u32_to_utf8(const u32string& u32)
	{
	deque<char> units;
	UTF8Encoder encode([&] (int code_unit) {
	units.push_back(static_cast<char>(code_unit));
	});
	for( char32_t cp : u32 ) { encode(cp); }
	return string(units.begin(), units.end());
	}

	wstring utf8_to_w(const string& utf8)
	{
	deque<char32_t> points;
	UTF8Decoder decode([&] (int code_point, const Cursor&) {
	if(code_point >= 0) {
	points.push_back(static_cast<wchar_t>(code_point));
	}
	});
	for( char ch : utf8 ) { decode(static_cast<unsigned char>(ch)); }
	return wstring(points.begin(), points.end());
	}

	string w_to_utf8(const wstring& w)
	{
	deque<char> units;
	UTF8Encoder encode([&] (int code_unit) {
	units.push_back(static_cast<char>(code_unit));
	});
	for( wchar_t cp : w ) { encode(cp); }
	return string(units.begin(), units.end());
	}

	u16string u32_to_u16(const u32string& u32)
	{
	// not efficient or pretty but we only ever do this in string concatenation

	deque<char16_t> code_units;
	// quick and dirty UTF16
	for( char32_t cp : u32 ) {
	if(cp <= 0xffff) {
	code_units.push_back(cp);
	} else {
	code_units.push_back(0xd800 + ((cp - 0x10000) >> 10));
	code_units.push_back(0xdc00 + ((cp - 0x10000) & 0x3ff));
	}
	}
	return u16string(code_units.begin(), code_units.end());
	}

	u16string utf8_to_u16(const string& utf8)
	{
	// yuk!
	return u32_to_u16(utf8_to_u32(utf8));
	}

	} // namespace rcc
	/// @file
	/// @brief Translation of source file characters to/from Unicode code points.
	///
	/// The first stage of processing is handled by code within this file. The UTF8Decoder class
	/// provides a decoder for the UTF-8 encoding scheme. The UTF8Encoder class provides the equivalent
	/// code point to UTF-8 code unit decoding. Both classes work as a wrapper around a std::function
	/// which represents the remaining phases of translation.
	///
	/// The CodePoints scoped enum also defines some useful non-graphical code points along with some
	/// 'special' code points which can be used to indicate various conditions to later pipeline stages.
	///
	/// @sa UTF-8 definition: http://tools.ietf.org/html/rfc3629#page-4
	///
	#pragma once
	#ifndef RCC_UTIL_ENCODINGS_HPP__
	#define RCC_UTIL_ENCODINGS_HPP__

	#include <cassert>
	#include <functional>
	#include <iostream>
	#include <stdexcept>
	#include <string>

	#include "cursor.hpp"
	#include "special_code_points.hpp"

	namespace rcc {

	/// @addtogroup util
	/// @{

	/// @brief Encode Unicode code points to output UTF-8 code units.
	///
	/// Instances are initialised with a function which is used to pass the encoded code units to the next
	/// stage in the processing pipeline.
	///
	/// Encoding is performed by calling UTF8Encoder::operator () with a Unicode code point. As the code
	/// points are encoded to code units, the passed functor will be called once for every output code
	/// unit. "Special" code points, i.e. those which are \f$ \lt 0 \f$, will cause an instance of
	/// std::invalid_argument to be thrown from UTF8Encoder::operator ().
	///
	class UTF8Encoder {
	private:

	typedef std::function<void (int)> StageFunction;

	StageFunction next_stage_;

	public:

	UTF8Encoder(StageFunction next_stage)
	: next_stage_(next_stage)
	{ }

	/// @brief Encode a Unicode code point into one or more code units.
	///
	/// Call the next stage functor once per code unit passing it the encoded code unit as an
	/// integer.
	///
	/// @param code_point The Unicode code point to encode. Values less than zero will throw a
	/// std::invalid_argument exception.
	void operator () (int code_point);
	};

	/// @brief Decode Unicode code points from a stream of UTF-8 code units.
	///
	/// Instances are initialised with a function which is used to pass the decoded code points to the
	/// next stage in the processing pipeline.
	///
	/// Decoding is performed by calling UTF8Decoder::operator () with an input code unit which, for
	/// UTF-8, should be on the interval \f$ [0, 255] \f$. Negative code units are passed directly through
	/// to the next stage functor as "special" code points. As the code units are decoded to code points,
	/// the passed functor will be called once for every output code point.
	///
	/// In addition to the decoded code points, a Cursor detailing the code point physical line, column
	/// and offset from the start of the file is passed to the next stage functor.
	class UTF8Decoder
	{
	private:

	typedef std::function<void (int, const rcc::Cursor&)> StageFunction;

	enum State {
	START,
	ONE_OCTET_TO_GO,
	TWO_OCTETS_TO_GO,
	THREE_OCTETS_TO_GO,
	};

	StageFunction next_stage_;
	State state_;
	int current_code_point_;
	Location current_loc_;
	std::shared_ptr<const std::string> current_source_name_;

	void send_error() {
	throw std::runtime_error("Invalid UTF-8 encountered in input stream.");
	}

	void send_current() {
	next_stage_(current_code_point_, Cursor(current_source_name_, current_loc_));
	if(current_code_point_ > 0) {
	++current_loc_.index;
	++current_loc_.column;

	if(current_code_point_ == static_cast<int>(CodePoints::LineFeed)) {
	++current_loc_.line;
	current_loc_.column = 1;
	}
	}
	}

	public:

	/// @brief Construct a UTF8Decoder.
	///
	/// @param next_stage
	/// @param source_name The source name which is associated with Cursor-s passed to next stage.
	/// @param loc The Location to use as the first location in the input stream.
	UTF8Decoder(StageFunction next_stage,
	const std::string& source_name,
	const Location& loc)
	: next_stage_(next_stage)
	, state_(START)
	, current_code_point_(0)
	, current_loc_(loc)
	, current_source_name_(std::make_shared<std::string>(source_name))
	{ }

	/// @brief Convenience override which uses line 1, column 1 as the starting point.
	///
	/// @param next_stage
	/// @param source_name
	UTF8Decoder(StageFunction next_stage,
	const std::string& source_name)
	: UTF8Decoder(next_stage, source_name, Location())
	{ }

	/// @brief Convenience override which uses line 1, column 1 as the starting point and "" as
	/// the source name.
	///
	/// @param next_stage
	UTF8Decoder(StageFunction next_stage)
	: UTF8Decoder(next_stage, "")
	{ }

	/// @brief Decode a UTF-8 code unit into zero or more Unicode code points.
	///
	/// Call once per code unit. For each set of code units which form a valid Unicode code point
	/// in UTF-8, the next stage functor will be called with that code point and a Cursor as an
	/// argument.
	///
	/// @param code_unit The Unicode code unit to decode. Values less than zero will be passed
	/// directly to the next stage functor.
	void operator () (int code_unit);

	/// @brief Update the source name and starting location used to generator Cursor-s.
	///
	/// @param source_name
	/// @param loc
	void set_source_location(const std::string& source_name, const Location& loc = Location()) {
	current_loc_ = loc;
	current_source_name_ = std::make_shared<const std::string>(source_name);
	}

	/// @brief Retrieve the current source name.
	///
	/// @return
	std::string source_name() const { return current_source_name_ ? *current_source_name_ : ""; }
	};

	std::u32string utf8_to_u32(const std::string& utf8);
	std::string u32_to_utf8(const std::u32string& u32);

	std::wstring utf8_to_w(const std::string& utf8);
	std::string w_to_utf8(const std::wstring& w);

	std::u16string u32_to_u16(const std::u32string& u32);
	std::u16string utf8_to_u16(const std::string& utf8);

	// TODO: u16_to_{utf8, u32}

	/// @}

	} // namespace rcc

	#endif // RCC_UTIL_ENCODINGS_HPP__