rjw57 · October 25, 2013 11:49
diff --git a/encoding.hpp b/encoding.hpp
 /// @file
 /// @brief Translation of source file characters to/from Unicode code points.
 ///
 /// The first stage of processing is handled by code within this file. The UTF8Decoder class
 /// provides a decoder for the UTF-8 encoding scheme. The UTF8Encoder class provides the equivalent
 /// code point to UTF-8 code unit decoding. Both classes work as a wrapper around a std::function
 /// which represents the remaining phases of translation.
 ///
 /// The CodePoints scoped enum also defines some useful non-graphical code points along with some
 /// 'special' code points which can be used to indicate various conditions to later pipeline stages.
 ///
 /// @sa UTF-8 definition: http://tools.ietf.org/html/rfc3629#page-4
 ///
 #pragma once
 #ifndef RCC_UTIL_ENCODINGS_HPP__
 #define RCC_UTIL_ENCODINGS_HPP__

 #include <cassert>
 #include <functional>
 #include <iostream>
 #include <stdexcept>
 #include <string>

 #include "cursor.hpp"
 #include "special_code_points.hpp"

 namespace rcc {

 /// @addtogroup util
 /// @{

 /// @brief Encode Unicode code points to output UTF-8 code units.
 ///
 /// Instances are initialised with a function which is used to pass the encoded code units to the next
 /// stage in the processing pipeline.
 ///
 /// Encoding is performed by calling UTF8Encoder::operator () with a Unicode code point. As the code
 /// points are encoded to code units, the passed functor will be called once for every output code
 /// unit. "Special" code points, i.e. those which are \f$ \lt 0 \f$, will cause an instance of
 /// std::invalid_argument to be thrown from UTF8Encoder::operator ().
 ///
 class UTF8Encoder {
 	private:

 	typedef std::function<void (int)> StageFunction;

 	StageFunction next_stage_;

 	public:
 	
 	UTF8Encoder(StageFunction next_stage)
 		: next_stage_(next_stage)
 	{ }

 	/// @brief Encode a Unicode code point into one or more code units.
 	///
 	/// Call the next stage functor once per code unit passing it the encoded code unit as an
 	/// integer.
 	///
 	/// @param code_point The Unicode code point to encode. Values less than zero will throw a
 	/// std::invalid_argument exception.
 	void operator () (int code_point);
 };

 /// @brief Decode Unicode code points from a stream of UTF-8 code units.
 ///
 /// Instances are initialised with a function which is used to pass the decoded code points to the
 /// next stage in the processing pipeline.
 ///
 /// Decoding is performed by calling UTF8Decoder::operator () with an input code unit which, for
 /// UTF-8, should be on the interval \f$ [0, 255] \f$. Negative code units are passed directly through
 /// to the next stage functor as "special" code points. As the code units are decoded to code points,
 /// the passed functor will be called once for every output code point.
 ///
 /// In addition to the decoded code points, a Cursor detailing the code point physical line, column
 /// and offset from the start of the file is passed to the next stage functor.
 class UTF8Decoder
 {
 	private:

 	typedef std::function<void (int, const rcc::Cursor&)> StageFunction;

 	enum State {
 		START,
 		ONE_OCTET_TO_GO,
 		TWO_OCTETS_TO_GO,
 		THREE_OCTETS_TO_GO,
 	};

 	StageFunction 	 next_stage_;
 	State            state_;
 	int		 current_code_point_;
 	Location         current_loc_;
 	std::shared_ptr<const std::string> current_source_name_;

 	void send_error() {
 		throw std::runtime_error("Invalid UTF-8 encountered in input stream.");
 	}

 	void send_current() {
 		next_stage_(current_code_point_, Cursor(current_source_name_, current_loc_));
 		if(current_code_point_ > 0) {
 			++current_loc_.index;
 			++current_loc_.column;

 			if(current_code_point_ == static_cast<int>(CodePoints::LineFeed)) {
 				++current_loc_.line;
 				current_loc_.column = 1;
 			}
 		}
 	}

       	public:

 	/// @brief Construct a UTF8Decoder.
 	///
 	/// @param next_stage
 	/// @param source_name The source name which is associated with Cursor-s passed to next stage.
 	/// @param loc The Location to use as the first location in the input stream.
 	UTF8Decoder(StageFunction next_stage,
 		    const std::string& source_name,
 		    const Location& loc)
 		: next_stage_(next_stage)
 		, state_(START)
 		, current_code_point_(0)
 		, current_loc_(loc)
 		, current_source_name_(std::make_shared<std::string>(source_name))
 	{ }

 	/// @brief Convenience override which uses line 1, column 1 as the starting point.
 	///
 	/// @param next_stage
 	/// @param source_name
 	UTF8Decoder(StageFunction next_stage,
 		    const std::string& source_name)
 		: UTF8Decoder(next_stage, source_name, Location())
 	{ }
 	
 	/// @brief Convenience override which uses line 1, column 1 as the starting point and "" as
 	/// the source name.
 	///
 	/// @param next_stage
 	UTF8Decoder(StageFunction next_stage)
 		: UTF8Decoder(next_stage, "")
 	{ }

 	/// @brief Decode a UTF-8 code unit into zero or more Unicode code points.
 	///
 	/// Call once per code unit. For each set of code units which form a valid Unicode code point
 	/// in UTF-8, the next stage functor will be called with that code point and a Cursor as an
 	/// argument.
 	///
 	/// @param code_unit The Unicode code unit to decode. Values less than zero will be passed
 	/// directly to the next stage functor.
 	void operator () (int code_unit);

 	/// @brief Update the source name and starting location used to generator Cursor-s.
 	///
 	/// @param source_name
 	/// @param loc
 	void set_source_location(const std::string& source_name, const Location& loc = Location()) {
 		current_loc_ = loc;
 		current_source_name_ = std::make_shared<const std::string>(source_name);
 	}

 	/// @brief Retrieve the current source name.
 	///
 	/// @return 
 	std::string source_name() const { return current_source_name_ ? *current_source_name_ : ""; }
 };

 std::u32string utf8_to_u32(const std::string& utf8);
 std::string u32_to_utf8(const std::u32string& u32);

 std::wstring utf8_to_w(const std::string& utf8);
 std::string w_to_utf8(const std::wstring& w);

 std::u16string u32_to_u16(const std::u32string& u32);
 std::u16string utf8_to_u16(const std::string& utf8);

 // TODO: u16_to_{utf8, u32}

 /// @}

 } // namespace rcc

 #endif // RCC_UTIL_ENCODINGS_HPP__
	/// @file
	/// @brief Translation of source file characters to/from Unicode code points.
	///
	/// The first stage of processing is handled by code within this file. The UTF8Decoder class
	/// provides a decoder for the UTF-8 encoding scheme. The UTF8Encoder class provides the equivalent
	/// code point to UTF-8 code unit decoding. Both classes work as a wrapper around a std::function
	/// which represents the remaining phases of translation.
	///
	/// The CodePoints scoped enum also defines some useful non-graphical code points along with some
	/// 'special' code points which can be used to indicate various conditions to later pipeline stages.
	///
	/// @sa UTF-8 definition: http://tools.ietf.org/html/rfc3629#page-4
	///
	#pragma once
	#ifndef RCC_UTIL_ENCODINGS_HPP__
	#define RCC_UTIL_ENCODINGS_HPP__

	#include <cassert>
	#include <functional>
	#include <iostream>
	#include <stdexcept>
	#include <string>

	#include "cursor.hpp"
	#include "special_code_points.hpp"

	namespace rcc {

	/// @addtogroup util
	/// @{

	/// @brief Encode Unicode code points to output UTF-8 code units.
	///
	/// Instances are initialised with a function which is used to pass the encoded code units to the next
	/// stage in the processing pipeline.
	///
	/// Encoding is performed by calling UTF8Encoder::operator () with a Unicode code point. As the code
	/// points are encoded to code units, the passed functor will be called once for every output code
	/// unit. "Special" code points, i.e. those which are \f$ \lt 0 \f$, will cause an instance of
	/// std::invalid_argument to be thrown from UTF8Encoder::operator ().
	///
	class UTF8Encoder {
	private:

	typedef std::function<void (int)> StageFunction;

	StageFunction next_stage_;

	public:

	UTF8Encoder(StageFunction next_stage)
	: next_stage_(next_stage)
	{ }

	/// @brief Encode a Unicode code point into one or more code units.
	///
	/// Call the next stage functor once per code unit passing it the encoded code unit as an
	/// integer.
	///
	/// @param code_point The Unicode code point to encode. Values less than zero will throw a
	/// std::invalid_argument exception.
	void operator () (int code_point);
	};

	/// @brief Decode Unicode code points from a stream of UTF-8 code units.
	///
	/// Instances are initialised with a function which is used to pass the decoded code points to the
	/// next stage in the processing pipeline.
	///
	/// Decoding is performed by calling UTF8Decoder::operator () with an input code unit which, for
	/// UTF-8, should be on the interval \f$ [0, 255] \f$. Negative code units are passed directly through
	/// to the next stage functor as "special" code points. As the code units are decoded to code points,
	/// the passed functor will be called once for every output code point.
	///
	/// In addition to the decoded code points, a Cursor detailing the code point physical line, column
	/// and offset from the start of the file is passed to the next stage functor.
	class UTF8Decoder
	{
	private:

	typedef std::function<void (int, const rcc::Cursor&)> StageFunction;

	enum State {
	START,
	ONE_OCTET_TO_GO,
	TWO_OCTETS_TO_GO,
	THREE_OCTETS_TO_GO,
	};

	StageFunction next_stage_;
	State state_;
	int current_code_point_;
	Location current_loc_;
	std::shared_ptr<const std::string> current_source_name_;

	void send_error() {
	throw std::runtime_error("Invalid UTF-8 encountered in input stream.");
	}

	void send_current() {
	next_stage_(current_code_point_, Cursor(current_source_name_, current_loc_));
	if(current_code_point_ > 0) {
	++current_loc_.index;
	++current_loc_.column;

	if(current_code_point_ == static_cast<int>(CodePoints::LineFeed)) {
	++current_loc_.line;
	current_loc_.column = 1;
	}
	}
	}

	public:

	/// @brief Construct a UTF8Decoder.
	///
	/// @param next_stage
	/// @param source_name The source name which is associated with Cursor-s passed to next stage.
	/// @param loc The Location to use as the first location in the input stream.
	UTF8Decoder(StageFunction next_stage,
	const std::string& source_name,
	const Location& loc)
	: next_stage_(next_stage)
	, state_(START)
	, current_code_point_(0)
	, current_loc_(loc)
	, current_source_name_(std::make_shared<std::string>(source_name))
	{ }

	/// @brief Convenience override which uses line 1, column 1 as the starting point.
	///
	/// @param next_stage
	/// @param source_name
	UTF8Decoder(StageFunction next_stage,
	const std::string& source_name)
	: UTF8Decoder(next_stage, source_name, Location())
	{ }

	/// @brief Convenience override which uses line 1, column 1 as the starting point and "" as
	/// the source name.
	///
	/// @param next_stage
	UTF8Decoder(StageFunction next_stage)
	: UTF8Decoder(next_stage, "")
	{ }

	/// @brief Decode a UTF-8 code unit into zero or more Unicode code points.
	///
	/// Call once per code unit. For each set of code units which form a valid Unicode code point
	/// in UTF-8, the next stage functor will be called with that code point and a Cursor as an
	/// argument.
	///
	/// @param code_unit The Unicode code unit to decode. Values less than zero will be passed
	/// directly to the next stage functor.
	void operator () (int code_unit);

	/// @brief Update the source name and starting location used to generator Cursor-s.
	///
	/// @param source_name
	/// @param loc
	void set_source_location(const std::string& source_name, const Location& loc = Location()) {
	current_loc_ = loc;
	current_source_name_ = std::make_shared<const std::string>(source_name);
	}

	/// @brief Retrieve the current source name.
	///
	/// @return
	std::string source_name() const { return current_source_name_ ? *current_source_name_ : ""; }
	};

	std::u32string utf8_to_u32(const std::string& utf8);
	std::string u32_to_utf8(const std::u32string& u32);

	std::wstring utf8_to_w(const std::string& utf8);
	std::string w_to_utf8(const std::wstring& w);

	std::u16string u32_to_u16(const std::u32string& u32);
	std::u16string utf8_to_u16(const std::string& utf8);

	// TODO: u16_to_{utf8, u32}

	/// @}

	} // namespace rcc

	#endif // RCC_UTIL_ENCODINGS_HPP__