Created
October 25, 2013 11:49
-
-
Save rjw57/7153446 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// @file | |
/// @brief Translation of source file characters to/from Unicode code points. | |
/// | |
/// The first stage of processing is handled by code within this file. The UTF8Decoder class | |
/// provides a decoder for the UTF-8 encoding scheme. The UTF8Encoder class provides the equivalent | |
/// code point to UTF-8 code unit decoding. Both classes work as a wrapper around a std::function | |
/// which represents the remaining phases of translation. | |
/// | |
/// The CodePoints scoped enum also defines some useful non-graphical code points along with some | |
/// 'special' code points which can be used to indicate various conditions to later pipeline stages. | |
/// | |
/// @sa UTF-8 definition: http://tools.ietf.org/html/rfc3629#page-4 | |
/// | |
#pragma once | |
#ifndef RCC_UTIL_ENCODINGS_HPP__ | |
#define RCC_UTIL_ENCODINGS_HPP__ | |
#include <cassert> | |
#include <functional> | |
#include <iostream> | |
#include <stdexcept> | |
#include <string> | |
#include "cursor.hpp" | |
#include "special_code_points.hpp" | |
namespace rcc { | |
/// @addtogroup util | |
/// @{ | |
/// @brief Encode Unicode code points to output UTF-8 code units. | |
/// | |
/// Instances are initialised with a function which is used to pass the encoded code units to the next | |
/// stage in the processing pipeline. | |
/// | |
/// Encoding is performed by calling UTF8Encoder::operator () with a Unicode code point. As the code | |
/// points are encoded to code units, the passed functor will be called once for every output code | |
/// unit. "Special" code points, i.e. those which are \f$ \lt 0 \f$, will cause an instance of | |
/// std::invalid_argument to be thrown from UTF8Encoder::operator (). | |
/// | |
class UTF8Encoder { | |
private: | |
typedef std::function<void (int)> StageFunction; | |
StageFunction next_stage_; | |
public: | |
UTF8Encoder(StageFunction next_stage) | |
: next_stage_(next_stage) | |
{ } | |
/// @brief Encode a Unicode code point into one or more code units. | |
/// | |
/// Call the next stage functor once per code unit passing it the encoded code unit as an | |
/// integer. | |
/// | |
/// @param code_point The Unicode code point to encode. Values less than zero will throw a | |
/// std::invalid_argument exception. | |
void operator () (int code_point); | |
}; | |
/// @brief Decode Unicode code points from a stream of UTF-8 code units. | |
/// | |
/// Instances are initialised with a function which is used to pass the decoded code points to the | |
/// next stage in the processing pipeline. | |
/// | |
/// Decoding is performed by calling UTF8Decoder::operator () with an input code unit which, for | |
/// UTF-8, should be on the interval \f$ [0, 255] \f$. Negative code units are passed directly through | |
/// to the next stage functor as "special" code points. As the code units are decoded to code points, | |
/// the passed functor will be called once for every output code point. | |
/// | |
/// In addition to the decoded code points, a Cursor detailing the code point physical line, column | |
/// and offset from the start of the file is passed to the next stage functor. | |
class UTF8Decoder | |
{ | |
private: | |
typedef std::function<void (int, const rcc::Cursor&)> StageFunction; | |
enum State { | |
START, | |
ONE_OCTET_TO_GO, | |
TWO_OCTETS_TO_GO, | |
THREE_OCTETS_TO_GO, | |
}; | |
StageFunction next_stage_; | |
State state_; | |
int current_code_point_; | |
Location current_loc_; | |
std::shared_ptr<const std::string> current_source_name_; | |
void send_error() { | |
throw std::runtime_error("Invalid UTF-8 encountered in input stream."); | |
} | |
void send_current() { | |
next_stage_(current_code_point_, Cursor(current_source_name_, current_loc_)); | |
if(current_code_point_ > 0) { | |
++current_loc_.index; | |
++current_loc_.column; | |
if(current_code_point_ == static_cast<int>(CodePoints::LineFeed)) { | |
++current_loc_.line; | |
current_loc_.column = 1; | |
} | |
} | |
} | |
public: | |
/// @brief Construct a UTF8Decoder. | |
/// | |
/// @param next_stage | |
/// @param source_name The source name which is associated with Cursor-s passed to next stage. | |
/// @param loc The Location to use as the first location in the input stream. | |
UTF8Decoder(StageFunction next_stage, | |
const std::string& source_name, | |
const Location& loc) | |
: next_stage_(next_stage) | |
, state_(START) | |
, current_code_point_(0) | |
, current_loc_(loc) | |
, current_source_name_(std::make_shared<std::string>(source_name)) | |
{ } | |
/// @brief Convenience override which uses line 1, column 1 as the starting point. | |
/// | |
/// @param next_stage | |
/// @param source_name | |
UTF8Decoder(StageFunction next_stage, | |
const std::string& source_name) | |
: UTF8Decoder(next_stage, source_name, Location()) | |
{ } | |
/// @brief Convenience override which uses line 1, column 1 as the starting point and "" as | |
/// the source name. | |
/// | |
/// @param next_stage | |
UTF8Decoder(StageFunction next_stage) | |
: UTF8Decoder(next_stage, "") | |
{ } | |
/// @brief Decode a UTF-8 code unit into zero or more Unicode code points. | |
/// | |
/// Call once per code unit. For each set of code units which form a valid Unicode code point | |
/// in UTF-8, the next stage functor will be called with that code point and a Cursor as an | |
/// argument. | |
/// | |
/// @param code_unit The Unicode code unit to decode. Values less than zero will be passed | |
/// directly to the next stage functor. | |
void operator () (int code_unit); | |
/// @brief Update the source name and starting location used to generator Cursor-s. | |
/// | |
/// @param source_name | |
/// @param loc | |
void set_source_location(const std::string& source_name, const Location& loc = Location()) { | |
current_loc_ = loc; | |
current_source_name_ = std::make_shared<const std::string>(source_name); | |
} | |
/// @brief Retrieve the current source name. | |
/// | |
/// @return | |
std::string source_name() const { return current_source_name_ ? *current_source_name_ : ""; } | |
}; | |
std::u32string utf8_to_u32(const std::string& utf8); | |
std::string u32_to_utf8(const std::u32string& u32); | |
std::wstring utf8_to_w(const std::string& utf8); | |
std::string w_to_utf8(const std::wstring& w); | |
std::u16string u32_to_u16(const std::u32string& u32); | |
std::u16string utf8_to_u16(const std::string& utf8); | |
// TODO: u16_to_{utf8, u32} | |
/// @} | |
} // namespace rcc | |
#endif // RCC_UTIL_ENCODINGS_HPP__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment