Skip to content

Instantly share code, notes, and snippets.

@rjw57
Created October 25, 2013 11:49
Show Gist options
  • Save rjw57/7153446 to your computer and use it in GitHub Desktop.
Save rjw57/7153446 to your computer and use it in GitHub Desktop.
/// @file
/// @brief Translation of source file characters to/from Unicode code points.
///
/// The first stage of processing is handled by code within this file. The UTF8Decoder class
/// provides a decoder for the UTF-8 encoding scheme. The UTF8Encoder class provides the equivalent
/// code point to UTF-8 code unit decoding. Both classes work as a wrapper around a std::function
/// which represents the remaining phases of translation.
///
/// The CodePoints scoped enum also defines some useful non-graphical code points along with some
/// 'special' code points which can be used to indicate various conditions to later pipeline stages.
///
/// @sa UTF-8 definition: http://tools.ietf.org/html/rfc3629#page-4
///
#pragma once
#ifndef RCC_UTIL_ENCODINGS_HPP__
#define RCC_UTIL_ENCODINGS_HPP__
#include <cassert>
#include <functional>
#include <iostream>
#include <stdexcept>
#include <string>
#include "cursor.hpp"
#include "special_code_points.hpp"
namespace rcc {
/// @addtogroup util
/// @{
/// @brief Encode Unicode code points to output UTF-8 code units.
///
/// Instances are initialised with a function which is used to pass the encoded code units to the next
/// stage in the processing pipeline.
///
/// Encoding is performed by calling UTF8Encoder::operator () with a Unicode code point. As the code
/// points are encoded to code units, the passed functor will be called once for every output code
/// unit. "Special" code points, i.e. those which are \f$ \lt 0 \f$, will cause an instance of
/// std::invalid_argument to be thrown from UTF8Encoder::operator ().
///
class UTF8Encoder {
private:
typedef std::function<void (int)> StageFunction;
StageFunction next_stage_;
public:
UTF8Encoder(StageFunction next_stage)
: next_stage_(next_stage)
{ }
/// @brief Encode a Unicode code point into one or more code units.
///
/// Call the next stage functor once per code unit passing it the encoded code unit as an
/// integer.
///
/// @param code_point The Unicode code point to encode. Values less than zero will throw a
/// std::invalid_argument exception.
void operator () (int code_point);
};
/// @brief Decode Unicode code points from a stream of UTF-8 code units.
///
/// Instances are initialised with a function which is used to pass the decoded code points to the
/// next stage in the processing pipeline.
///
/// Decoding is performed by calling UTF8Decoder::operator () with an input code unit which, for
/// UTF-8, should be on the interval \f$ [0, 255] \f$. Negative code units are passed directly through
/// to the next stage functor as "special" code points. As the code units are decoded to code points,
/// the passed functor will be called once for every output code point.
///
/// In addition to the decoded code points, a Cursor detailing the code point physical line, column
/// and offset from the start of the file is passed to the next stage functor.
class UTF8Decoder
{
private:
typedef std::function<void (int, const rcc::Cursor&)> StageFunction;
enum State {
START,
ONE_OCTET_TO_GO,
TWO_OCTETS_TO_GO,
THREE_OCTETS_TO_GO,
};
StageFunction next_stage_;
State state_;
int current_code_point_;
Location current_loc_;
std::shared_ptr<const std::string> current_source_name_;
void send_error() {
throw std::runtime_error("Invalid UTF-8 encountered in input stream.");
}
void send_current() {
next_stage_(current_code_point_, Cursor(current_source_name_, current_loc_));
if(current_code_point_ > 0) {
++current_loc_.index;
++current_loc_.column;
if(current_code_point_ == static_cast<int>(CodePoints::LineFeed)) {
++current_loc_.line;
current_loc_.column = 1;
}
}
}
public:
/// @brief Construct a UTF8Decoder.
///
/// @param next_stage
/// @param source_name The source name which is associated with Cursor-s passed to next stage.
/// @param loc The Location to use as the first location in the input stream.
UTF8Decoder(StageFunction next_stage,
const std::string& source_name,
const Location& loc)
: next_stage_(next_stage)
, state_(START)
, current_code_point_(0)
, current_loc_(loc)
, current_source_name_(std::make_shared<std::string>(source_name))
{ }
/// @brief Convenience override which uses line 1, column 1 as the starting point.
///
/// @param next_stage
/// @param source_name
UTF8Decoder(StageFunction next_stage,
const std::string& source_name)
: UTF8Decoder(next_stage, source_name, Location())
{ }
/// @brief Convenience override which uses line 1, column 1 as the starting point and "" as
/// the source name.
///
/// @param next_stage
UTF8Decoder(StageFunction next_stage)
: UTF8Decoder(next_stage, "")
{ }
/// @brief Decode a UTF-8 code unit into zero or more Unicode code points.
///
/// Call once per code unit. For each set of code units which form a valid Unicode code point
/// in UTF-8, the next stage functor will be called with that code point and a Cursor as an
/// argument.
///
/// @param code_unit The Unicode code unit to decode. Values less than zero will be passed
/// directly to the next stage functor.
void operator () (int code_unit);
/// @brief Update the source name and starting location used to generator Cursor-s.
///
/// @param source_name
/// @param loc
void set_source_location(const std::string& source_name, const Location& loc = Location()) {
current_loc_ = loc;
current_source_name_ = std::make_shared<const std::string>(source_name);
}
/// @brief Retrieve the current source name.
///
/// @return
std::string source_name() const { return current_source_name_ ? *current_source_name_ : ""; }
};
std::u32string utf8_to_u32(const std::string& utf8);
std::string u32_to_utf8(const std::u32string& u32);
std::wstring utf8_to_w(const std::string& utf8);
std::string w_to_utf8(const std::wstring& w);
std::u16string u32_to_u16(const std::u32string& u32);
std::u16string utf8_to_u16(const std::string& utf8);
// TODO: u16_to_{utf8, u32}
/// @}
} // namespace rcc
#endif // RCC_UTIL_ENCODINGS_HPP__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment