Skip to content

Instantly share code, notes, and snippets.

@Alexhuszagh
Created September 15, 2016 05:42
Show Gist options
  • Save Alexhuszagh/8c06851db7af40e45ab4709e5e7c1d48 to your computer and use it in GitHub Desktop.
Save Alexhuszagh/8c06851db7af40e45ab4709e5e7c1d48 to your computer and use it in GitHub Desktop.
Characterset-Conversion Stream Buffer Wrapper
// :copyright: (c) 2015-2016 The Regents of the University of California.
// :license: Boost, see licenses/boost-v1.0.md for more details.
/** @ingroup Utils
*
* @brief Stream buffer filter to encode or decode data from one
* encoding to another, using iconv as the back-end.
*/
#include "encoding.hpp"
#include <iconv.h>
#include <algorithm>
#include <cstring>
#include <string>
namespace boost
{
namespace iostreams
{
namespace detail
{
// CONSTANTS
// ---------
const size_t maxUnicodeWidth = 4;
// DETAILS
// -------
/** @brief Initialize the iconv converter with the source and
* destination encoding.
*/
encoded_base::encoded_base(const encoded_params &params)
{
if (params.output != params.input) {
conv = iconv_open(params.output.data(), params.input.data());
differentCharset = true;
} else {
differentCharset = false;
}
}
/** @brief Cleanup the iconv converter.
*/
encoded_base::~encoded_base()
{
if (differentCharset) {
iconv_close(conv);
}
}
/** C-style stream converter, which converts the source
* character array to the destination character array, calling iconv
* recursively to skip invalid characters.
*/
int encoded_base::convert(const char * & src_begin,
const char * & src_end,
char * & dest_begin,
char * & dest_end)
{
char *end = dest_end - maxUnicodeWidth;
size_t srclen, dstlen;
while (src_begin < src_end && dest_begin < end) {
srclen = src_end - src_begin;
dstlen = dest_end - dest_begin;
#ifdef _MSC_VER
const char *pIn = src_begin;
#else
char *pIn = const_cast<char *>(src_begin);
#endif
iconv(conv, &pIn, &srclen, &dest_begin, &dstlen);
if (src_begin == pIn) {
src_begin++;
} else {
src_begin = pIn;
}
}
return 0;
}
/** C-style stream converter, which copies source bytes to output
* bytes.
*/
int encoded_base::copy(const char * & src_begin,
const char * & src_end,
char * & dest_begin,
char * & dest_end)
{
size_t srclen = src_end - src_begin;
size_t dstlen = dest_end - dest_begin;
size_t length = std::min(srclen, dstlen);
memmove((void*) dest_begin, (void *) src_begin, length);
src_begin += length;
dest_begin += length;
return 0;
}
/** @brief Processes the input stream through the stream filter.
*/
int encoded_base::process(const char * & src_begin,
const char * & src_end,
char * & dest_begin,
char * & dest_end,
int /* flushLevel */)
{
if (differentCharset) {
return convert(src_begin, src_end, dest_begin, dest_end);
} else {
return copy(src_begin, src_end, dest_begin, dest_end);
}
}
} /* detail */
} /* iostreams */
} /* boost */
#pragma once
#include <iostream>
#if defined(_MSC_VER) && (_MSC_VER >= 1020)
# pragma once
#endif
#include <cassert>
#include <iosfwd> // streamsize.
#include <memory> // allocator, bad_alloc.
#include <new>
#include <string>
#include <boost/config.hpp>
#include <boost/cstdint.hpp>
#include <boost/detail/workaround.hpp>
#include <boost/iostreams/constants.hpp>
#include <boost/iostreams/detail/config/auto_link.hpp>
#include <boost/iostreams/detail/config/dyn_link.hpp>
#include <boost/iostreams/detail/config/wide_streams.hpp>
#include <boost/iostreams/detail/config/zlib.hpp>
#include <boost/iostreams/detail/ios.hpp>
#include <boost/iostreams/filter/symmetric.hpp>
#include <boost/iostreams/pipeline.hpp>
#include <boost/type_traits/is_same.hpp>
#include <boost/iostreams/filter/zlib.hpp>
#include <iconv.h>
// Must come last.
#ifdef BOOST_MSVC
# pragma warning(push)
# pragma warning(disable:4251 4231 4660) // Dependencies not exported.
#endif
#include <boost/config/abi_prefix.hpp>
#undef small
namespace boost
{
namespace iostreams
{
// CONSTANTS
// ---------
extern const size_t maxUnicodeWidth;
// OBJECTS
// -------
/** @brief Parameters for input and output encodings to pass to iconv.
*/
struct encoded_params {
std::string input;
std::string output;
encoded_params(const std::string &input = "UTF-8",
const std::string &output = "UTF-8"):
input(input),
output(output)
{}
};
namespace detail
{
// DETAILS
// -------
/** @brief Base class for the character set conversion filter.
* Contains a core process function which converts the source
* encoding to the destination encoding.
*/
class BOOST_IOSTREAMS_DECL encoded_base {
public:
typedef char char_type;
protected:
encoded_base(const encoded_params & params = encoded_params());
~encoded_base();
int convert(const char * & src_begin,
const char * & src_end,
char * & dest_begin,
char * & dest_end);
int copy(const char * & src_begin,
const char * & src_end,
char * & dest_begin,
char * & dest_end);
int process(const char * & src_begin,
const char * & src_end,
char * & dest_begin,
char * & dest_end,
int /* flushLevel */);
public:
int total_in();
int total_out();
private:
iconv_t conv;
bool differentCharset;
};
/** @brief Template implementation for the encoded writer.
*
* Model of a C-style file filter for character set conversions, via
* iconv.
*/
template<typename Alloc = std::allocator<char> >
class encoded_writer_impl : public encoded_base {
public:
encoded_writer_impl(const encoded_params &params = encoded_params());
~encoded_writer_impl();
bool filter(const char*& src_begin, const char* src_end,
char*& dest_begin, char* dest_end, bool flush);
void close();
};
/** @brief Template implementation for the encoded reader.
*
* Model of a C-style file filter for character set conversions, via
* iconv.
*/
template<typename Alloc = std::allocator<char> >
class encoded_reader_impl : public encoded_base {
public:
encoded_reader_impl(const encoded_params &params = encoded_params());
~encoded_reader_impl();
bool filter(const char*& begin_in, const char* end_in,
char*& begin_out, char* end_out, bool flush);
void close();
bool eof() const
{
return eof_;
}
private:
bool eof_;
};
} /* detail */
// FILTERS
// -------
/** @brief Model of InputFilter and OutputFilter implementing
* character set conversion via iconv.
*/
template<typename Alloc = std::allocator<char> >
struct basic_encoded_writer
: symmetric_filter<detail::encoded_writer_impl<Alloc>, Alloc>
{
private:
typedef detail::encoded_writer_impl<Alloc> impl_type;
typedef symmetric_filter<impl_type, Alloc> base_type;
public:
typedef typename base_type::char_type char_type;
typedef typename base_type::category category;
basic_encoded_writer(const encoded_params &params = encoded_params(),
int buffer_size = default_device_buffer_size);
int total_in() { return this->filter().total_in(); }
};
BOOST_IOSTREAMS_PIPABLE(basic_encoded_writer, 1)
typedef basic_encoded_writer<> encoded_writer;
/** @brief Model of InputFilter and OutputFilter implementing
* character set conversion via iconv.
*/
template<typename Alloc = std::allocator<char> >
struct basic_encoded_reader
: symmetric_filter<detail::encoded_reader_impl<Alloc>, Alloc>
{
private:
typedef detail::encoded_reader_impl<Alloc> impl_type;
typedef symmetric_filter<impl_type, Alloc> base_type;
public:
typedef typename base_type::char_type char_type;
typedef typename base_type::category category;
basic_encoded_reader(const encoded_params &params = encoded_params(),
int buffer_size = default_device_buffer_size);
int total_out() { return this->filter().total_out(); }
bool eof() { return this->filter().eof(); }
};
BOOST_IOSTREAMS_PIPABLE(basic_encoded_reader, 1)
typedef basic_encoded_reader<> encoded_reader;
namespace detail
{
// IMPLEMENTATION
// --------------
/** @brief Initialize the encoded writer with the iconv parameters.
*/
template<typename Alloc>
encoded_writer_impl<Alloc>::encoded_writer_impl(const encoded_params& p):
encoded_base(p)
{}
/** @brief Close the encoded writer.
*/
template<typename Alloc>
encoded_writer_impl<Alloc>::~encoded_writer_impl()
{}
/** @brief Implementation of the symmetric, character set encoding filter
* for the writer.
*/
template<typename Alloc>
bool encoded_writer_impl<Alloc>::filter
(const char*& src_begin, const char* src_end,
char*& dest_begin, char* dest_end, bool flush)
{
int result = process(src_begin, src_end, dest_begin, dest_end, flush);
return result == -1;
}
/** @brief Close the encoded writer.
*/
template<typename Alloc>
void encoded_writer_impl<Alloc>::close()
{}
/** @brief Close the encoded reader.
*/
template<typename Alloc>
encoded_reader_impl<Alloc>::~encoded_reader_impl()
{}
/** @brief Initialize the encoded reader with the iconv parameters.
*/
template<typename Alloc>
encoded_reader_impl<Alloc>::encoded_reader_impl(const encoded_params& p):
encoded_base(p),
eof_(false)
{}
/** @brief Implementation of the symmetric, character set encoding filter
* for the reader.
*/
template<typename Alloc>
bool encoded_reader_impl<Alloc>::filter
(const char*& src_begin, const char* src_end,
char*& dest_begin, char* dest_end, bool /* flush */)
{
int result = process(src_begin, src_end, dest_begin, dest_end, true);
return result;
}
/** @brief Close the encoded reader.
*/
template<typename Alloc>
void encoded_reader_impl<Alloc>::close()
{
// cannot re-open, not a true stream
//eof_ = false;
//reset(false, true);
}
} /* detail */
/** @brief Initializer for the symmetric write filter, which initializes
* the iconv base from the parameters and the buffer size.
*/
template<typename Alloc>
basic_encoded_writer<Alloc>::basic_encoded_writer
(const encoded_params& p, int buffer_size):
base_type(buffer_size, p)
{}
/** @brief Initializer for the symmetric read filter, which initializes
* the iconv base from the parameters and the buffer size.
*/
template<typename Alloc>
basic_encoded_reader<Alloc>::basic_encoded_reader(const encoded_params &p, int buffer_size):
base_type(buffer_size, p)
{}
} /* iostreams */
} /* boost */
#include <boost/config/abi_suffix.hpp> // Pops abi_suffix.hpp pragmas.
#ifdef BOOST_MSVC
# pragma warning(pop)
#endif

Encoding

This file contains a stream-buffer wrapper automatically converting charactersets from a one encoding to a another. If the source and the destination encodings are the same, it calls memmove to move the bytes from the source to the destination.

Dependencies

Installation

First, install Boost and GNU Iconv. Afterwards, download the encoding.hpp and encoding.cpp files and either add them to your project or build a static library from encoding.cpp and include encoding.hpp in your source.

Use

Using the encoded filestream is the same as using Boost's filtering streambufs, such as the GZip decompressor. If I wanted to decode a source file from UTF-16 to UTF-8, use of the library is as simple as:

#include "encoding.hpp"

#include <boost/iostreams/filtering_streambuf.hpp>
#include <fstream>
#include <string>


int main()
{
    std::ifstream fin("utf16.csv", std::ios::binary);
    std::ofstream fout("utf8.csv", std::ios::binary);

    // encoding
    boost::iostreams::filtering_streambuf<boost::iostreams::input> streambuf;
    streambuf.push(boost::iostreams::encoded_reader({"UTF-16", "UTF-8"}));
    streambuf.push(fin);
    std::istream stream(&streambuf);

    std::string line;
    while (std::getline(stream, line)) {
        fout << line << std::endl;
    }
    fout.close();
}

Rationale

Using filtered streambufs allows integration of character set encoding with the addition of only 4 lines of code during stream creation, and can be wrapped into a class or function, simplifying debugging and removing ugly code from your source files.

GNU IConv was chosen due to its large format support, simple API, small size, and endian-specific encoding (supporting both big and little UTF-16 and UTF-32), making it an ideal choice for the conversion library. ICU, although more permissively licensed, is much larger, and does not support non-native endian encodings. Since IConv is distributed under the GNU LGPL, it can be safely included in commercial applications as long as it is dynamically linked.

License

See included license file.

Boost Software License - Version 1.0 - August 17th, 2003

Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare derivative works of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following:

The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment