Created
August 28, 2012 23:13
-
-
Save takscape/3505196 to your computer and use it in GitHub Desktop.
Encoding Converter
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
** The author disclaims copyright to this source code. | |
** In place of a legal notice, here is a blessing: | |
** | |
** May you do good and not evil. | |
** May you find forgiveness for yourself and forgive others. | |
** May you share freely, never taking more than you give. | |
*/ | |
/* | |
** Any feedback would be appreciated. | |
** mailto:[email protected] | |
*/ | |
#ifndef ___PORTPP_ENCCONV_H___ | |
#define ___PORTPP_ENCCONV_H___ | |
#if defined(_WIN32) && !defined(PORTPP_USE_LIBICONV) | |
# include <windows.h> | |
# include <objbase.h> | |
# include <mlang.h> | |
#else | |
# include <iconv.h> | |
#endif | |
#include <string> | |
#include <cstring> | |
#include <cctype> | |
namespace portpp { | |
class EncodingConverter | |
{ | |
public: | |
enum OPTION | |
{ | |
CONVERT_NONE = 0, // No options. | |
CONVERT_TRANSLITERATE = 1, // Transliterate characters which do not exist in destination charset. | |
CONVERT_DISCARD_ILSEQ = 2, // Discard invalid byte sequences. | |
}; | |
protected: | |
std::string fromEnc_; | |
std::string toEnc_; | |
OPTION opt_; | |
#if defined(_WIN32) && !defined(PORTPP_USE_LIBICONV) | |
DWORD toCodePage_; | |
DWORD fromCodePage_; | |
IMultiLanguage2* ml_; | |
IMLangConvertCharset* conv_; | |
DWORD encNameToCodePage(const char* encName); | |
#else | |
iconv_t cd_; | |
#endif | |
public: | |
/** | |
* Constructor. | |
* @param fromEnc Source encoding. | |
* @param toEnc Destination encoding. | |
* @param opt Options. Bit-wise ORed combination of CONVERT_*. | |
*/ | |
EncodingConverter(const char* fromEnc, const char* toEnc, OPTION opt); | |
virtual ~EncodingConverter(); | |
/** | |
* Returns the source encoding. | |
* @return Source encoding. | |
*/ | |
std::string fromEncoding() const { return fromEnc_; } | |
/** | |
* Returns the destination encoding. | |
* @return Destination encoding. | |
*/ | |
std::string toEncoding() const { return toEnc_; } | |
/** | |
* Returns true if the converter was successfully initialized. | |
* @return true/false. | |
*/ | |
bool valid() const; | |
/** | |
* Converts input and stores result into output. | |
* @param input [in] Input byte sequence. | |
* @param inputBytesLeft [in/out] Size of input in bytes. | |
* It will be subtracted by the number of bytes consumed when the method returns. | |
* @param output [out] A buffer to be stored with output byte sequence. | |
* @param outputBytesLeft [in/out] Size of output in bytes. | |
* It will be subtracted by the number of bytes stored when the method returns. | |
* @return true if succeeded. | |
*/ | |
bool convert(const void* input, size_t& inputBytesLeft, | |
void* output, size_t& outputBytesLeft); | |
/** | |
* Flushes any shift string. | |
* This method is meaningful only when the destination encoding is a stateful encoding such as ISO-2022-JP. | |
* @param output [out] A buffer to be stored with output byte sequence. | |
* @param outputBytesLeft [in/out] Size of output in bytes. | |
* It will be subtracted by the number of bytes stored when the method returns. | |
* @return true if succeeded. | |
*/ | |
bool flush(void* output, size_t& outputBytesLeft); | |
/** | |
* Reinitializes the internal state of the converter. | |
*/ | |
void reset(); | |
/////////////////////////////////////////////////// | |
// Convenience methods | |
/** | |
* Converts input and returns result as std::string. | |
*/ | |
std::string convert(const void* input, size_t& inputBytesLeft) | |
{ | |
std::string ret; | |
char buf[1024]; | |
size_t buflen, prevlen; | |
do { | |
prevlen = inputBytesLeft; | |
buflen = sizeof(buf); | |
convert(input, inputBytesLeft, buf, buflen); | |
ret.append(buf, sizeof(buf)-buflen); | |
if (inputBytesLeft == prevlen) | |
break; | |
} while (inputBytesLeft > 0); | |
return ret; | |
} | |
/** | |
* Flushes any shift string and returns it as std::string. | |
*/ | |
std::string flush() | |
{ | |
char buf[1024]; | |
size_t buflen = sizeof(buf); | |
flush(buf, buflen); | |
return std::string(buf, sizeof(buf)-buflen); | |
} | |
}; | |
#if defined(_WIN32) && !defined(PORTPP_USE_LIBICONV) | |
inline EncodingConverter::EncodingConverter(const char* fromEnc, const char* toEnc, OPTION opt) | |
{ | |
ml_ = 0; | |
conv_ = 0; | |
opt_ = opt; | |
if (FAILED(CoCreateInstance(CLSID_CMultiLanguage, NULL, | |
CLSCTX_INPROC_SERVER, IID_IMultiLanguage2, (void**)&ml_))) | |
{ | |
ml_ = 0; | |
} | |
fromEnc_ = fromEnc; | |
toEnc_ = toEnc; | |
toCodePage_ = encNameToCodePage(toEnc_.c_str()); | |
fromCodePage_ = encNameToCodePage(fromEnc_.c_str()); | |
DWORD prop = MLCONVCHARF_NOBESTFITCHARS; | |
if (opt & CONVERT_TRANSLITERATE) { | |
prop &= ~(DWORD)MLCONVCHARF_NOBESTFITCHARS; | |
} | |
if (ml_) { | |
if (FAILED(ml_->CreateConvertCharset( | |
fromCodePage_, toCodePage_, prop, &conv_))) | |
{ | |
conv_ = 0; | |
ml_->Release(); | |
ml_ = 0; | |
} | |
} | |
////////////////////////////////////// | |
// IMultiLanguage* ml1 = 0; | |
// IEnumCodePage* cp = 0; | |
// ml_->QueryInterface(IID_IMultiLanguage, (void**)&ml1); | |
// ml1->EnumCodePages(MIMECONTF_MIME_IE4, &cp); | |
// PMIMECPINFO cpinfo = (PMIMECPINFO)CoTaskMemAlloc(sizeof(MIMECPINFO)); | |
// ULONG fetched = 0; | |
// while (S_OK == cp->Next(1, cpinfo, &fetched)) { | |
//if (fetched) { | |
// printf("CodePage = %u\n", cpinfo->uiCodePage); | |
// wprintf(L"WebName = %s\n", cpinfo->wszWebCharset); | |
// wprintf(L"HdrName = %s\n", cpinfo->wszHeaderCharset); | |
// wprintf(L"BdyName = %s\n", cpinfo->wszBodyCharset); | |
//} | |
//cpinfo = (PMIMECPINFO)CoTaskMemRealloc(cpinfo, sizeof(MIMECPINFO)); | |
//fetched = 0; | |
// } | |
// CoTaskMemFree(cpinfo); | |
// cp->Release(); | |
// ml1->Release(); | |
/////////////////////////////////////// | |
} | |
inline EncodingConverter::~EncodingConverter() | |
{ | |
if (conv_) { | |
conv_->Release(); | |
conv_ = 0; | |
} | |
if (ml_) { | |
ml_->Release(); | |
ml_ = 0; | |
} | |
} | |
inline bool EncodingConverter::valid() const | |
{ | |
return ((ml_!=0) && (conv_!=0) && (toCodePage_!=0) && (fromCodePage_!=0)); | |
} | |
inline bool EncodingConverter::convert(const void* input, size_t& inputBytesLeft, | |
void* output, size_t& outputBytesLeft) | |
{ | |
BYTE* inbuf = (BYTE*)input; | |
BYTE* outbuf = (BYTE*)output; | |
UINT srcsize = (inputBytesLeft > (size_t)UINT_MAX) ? UINT_MAX : (UINT)inputBytesLeft; | |
UINT dstsize = (outputBytesLeft > (size_t)UINT_MAX) ? UINT_MAX : (UINT)outputBytesLeft; | |
HRESULT hr = conv_->DoConversion(inbuf, &srcsize, outbuf, &dstsize); | |
if (FAILED(hr)) return false; | |
inputBytesLeft -= srcsize; | |
outputBytesLeft -= dstsize; | |
return true; | |
} | |
inline bool EncodingConverter::flush(void* output, size_t& outputBytesLeft) | |
{ | |
reset(); | |
return true; | |
} | |
inline void EncodingConverter::reset() | |
{ | |
DWORD prop = MLCONVCHARF_NOBESTFITCHARS; | |
if (opt_ & CONVERT_TRANSLITERATE) { | |
prop &= ~(DWORD)MLCONVCHARF_NOBESTFITCHARS; | |
} | |
conv_->Initialize(fromCodePage_, toCodePage_, prop); | |
} | |
DWORD EncodingConverter::encNameToCodePage(const char* encName) | |
{ | |
DWORD codepage = 0; | |
MIMECSETINFO charsetInfo; | |
BSTR name = NULL; | |
WCHAR* wideName = NULL; | |
if (!_stricmp(encName, "UTF-16") || | |
!_stricmp(encName, "UTF-16BE") || | |
!_stricmp(encName, "UCS-2") || | |
!_stricmp(encName, "UCS-2BE") || | |
!_stricmp(encName, "UNICODEBIG")) | |
{ | |
encName = "unicodeFFFE"; | |
} | |
else if (!_stricmp(encName, "UTF-16LE") || | |
!_stricmp(encName, "UCS-2LE") || | |
!_stricmp(encName, "UNICODELITTLE")) | |
{ | |
encName = "unicode"; | |
} | |
else if (encName[0] && toupper((unsigned char)encName[0]) == 'C' && | |
encName[1] && toupper((unsigned char)encName[1]) == 'P') | |
{ | |
return (DWORD)atoi(encName+2); | |
} | |
do { | |
int widelen = MultiByteToWideChar(CP_ACP, 0, encName, -1, NULL, 0); | |
if (widelen == 0) { | |
break; | |
} | |
wideName = (WCHAR*)malloc((size_t)widelen * sizeof(WCHAR)); | |
MultiByteToWideChar(CP_ACP, 0, encName, -1, wideName, widelen); | |
name = SysAllocString(wideName); | |
if (FAILED(ml_->GetCharsetInfo(name, &charsetInfo))) { | |
break; | |
} | |
codepage = charsetInfo.uiInternetEncoding; | |
} while (false); | |
// cleanup | |
if (name) { | |
SysFreeString(name); | |
} | |
if (wideName) { | |
free(wideName); | |
} | |
return codepage; | |
} | |
#else | |
inline EncodingConverter::EncodingConverter(const char* fromEnc, const char* toEnc, OPTION opt) | |
{ | |
cd_ = (iconv_t)(-1); | |
fromEnc_ = fromEnc; | |
toEnc_ = toEnc; | |
std::string tocode = toEnc; | |
if (opt & CONVERT_TRANSLITERATE) { | |
tocode += "//TRANSLIT"; | |
} | |
if (opt & CONVERT_DISCARD_ILSEQ) { | |
tocode += "//IGNORE"; | |
} | |
cd_ = iconv_open(tocode.c_str(), fromEnc); | |
} | |
inline EncodingConverter::~EncodingConverter() | |
{ | |
if (valid()) { | |
iconv_close(cd_); | |
cd_ = (iconv_t)(-1); | |
} | |
} | |
inline bool EncodingConverter::valid() const | |
{ | |
return (cd_ != (iconv_t)(-1)); | |
} | |
inline bool EncodingConverter::convert(const void* input, size_t& inputBytesLeft, | |
void* output, size_t& outputBytesLeft) | |
{ | |
#ifdef _WIN32 | |
const char** inbuf = (const char**)(&input); | |
#else | |
char** inbuf = (char**)(&input); | |
#endif | |
char** outbuf = (char**)(&output); | |
size_t res = iconv(cd_, inbuf, &inputBytesLeft, outbuf, &outputBytesLeft); | |
return (res != (size_t)(-1)); | |
} | |
inline bool EncodingConverter::flush(void* output, size_t& outputBytesLeft) | |
{ | |
char** outbuf = (char**)(&output); | |
size_t res = iconv(cd_, NULL, NULL, outbuf, &outputBytesLeft); | |
return (res != (size_t)(-1)); | |
} | |
inline void EncodingConverter::reset() | |
{ | |
iconv(cd_, NULL, NULL, NULL, NULL); | |
} | |
#endif | |
}; // end of namespace portpp | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment