Created
March 13, 2022 10:07
-
-
Save cjheath/a148f710be0007057061eaa5fa8013d2 to your computer and use it in GitHub Desktop.
A C++ class that encapsulates a pointer to UTF-8 data so that "normal" char-pointer work in multi-byte UTF-8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// See https://github.com/cjheath/strpp/blob/main/include/char_encoding.h | |
#include <char_encoding.h> | |
// This MOSTLY works. I just need a way to make these both work: | |
// UCS4 ch = *ptr++; | |
// *ptr++ = ch; // <<<< This is the tricky one. | |
class UTF8P | |
{ | |
private: | |
UTF8* data; | |
public: | |
UTF8P(UTF8* s) : data(s) {} // Normal constructor | |
UTF8P(UTF8P& c) : data(c.data) {} // Copy constructor | |
UTF8P(const UTF8P& c) : data(c.data) {} // Copy constructor | |
~UTF8P() {}; | |
UTF8P& operator=(UTF8* s) // Assignment | |
{ data = s; return *this; } | |
operator UTF8*() { return data; } // Access the UTF8 bytes | |
UCS4 operator*() // Dereference to char under the pointer | |
{ const UTF8* s = data; return UTF8Get(s); } | |
static int len(UCS4 ch) // Length in bytes of this UCS4 character | |
{ return UTF8Len(ch); } | |
int len() // length in bytes of character under the pointer | |
{ return UTF8Len(data); } | |
static bool is1st(UTF8* s) // Is this looking at the start of a UTF8 character? | |
{ return UTF8Is1st(*s); } | |
bool is1st() // Are we looking at the start of a UTF8 character? | |
{ return UTF8Is1st(*data); } | |
// Add and subtract integers: | |
UTF8P& operator+=(int i) | |
{ const UTF8* s = data; | |
while (i > 0) { UTF8Get(s); i--;} // Advance | |
while (i < 0) { s = UTF8Backup(s); i++;} // Or backup | |
data = (UTF8*)s; return *this; | |
} | |
UTF8P operator+(int i) { UTF8P t(*this); t += i; return t; } | |
UTF8P operator-=(int i) { return *this += -i; } | |
UTF8P operator-(int i) { UTF8P t(*this); t += -i; return t; } | |
// incr/decr functions: | |
UTF8P& preincr() { const UTF8* s = data; UTF8Get(s); data = (UTF8*)s; return *this; } | |
UTF8P postincr() { UTF8P save(*this); ++*this; return save; } | |
UTF8P& predecr() { data = (UTF8*)UTF8Backup(data); return *this; } | |
UTF8P postdecr() { UTF8P save(*this); --*this; return save; } | |
// incr/decr operators: | |
UTF8P& operator++() { return preincr(); } | |
UTF8P operator++(int) { return postincr(); } | |
UTF8P& operator--() { return predecr(); } | |
UTF8P operator--(int) { return postdecr(); } | |
// Store a character, advancing the pointer (like *ptr++ = ch) | |
UTF8P& put(UCS4 ch) { UTF8Put(data, ch); return *this; } | |
// This is not what I need, I want "*ptr++ = ch;" to do the right thing. I think that means the operator* and operator-> must be special | |
UTF8P& operator=(UCS4 ch) { return put(ch); } | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment