Created
February 7, 2024 14:49
-
-
Save vmg/11625faa79574a4d389fb3c04bdd0582 to your computer and use it in GitHub Desktop.
Collation Dumper for MySQL 8+
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Copyright (c) 2023, The Vitess Authors | |
This program is free software; you can redistribute it and/or modify | |
it under the terms of the GNU General Public License, version 2.0, | |
as published by the Free Software Foundation. | |
This program is also distributed with certain software (including | |
but not limited to OpenSSL) that is licensed under separate terms, | |
as designated in a particular file or component or in included license | |
documentation. The authors of MySQL hereby grant you an additional | |
permission to link the program and your derivative works with the | |
separately licensed software that they have included with MySQL. | |
Without limiting anything contained in the foregoing, this file, | |
which is part of C Driver for MySQL (Connector/C), is also subject to the | |
Universal FOSS Exception, version 1.0, a copy of which can be found at | |
http://oss.oracle.com/licenses/universal-foss-exception. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License, version 2.0, for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program; if not, write to the Free Software | |
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ | |
#include <assert.h> | |
#include <fcntl.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <sys/types.h> | |
#include "m_ctype.h" | |
#ifdef HAVE_UNISTD_H | |
#include <unistd.h> | |
#endif | |
#include "my_sys.h" | |
#include "my_config.h" | |
#include "my_compiler.h" | |
#include "my_inttypes.h" | |
#include "my_io.h" | |
#include "my_loglevel.h" | |
#include "my_macros.h" | |
#include "str_uca_type.h" | |
#include "rapidjson/rapidjson.h" | |
#include "rapidjson/filewritestream.h" | |
#include "rapidjson/writer.h" | |
template <typename J> | |
static void print_contractions_1(J &json, my_wc_t *path, size_t depth, bool contextual, const MY_CONTRACTION &contraction) | |
{ | |
path[depth] = contraction.ch; | |
if (contraction.is_contraction_tail) | |
{ | |
json.StartObject(); | |
json.Key("Path"); | |
json.StartArray(); | |
for (size_t i = 0; i <= depth; i++) | |
{ | |
json.Uint((unsigned int)path[i]); | |
} | |
json.EndArray(); | |
json.Key("Weights"); | |
json.StartArray(); | |
for (size_t i = 0; i < MY_UCA_MAX_WEIGHT_SIZE; i++) | |
{ | |
json.Uint(contraction.weight[i]); | |
} | |
json.EndArray(); | |
if (contextual) | |
{ | |
json.Key("Contextual"); | |
json.Bool(true); | |
} | |
json.EndObject(); | |
} | |
for (const MY_CONTRACTION &ctr : contraction.child_nodes) | |
{ | |
print_contractions_1(json, path, depth + 1, false, ctr); | |
} | |
for (const MY_CONTRACTION &ctr : contraction.child_nodes_context) | |
{ | |
print_contractions_1(json, path, depth + 1, true, ctr); | |
} | |
} | |
template <typename J> | |
static void print_contractions(J &json, std::vector<MY_CONTRACTION> *contractions) | |
{ | |
my_wc_t path[256]; | |
json.StartArray(); | |
for (const MY_CONTRACTION &ctr : *contractions) | |
{ | |
print_contractions_1(json, path, 0, false, ctr); | |
} | |
json.EndArray(); | |
} | |
template <typename J> | |
static void print_reorder_params(J &json, struct Reorder_param *reorder) | |
{ | |
json.StartArray(); | |
for (int i = 0; i < reorder->wt_rec_num; i++) | |
{ | |
struct Reorder_wt_rec &r = reorder->wt_rec[i]; | |
json.StartArray(); | |
json.Uint(r.old_wt_bdy.begin); | |
json.Uint(r.old_wt_bdy.end); | |
json.Uint(r.new_wt_bdy.begin); | |
json.Uint(r.new_wt_bdy.end); | |
json.EndArray(); | |
} | |
json.EndArray(); | |
} | |
template <typename J> | |
static void print_unipages(J &json, const MY_UNI_IDX *unicodeidx) | |
{ | |
json.StartArray(); | |
for (const MY_UNI_IDX *idx = unicodeidx; idx->tab != NULL; idx++) | |
{ | |
json.StartObject(); | |
json.Key("From"); | |
json.Uint(idx->from); | |
json.Key("To"); | |
json.Uint(idx->to); | |
json.Key("Tab"); | |
json.StartArray(); | |
const size_t entries = idx->to - idx->from; | |
for (size_t i = 0; i <= entries; i++) | |
{ | |
json.Uint(idx->tab[i]); | |
} | |
json.EndArray(); | |
json.EndObject(); | |
} | |
json.EndArray(); | |
} | |
template <typename J> | |
static void print_uca_weights_900(J &json, int codepoint, uint16 **weights) | |
{ | |
uint16 *page = weights[codepoint >> 8]; | |
if (page == NULL) | |
return; | |
int offset = codepoint & 0xFF; | |
int cecount = page[offset]; | |
char key[32]; | |
snprintf(key, sizeof(key), "U+%04X", codepoint); | |
json.Key(key); | |
json.StartArray(); | |
for (int ce = 0; ce < cecount; ce++) | |
{ | |
json.Uint(page[256 + (ce * 3 + 0) * 256 + offset]); | |
json.Uint(page[256 + (ce * 3 + 1) * 256 + offset]); | |
json.Uint(page[256 + (ce * 3 + 2) * 256 + offset]); | |
} | |
json.EndArray(); | |
} | |
template <typename J> | |
static void print_uca_weights_legacy(J &json, int codepoint, uint16 **weights, uchar *lengths) | |
{ | |
uint16 *page = weights[codepoint >> 8]; | |
if (page == NULL) | |
return; | |
int offset = codepoint & 0xFF; | |
uint16 *w = page + offset * lengths[codepoint >> 8]; | |
if (!w[0]) | |
return; | |
char key[32]; | |
snprintf(key, sizeof(key), "U+%04X", codepoint); | |
json.Key(key); | |
json.StartArray(); | |
for (; w[0]; w++) | |
{ | |
json.Uint(w[0]); | |
} | |
json.EndArray(); | |
} | |
template <typename J> | |
static void print_array_uchar(J &json, const uchar *arr, size_t len) | |
{ | |
json.StartArray(); | |
for (size_t i = 0; i < len; ++i) | |
{ | |
json.Uint(arr[i]); | |
} | |
json.EndArray(); | |
} | |
template <typename J> | |
static void print_array_uint16(J &json, const uint16 *arr, size_t len) | |
{ | |
json.StartArray(); | |
for (size_t i = 0; i < len; ++i) | |
{ | |
json.Uint(arr[i]); | |
} | |
json.EndArray(); | |
} | |
static CHARSET_INFO *init_collation(const char *name) | |
{ | |
MY_CHARSET_LOADER loader; | |
return my_collation_get_by_name(&loader, name, MYF(0)); | |
} | |
#define MY_UCA_MAXCHAR (0x10FFFF + 1) | |
#define MY_UCA_CHARS_PER_PAGE 256 | |
extern MY_COLLATION_HANDLER my_collation_uca_900_handler; | |
extern MY_COLLATION_HANDLER my_collation_any_uca_handler; | |
extern MY_COLLATION_HANDLER my_collation_utf16_uca_handler; | |
extern MY_COLLATION_HANDLER my_collation_utf32_uca_handler; | |
extern MY_COLLATION_HANDLER my_collation_ucs2_uca_handler; | |
struct KNOWN_HANDLER | |
{ | |
const char *name; | |
const MY_COLLATION_HANDLER *h; | |
}; | |
static KNOWN_HANDLER known_handlers[] = { | |
{"8bit_bin", &my_collation_8bit_bin_handler}, | |
{"8bit_simple_ci", &my_collation_8bit_simple_ci_handler}, | |
{"any_uca", &my_collation_any_uca_handler}, | |
{"uca_900", &my_collation_uca_900_handler}, | |
{"utf16_uca", &my_collation_utf16_uca_handler}, | |
{"utf32_uca", &my_collation_utf32_uca_handler}, | |
{"ucs2_uca", &my_collation_ucs2_uca_handler}, | |
}; | |
static int dumpall(const char *dumppath) | |
{ | |
char pathbuf[4096]; | |
char jsonbuf[4096 * 4]; | |
// bootstrap the `all_charsets` collation array | |
init_collation("utf8mb4_0900_ai_ci"); | |
for (const CHARSET_INFO *charset : all_charsets) | |
{ | |
if (!charset || (charset->state & MY_CS_AVAILABLE) == 0) | |
continue; | |
charset = init_collation(charset->m_coll_name); | |
snprintf(pathbuf, sizeof(pathbuf), "%s/%s.json", dumppath, charset->m_coll_name); | |
FILE *jsonfile = fopen(pathbuf, "w"); | |
if (jsonfile == NULL) | |
{ | |
fprintf(stderr, "failed to create '%s'\n", pathbuf); | |
return 1; | |
} | |
rapidjson::FileWriteStream os(jsonfile, jsonbuf, sizeof(jsonbuf)); | |
rapidjson::Writer<rapidjson::FileWriteStream, rapidjson::UTF8<>, rapidjson::ASCII<>> json(os); | |
json.StartObject(); | |
json.Key("Name"); | |
json.String(charset->m_coll_name); | |
json.Key("Charset"); | |
json.String(charset->csname); | |
json.Key("Number"); | |
json.Uint(charset->number); | |
json.Key("Flags"); | |
json.StartObject(); | |
json.Key("Binary"); | |
json.Bool((charset->state & MY_CS_BINSORT) != 0); | |
json.Key("ASCII"); | |
json.Bool((charset->state & MY_CS_PUREASCII) != 0); | |
json.Key("Default"); | |
json.Bool((charset->state & MY_CS_PRIMARY) != 0); | |
json.EndObject(); | |
for (const KNOWN_HANDLER &handler : known_handlers) | |
{ | |
if (charset->coll == handler.h) | |
{ | |
json.Key("CollationImpl"); | |
json.String(handler.name); | |
break; | |
} | |
} | |
if (charset->ctype != NULL) | |
{ | |
json.Key("CType"); | |
print_array_uchar(json, charset->ctype, 256); | |
} | |
if (charset->to_lower != NULL) | |
{ | |
json.Key("ToLower"); | |
print_array_uchar(json, charset->to_lower, 256); | |
} | |
if (charset->to_upper != NULL) | |
{ | |
json.Key("ToUpper"); | |
print_array_uchar(json, charset->to_upper, 256); | |
} | |
if (charset->tab_to_uni != NULL) | |
{ | |
json.Key("TabToUni"); | |
print_array_uint16(json, charset->tab_to_uni, 256); | |
} | |
if (charset->tab_from_uni != NULL) | |
{ | |
json.Key("TabFromUni"); | |
print_unipages(json, charset->tab_from_uni); | |
} | |
if (charset->sort_order != NULL) | |
{ | |
json.Key("SortOrder"); | |
print_array_uchar(json, charset->sort_order, 256); | |
} | |
if (charset->uca != NULL) | |
{ | |
MY_UCA_INFO *uca = charset->uca; | |
json.Key("UCAVersion"); | |
switch (uca->version) | |
{ | |
case UCA_V400: | |
json.Uint(400); | |
break; | |
case UCA_V520: | |
json.Uint(520); | |
break; | |
case UCA_V900: | |
json.Uint(900); | |
break; | |
default: | |
json.Uint(0); | |
break; | |
} | |
json.Key("Weights"); | |
json.StartObject(); | |
if (uca->version == UCA_V900) | |
{ | |
for (my_wc_t cp = 0; cp < MY_UCA_MAXCHAR; cp++) | |
{ | |
print_uca_weights_900(json, cp, uca->weights); | |
} | |
} | |
else | |
{ | |
for (my_wc_t cp = 0; cp < uca->maxchar; cp++) | |
{ | |
print_uca_weights_legacy(json, cp, uca->weights, uca->lengths); | |
} | |
} | |
json.EndObject(); | |
if (uca->have_contractions) | |
{ | |
json.Key("Contractions"); | |
print_contractions(json, uca->contraction_nodes); | |
} | |
} | |
if (charset->coll_param != NULL) | |
{ | |
json.Key("UppercaseFirst"); | |
json.Bool(charset->coll_param->case_first == CASE_FIRST_UPPER); | |
if (charset->coll_param->reorder_param != NULL) | |
{ | |
json.Key("Reorder"); | |
print_reorder_params(json, charset->coll_param->reorder_param); | |
} | |
} | |
json.EndObject(); | |
os.Flush(); | |
fclose(jsonfile); | |
} | |
return 0; | |
} | |
int main(int argc, char **argv) | |
{ | |
if (argc < 2) | |
{ | |
fprintf(stderr, "usage: %s <destination_path>\n", argv[0]); | |
return 1; | |
} | |
return dumpall(argv[1]); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment