Skip to content

Instantly share code, notes, and snippets.

@flavorjones
Last active August 11, 2023 21:10
Show Gist options
  • Save flavorjones/29179057f57ea85e84b5cca3bbfdb894 to your computer and use it in GitHub Desktop.
Save flavorjones/29179057f57ea85e84b5cca3bbfdb894 to your computer and use it in GitHub Desktop.
2023-08-11 reproduction of libxml2 encoding behavior changes

see sparklemotion/nokogiri#2947

output:

using libxml2 version 21200-GITv2.11.0-70-gd38e73f9
vanilla.html:
  htmlReadFile encoding: UTF-8
  htmlReadMemory encoding: UTF-8
bad-charset.html:
  htmlReadFile encoding: UTF-8
  htmlReadMemory encoding: UTF-8

using libxml2 version 21200-GITv2.11.0-71-gec7be506
vanilla.html:
  htmlReadFile encoding: UTF-8
  htmlReadMemory encoding: (null)
bad-charset.html:
  htmlReadFile encoding: XXXXX
  htmlReadMemory encoding: XXXXX
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=XXXXX">
</head>
</html>
#include <libxml/HTMLparser.h>
#include <string.h>
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <documentfile> [<documentfile2> ...]\n", argv[0]);
return 1;
}
fprintf(stderr, "using libxml2 version %s\n", xmlParserVersion);
for (int j = 1 ; j < argc ; j++) {
char* documentfile = argv[j];
fprintf(stderr, "%s:\n", documentfile);
htmlDocPtr document = htmlReadFile(documentfile, "UTF-8", 0);
fprintf(stderr, " htmlReadFile encoding: %s\n", document->encoding);
xmlFreeDoc(document);
// read the file into a C string and parse with htmlReadMemory
FILE* file = fopen(documentfile, "r");
fseek(file, 0, SEEK_END);
long length = ftell(file);
fseek(file, 0, SEEK_SET);
char* buffer = malloc(length + 1);
fread(buffer, 1, length, file);
fclose(file);
buffer[length] = '\0';
document = htmlReadMemory(buffer, length, NULL, "UTF-8", 0);
fprintf(stderr, " htmlReadMemory encoding: %s\n", document->encoding);
xmlFreeDoc(document);
}
}
CFLAGS=$(shell ./local/bin/xml2-config --cflags)
LDFLAGS=$(shell ./local/bin/xml2-config --libs)
foo: foo.o
$(CC) -o foo foo.o $(LDFLAGS)
foo.o: foo.c
$(CC) $(CFLAGS) -c -o foo.o foo.c
clean:
rm -f foo.o foo
<html>
<head>
<meta http-equiv="Content-Type" content="text/html">
</head>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment