Created
July 9, 2019 21:41
-
-
Save ScumCoder/4a0ed5a90cd5c95e5df8174e6a1f0184 to your computer and use it in GitHub Desktop.
SSCCE for Gumbo parsing issue
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <cassert> | |
#include <gumbo.h> | |
int main() | |
{ | |
// Result is the same if there is no doctype, or if some of the nodes are missing | |
const char *data = "<!DOCTYPE html>\n<html>\n<head>\n</head>\n<body>\n</body>\n</html>"; | |
GumboOutput *output = gumbo_parse(data); | |
// Following is just getting to the problematic node: | |
assert(output->root->type == GUMBO_NODE_ELEMENT); | |
const GumboElement &htmlNode = output->root->v.element; | |
assert(htmlNode.tag == GUMBO_TAG_HTML); | |
std::cout << "Root element is HTML and has " << htmlNode.children.length << " children" << std::endl; | |
assert(htmlNode.children.length > 2); | |
GumboNode *bodyNodeCont = static_cast<GumboNode*>(htmlNode.children.data[2]); | |
assert(bodyNodeCont->type == GUMBO_NODE_ELEMENT); | |
const GumboElement &bodyNode = bodyNodeCont->v.element; | |
assert(bodyNode.tag == GUMBO_TAG_BODY); | |
std::cout << "3rd of them is BODY which has " << bodyNode.children.length << " children" << std::endl; | |
assert(bodyNode.children.length > 0); | |
GumboNode *whitespaceCont = static_cast<GumboNode*>(bodyNode.children.data[0]); | |
assert(whitespaceCont->type == GUMBO_NODE_WHITESPACE); | |
const GumboText &whitespace = whitespaceCont->v.text; | |
// ...and now the problem itself: | |
std::cout << "1st of them is WHITESPACE which looks like this: \"" | |
<< whitespace.text << "\", and original is " | |
<< whitespace.original_text.length << " bytes long and looks like this: \"" | |
<< std::string(whitespace.original_text.data, whitespace.original_text.length) | |
<< "\"" << std::endl; | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Root element is HTML and has 3 children | |
3rd of them is BODY which has 1 children | |
1st of them is WHITESPACE which looks like this: " | |
", and original is 16 bytes long and looks like this: " | |
</body> | |
</html>" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment