Skip to content

Instantly share code, notes, and snippets.

@d-tux
Created October 17, 2009 14:19
Show Gist options
  • Save d-tux/212349 to your computer and use it in GitHub Desktop.
Save d-tux/212349 to your computer and use it in GitHub Desktop.
#include <libxml/tree.h>
#include <libxml/xpath.h>
#include <libxml/HTMLparser.h>
#include <stdio.h>
#define FILENAME "index.html"
inline static
void initialize()
{
xmlInitParser() ;
}
inline static
void shutdown()
{
xmlCleanupParser() ;
}
static
htmlDocPtr parseHtmlDocument(const char *filename)
{
htmlParserCtxtPtr parser_context = htmlNewParserCtxt() ;
htmlDocPtr document = htmlCtxtReadFile(parser_context, FILENAME, NULL, HTML_PARSE_NOWARNING | HTML_PARSE_NOERROR | HTML_PARSE_RECOVER) ;
htmlFreeParserCtxt(parser_context) ;
return document ;
}
static
xmlXPathObjectPtr findNodes(htmlDocPtr document, const char *xpath_query)
{
xmlXPathContextPtr xpath_ctx = xmlXPathNewContext(document) ;
xmlXPathObjectPtr nodes = xmlXPathEvalExpression(xpath_query, xpath_ctx) ;
xmlXPathFreeContext(xpath_ctx) ;
return nodes ;
}
typedef void (*node_function_t)(xmlNodePtr node, void *data) ;
void printLinkNode(xmlNodePtr node, void *data)
{
if (node->type == XML_ELEMENT_NODE) {
xmlAttrPtr href = xmlHasProp(node, "href") ;
if (href) {
printf("-> Link to '%s'\n", xmlGetProp(node, "href")) ;
}
}
}
static
void eachNode(xmlXPathObjectPtr nodes, node_function_t f, void *data)
{
xmlNodeSetPtr nodeset = nodes->nodesetval ;
int i, size = nodeset->nodeNr ;
for (i = 0 ; i < size ; i++) {
xmlNodePtr cur ;
cur = (xmlNodePtr)nodeset->nodeTab[i] ;
f(cur, data) ;
}
}
int main(int argc, char *argv[])
{
initialize() ;
htmlDocPtr document = parseHtmlDocument(FILENAME) ;
xmlXPathObjectPtr nodes = findNodes(document, "/html/body//a") ;
eachNode(nodes, printLinkNode, NULL) ;
xmlXPathFreeObject(nodes) ;
xmlFreeDoc(document) ;
shutdown() ;
return 0 ;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment