Minoru · November 11, 2017 11:00
diff --git a/urlextract.cpp b/urlextract.cpp
 #include <curl/curl.h>
 #include <cstddef>
 #include <iostream>
 #include <vector>
 #include <cstring>

 #include <libxml/HTMLparser.h>
 #include <libxml/tree.h>
 #include <libxml/parser.h>
 #include <libxml/xpath.h>
 #include <libxml/xpathInternals.h>

 static size_t my_write_data(void *buffer, size_t size, size_t nmemb, void *userp) {
 	std::string * pbuf = static_cast<std::string *>(userp);
 	pbuf->append(static_cast<const char *>(buffer), size * nmemb);
 	return size * nmemb;
 }

 std::string retrieve_url(
 		const std::string& url,
 		void * cfgcont,
 		const std::string& authinfo,
 		const std::string* postdata)
 {
 	std::string buf;

 	CURL * easyhandle = curl_easy_init();
 	curl_easy_setopt(easyhandle, CURLOPT_URL, url.c_str());
 	curl_easy_setopt(easyhandle, CURLOPT_WRITEFUNCTION, my_write_data);
 	curl_easy_setopt(easyhandle, CURLOPT_WRITEDATA, &buf);

 	if (postdata != nullptr) {
 		curl_easy_setopt(easyhandle, CURLOPT_POST, 1);
 		curl_easy_setopt(easyhandle, CURLOPT_POSTFIELDS, postdata->c_str());
 	}

 	curl_easy_perform(easyhandle);
 	curl_easy_cleanup(easyhandle);

 	return buf;
 }

 // match for: <link rel="alternate" type="application/rss+xml" title="..." href="https://..." />
 std::vector<std::string> extract_rss_urls(std::string& url, std::string& html) {
 	// The options will make the extractor quite permissive
 	htmlDocPtr doc = htmlReadMemory(html.c_str(), html.size(), url.c_str(), NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);

 	std::vector<std::string> feeds;

 	if (!doc) {
 		// TODO: Better error handling, use LOGGER, ...
 		std::cerr << "Failed to parse" << std::endl;
 		return feeds;
 	}

    /* Create xpath evaluation context */
    xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
    if(xpathCtx == NULL) {
 		std::cerr << "Error: unable to create new XPath context" << std::endl;
        xmlFreeDoc(doc);
 		return feeds;
    }

 	const char *xpathExpr = "/html/head/link[@rel='alternate' and (@type='application/atom+xml' or @type='application/rss+xml')]";
 	xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((const xmlChar *) xpathExpr, xpathCtx);
    if(xpathObj == NULL) {
 		std::cerr << "Error: unable to evaluate xpath expression \"" << xpathExpr << "\"" << std::endl;
        xmlXPathFreeContext(xpathCtx);
        xmlFreeDoc(doc);
 		return feeds;
    }

 	xmlNodeSetPtr nodes = xpathObj->nodesetval;
 	for(unsigned int i = 0; i < nodes->nodeNr; i++) {
 		xmlNode *cur = nodes->nodeTab[i];
 		xmlChar *href = xmlGetProp(cur, (const xmlChar *) "href");
 		if (href) {
 			feeds.push_back(std::string((char *) href));
 			xmlFree(href);
 		}
 	}

 	xmlXPathFreeObject(xpathObj);
    xmlXPathFreeContext(xpathCtx);
 	xmlFreeDoc(doc);       // free document
    xmlCleanupParser();    // Free globals
    return feeds;

 }

 int main(int argc, char const* argv[])
 {
 	if(argc < 2) {
 		std::cerr << "usage: extract-url <url>" << std::endl;
 		return 1;
 	}
 	std::string url  = argv[1];
 	std::string html = retrieve_url(url, NULL, "", NULL);
 	for (auto rel : extract_rss_urls(url, html)) {
 		std::cout << "Rel: " << rel << std::endl;
 	}
 	return 0;
 }
	#include <curl/curl.h>
	#include <cstddef>
	#include <iostream>
	#include <vector>
	#include <cstring>

	#include <libxml/HTMLparser.h>
	#include <libxml/tree.h>
	#include <libxml/parser.h>
	#include <libxml/xpath.h>
	#include <libxml/xpathInternals.h>

	static size_t my_write_data(void buffer, size_t size, size_t nmemb, void userp) {
	std::string * pbuf = static_cast<std::string *>(userp);
	pbuf->append(static_cast<const char >(buffer), size nmemb);
	return size * nmemb;
	}

	std::string retrieve_url(
	const std::string& url,
	void * cfgcont,
	const std::string& authinfo,
	const std::string* postdata)
	{
	std::string buf;

	CURL * easyhandle = curl_easy_init();
	curl_easy_setopt(easyhandle, CURLOPT_URL, url.c_str());
	curl_easy_setopt(easyhandle, CURLOPT_WRITEFUNCTION, my_write_data);
	curl_easy_setopt(easyhandle, CURLOPT_WRITEDATA, &buf);

	if (postdata != nullptr) {
	curl_easy_setopt(easyhandle, CURLOPT_POST, 1);
	curl_easy_setopt(easyhandle, CURLOPT_POSTFIELDS, postdata->c_str());
	}

	curl_easy_perform(easyhandle);
	curl_easy_cleanup(easyhandle);

	return buf;
	}

	// match for: <link rel="alternate" type="application/rss+xml" title="..." href="https://..." />
	std::vector<std::string> extract_rss_urls(std::string& url, std::string& html) {
	// The options will make the extractor quite permissive
	htmlDocPtr doc = htmlReadMemory(html.c_str(), html.size(), url.c_str(), NULL, HTML_PARSE_NOBLANKS \| HTML_PARSE_NOERROR \| HTML_PARSE_NOWARNING \| HTML_PARSE_NONET);

	std::vector<std::string> feeds;

	if (!doc) {
	// TODO: Better error handling, use LOGGER, ...
	std::cerr << "Failed to parse" << std::endl;
	return feeds;
	}

	/* Create xpath evaluation context */
	xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
	if(xpathCtx == NULL) {
	std::cerr << "Error: unable to create new XPath context" << std::endl;
	xmlFreeDoc(doc);
	return feeds;
	}

	const char *xpathExpr = "/html/head/link[@rel='alternate' and (@type='application/atom+xml' or @type='application/rss+xml')]";
	xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((const xmlChar *) xpathExpr, xpathCtx);
	if(xpathObj == NULL) {
	std::cerr << "Error: unable to evaluate xpath expression \"" << xpathExpr << "\"" << std::endl;
	xmlXPathFreeContext(xpathCtx);
	xmlFreeDoc(doc);
	return feeds;
	}

	xmlNodeSetPtr nodes = xpathObj->nodesetval;
	for(unsigned int i = 0; i < nodes->nodeNr; i++) {
	xmlNode *cur = nodes->nodeTab[i];
	xmlChar href = xmlGetProp(cur, (const xmlChar ) "href");
	if (href) {
	feeds.push_back(std::string((char *) href));
	xmlFree(href);
	}
	}

	xmlXPathFreeObject(xpathObj);
	xmlXPathFreeContext(xpathCtx);
	xmlFreeDoc(doc); // free document
	xmlCleanupParser(); // Free globals
	return feeds;

	}

	int main(int argc, char const* argv[])
	{
	if(argc < 2) {
	std::cerr << "usage: extract-url <url>" << std::endl;
	return 1;
	}
	std::string url = argv[1];
	std::string html = retrieve_url(url, NULL, "", NULL);
	for (auto rel : extract_rss_urls(url, html)) {
	std::cout << "Rel: " << rel << std::endl;
	}
	return 0;
	}