Created
November 11, 2017 11:00
-
-
Save Minoru/3e6c56d73f61b140af3d63546de0b2e7 to your computer and use it in GitHub Desktop.
Extract <link rel="alternate"> feeds from a webpage (by @noctux; https://paste.xinu.at/RRug/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <curl/curl.h> | |
#include <cstddef> | |
#include <iostream> | |
#include <vector> | |
#include <cstring> | |
#include <libxml/HTMLparser.h> | |
#include <libxml/tree.h> | |
#include <libxml/parser.h> | |
#include <libxml/xpath.h> | |
#include <libxml/xpathInternals.h> | |
static size_t my_write_data(void *buffer, size_t size, size_t nmemb, void *userp) { | |
std::string * pbuf = static_cast<std::string *>(userp); | |
pbuf->append(static_cast<const char *>(buffer), size * nmemb); | |
return size * nmemb; | |
} | |
std::string retrieve_url( | |
const std::string& url, | |
void * cfgcont, | |
const std::string& authinfo, | |
const std::string* postdata) | |
{ | |
std::string buf; | |
CURL * easyhandle = curl_easy_init(); | |
curl_easy_setopt(easyhandle, CURLOPT_URL, url.c_str()); | |
curl_easy_setopt(easyhandle, CURLOPT_WRITEFUNCTION, my_write_data); | |
curl_easy_setopt(easyhandle, CURLOPT_WRITEDATA, &buf); | |
if (postdata != nullptr) { | |
curl_easy_setopt(easyhandle, CURLOPT_POST, 1); | |
curl_easy_setopt(easyhandle, CURLOPT_POSTFIELDS, postdata->c_str()); | |
} | |
curl_easy_perform(easyhandle); | |
curl_easy_cleanup(easyhandle); | |
return buf; | |
} | |
// match for: <link rel="alternate" type="application/rss+xml" title="..." href="https://..." /> | |
std::vector<std::string> extract_rss_urls(std::string& url, std::string& html) { | |
// The options will make the extractor quite permissive | |
htmlDocPtr doc = htmlReadMemory(html.c_str(), html.size(), url.c_str(), NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET); | |
std::vector<std::string> feeds; | |
if (!doc) { | |
// TODO: Better error handling, use LOGGER, ... | |
std::cerr << "Failed to parse" << std::endl; | |
return feeds; | |
} | |
/* Create xpath evaluation context */ | |
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); | |
if(xpathCtx == NULL) { | |
std::cerr << "Error: unable to create new XPath context" << std::endl; | |
xmlFreeDoc(doc); | |
return feeds; | |
} | |
const char *xpathExpr = "/html/head/link[@rel='alternate' and (@type='application/atom+xml' or @type='application/rss+xml')]"; | |
xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((const xmlChar *) xpathExpr, xpathCtx); | |
if(xpathObj == NULL) { | |
std::cerr << "Error: unable to evaluate xpath expression \"" << xpathExpr << "\"" << std::endl; | |
xmlXPathFreeContext(xpathCtx); | |
xmlFreeDoc(doc); | |
return feeds; | |
} | |
xmlNodeSetPtr nodes = xpathObj->nodesetval; | |
for(unsigned int i = 0; i < nodes->nodeNr; i++) { | |
xmlNode *cur = nodes->nodeTab[i]; | |
xmlChar *href = xmlGetProp(cur, (const xmlChar *) "href"); | |
if (href) { | |
feeds.push_back(std::string((char *) href)); | |
xmlFree(href); | |
} | |
} | |
xmlXPathFreeObject(xpathObj); | |
xmlXPathFreeContext(xpathCtx); | |
xmlFreeDoc(doc); // free document | |
xmlCleanupParser(); // Free globals | |
return feeds; | |
} | |
int main(int argc, char const* argv[]) | |
{ | |
if(argc < 2) { | |
std::cerr << "usage: extract-url <url>" << std::endl; | |
return 1; | |
} | |
std::string url = argv[1]; | |
std::string html = retrieve_url(url, NULL, "", NULL); | |
for (auto rel : extract_rss_urls(url, html)) { | |
std::cout << "Rel: " << rel << std::endl; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment