Created
August 25, 2013 21:13
-
-
Save oktal/6336347 to your computer and use it in GitHub Desktop.
A simple XML sitemap crawler using C++ Poco Libraries.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "Poco/DOM/DOMParser.h" | |
#include "Poco/DOM/Document.h" | |
#include "Poco/DOM/AutoPtr.h" | |
#include "Poco/SAX/InputSource.h" | |
#include "Poco/Net/HTTPClientSession.h" | |
#include "Poco/Net/HTTPRequest.h" | |
#include "Poco/Net/HTTPResponse.h" | |
#include "Poco/Net/DNS.h" | |
#include "Poco/DOM/ElementsByTagNameList.h" | |
#include <set> | |
#include <string> | |
#include <iostream> | |
#include <algorithm> | |
#include <tuple> | |
#include <cassert> | |
#include <thread> | |
#include <chrono> | |
#include <unistd.h> | |
using namespace std; | |
using namespace Poco::Net; | |
using namespace Poco::XML; | |
typedef vector<string> ChunkUrl; | |
const string DomainName = "www.domain.com"; | |
const string SitemapPath = "/sitemap.xml"; | |
const int Threads = 48; | |
set<string> parseSiteMap() { | |
set<string> urls; | |
HTTPClientSession session { DomainName }; | |
HTTPRequest request(HTTPRequest::HTTP_GET, SitemapPath); | |
session.sendRequest(request); | |
HTTPResponse response; | |
auto &stream = session.receiveResponse(response); | |
InputSource src { stream }; | |
DOMParser parser; | |
AutoPtr<Document> dom = parser.parse(&src); | |
auto list = dom->getElementsByTagName("loc"); | |
const string http { "http://" }; | |
for (unsigned long index = 0; index < list->length(); ++index) { | |
auto url = list->item(index)->innerText(); | |
// Looks like POCO can't handle an host with http://, so we remove | |
// it from the URL. | |
if (!url.compare(0, http.size(), http)) { | |
url.erase(0, http.size()); | |
} | |
urls.insert(move(url)); | |
} | |
return urls; | |
} | |
// Splits a full URL in a (host, ressource) pair. For example, | |
// http://www.mydomain.com/index.html splits into | |
// (http://www.mydomain.com, /index.html) | |
pair<string, string> extractUrlDomain(const string &url) { | |
auto sepOffset = url.find('/'); | |
if (sepOffset == string::npos) { | |
return make_pair(url, string { '/' }); | |
} else { | |
auto domain = url.substr(0, sepOffset); | |
auto page = url.substr(sepOffset); | |
return make_pair(domain, page); | |
} | |
} | |
void doRequests(ChunkUrl &&chunk) { | |
for (const auto &url: chunk) { | |
string domain, page; | |
tie(domain, page) = extractUrlDomain(url); | |
try { | |
HTTPClientSession session(domain); | |
HTTPRequest request(HTTPRequest::HTTP_GET, page); | |
session.sendRequest(request); | |
HTTPResponse response; | |
session.receiveResponse(response); | |
auto status = response.getStatus(); | |
if (status == HTTPResponse::HTTP_OK) { | |
printf("Fetched URL '%s' [HTTP_OK]\n", | |
url.c_str()); | |
} else { | |
printf("Fetched URL '%s' FAIL [%d]\n", | |
url.c_str(), | |
static_cast<int>(status)); | |
} | |
} catch (const exception &e) { | |
fprintf(stderr, "Fetched URL '%s' [NoAdressFoundException]\n", | |
url.c_str()); | |
} | |
} | |
} | |
void launchThreads(const vector<ChunkUrl> &chunks) { | |
// We do not want to get less chunks than | |
// threads, otherwise boom. | |
assert(chunks.size() >= Threads); | |
vector<thread> threads; | |
threads.reserve(Threads); | |
// Let's create the threads | |
for (const auto &chunk: chunks) { | |
threads.push_back(thread(doRequests, move(chunk))); | |
} | |
// And let's now wait for all threads to finish | |
for (auto &thr: threads) { | |
thr.join(); | |
} | |
} | |
vector<ChunkUrl> makeChunks(const set<string> &urls) { | |
const size_t chunksSize = urls.size() / Threads; | |
vector<ChunkUrl> chunks; | |
chunks.reserve(Threads); | |
auto chunkIt = begin(urls); | |
// Let's first handle the equal sized | |
// chunks | |
for (size_t i = 0; i < Threads - 1; ++i) { | |
ChunkUrl chunk; | |
chunk.reserve(chunksSize); | |
copy_n(chunkIt, chunksSize, back_inserter(chunk)); | |
chunks.push_back(move(chunk)); | |
advance(chunkIt, chunksSize); | |
} | |
// Let's now handle the last chunk | |
ChunkUrl lastChunk; | |
copy(chunkIt, end(urls), back_inserter(lastChunk)); | |
chunks.push_back(move(lastChunk)); | |
return chunks; | |
} | |
int main() { | |
auto urls = parseSiteMap(); | |
auto chunks = makeChunks(urls); | |
launchThreads(chunks); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment