berlinbrown · January 21, 2013 05:05 · gauravsinha7 · May 1, 2016 · Dvdhyh · May 29, 2016
diff --git a/gistfile1.cpp b/gistfile1.cpp
 //============================================================================
 // Name        : OctaneCrawler.cpp
 // Author      : Berlin Brown (berlin dot brown at gmail.com)
 // Version     :
 // Copyright   : Copyright Berlin Brown 2012-2013
 // License     : BSD
 // Description : This is the simplest possible web crawler in C++
 //               Uses boost_regex and boost_algorithm
 //============================================================================

 #include <iostream>
 #include <string>
 #include <typeinfo>
 #include <cstdarg>
 #include <iostream>
 #include <fstream>

 #include <boost/regex.hpp>
 #include <boost/algorithm/string.hpp>

 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>

 #include <errno.h>
 #include <fcntl.h>
 #include <netdb.h>
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <time.h>

 using namespace std;
 using namespace boost;

 const int DELAY = 12;
 const int MAXRECV = 140 * 1024;
 const std::string WRITE_DIR_PATH = "/home/bbrown/public/example.com/public/octane_bot_store";

 class WebPage {
 public:
    std::string hostname;
    std::string page;

    WebPage() {
        hostname = "";
        page = "";
    }

    std::string parseHttp(const std::string str) {
        const boost::regex re("(?i)http://(.*)/?(.*)");
        boost::smatch what;
        if (boost::regex_match(str, what, re)) {
            std::string hst = what[1];
            boost::algorithm::to_lower(hst);
            return hst;
        }
        return "";
    } // End of method //

    void parseHref(const std::string orig_host, const std::string str) {
        const boost::regex re("(?i)http://(.*)/(.*)");
        boost::smatch what;
        if (boost::regex_match(str, what, re)) {
            // We found a full URL, parse out the 'hostname'
            // Then parse out the page
            hostname = what[1];
            boost::algorithm::to_lower(hostname);
            page = what[2];
        } else {
            // We could not find the 'page' but we can build the hostname
            hostname = orig_host;
            page = "";
        } // End of the if - else //
    } // End of method //

    void parse(const std::string orig_host, const std::string hrf) {
        const std::string hst = parseHttp(hrf);
        if (!hst.empty()) {
            // If we have a HTTP prefix
            // We could end up with a 'hostname' and page
            parseHref(hst, hrf);
        } else {
            hostname = orig_host;
            page = hrf;
        }
        // hostname and page are constructed,
        // perform post analysis
        if (page.length() == 0) {
            page = "/";
        } // End of the if //
    } // End of the method
 }; // End of the class

 std::string string_format(const std::string &fmt, ...) {
    int size = 255;
    std::string str;
    va_list ap;
    while (1) {
        str.resize(size);
        va_start(ap, fmt);
        int n = vsnprintf((char *) str.c_str(), size, fmt.c_str(), ap);
        va_end(ap);
        if (n > -1 && n < size) {
            str.resize(n);
            return str;
        }
        if (n > -1)
            size = n + 1;
        else
            size *= 2;
    } // End of the while //
    return str;
 } // End of the function //

 std::string request(std::string host, std::string path) {
    std::string request = "GET ";
    request.append(path);
    request.append(" HTTP/1.1\r\n");
    request.append("Host: ");
    request.append(host);
    request.append("\r\n");
    request.append("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.81\r\n");
    request.append("User-Agent: Mozilla/5.0 (compatible; octanebot/1.0; http://code.google.com/p/octane-crawler/)\r\n");
    request.append("Connection: close\r\n");
    request.append("\r\n");
    return request;
 } // End of the function //

 std::string clean_href(const std::string host, const std::string path) {
    // Clean the href to save to file //
    std::string full_url = host;
    full_url.append("/");
    full_url.append(path);
    const boost::regex rmv_all("[^a-zA-Z0-9]");
    const std::string s2 = boost::regex_replace(full_url, rmv_all, "_");
    cout << s2 << endl;
    return s2;
 }

 int connect(const std::string host, const std::string path) {
    const int port = 80;

    // Setup the msock
    int m_sock;
    sockaddr_in m_addr;
    memset(&m_addr, 0, sizeof(m_addr));
    m_sock = socket(AF_INET, SOCK_STREAM, 0);

    int on = 1;
    if (setsockopt(m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*) &on, sizeof(on)) == -1) {
        return false;
    }

    // Connect //
    m_addr.sin_family = AF_INET;
    m_addr.sin_port = htons(port);
    int status = inet_pton(AF_INET, host.c_str(), &m_addr.sin_addr);

    if (errno == EAFNOSUPPORT) {
        return false;
    }
    status = ::connect(m_sock, (sockaddr *) &m_addr, sizeof(m_addr));

    // HTTP/1.1 defines the "close" connection option for
    // the sender to signal that the connection will be closed
    // after completion of the response.
    std::string req = request(host, path);
    // End of building the request //

    status = ::send(m_sock, req.c_str(), req.size(), MSG_NOSIGNAL);
    char buf[MAXRECV];

    cout << "Request: " << req << endl;
    cout << "=========================" << endl;

    std::string recv = "";
    while (status != 0) {
        memset(buf, 0, MAXRECV);
        status = ::recv(m_sock, buf, MAXRECV, 0);
        recv.append(buf);
    } // End of the while //
    cout << "Response:" << recv << endl;
    cout << "---------------------------" << endl;

    // Attempt to write to file //
    const std::string html_file_write = string_format("%s/%s", WRITE_DIR_PATH.c_str(), clean_href(host, path).c_str());
    cout << "Writing to file : " << html_file_write << endl;
    ofstream outfile(html_file_write.c_str());
    outfile << recv << endl;
    outfile.close();

    // Parse the data //
    try {
        const boost::regex rmv_all("[\\r|\\n]");
        const std::string s2 = boost::regex_replace(recv, rmv_all, "");
        const std::string s = s2;
        // Use this regex expression, allow for mixed-case
        // Search for the anchor tag but not the '>'
        // Where (.+?) match anything
        //const boost::regex re("<a([^>]+) href='(.+?)'>");
        const boost::regex re("<a\\s+href\\s*=\\s*(\"([^\"]*)\")|('([^']*)')\\s*>");
        boost::cmatch matches;
        // Using token iterator with sub-matches
        const int subs[] = { 2, 4 };
        boost::sregex_token_iterator i(s.begin(), s.end(), re, subs);
        boost::sregex_token_iterator j;
        for (; i != j; i++) {
            // Iterate through the listed HREFs and
            // move to next request //
            const std::string href = *i;
            if (href.length() != 0) {
                WebPage* page = new WebPage();
                page->parse(host, href);
                const char* hrefc = page->page.c_str();
                cout << "Connecting to HTTP server with : " << page->hostname << " page=" << hrefc  << endl;
                sleep(DELAY);
                connect(page->hostname, string_format("/%s", hrefc));
                delete page;
            } // End of the if ///
        } // End of the for //
    } catch (boost::regex_error& e) {
        cout << "Error: " << e.what() << "\n";
    } // End of the try - catch //
    return 1;
 } // End of the function //

 int main() {
    cout << "Launching program" << endl;
    connect("localhost", "/");
    cout << "Done" << endl;
    return 0;
 } // End of the function //
	//============================================================================
	// Name : OctaneCrawler.cpp
	// Author : Berlin Brown (berlin dot brown at gmail.com)
	// Version :
	// Copyright : Copyright Berlin Brown 2012-2013
	// License : BSD
	// Description : This is the simplest possible web crawler in C++
	// Uses boost_regex and boost_algorithm
	//============================================================================

	#include <iostream>
	#include <string>
	#include <typeinfo>
	#include <cstdarg>
	#include <iostream>
	#include <fstream>

	#include <boost/regex.hpp>
	#include <boost/algorithm/string.hpp>

	#include <sys/types.h>
	#include <sys/socket.h>
	#include <netinet/in.h>
	#include <arpa/inet.h>

	#include <errno.h>
	#include <fcntl.h>
	#include <netdb.h>
	#include <unistd.h>
	#include <errno.h>
	#include <fcntl.h>
	#include <time.h>

	using namespace std;
	using namespace boost;

	const int DELAY = 12;
	const int MAXRECV = 140 * 1024;
	const std::string WRITE_DIR_PATH = "/home/bbrown/public/example.com/public/octane_bot_store";

	class WebPage {
	public:
	std::string hostname;
	std::string page;

	WebPage() {
	hostname = "";
	page = "";
	}

	std::string parseHttp(const std::string str) {
	const boost::regex re("(?i)http://(.)/?(.)");
	boost::smatch what;
	if (boost::regex_match(str, what, re)) {
	std::string hst = what[1];
	boost::algorithm::to_lower(hst);
	return hst;
	}
	return "";
	} // End of method //

	void parseHref(const std::string orig_host, const std::string str) {
	const boost::regex re("(?i)http://(.)/(.)");
	boost::smatch what;
	if (boost::regex_match(str, what, re)) {
	// We found a full URL, parse out the 'hostname'
	// Then parse out the page
	hostname = what[1];
	boost::algorithm::to_lower(hostname);
	page = what[2];
	} else {
	// We could not find the 'page' but we can build the hostname
	hostname = orig_host;
	page = "";
	} // End of the if - else //
	} // End of method //

	void parse(const std::string orig_host, const std::string hrf) {
	const std::string hst = parseHttp(hrf);
	if (!hst.empty()) {
	// If we have a HTTP prefix
	// We could end up with a 'hostname' and page
	parseHref(hst, hrf);
	} else {
	hostname = orig_host;
	page = hrf;
	}
	// hostname and page are constructed,
	// perform post analysis
	if (page.length() == 0) {
	page = "/";
	} // End of the if //
	} // End of the method
	}; // End of the class

	std::string string_format(const std::string &fmt, ...) {
	int size = 255;
	std::string str;
	va_list ap;
	while (1) {
	str.resize(size);
	va_start(ap, fmt);
	int n = vsnprintf((char *) str.c_str(), size, fmt.c_str(), ap);
	va_end(ap);
	if (n > -1 && n < size) {
	str.resize(n);
	return str;
	}
	if (n > -1)
	size = n + 1;
	else
	size *= 2;
	} // End of the while //
	return str;
	} // End of the function //

	std::string request(std::string host, std::string path) {
	std::string request = "GET ";
	request.append(path);
	request.append(" HTTP/1.1\r\n");
	request.append("Host: ");
	request.append(host);
	request.append("\r\n");
	request.append("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.81\r\n");
	request.append("User-Agent: Mozilla/5.0 (compatible; octanebot/1.0; http://code.google.com/p/octane-crawler/)\r\n");
	request.append("Connection: close\r\n");
	request.append("\r\n");
	return request;
	} // End of the function //

	std::string clean_href(const std::string host, const std::string path) {
	// Clean the href to save to file //
	std::string full_url = host;
	full_url.append("/");
	full_url.append(path);
	const boost::regex rmv_all("[^a-zA-Z0-9]");
	const std::string s2 = boost::regex_replace(full_url, rmv_all, "_");
	cout << s2 << endl;
	return s2;
	}

	int connect(const std::string host, const std::string path) {
	const int port = 80;

	// Setup the msock
	int m_sock;
	sockaddr_in m_addr;
	memset(&m_addr, 0, sizeof(m_addr));
	m_sock = socket(AF_INET, SOCK_STREAM, 0);

	int on = 1;
	if (setsockopt(m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*) &on, sizeof(on)) == -1) {
	return false;
	}

	// Connect //
	m_addr.sin_family = AF_INET;
	m_addr.sin_port = htons(port);
	int status = inet_pton(AF_INET, host.c_str(), &m_addr.sin_addr);

	if (errno == EAFNOSUPPORT) {
	return false;
	}
	status = ::connect(m_sock, (sockaddr *) &m_addr, sizeof(m_addr));

	// HTTP/1.1 defines the "close" connection option for
	// the sender to signal that the connection will be closed
	// after completion of the response.
	std::string req = request(host, path);
	// End of building the request //

	status = ::send(m_sock, req.c_str(), req.size(), MSG_NOSIGNAL);
	char buf[MAXRECV];

	cout << "Request: " << req << endl;
	cout << "=========================" << endl;

	std::string recv = "";
	while (status != 0) {
	memset(buf, 0, MAXRECV);
	status = ::recv(m_sock, buf, MAXRECV, 0);
	recv.append(buf);
	} // End of the while //
	cout << "Response:" << recv << endl;
	cout << "---------------------------" << endl;

	// Attempt to write to file //
	const std::string html_file_write = string_format("%s/%s", WRITE_DIR_PATH.c_str(), clean_href(host, path).c_str());
	cout << "Writing to file : " << html_file_write << endl;
	ofstream outfile(html_file_write.c_str());
	outfile << recv << endl;
	outfile.close();

	// Parse the data //
	try {
	const boost::regex rmv_all("[\\r\|\\n]");
	const std::string s2 = boost::regex_replace(recv, rmv_all, "");
	const std::string s = s2;
	// Use this regex expression, allow for mixed-case
	// Search for the anchor tag but not the '>'
	// Where (.+?) match anything
	//const boost::regex re("<a([^>]+) href='(.+?)'>");
	const boost::regex re("<a\\s+href\\s=\\s(\"([^\"])\")\|('([^'])')\\s*>");
	boost::cmatch matches;
	// Using token iterator with sub-matches
	const int subs[] = { 2, 4 };
	boost::sregex_token_iterator i(s.begin(), s.end(), re, subs);
	boost::sregex_token_iterator j;
	for (; i != j; i++) {
	// Iterate through the listed HREFs and
	// move to next request //
	const std::string href = *i;
	if (href.length() != 0) {
	WebPage* page = new WebPage();
	page->parse(host, href);
	const char* hrefc = page->page.c_str();
	cout << "Connecting to HTTP server with : " << page->hostname << " page=" << hrefc << endl;
	sleep(DELAY);
	connect(page->hostname, string_format("/%s", hrefc));
	delete page;
	} // End of the if ///
	} // End of the for //
	} catch (boost::regex_error& e) {
	cout << "Error: " << e.what() << "\n";
	} // End of the try - catch //
	return 1;
	} // End of the function //

	int main() {
	cout << "Launching program" << endl;
	connect("localhost", "/");
	cout << "Done" << endl;
	return 0;
	} // End of the function //