-
-
Save berlinbrown/4583728 to your computer and use it in GitHub Desktop.
//============================================================================ | |
// Name : OctaneCrawler.cpp | |
// Author : Berlin Brown (berlin dot brown at gmail.com) | |
// Version : | |
// Copyright : Copyright Berlin Brown 2012-2013 | |
// License : BSD | |
// Description : This is the simplest possible web crawler in C++ | |
// Uses boost_regex and boost_algorithm | |
//============================================================================ | |
#include <iostream> | |
#include <string> | |
#include <typeinfo> | |
#include <cstdarg> | |
#include <iostream> | |
#include <fstream> | |
#include <boost/regex.hpp> | |
#include <boost/algorithm/string.hpp> | |
#include <sys/types.h> | |
#include <sys/socket.h> | |
#include <netinet/in.h> | |
#include <arpa/inet.h> | |
#include <errno.h> | |
#include <fcntl.h> | |
#include <netdb.h> | |
#include <unistd.h> | |
#include <errno.h> | |
#include <fcntl.h> | |
#include <time.h> | |
using namespace std; | |
using namespace boost; | |
const int DELAY = 12; | |
const int MAXRECV = 140 * 1024; | |
const std::string WRITE_DIR_PATH = "/home/bbrown/public/example.com/public/octane_bot_store"; | |
class WebPage { | |
public: | |
std::string hostname; | |
std::string page; | |
WebPage() { | |
hostname = ""; | |
page = ""; | |
} | |
std::string parseHttp(const std::string str) { | |
const boost::regex re("(?i)http://(.*)/?(.*)"); | |
boost::smatch what; | |
if (boost::regex_match(str, what, re)) { | |
std::string hst = what[1]; | |
boost::algorithm::to_lower(hst); | |
return hst; | |
} | |
return ""; | |
} // End of method // | |
void parseHref(const std::string orig_host, const std::string str) { | |
const boost::regex re("(?i)http://(.*)/(.*)"); | |
boost::smatch what; | |
if (boost::regex_match(str, what, re)) { | |
// We found a full URL, parse out the 'hostname' | |
// Then parse out the page | |
hostname = what[1]; | |
boost::algorithm::to_lower(hostname); | |
page = what[2]; | |
} else { | |
// We could not find the 'page' but we can build the hostname | |
hostname = orig_host; | |
page = ""; | |
} // End of the if - else // | |
} // End of method // | |
void parse(const std::string orig_host, const std::string hrf) { | |
const std::string hst = parseHttp(hrf); | |
if (!hst.empty()) { | |
// If we have a HTTP prefix | |
// We could end up with a 'hostname' and page | |
parseHref(hst, hrf); | |
} else { | |
hostname = orig_host; | |
page = hrf; | |
} | |
// hostname and page are constructed, | |
// perform post analysis | |
if (page.length() == 0) { | |
page = "/"; | |
} // End of the if // | |
} // End of the method | |
}; // End of the class | |
std::string string_format(const std::string &fmt, ...) { | |
int size = 255; | |
std::string str; | |
va_list ap; | |
while (1) { | |
str.resize(size); | |
va_start(ap, fmt); | |
int n = vsnprintf((char *) str.c_str(), size, fmt.c_str(), ap); | |
va_end(ap); | |
if (n > -1 && n < size) { | |
str.resize(n); | |
return str; | |
} | |
if (n > -1) | |
size = n + 1; | |
else | |
size *= 2; | |
} // End of the while // | |
return str; | |
} // End of the function // | |
std::string request(std::string host, std::string path) { | |
std::string request = "GET "; | |
request.append(path); | |
request.append(" HTTP/1.1\r\n"); | |
request.append("Host: "); | |
request.append(host); | |
request.append("\r\n"); | |
request.append("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.81\r\n"); | |
request.append("User-Agent: Mozilla/5.0 (compatible; octanebot/1.0; http://code.google.com/p/octane-crawler/)\r\n"); | |
request.append("Connection: close\r\n"); | |
request.append("\r\n"); | |
return request; | |
} // End of the function // | |
std::string clean_href(const std::string host, const std::string path) { | |
// Clean the href to save to file // | |
std::string full_url = host; | |
full_url.append("/"); | |
full_url.append(path); | |
const boost::regex rmv_all("[^a-zA-Z0-9]"); | |
const std::string s2 = boost::regex_replace(full_url, rmv_all, "_"); | |
cout << s2 << endl; | |
return s2; | |
} | |
int connect(const std::string host, const std::string path) { | |
const int port = 80; | |
// Setup the msock | |
int m_sock; | |
sockaddr_in m_addr; | |
memset(&m_addr, 0, sizeof(m_addr)); | |
m_sock = socket(AF_INET, SOCK_STREAM, 0); | |
int on = 1; | |
if (setsockopt(m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*) &on, sizeof(on)) == -1) { | |
return false; | |
} | |
// Connect // | |
m_addr.sin_family = AF_INET; | |
m_addr.sin_port = htons(port); | |
int status = inet_pton(AF_INET, host.c_str(), &m_addr.sin_addr); | |
if (errno == EAFNOSUPPORT) { | |
return false; | |
} | |
status = ::connect(m_sock, (sockaddr *) &m_addr, sizeof(m_addr)); | |
// HTTP/1.1 defines the "close" connection option for | |
// the sender to signal that the connection will be closed | |
// after completion of the response. | |
std::string req = request(host, path); | |
// End of building the request // | |
status = ::send(m_sock, req.c_str(), req.size(), MSG_NOSIGNAL); | |
char buf[MAXRECV]; | |
cout << "Request: " << req << endl; | |
cout << "=========================" << endl; | |
std::string recv = ""; | |
while (status != 0) { | |
memset(buf, 0, MAXRECV); | |
status = ::recv(m_sock, buf, MAXRECV, 0); | |
recv.append(buf); | |
} // End of the while // | |
cout << "Response:" << recv << endl; | |
cout << "---------------------------" << endl; | |
// Attempt to write to file // | |
const std::string html_file_write = string_format("%s/%s", WRITE_DIR_PATH.c_str(), clean_href(host, path).c_str()); | |
cout << "Writing to file : " << html_file_write << endl; | |
ofstream outfile(html_file_write.c_str()); | |
outfile << recv << endl; | |
outfile.close(); | |
// Parse the data // | |
try { | |
const boost::regex rmv_all("[\\r|\\n]"); | |
const std::string s2 = boost::regex_replace(recv, rmv_all, ""); | |
const std::string s = s2; | |
// Use this regex expression, allow for mixed-case | |
// Search for the anchor tag but not the '>' | |
// Where (.+?) match anything | |
//const boost::regex re("<a([^>]+) href='(.+?)'>"); | |
const boost::regex re("<a\\s+href\\s*=\\s*(\"([^\"]*)\")|('([^']*)')\\s*>"); | |
boost::cmatch matches; | |
// Using token iterator with sub-matches | |
const int subs[] = { 2, 4 }; | |
boost::sregex_token_iterator i(s.begin(), s.end(), re, subs); | |
boost::sregex_token_iterator j; | |
for (; i != j; i++) { | |
// Iterate through the listed HREFs and | |
// move to next request // | |
const std::string href = *i; | |
if (href.length() != 0) { | |
WebPage* page = new WebPage(); | |
page->parse(host, href); | |
const char* hrefc = page->page.c_str(); | |
cout << "Connecting to HTTP server with : " << page->hostname << " page=" << hrefc << endl; | |
sleep(DELAY); | |
connect(page->hostname, string_format("/%s", hrefc)); | |
delete page; | |
} // End of the if /// | |
} // End of the for // | |
} catch (boost::regex_error& e) { | |
cout << "Error: " << e.what() << "\n"; | |
} // End of the try - catch // | |
return 1; | |
} // End of the function // | |
int main() { | |
cout << "Launching program" << endl; | |
connect("localhost", "/"); | |
cout << "Done" << endl; | |
return 0; | |
} // End of the function // |
I found a solution how to compile it:
You need libboost installed (on linux mint 17: sudo apt-get install libboost-all-dev). You don't need all packages, but it also doesn't hurt. Then run "g++ main.cpp -lboost_regex".
the connection is not getting established . What to do ?
For Window users add the following
include winsock2.h
include WS2tcpip.h
include conio.h //Sleep
pragma comment(lib, "ws2_32.lib")
A small note
int connect(const std::string host, const std::string path) This is one of the custom functions.
The function connect(), inside of it he also calls the built-in function called connect()
consider using boost::asio
remove boost
//============================================================================
// Name : OctaneCrawler.cpp
// Author : Berlin Brown (berlin dot brown at gmail.com)
// Version :
// Copyright : Copyright Berlin Brown 2012-2013
// License : BSD
// Description : This is the simplest possible web crawler in C++
// Uses boost_regex and boost_algorithm
//============================================================================
#include <iostream>
#include <string>
#include <typeinfo>
#include <cstdarg>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <regex>
// #include <boost/regex.hpp>
// #include <boost/algorithm/string.hpp>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
#include <netdb.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <time.h>
using namespace std;
// using namespace boost;
const int DELAY = 12;
const int MAXRECV = 140 * 1024;
const std::string WRITE_DIR_PATH = "/home/bbrown/public/example.com/public/octane_bot_store";
class WebPage {
public:
std::string hostname;
std::string page;
WebPage() {
hostname = "";
page = "";
}
std::string parseHttp(const std::string str) {
const regex re("(?i)http://(.*)/?(.*)");
smatch what;
if (regex_match(str, what, re)) {
std::string hst = what[1];
for_each(hst.begin(),hst.end(),[](char& c){c=tolower(c);});
return hst;
}
return "";
} // End of method //
void parseHref(const std::string orig_host, const std::string str) {
const regex re("(?i)http://(.*)/(.*)");
smatch what;
if (regex_match(str, what, re)) {
// We found a full URL, parse out the 'hostname'
// Then parse out the page
hostname = what[1];
for_each(hostname.begin(),hostname.end(),[](char& c){c=tolower(c);});
page = what[2];
} else {
// We could not find the 'page' but we can build the hostname
hostname = orig_host;
page = "";
} // End of the if - else //
} // End of method //
void parse(const std::string orig_host, const std::string hrf) {
const std::string hst = parseHttp(hrf);
if (!hst.empty()) {
// If we have a HTTP prefix
// We could end up with a 'hostname' and page
parseHref(hst, hrf);
} else {
hostname = orig_host;
page = hrf;
}
// hostname and page are constructed,
// perform post analysis
if (page.length() == 0) {
page = "/";
} // End of the if //
} // End of the method
}; // End of the class
std::string string_format(const std::string &fmt, ...) {
int size = 255;
std::string str;
va_list ap;
while (1) {
str.resize(size);
va_start(ap, fmt);
int n = vsnprintf((char *) str.c_str(), size, fmt.c_str(), ap);
va_end(ap);
if (n > -1 && n < size) {
str.resize(n);
return str;
}
if (n > -1)
size = n + 1;
else
size *= 2;
} // End of the while //
return str;
} // End of the function //
std::string request(std::string host, std::string path) {
std::string request = "GET ";
request.append(path);
request.append(" HTTP/1.1\r\n");
request.append("Host: ");
request.append(host);
request.append("\r\n");
request.append("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.81\r\n");
request.append("User-Agent: Mozilla/5.0 (compatible; octanebot/1.0; http://code.google.com/p/octane-crawler/)\r\n");
request.append("Connection: close\r\n");
request.append("\r\n");
return request;
} // End of the function //
std::string clean_href(const std::string host, const std::string path) {
// Clean the href to save to file //
std::string full_url = host;
full_url.append("/");
full_url.append(path);
const regex rmv_all("[^a-zA-Z0-9]");
const std::string s2 = regex_replace(full_url, rmv_all, "_");
cout << s2 << endl;
return s2;
}
int connect(const std::string host, const std::string path) {
const int port = 80;
// Setup the msock
int m_sock;
sockaddr_in m_addr;
memset(&m_addr, 0, sizeof(m_addr));
m_sock = socket(AF_INET, SOCK_STREAM, 0);
int on = 1;
if (setsockopt(m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*) &on, sizeof(on)) == -1) {
return false;
}
// Connect //
m_addr.sin_family = AF_INET;
m_addr.sin_port = htons(port);
int status = inet_pton(AF_INET, host.c_str(), &m_addr.sin_addr);
if (errno == EAFNOSUPPORT) {
return false;
}
status = ::connect(m_sock, (sockaddr *) &m_addr, sizeof(m_addr));
// HTTP/1.1 defines the "close" connection option for
// the sender to signal that the connection will be closed
// after completion of the response.
std::string req = request(host, path);
// End of building the request //
status = ::send(m_sock, req.c_str(), req.size(), MSG_NOSIGNAL);
char buf[MAXRECV];
cout << "Request: " << req << endl;
cout << "=========================" << endl;
std::string recv = "";
while (status != 0) {
memset(buf, 0, MAXRECV);
status = ::recv(m_sock, buf, MAXRECV, 0);
recv.append(buf);
} // End of the while //
cout << "Response:" << recv << endl;
cout << "---------------------------" << endl;
// Attempt to write to file //
const std::string html_file_write = string_format("%s/%s", WRITE_DIR_PATH.c_str(), clean_href(host, path).c_str());
cout << "Writing to file : " << html_file_write << endl;
ofstream outfile(html_file_write.c_str());
outfile << recv << endl;
outfile.close();
// Parse the data //
try {
const regex rmv_all("[\\r|\\n]");
const std::string s2 = regex_replace(recv, rmv_all, "");
const std::string s = s2;
// Use this regex expression, allow for mixed-case
// Search for the anchor tag but not the '>'
// Where (.+?) match anything
//const regex re("<a([^>]+) href='(.+?)'>");
const regex re("<a\\s+href\\s*=\\s*(\"([^\"]*)\")|('([^']*)')\\s*>");
cmatch matches;
// Using token iterator with sub-matches
const int subs[] = { 2, 4 };
sregex_token_iterator i(s.begin(), s.end(), re, subs);
sregex_token_iterator j;
for (; i != j; i++) {
// Iterate through the listed HREFs and
// move to next request //
const std::string href = *i;
if (href.length() != 0) {
WebPage* page = new WebPage();
page->parse(host, href);
const char* hrefc = page->page.c_str();
cout << "Connecting to HTTP server with : " << page->hostname << " page=" << hrefc << endl;
sleep(DELAY);
connect(page->hostname, string_format("/%s", hrefc));
delete page;
} // End of the if ///
} // End of the for //
} catch (regex_error& e) {
cout << "Error: " << e.what() << "\n";
} // End of the try - catch //
return 1;
} // End of the function //
int main() {
cout << "Launching program" << endl;
connect("localhost", "/");
cout << "Done" << endl;
return 0;
} // End of the function //
remove boost
//============================================================================ // Name : OctaneCrawler.cpp // Author : Berlin Brown (berlin dot brown at gmail.com) // Version : // Copyright : Copyright Berlin Brown 2012-2013 // License : BSD // Description : This is the simplest possible web crawler in C++ // Uses boost_regex and boost_algorithm //============================================================================ #include <iostream> #include <string> #include <typeinfo> #include <cstdarg> #include <iostream> #include <fstream> #include <algorithm> #include <regex> // #include <boost/regex.hpp> // #include <boost/algorithm/string.hpp> #include <sys/types.h> #include <sys/socket.h> #include <netinet/in.h> #include <arpa/inet.h> #include <errno.h> #include <fcntl.h> #include <netdb.h> #include <unistd.h> #include <errno.h> #include <fcntl.h> #include <time.h> using namespace std; // using namespace boost; const int DELAY = 12; const int MAXRECV = 140 * 1024; const std::string WRITE_DIR_PATH = "/home/bbrown/public/example.com/public/octane_bot_store"; class WebPage { public: std::string hostname; std::string page; WebPage() { hostname = ""; page = ""; } std::string parseHttp(const std::string str) { const regex re("(?i)http://(.*)/?(.*)"); smatch what; if (regex_match(str, what, re)) { std::string hst = what[1]; for_each(hst.begin(),hst.end(),[](char& c){c=tolower(c);}); return hst; } return ""; } // End of method // void parseHref(const std::string orig_host, const std::string str) { const regex re("(?i)http://(.*)/(.*)"); smatch what; if (regex_match(str, what, re)) { // We found a full URL, parse out the 'hostname' // Then parse out the page hostname = what[1]; for_each(hostname.begin(),hostname.end(),[](char& c){c=tolower(c);}); page = what[2]; } else { // We could not find the 'page' but we can build the hostname hostname = orig_host; page = ""; } // End of the if - else // } // End of method // void parse(const std::string orig_host, const std::string hrf) { const std::string hst = parseHttp(hrf); if (!hst.empty()) { // If we have a HTTP prefix // We could end up with a 'hostname' and page parseHref(hst, hrf); } else { hostname = orig_host; page = hrf; } // hostname and page are constructed, // perform post analysis if (page.length() == 0) { page = "/"; } // End of the if // } // End of the method }; // End of the class std::string string_format(const std::string &fmt, ...) { int size = 255; std::string str; va_list ap; while (1) { str.resize(size); va_start(ap, fmt); int n = vsnprintf((char *) str.c_str(), size, fmt.c_str(), ap); va_end(ap); if (n > -1 && n < size) { str.resize(n); return str; } if (n > -1) size = n + 1; else size *= 2; } // End of the while // return str; } // End of the function // std::string request(std::string host, std::string path) { std::string request = "GET "; request.append(path); request.append(" HTTP/1.1\r\n"); request.append("Host: "); request.append(host); request.append("\r\n"); request.append("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.81\r\n"); request.append("User-Agent: Mozilla/5.0 (compatible; octanebot/1.0; http://code.google.com/p/octane-crawler/)\r\n"); request.append("Connection: close\r\n"); request.append("\r\n"); return request; } // End of the function // std::string clean_href(const std::string host, const std::string path) { // Clean the href to save to file // std::string full_url = host; full_url.append("/"); full_url.append(path); const regex rmv_all("[^a-zA-Z0-9]"); const std::string s2 = regex_replace(full_url, rmv_all, "_"); cout << s2 << endl; return s2; } int connect(const std::string host, const std::string path) { const int port = 80; // Setup the msock int m_sock; sockaddr_in m_addr; memset(&m_addr, 0, sizeof(m_addr)); m_sock = socket(AF_INET, SOCK_STREAM, 0); int on = 1; if (setsockopt(m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*) &on, sizeof(on)) == -1) { return false; } // Connect // m_addr.sin_family = AF_INET; m_addr.sin_port = htons(port); int status = inet_pton(AF_INET, host.c_str(), &m_addr.sin_addr); if (errno == EAFNOSUPPORT) { return false; } status = ::connect(m_sock, (sockaddr *) &m_addr, sizeof(m_addr)); // HTTP/1.1 defines the "close" connection option for // the sender to signal that the connection will be closed // after completion of the response. std::string req = request(host, path); // End of building the request // status = ::send(m_sock, req.c_str(), req.size(), MSG_NOSIGNAL); char buf[MAXRECV]; cout << "Request: " << req << endl; cout << "=========================" << endl; std::string recv = ""; while (status != 0) { memset(buf, 0, MAXRECV); status = ::recv(m_sock, buf, MAXRECV, 0); recv.append(buf); } // End of the while // cout << "Response:" << recv << endl; cout << "---------------------------" << endl; // Attempt to write to file // const std::string html_file_write = string_format("%s/%s", WRITE_DIR_PATH.c_str(), clean_href(host, path).c_str()); cout << "Writing to file : " << html_file_write << endl; ofstream outfile(html_file_write.c_str()); outfile << recv << endl; outfile.close(); // Parse the data // try { const regex rmv_all("[\\r|\\n]"); const std::string s2 = regex_replace(recv, rmv_all, ""); const std::string s = s2; // Use this regex expression, allow for mixed-case // Search for the anchor tag but not the '>' // Where (.+?) match anything //const regex re("<a([^>]+) href='(.+?)'>"); const regex re("<a\\s+href\\s*=\\s*(\"([^\"]*)\")|('([^']*)')\\s*>"); cmatch matches; // Using token iterator with sub-matches const int subs[] = { 2, 4 }; sregex_token_iterator i(s.begin(), s.end(), re, subs); sregex_token_iterator j; for (; i != j; i++) { // Iterate through the listed HREFs and // move to next request // const std::string href = *i; if (href.length() != 0) { WebPage* page = new WebPage(); page->parse(host, href); const char* hrefc = page->page.c_str(); cout << "Connecting to HTTP server with : " << page->hostname << " page=" << hrefc << endl; sleep(DELAY); connect(page->hostname, string_format("/%s", hrefc)); delete page; } // End of the if /// } // End of the for // } catch (regex_error& e) { cout << "Error: " << e.what() << "\n"; } // End of the try - catch // return 1; } // End of the function // int main() { cout << "Launching program" << endl; connect("localhost", "/"); cout << "Done" << endl; return 0; } // End of the function //
+1
Hello, please someone help with the
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
headers. I added the glibc-2.33 library, but from 8 errors it gets to 4400 errors. What am I doing wrong?
consider using boost::asio
I use boost::asio for that
for windows
#include <winsock2.h>
#include <WS2tcpip.h>
#include <conio.h> //Sleep
#pragma comment(lib, "ws2_32.lib")
#include <iostream>
#include <string>
#include <typeinfo>
#include <cstdarg>
#include <iostream>
#include <fstream>
#include <algorithm>
#include <regex>
// #include <boost/regex.hpp>
// #include <boost/algorithm/string.hpp>
#include <sys/types.h>
//#include <sys/socket.h>
//#include <netinet/in.h>
//#include <arpa/inet.h>
#include <errno.h>
#include <fcntl.h>
//#include <netdb.h>
//#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <time.h>
using namespace std;
// using namespace boost;
const int DELAY = 12;
const int MAXRECV = 140 * 1024;
const std::string WRITE_DIR_PATH = "/home/bbrown/public/example.com/public/octane_bot_store";
class WebPage {
public:
std::string hostname;
std::string page;
WebPage() {
hostname = "";
page = "";
}
std::string parseHttp(const std::string str) {
const regex re("(?i)http://(.*)/?(.*)");
smatch what;
if (regex_match(str, what, re)) {
std::string hst = what[1];
for_each(hst.begin(), hst.end(), [](char& c) {c = tolower(c); });
return hst;
}
return "";
} // End of method //
void parseHref(const std::string orig_host, const std::string str) {
const regex re("(?i)http://(.*)/(.*)");
smatch what;
if (regex_match(str, what, re)) {
// We found a full URL, parse out the 'hostname'
// Then parse out the page
hostname = what[1];
for_each(hostname.begin(), hostname.end(), [](char& c) {c = tolower(c); });
page = what[2];
}
else {
// We could not find the 'page' but we can build the hostname
hostname = orig_host;
page = "";
} // End of the if - else //
} // End of method //
void parse(const std::string orig_host, const std::string hrf) {
const std::string hst = parseHttp(hrf);
if (!hst.empty()) {
// If we have a HTTP prefix
// We could end up with a 'hostname' and page
parseHref(hst, hrf);
}
else {
hostname = orig_host;
page = hrf;
}
// hostname and page are constructed,
// perform post analysis
if (page.length() == 0) {
page = "/";
} // End of the if //
} // End of the method
}; // End of the class
std::string string_format(const std::string& fmt, ...) {
int size = 255;
std::string str;
va_list ap;
while (1) {
str.resize(size);
va_start(ap, fmt);
int n = vsnprintf((char*)str.c_str(), size, fmt.c_str(), ap);
va_end(ap);
if (n > -1 && n < size) {
str.resize(n);
return str;
}
if (n > -1)
size = n + 1;
else
size *= 2;
} // End of the while //
return str;
} // End of the function //
std::string request(std::string host, std::string path) {
std::string request = "GET ";
request.append(path);
request.append(" HTTP/1.1\r\n");
request.append("Host: ");
request.append(host);
request.append("\r\n");
request.append("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.81\r\n");
request.append("User-Agent: Mozilla/5.0 (compatible; octanebot/1.0; http://code.google.com/p/octane-crawler/)\r\n");
request.append("Connection: close\r\n");
request.append("\r\n");
return request;
} // End of the function //
std::string clean_href(const std::string host, const std::string path) {
// Clean the href to save to file //
std::string full_url = host;
full_url.append("/");
full_url.append(path);
const regex rmv_all("[^a-zA-Z0-9]");
const std::string s2 = regex_replace(full_url, rmv_all, "_");
cout << s2 << endl;
return s2;
}
int connect(const std::string host, const std::string path) {
const int port = 80;
WSADATA WSAData;
WSAStartup(MAKEWORD(2, 0), &WSAData);
// Setup the msock
int m_sock;
sockaddr_in m_addr;
memset(&m_addr, 0, sizeof(m_addr));
m_sock = socket(AF_INET, SOCK_STREAM, 0);
int on = 1;
if (setsockopt(m_sock, SOL_SOCKET, SO_REUSEADDR, (const char*)&on, sizeof(on)) == -1) {
return false;
}
// Connect //
m_addr.sin_family = AF_INET;
m_addr.sin_port = htons(port);
int status = inet_pton(AF_INET, host.c_str(), &m_addr.sin_addr);
if (errno == EAFNOSUPPORT) {
return false;
}
status = ::connect(m_sock, (sockaddr*)&m_addr, sizeof(m_addr));
// HTTP/1.1 defines the "close" connection option for
// the sender to signal that the connection will be closed
// after completion of the response.
std::string req = request(host, path);
// End of building the request //
status = ::send(m_sock, req.c_str(), req.size(), 0);
char buf[MAXRECV];
cout << "Request: " << req << endl;
cout << "=========================" << endl;
std::string recv = "";
while (status != 0) {
memset(buf, 0, MAXRECV);
status = ::recv(m_sock, buf, MAXRECV, 0);
recv.append(buf);
} // End of the while //
cout << "Response:" << recv << endl;
cout << "---------------------------" << endl;
// Attempt to write to file //
const std::string html_file_write = string_format("%s/%s", WRITE_DIR_PATH.c_str(), clean_href(host, path).c_str());
cout << "Writing to file : " << html_file_write << endl;
ofstream outfile(html_file_write.c_str());
outfile << recv << endl;
outfile.close();
// Parse the data //
try {
const regex rmv_all("[\\r|\\n]");
const std::string s2 = regex_replace(recv, rmv_all, "");
const std::string s = s2;
// Use this regex expression, allow for mixed-case
// Search for the anchor tag but not the '>'
// Where (.+?) match anything
//const regex re("<a([^>]+) href='(.+?)'>");
const regex re("<a\\s+href\\s*=\\s*(\"([^\"]*)\")|('([^']*)')\\s*>");
cmatch matches;
// Using token iterator with sub-matches
const int subs[] = { 2, 4 };
sregex_token_iterator i(s.begin(), s.end(), re, subs);
sregex_token_iterator j;
for (; i != j; i++) {
// Iterate through the listed HREFs and
// move to next request //
const std::string href = *i;
if (href.length() != 0) {
WebPage* page = new WebPage();
page->parse(host, href);
const char* hrefc = page->page.c_str();
cout << "Connecting to HTTP server with : " << page->hostname << " page=" << hrefc << endl;
// sleep(DELAY);
connect(page->hostname, string_format("/%s", hrefc));
delete page;
} // End of the if ///
} // End of the for //
}
catch (regex_error& e) {
cout << "Error: " << e.what() << "\n";
} // End of the try - catch //
WSACleanup();
return 1;
} // End of the function //
Please also mention how to compile this code ?