Last active
April 22, 2023 16:14
-
-
Save uintdev/e846618677a5c7646660cd28bcbdee8d to your computer and use it in GitHub Desktop.
Email extractor but in C++ (test port)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Last modified: 2nd April 2019 | |
* This was a test port from Python to see if C++ would trivally | |
* improve performance. In this case, the implementation resulted | |
* in far worse file extraction times in comparison to | |
* the Python counterpart. | |
* Of course, the Rust port, of which came out long after, had easily bet both. | |
* | |
* Presented is the snapshot of the last modification, | |
* with comments and what was commented out. | |
*/ | |
#include "stdafx.h" | |
#include <iostream> | |
#include <fstream> | |
#include <sstream> | |
#include <stdio.h> | |
#include <locale> | |
#include <string> | |
#include <vector> | |
#include <regex> | |
std::vector<std::string> emailList; | |
bool extract(const std::string inputFile) { | |
std::cout << "Reading emails from '" << inputFile << "'..." << std::endl; | |
std::streampos filesizei; | |
unsigned int emailsfound = 0; | |
std::ifstream file(inputFile, std::ios_base::binary); // attempt to read file | |
if (file.is_open()) { | |
std::cout << "File found." << std::endl; | |
file.seekg(0, std::ifstream::end); | |
filesizei = file.tellg(); | |
file.seekg(0, std::ifstream::beg); | |
char* buffer = new char[filesizei]; // malloc() instead ? | |
std::cout << "Allocated " << filesizei << " bytes.\n" | |
<< "Storing file content in buffer..." | |
<< std::endl; | |
if (file.read(buffer, filesizei)) { | |
std::cout << "Stored :: " << strlen(buffer) << "\n" | |
<< "BUFFER DATA :: " << buffer[0] | |
<< std::endl; | |
file.close(); // close file | |
} | |
else { | |
std::cout << "Unable to store file contents. Halting." << std::endl; | |
return false; | |
} | |
/* | |
ifstream file(inputFile, ios_base::binary); // attempt to read file | |
if (!file) { | |
cout << "Unable to read file."; // error reading file | |
return 1; | |
} | |
const int iobuf = 256 * 1024; | |
char buff[iobuf]; | |
file.rdbuf()->pubsetbuf(buff, sizeof iobuf); | |
string fileline; | |
*/ | |
std::string fline; | |
std::istringstream filedat(buffer); | |
//cout << "LINE :: " << buffer << "\n"; | |
//cout << "Extracting emails from file content...\n"; | |
while (std::getline(filedat, fline)) { | |
// begin regex | |
//cout << "LINE :: " << memblock << "\n"; | |
//cout << "LINE2" << fline << "\n"; | |
std::string subject(fline); | |
try { | |
std::regex re("[a-zA-Z0-9][a-zA-Z0-9_.]+@[a-zA-Z0-9_]+\\.[a-zA-Z0-9_.]+"); | |
std::sregex_iterator next(subject.begin(), subject.end(), re); | |
std::sregex_iterator end; | |
while (next != end) { | |
std::smatch match = *next; | |
emailList.push_back(match.str()); // add to array | |
emailsfound++; | |
next++; | |
std::cout << emailsfound << " emails found\r"; | |
// TODO: correct issue regarding the referencing of 'next' | |
//unsigned int nextcut = next - 1; | |
//if (next != nextcut) { | |
std::cout.flush(); | |
//} | |
} | |
} | |
catch (std::regex_error& e) { | |
(void)e; | |
std::cout << "Error: malformed regex pattern. Halting." << std::endl; | |
return false; | |
} | |
} | |
if (emailList.size() == 0) { | |
std::cout << "No emails found. Halting." << std::endl; | |
return false; | |
} | |
std::cout << "Changing email order..." << std::endl; | |
sort(emailList.begin(), emailList.end()); | |
std::cout << "Removing duplicate emails (if any)..." << std::endl; | |
auto el = unique(emailList.begin(), emailList.end()); | |
emailList.erase(el, emailList.end()); | |
std::cout << emailList.size() << " email(s) loaded." << std::endl; | |
return true; | |
} | |
else { | |
std::cout << "Unable to read file." << std::endl; // error reading file | |
return false; | |
} | |
} | |
bool dump(std::string outputFile, bool forceoverwritef) { | |
std::ifstream existf(outputFile); | |
if (existf && forceoverwritef == false) { | |
std::string owf; | |
std::cout << "'" << outputFile << "' already exists. Do you want to overwrite? (y/N) "; | |
std::cin >> owf; | |
// convert input to lowercase | |
std::locale loc; | |
for (auto elem : owf) { | |
owf = tolower(elem, loc); | |
} | |
if (owf != "y") { | |
std::cout << "Halted." << std::endl; | |
return false; | |
} | |
} | |
else { | |
existf.close(); | |
} | |
std::ofstream outputf(outputFile); | |
if (!outputf) { | |
std::cout << "Unable to write emails to file." << std::endl; | |
return false; | |
} | |
std::cout << "Writing email(s) to file..." << std::endl; | |
unsigned __int64 remainingemails = 0; | |
for (auto & emails : emailList) { | |
outputf << emails << std::endl; | |
++remainingemails; | |
std::cout << remainingemails << " out of " << emailList.size() << " written to file. " << int((unsigned __int64)remainingemails * 100 / emailList.size()) << "%\r"; | |
std::cout.flush(); | |
} | |
std::cout << std::endl; | |
outputf.close(); | |
std::cout << remainingemails << " email(s) written to '" << outputFile << "'." << std::endl; | |
return true; | |
} | |
int main(int argc, char* argv[]) { | |
if (argc < 2) { | |
std::cout << "No arguments had been provided." << std::endl; | |
} | |
else if (argc < 3) { | |
std::cout << "No output file was provided." << std::endl; | |
} | |
if (argc < 3) { | |
std::cout << "\nSyntax: " << argv[0] << " {input file} {output file} {option}" | |
<< "\nOptions" | |
<< "\n-------------------------------------------" | |
<< "\n| -f | Force overwritting of output file |" | |
<< "\n-------------------------------------------" | |
<< std::endl; | |
return 1; | |
} | |
// defaults | |
bool forceoverwrite = false; | |
// options | |
if (argc > 3) { | |
std::string optionarg1 = argv[3]; | |
if (optionarg1 == "-f") { | |
forceoverwrite = true; // force overwrite | |
} | |
} | |
std::cout << "EMAIL EXTRACTION TOOL" << std::endl | |
<< "\n\nInput file :: " << argv[1] | |
<< "\nOutput file :: " << argv[2] | |
<< std::endl; | |
if (forceoverwrite) { | |
std::cout << "Force overwrite enabled." << std::endl; | |
} | |
std::cout << "\n\n"; | |
if (!extract(argv[1])) return 1; | |
dump(argv[2], forceoverwrite); | |
std::cout << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment