-
-
Save r-lyeh-archived/4f9bda5415e2f92780b8fcc109865291 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Testing read file speed for the three read functions from | |
http://cpp.indi.frih.net/blog/2014/09/how-to-read-an-entire-file-into-memory-in-cpp/ | |
compile with -std=c++11 | |
*/ | |
#include <type_traits> | |
#include <ostream> | |
#include <sstream> | |
#include <limits> | |
#include <array> | |
#include <vector> | |
#include <deque> | |
/* go to method for small files (<100K) | |
auto ss = std::ostringstream{}; | |
ss << in.rdbuf(); | |
auto s = ss.str(); | |
Problems: has to copy data from the ostringstream into a string and for large | |
data this could be an issue due to having two copies of large data in memory. | |
*/ | |
template <typename CharT, typename Traits = std::char_traits<CharT>, | |
typename Allocator = std::allocator<CharT> > | |
std::basic_string<CharT, Traits, Allocator> read_stream_into_string( | |
std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) { | |
std::basic_ostringstream<CharT, Traits, Allocator> ss( | |
std::basic_string<CharT, Traits, Allocator>(std::move(alloc))); | |
if (!(ss << in.rdbuf())) | |
throw std::ios_base::failure{ "error" }; | |
return ss.str(); | |
} | |
/* reading straight into a container: | |
If you are dealing with files it can be faster to count all the | |
characters first, then do one big allocation and one big whopper of a read: | |
auto const start_pos = in.tellg(); | |
in.ignore(std::numeric_limits<std::streamsize>::max()); | |
auto const char_count = in.gcount(); | |
in.seekg(start_pos); | |
auto s = std::string(char_count, char{}); | |
in.read(&s[0], s.size()); | |
in.ignore() is a safe way to count the bytes in a file but means this method | |
requires reading the file twice, once to count bytes and once to read them in. | |
*/ | |
template <typename Container = std::string, typename CharT = char, | |
typename Traits = std::char_traits<char> > | |
Container read_stream_into_container( | |
std::basic_istream<CharT, Traits>& in, | |
typename Container::allocator_type alloc = {}) { | |
static_assert( | |
// Allow only strings... | |
std::is_same< | |
Container, | |
std::basic_string<CharT, Traits, | |
typename Container::allocator_type> >::value || | |
// ... and vectors of the plain, signed, and | |
// unsigned flavours of CharT. | |
std::is_same< | |
Container, | |
std::vector<CharT, typename Container::allocator_type> >::value || | |
std::is_same< | |
Container, | |
std::vector<std::make_unsigned<CharT>, | |
typename Container::allocator_type> >::value || | |
std::is_same<Container, | |
std::vector<std::make_signed<CharT>, | |
typename Container::allocator_type> >::value, | |
"only strings and vectors of ((un)signed) CharT allowed"); | |
auto const start_pos = in.tellg(); | |
if (std::streamsize(-1) == start_pos) | |
throw std::ios_base::failure{ "error" }; | |
if (!in.ignore(std::numeric_limits<std::streamsize>::max())) | |
throw std::ios_base::failure{ "error" }; | |
auto const char_count = in.gcount(); | |
if (!in.seekg(start_pos)) | |
throw std::ios_base::failure{ "error" }; | |
auto container = Container(std::move(alloc)); | |
container.resize(char_count); | |
if (0 != container.size()) { | |
if (!in.read(reinterpret_cast<CharT*>(&container[0]), container.size())) | |
throw std::ios_base::failure{ "error" }; | |
} | |
return container; | |
} | |
/* read chunks into a deque: | |
If you’re expecting enormous files (at least several hundreds of megabytes, | |
on average) and you don’t want to seek on stream, read the file in chunks into | |
a deque. Advantage is no copy unless you can't work with the dequeu and end up | |
copying the data out of it. | |
*/ | |
template <typename CharT, typename Traits = std::char_traits<CharT>, | |
typename CharO = CharT, typename Allocator = std::allocator<CharO> > | |
std::deque<CharO, Allocator> read_file_into_deque( | |
std::basic_istream<CharT, Traits>& in, Allocator alloc = {}) { | |
static_assert(std::is_same<CharT, CharO>::value || | |
std::is_same<std::make_unsigned<CharT>, CharO>::value || | |
std::is_same<std::make_signed<CharT>, CharO>::value, | |
"char type of deque must be same " | |
"as stream char type " | |
"(possibly signed or unsigned)"); | |
using std::begin; | |
using std::end; | |
auto const chunk_size = std::size_t{ BUFSIZ }; | |
auto container = std::deque<CharO, Allocator>(std::move(alloc)); | |
auto chunk = std::array<CharO, chunk_size>{}; | |
while (in.read(reinterpret_cast<CharT*>(chunk.data()), chunk.size()) || | |
in.gcount()) | |
container.insert(end(container), begin(chunk), begin(chunk) + in.gcount()); | |
return container; | |
} | |
/* | |
Testing section | |
*/ | |
#include <chrono> | |
#include <iostream> | |
#include <fstream> | |
#include <stdio.h> | |
#include <cassert> | |
#include <cctype> | |
/* | |
humanize from | |
https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libtcplay/humanize.c | |
*/ | |
static const char prefixes[] = " KMGTPE"; | |
template <typename T> std::string humanize(T num) { | |
const char* prefixp; | |
uint64_t i, d; | |
prefixp = prefixes; | |
i = num; | |
d = 0; | |
while ((i > 1024) && (*prefixp != '\0')) { | |
d = (i % 1024) / 10; | |
i /= 1024; | |
++prefixp; | |
} | |
if (d > 0) | |
return std::to_string(i) + '.' + std::to_string(d) + *prefixp; | |
else | |
return std::to_string(i) + *prefixp; | |
} | |
/* | |
dehumanize_number from | |
http://cvsweb.netbsd.org/bsdweb.cgi/~checkout~/src/lib/libc/gen/dehumanize_number.c | |
*/ | |
uint64_t dehumanize_number(const std::string& str) { | |
char unit; | |
size_t delimit; | |
long multiplier; | |
long long tmp, tmp2; | |
size_t ep; | |
size_t len = str.size(); | |
if (str.empty()) { | |
return 0; | |
} | |
multiplier = 1; | |
unit = str[len - 1]; | |
if (std::isalpha((unsigned char)unit)) { | |
switch (std::tolower((unsigned char)unit)) { | |
case 'b': | |
multiplier = 1; | |
break; | |
case 'k': | |
multiplier = 1024; | |
break; | |
case 'm': | |
multiplier = 1024 * 1024; | |
break; | |
case 'g': | |
multiplier = 1024 * 1024 * 1024; | |
break; | |
default: | |
return 0; /* Invalid suffix. */ | |
} | |
delimit = len - 1; | |
} else | |
delimit = 0; | |
tmp = std::stoull(str, &ep, 10); | |
if (str[0] == '\0' || (ep != delimit && str[ep] != '\0')) | |
return 0; /* Not a number. */ | |
tmp2 = tmp * multiplier; | |
tmp2 = tmp2 / multiplier; | |
if (tmp != tmp2) { | |
return 0; /* Out of range. */ | |
} | |
return tmp *= multiplier; | |
} | |
std::string create_empty_file(size_t size) { | |
// dispite a warning tmpnam is portable and there is no risk for this program | |
std::string name = std::tmpnam(nullptr); | |
std::ofstream out(name, std::ofstream::binary); | |
if (!out.seekp(size - 1)) | |
throw std::ios_base::failure{ "error" }; | |
out << 'X'; | |
return name; | |
} | |
template <typename F> | |
bool test(const std::string& test_name, F fn, size_t test_size, | |
bool skip_long_verify) { | |
std::string temp_filename = create_empty_file(test_size); | |
std::ifstream in(temp_filename, std::ifstream::binary); | |
auto start = std::chrono::high_resolution_clock::now(); | |
auto s = fn(in, {}); | |
auto end = std::chrono::high_resolution_clock::now(); | |
remove(temp_filename.c_str()); | |
std::cout << test_name << ":" | |
<< std::chrono::duration_cast<std::chrono::milliseconds>( | |
end - start).count() << "ms\n"; | |
// content tests | |
if (s.size() != test_size) { | |
std::cout << "FAILED:wrong size " << s.size() << '\n'; | |
return false; | |
} else if (s[s.size() - 1] != 'X') { | |
std::cout << "FAILED:last byte is wrong\n"; | |
return false; | |
} | |
if (skip_long_verify) { return true; } | |
size_t i = 1; | |
for (; i < s.size() - 1; ++i) { | |
if (s[i]) { | |
std::cout << "FAILED:[" << i << "]!=0\n"; | |
break; | |
} | |
} | |
return (i == s.size() - 1); | |
} | |
bool test_all(size_t test_size, bool skip_long_verify = false) { | |
std::cout << "test_size:" << humanize(test_size) << ", BUFSIZ:" << BUFSIZ | |
<< '\n'; | |
bool t1 = test("read_stream_into_string", &read_stream_into_string<char>, | |
test_size, skip_long_verify); | |
bool t2 = test("read_stream_into_container", | |
&read_stream_into_container<std::string>, test_size, | |
skip_long_verify); | |
bool t3 = test("read_file_into_deque", read_file_into_deque<char>, test_size, | |
skip_long_verify); | |
std::cout << std::endl; | |
return (t1 && t2 && t3); | |
} | |
void usage(const std::string& prog_name) { | |
std::cout | |
<< "Usage: " << prog_name << " [test_file_size] [max_size step]\n\n" | |
<< " Example: " << prog_name << " 1m 100m 500k\n" | |
<< " Times read algorithms with files of 1M to 100M in steps of 500K\n" | |
<< "Minimum size for files is 2 and using step 0 is troublemaking.\n"; | |
} | |
int main(int argc, char* argv[]) { | |
size_t test_size = dehumanize_number("1M"); | |
if (argc >= 2) { | |
test_size = dehumanize_number(argv[1]); | |
} | |
size_t test_size_max = test_size; | |
size_t step = 1; | |
if (argc == 4) { | |
test_size_max = dehumanize_number(argv[2]); | |
step = dehumanize_number(argv[3]); | |
} | |
if ((test_size <= 1) || (test_size > test_size_max) || (test_size_max <= 1) || | |
(step <= 0)) { | |
usage(argv[0]); | |
return 1; | |
} | |
for (; test_size <= test_size_max && test_all(test_size); test_size += step) { | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment