Last active
March 20, 2022 15:32
-
-
Save andyleejordan/d120be76ba8ebd66cf50 to your computer and use it in GitHub Desktop.
Algorithm for Efficient Chunked File Reading in C++
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Algorithm for Efficient Chunked File Reading in C++ | |
* | |
* The MIT License (MIT) | |
* | |
* Copyright 2014 Andrew Schwartzmeyer | |
* | |
* Permission is hereby granted, free of charge, to any person | |
* obtaining a copy of this software and associated documentation | |
* files (the "Software"), to deal in the Software without | |
* restriction, including without limitation the rights to use, copy, | |
* modify, merge, publish, distribute, sublicense, and/or sell copies | |
* of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be | |
* included in all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
* SOFTWARE. | |
*/ | |
#include <cstdlib> | |
#include <fstream> | |
#include <iostream> | |
#include <vector> | |
#include <sys/types.h> | |
#include <sys/stat.h> | |
#include <unistd.h> | |
using namespace std; | |
int main(int argc, char* argv[]) | |
{ | |
/* basic CLI interface */ | |
if (argc < 2) | |
{ | |
cerr << "usage: input_file [chunk_size]" << endl; | |
return 1; | |
} | |
ifstream file(argv[1], ifstream::binary); | |
/* basic sanity check */ | |
if (not file) | |
{ | |
cerr << "file: " << argv[1] << " failed to open" << endl; | |
return 1; | |
} | |
/* *NIX way to get file size without seeking to the end and back */ | |
struct stat filestatus; | |
stat(argv[1], &filestatus); | |
size_t total_size = filestatus.st_size; | |
size_t chunk_size = 0; | |
/* C-string necessitates aoti to get chunk size */ | |
if (argc == 3) | |
{ chunk_size = atoi(argv[2]); } | |
/* atoi may fail and leave us with an undefined chunk size*/ | |
if (not (chunk_size > 0)) | |
{ chunk_size = 16 * 1024; } | |
cout << "using chunk size: " << chunk_size << endl; | |
/* on to the actual algorithm */ | |
size_t total_chunks = total_size / chunk_size; | |
size_t last_chunk_size = total_size % chunk_size; | |
if (last_chunk_size != 0) /* if the above division was uneven */ | |
{ | |
++total_chunks; /* add an unfilled final chunk */ | |
} | |
else /* if division was even, last chunk is full */ | |
{ | |
last_chunk_size = chunk_size; | |
} | |
/* the loop of chunking */ | |
for (size_t chunk = 0; chunk < total_chunks; ++chunk) | |
{ | |
size_t this_chunk_size = | |
chunk == total_chunks - 1 /* if last chunk */ | |
? last_chunk_size /* then fill chunk with remaining bytes */ | |
: chunk_size; /* else fill entire chunk */ | |
/* if needed, we also have the position of this chunk in the file | |
size_t start_of_chunk = chunk * chunk_size; */ | |
/* adapt this portion as necessary, this is the fast C++ way */ | |
vector<char> chunk_data(this_chunk_size); | |
file.read(&chunk_data[0], /* address of buffer start */ | |
this_chunk_size); /* this many bytes is to be read */ | |
/* do something with chunk_data before next iteration */ | |
cout << "chunk #" << chunk << endl; | |
for (const auto c : chunk_data) /* I like my C++11 extensions */ | |
{ | |
cout << c; | |
} | |
cout << endl; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment