Skip to content

Instantly share code, notes, and snippets.

@nico
Last active June 3, 2019 13:14
Show Gist options
  • Save nico/b0cca071e5b1fd71f929 to your computer and use it in GitHub Desktop.
Save nico/b0cca071e5b1fd71f929 to your computer and use it in GitHub Desktop.
benchmarking different file stat()ing techniques on windows (parts based on https://github.com/cpizano/Kodefind/blob/master/src/engine_v1_win.cc)
// cl stattest.cc /Ox /GL /GR-
#include <algorithm>
#include <iostream>
#include <map>
#include <set>
#include <string>
#include <vector>
using namespace std;
#include <direct.h>
#include <stdint.h>
#include <time.h>
#include <windows.h>
struct StringPiece {
StringPiece() : str_(NULL), len_(0) {}
StringPiece(const string& str) : str_(str.data()), len_(str.size()) {}
StringPiece(const char* str) : str_(str), len_(strlen(str)) {}
StringPiece(const char* str, size_t len) : str_(str), len_(len) {}
bool operator<(StringPiece RHS) const {
if (int Res = memcmp(str_, RHS.str_, min(len_, RHS.len_)))
return Res < 0;
if (len_ == RHS.len_)
return false;
return len_ < RHS.len_;
}
bool operator==(const StringPiece& other) const {
return len_ == other.len_ && memcmp(str_, other.str_, len_) == 0;
}
bool operator!=(const StringPiece& other) const {
return !(*this == other);
}
string AsString() const { return len_ ? string(str_, len_) : string(); }
const char* str_;
size_t len_;
};
// About 1s for stat()ing 55000 files.
uint64_t stat(const string& path) {
WIN32_FILE_ATTRIBUTE_DATA attrs;
if (!GetFileAttributesExA(path.c_str(), GetFileExInfoStandard, &attrs)) {
DWORD err = GetLastError();
if (err == ERROR_FILE_NOT_FOUND || err == ERROR_PATH_NOT_FOUND)
return 0;
fprintf(stderr, "fail %s", path.c_str());
exit(-1);
}
const FILETIME& filetime = attrs.ftLastWriteTime;
// FILETIME is in 100-nanosecond increments since the Windows epoch.
// We don't much care about epoch correctness but we do want the
// resulting value to fit in an integer.
uint64_t mtime = ((uint64_t)filetime.dwHighDateTime << 32) |
((uint64_t)filetime.dwLowDateTime);
mtime /= 1000000000LL / 100; // 100ns -> s.
mtime -= 12622770400LL; // 1600 epoch -> 2000 epoch (subtract 400 years).
return mtime;
}
#if 1
uint64_t statsum(const string& d, const set<StringPiece>& s) {
HANDLE hFind = INVALID_HANDLE_VALUE;
WIN32_FIND_DATAA ffd;
#if 0
hFind = FindFirstFileA((d + "\\*").c_str(), &ffd);
#else
hFind = FindFirstFileExA((d + "\\*").c_str(),
//FindExInfoStandard,
FindExInfoBasic, // 30% faster than FindExInfoStandard!
&ffd,
FindExSearchNameMatch,
NULL,
0 ); // XXX: check FIND_FIRST_EX_LARGE_FETCH
#endif
if (hFind == INVALID_HANDLE_VALUE) {
fprintf(stderr, "fail %s", d.c_str());
exit(-1);
}
uint64_t t = 0;
do {
//fprintf(stderr, "%s %s\n", d.c_str(), ffd.cFileName);
if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
continue;
string lowername = ffd.cFileName;
std::transform(lowername.begin(), lowername.end(),
lowername.begin(), ::tolower);
if (s.count(lowername) == 0) {
//fprintf(stderr, "skipping %s %s\n", d.c_str(), ffd.cFileName);
continue;
}
const FILETIME& filetime = ffd.ftLastWriteTime;
// FILETIME is in 100-nanosecond increments since the Windows epoch.
// We don't much care about epoch correctness but we do want the
// resulting value to fit in an integer.
uint64_t mtime = ((uint64_t)filetime.dwHighDateTime << 32) |
((uint64_t)filetime.dwLowDateTime);
mtime /= 1000000000LL / 100; // 100ns -> s.
//if(mtime == 0)
//printf(" asdasdfadsfsadf\n");
mtime -= 12622770400LL; // 1600 epoch -> 2000 epoch (subtract 400 years).
t += mtime;
} while (FindNextFileA(hFind, &ffd));
FindClose(hFind);
return t;
}
#else
HANDLE OpenDirectory(const std::string& dir) {
DWORD share = FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE;
return CreateFileA(dir.c_str(), GENERIC_READ , share, NULL,
OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
}
uint64_t statsum(const string& d, const set<StringPiece>& s) {
#if 1
FILE_INFO_BY_HANDLE_CLASS handle_class = FileIdBothDirectoryInfo;
typedef FILE_ID_BOTH_DIR_INFO HandleClass;
#else
// Requires win8, likely faster
FILE_INFO_BY_HANDLE_CLASS handle_class = FileIdFullDirectoryInfo;
typedef FILE_ID_FULL_DIR_INFO HandleClass;
#endif
const size_t kBufSize = 512 * 1024;
char* buf = (char*)malloc(kBufSize);
HANDLE hDir = OpenDirectory(d.c_str());
if (hDir == INVALID_HANDLE_VALUE) {
fprintf(stderr, "fail %s", d.c_str());
exit(-1);
}
uint64_t t = 0;
while (GetFileInformationByHandleEx(hDir, handle_class, buf, kBufSize)) {
HandleClass* c = (HandleClass*)buf;
while (1) {
if ((c->FileAttributes & FILE_ATTRIBUTE_DIRECTORY) == 0) {
wstring wlowername(&c->FileName[0], c->FileNameLength / sizeof(wchar_t));
string lowername(wlowername.begin(), wlowername.end()); // HACK
std::transform(lowername.begin(), lowername.end(),
lowername.begin(), ::tolower);
//fprintf(stderr, "%s %s\n", d.c_str(), lowername.c_str());
if (s.count(lowername) == 0) {
//fprintf(stderr, "skipping %s %s\n", d.c_str(), c->FileName);
} else {
uint64_t mtime = ((uint64_t)c->LastWriteTime.QuadPart);
mtime /= 1000000000LL / 100; // 100ns -> s.
mtime -= 12622770400LL; // 1600 epoch -> 2000 epoch (subtract 400 years).
t += mtime;
}
}
if (c->NextEntryOffset == 0)
break;
c = (HandleClass*)(ULONG_PTR(c) + c->NextEntryOffset);
}
}
if (GetLastError() != ERROR_NO_MORE_FILES) {
fprintf(stderr, "fail %s", d.c_str());
exit(-1);
}
free(buf);
CloseHandle(hDir);
return t;
}
#endif
int main() {
_chdir("out\\Debug");
clock_t cs = clock(), ce;
vector<string> files;
string s;
while (getline(cin, s)) {
std::transform(s.begin(), s.end(), s.begin(), ::tolower);
files.push_back(s);
}
// 0.36s up to here
ce = clock();
printf("reading took %f\n", (ce - cs)/float(CLOCKS_PER_SEC));
cs = ce;
// takes about 0.1s
typedef set<StringPiece> Dirs;
typedef map<string, Dirs> DirMap;
DirMap m;
for (size_t i = 0; i < files.size(); ++i) {
string dir;
StringPiece base;
size_t p = files[i].rfind('\\');
if (p != string::npos) {
dir = files[i].substr(0, p);
base = StringPiece(files[i].data() + p + 1, files[i].size() - p - 1);
//printf("%s %s\n", dir.c_str(), base.c_str()); break;
} else {
dir = ".";
base = files[i];
}
//std::transform(base.begin(), base.end(), base.begin(), ::tolower);
m[dir].insert(base);
}
ce = clock();
printf("map building took %f\n", (ce - cs)/float(CLOCKS_PER_SEC));
cs = ce;
uint64_t sum;
sum = 0;
for (DirMap::iterator it = m.begin(), end = m.end();
it != end; ++it) {
sum += statsum(it->first, it->second);
}
printf("s %I64x\n", sum);
ce = clock();
printf("per-dir stating took %f\n", (ce - cs)/float(CLOCKS_PER_SEC));
cs = ce;
sum = 0;
//for (size_t i = 0; i < files.size(); ++i) {
// sum += stat(files[i]);
//}
for (DirMap::iterator it = m.begin(), end = m.end();
it != end; ++it) {
for (Dirs::iterator sit = it->second.begin(), send = it->second.end();
sit != send; ++sit)
sum += stat(it->first + "\\" + sit->AsString());
}
printf("s %I64x\n", sum);
ce = clock();
printf("direct stating took %f\n", (ce - cs)/float(CLOCKS_PER_SEC));
cs = ce;
return (int)sum;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment