forrestthewoods · December 30, 2024 19:29
diff --git a/main.cpp b/main.cpp
 // virtual_alloc.cpp : This file contains the 'main' function. Program execution begins and ends there.
 //

 #if _WIN32
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #include <Windows.h>
 #include <psapi.h>
 #else
 #include <sys/mman.h>
 #include <sys/sysinfo.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
 #endif

 #include <algorithm>
 #include <assert.h>
 #include <charconv>
 #include <chrono>
 #include <cstdint>
 #include <format>
 #include <iostream>
 #include <vector>
 #include <type_traits>

 #define ENABLE_SUPERLUMINAL 0
 #if ENABLE_SUPERLUMINAL
 #include <Superluminal/PerformanceAPI_capi.h>
 #else
 #define PerformanceAPI_BeginEvent(a,b,c)
 #define PerformanceAPI_EndEvent()
 #endif

 using namespace std::chrono;
 using Clock = high_resolution_clock;

 static const size_t ONE_KB = 1024;
 static const size_t ONE_MB = 1024 * 1024;
 static const size_t ONE_GB = 1024 * 1024 * 1024;
 static const size_t ONE_TB = 1024llu * 1024 * 1024 * 1024;

 static const size_t PAGE_SIZE = []() -> size_t {
 #if _WIN32
    SYSTEM_INFO systemInfo;
    GetSystemInfo(&systemInfo);
    return systemInfo.dwPageSize;
 #else
    return sysconf(_SC_PAGESIZE);
 #endif
 }();

 static const size_t RESERVE_LIMIT = []() -> size_t {
 #ifdef _WIN32
    MEMORYSTATUSEX memStatus;
    memStatus.dwLength = sizeof(memStatus);
    GlobalMemoryStatusEx(&memStatus);
    return memStatus.ullTotalVirtual;
 #else
    struct sysinfo si;
    if (sysinfo(&si) == 0) {
        // Total virtual memory = RAM + swap
        return (si.totalram + si.totalswap) * si.mem_unit;
    }
    return 0;
 #endif
 }();

 static const size_t COMMIT_LIMIT = []() -> size_t {
 #if _WIN32
    PERFORMANCE_INFORMATION perfInfo;
    perfInfo.cb = sizeof(perfInfo);

    GetPerformanceInfo(&perfInfo, sizeof(perfInfo));
    size_t commitLimit = perfInfo.CommitLimit * perfInfo.PageSize;
    return commitLimit;
 #else
    struct sysinfo si;
    if (sysinfo(&si) == 0) {
        // On Linux, commit limit is typically RAM + swap
        return (si.totalram + si.totalswap) * si.mem_unit;
    }
    return 0;
 #endif
 }();


 // Small Helpers
 size_t roundUpToPage(size_t size) { return (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); }
 size_t roundDownToPage(size_t size) { return size & ~(PAGE_SIZE - 1); }
 void* offsetPtr(void* ptr, size_t offset) { return (void*)((char*)ptr + offset); }

 std::string prettyTime(nanoseconds ns) {
    if (ns.count() < 1000) return std::format("{}ns", ns.count());
    else if (ns.count() < 1000 * 1000) return std::format("{}us", ns.count() / 1000);
    else if (ns.count() < 1000llu * 1000llu * 1000llu * 10llu) return std::format("{}ms", ns.count() / 1000 / 1000);
    else return std::format("{}s", ns.count() / 1000 / 1000 / 1000);
 }

 std::string prettyBytes(size_t bytes) {
    if (bytes < 1024llu) return std::format("{}bytes", bytes);
    else if (bytes < 1024llu * 1024) return std::format("{}KiB", bytes / 1024);
    else if (bytes < 1024llu * 1024 * 1024) return std::format("{}MiB", bytes / 1024 / 1024);
    else if (bytes < 1024llu * 1024 * 1024 * 1024) return std::format("{}GiB", bytes / 1024 / 1024 / 1024);
    else return std::format("{}TiB", bytes / 1024 / 1024 / 1024 / 1024);
 }

 enum class CommitStrategy { Page, Multiplier, All };
 const double CommitMultiplier = 1.5;

 template<typename T, CommitStrategy commitStrategy>
 struct VirtualVec {
    T* data = nullptr;
    size_t count = 0;
    size_t numVirtualBytes = 0;
    void* nextPage = nullptr;
    void* endPage = nullptr; // one byte past end of last page

    VirtualVec() = default;
    ~VirtualVec() {
        if (data) {
            if constexpr (!std::is_trivially_destructible_v<T>) {
                for (size_t i = 0; i < count; ++i) {
                    data[i].~T(); // destroy!
                }
            }
 #if _WIN32
            VirtualFree(data, 0, MEM_RELEASE);
 #else
            munmap(data, numVirtualBytes);  // Note: needs original size used in mmap
 #endif
        }
    }
    
    void init(size_t numElements) {
 #if !_WIN32
        // Linux only supports ALL
        assert(commitStrategy == CommitStrategy::All);
 #endif

        PerformanceAPI_BeginEvent("VirtualVec::init", nullptr, PERFORMANCEAPI_DEFAULT_COLOR);

        assert(data == nullptr); // don't handle re-init
        numVirtualBytes = roundUpToPage(numElements * sizeof(T)); // align to page size

 #if _WIN32
        if constexpr (commitStrategy == CommitStrategy::All) {
            // reserve AND commit
            void* raw = VirtualAlloc(nullptr, numVirtualBytes, MEM_COMMIT, PAGE_READWRITE);
            assert(raw != nullptr);
            data = static_cast<T*>(raw);
            endPage = offsetPtr(data, numVirtualBytes);
            nextPage = endPage;
        }
        else {
            // reserve but do NOT commit
            void* raw = VirtualAlloc(nullptr, numVirtualBytes, MEM_RESERVE, PAGE_READWRITE);
            assert(raw != nullptr);
            data = static_cast<T*>(raw);
            nextPage = data;
            endPage = offsetPtr(data, numVirtualBytes);
        }
 #else
        void* raw = mmap(nullptr, numVirtualBytes,
            PROT_READ | PROT_WRITE,          // Protection flags
            MAP_PRIVATE | MAP_ANONYMOUS,      // Mapping flags
            -1,                              // File descriptor (not used)
            0);                              // Offset (not used)
        assert(raw != nullptr);
        data = static_cast<T*>(raw);
        endPage = offsetPtr(data, numVirtualBytes);
        nextPage = endPage;
 #endif

        PerformanceAPI_EndEvent();
    }

    void push_back(T const& v) {
        assert(data != nullptr); // must call init first

        T* nextSlot = data + count;
        while (nextSlot >= nextPage) {
            reserveMore();
        }

        *nextSlot = v;
        count += 1;
    }

    void reserveMore() {
 #if _WIN32
        assert(nextPage < endPage);

        if constexpr (commitStrategy == CommitStrategy::Page) {
            VirtualAlloc(nextPage, PAGE_SIZE, MEM_COMMIT, PAGE_READWRITE); // commit first page
            nextPage = static_cast<void*>(static_cast<char*>(nextPage) + PAGE_SIZE);
        }
        else if constexpr (commitStrategy == CommitStrategy::Multiplier) {
            size_t currCommitedBytes = ((char*)nextPage - (char*)data);
            void* targetPage = nullptr;
            if (currCommitedBytes == 0) {
                targetPage = offsetPtr(data, PAGE_SIZE);
            }
            else {
                targetPage = offsetPtr(data, size_t(double(currCommitedBytes) * CommitMultiplier));
            }
            targetPage = std::min(targetPage, endPage);

            while (nextPage < targetPage && nextPage < endPage) {
                VirtualAlloc(nextPage, PAGE_SIZE, MEM_COMMIT, PAGE_READWRITE); // commit first page
                nextPage = static_cast<void*>(static_cast<char*>(nextPage) + PAGE_SIZE);
            }
        }
        else {
        }
 #else
        // Linux doesn't have to deal with this
        assert(false);
 #endif
    }
 };

 void run_subprocess(const char* exePath, size_t test, size_t subtest, size_t extra) {
    PerformanceAPI_BeginEvent("run_subprocess", nullptr, PERFORMANCEAPI_DEFAULT_COLOR);

 #if _WIN32
    HANDLE pipe_read, pipe_write;
    SECURITY_ATTRIBUTES sa = {
        .nLength = sizeof(SECURITY_ATTRIBUTES),
        .lpSecurityDescriptor = NULL,
        .bInheritHandle = TRUE,
    };

    // Create pipe for stdout
    if (!CreatePipe(&pipe_read, &pipe_write, &sa, 0)) {
        // Handle error
    }

    // Ensure the read handle is not inherited
    SetHandleInformation(pipe_read, HANDLE_FLAG_INHERIT, 0);

    STARTUPINFOA si = {
        .cb = sizeof(STARTUPINFO),
        .dwFlags = STARTF_USESTDHANDLES,
        .hStdOutput = pipe_write,
        .hStdError = pipe_write,
    };
    PROCESS_INFORMATION pi;

    std::string cli = std::format("{} {} {} {}", exePath, test, subtest, extra);

    if (!CreateProcessA(
        NULL,           // Application name
        cli.data(),        // Command line 
        NULL,           // Process security attributes
        NULL,           // Thread security attributes
        TRUE,           // Inherit handles
        0,             // Creation flags
        NULL,          // Use parent's environment
        NULL,          // Use parent's directory
        &si,           // STARTUPINFO pointer
        &pi            // PROCESS_INFORMATION pointer
    )) {
        // Handle error
    }

    // Close the write end of pipe in parent
    CloseHandle(pipe_write);

    // Read from pipe
    char buffer[4096];
    DWORD bytes_read;
    while (ReadFile(pipe_read, buffer, sizeof(buffer) - 1, &bytes_read, NULL)) {
        if (bytes_read == 0) break;
        buffer[bytes_read] = '\0';
        // Process output here
        printf("%s", buffer);
    }

    // Wait for process to complete
    WaitForSingleObject(pi.hProcess, INFINITE);

    // Cleanup
    CloseHandle(pipe_read);
    CloseHandle(pi.hProcess);
    CloseHandle(pi.hThread);
 #else
 int pipe_fd[2];
    if (pipe(pipe_fd) == -1) {
        perror("pipe");
        return;
    }

    pid_t pid = fork();
    if (pid == -1) {
        perror("fork");
        close(pipe_fd[0]);
        close(pipe_fd[1]);
        return;
    }

    if (pid == 0) {  // Child process
        // Close read end of pipe
        close(pipe_fd[0]);

        // Redirect stdout and stderr to pipe
        dup2(pipe_fd[1], STDOUT_FILENO);
        dup2(pipe_fd[1], STDERR_FILENO);
        close(pipe_fd[1]);

        // Convert arguments to strings
        std::string test_str = std::to_string(test);
        std::string subtest_str = std::to_string(subtest);
        std::string extra_str = std::to_string(extra);

        // Execute the program
        execl(exePath, exePath, 
              test_str.c_str(), 
              subtest_str.c_str(), 
              extra_str.c_str(), 
              (char*)nullptr);

        // If execl returns, there was an error
        perror("execl");
        exit(1);
    } else {  // Parent process
        // Close write end of pipe
        close(pipe_fd[1]);

        // Read from pipe
        char buffer[4096];
        ssize_t bytes_read;
        while ((bytes_read = read(pipe_fd[0], buffer, sizeof(buffer) - 1)) > 0) {
            buffer[bytes_read] = '\0';
            printf("%s", buffer);
        }

        // Close read end of pipe
        close(pipe_fd[0]);

        // Wait for child process to complete
        int status;
        waitpid(pid, &status, 0);
    }
 #endif

    PerformanceAPI_EndEvent();
 }

 enum Tests {
    VirtualPageCommit,
    VirtualGrowCommit,
    VirtualAllCommit,
    AllocCost
 };

 int main(int argc, char** argv) {
    using NumType = int;
    const std::vector<size_t> testBytes = { ONE_MB, ONE_MB * 10, ONE_MB * 100, ONE_GB, ONE_GB * 4, ONE_GB * 16, ONE_GB * 128, ONE_TB };
    const std::vector<std::string> testBytesStr = { "1 MiB", "10 MiB", "100 MiB", "1 GiB", "4 GiB", "16GiB", "128GiB", "1TiB" };
    const size_t COMMIT_SAFE_BYTES = 6;
    const std::vector<size_t> numAllocs = { 1, 5, 10, 25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 25000, 50000, 75000, 100000 };
    const std::vector<size_t> allocSizes = { ONE_MB, ONE_MB * 5, ONE_MB * 10, ONE_MB * 50, ONE_MB * 100, ONE_MB * 500, ONE_GB, ONE_GB * 4, ONE_GB * 8, ONE_GB * 16 };

    if (argc == 1) {
        std::cout << std::format("Page Size: {}\n", PAGE_SIZE);
        std::cout << std::format("Reserve Limit: {}\n", prettyBytes(RESERVE_LIMIT)); // ~127TB
        std::cout << std::format("Commit Limit: {}\n\n", prettyBytes(COMMIT_LIMIT)); // ~100GB

 #if _WIN32
        std::cout << "VirtualVec, reserve, no commit, no write\n";
        for (int i = 0; i < testBytes.size(); ++i) { run_subprocess(argv[0], VirtualPageCommit, i, 0); }
        std::cout << "\n";
 #endif

        std::cout << "VirtualVec, commit all, no write\n";
        for (int i = 0; i < COMMIT_SAFE_BYTES; ++i) { run_subprocess(argv[0], VirtualAllCommit, i, 0); }
        std::cout << "\n";

 #if _WIN32
        std::cout << "VirtualVec, commit page at a time, write all\n";
        for (int i = 0; i < COMMIT_SAFE_BYTES; ++i) { run_subprocess(argv[0], VirtualPageCommit, i, 1); }
        std::cout << "\n";

        std::cout << "VirtualVec, commit by 1.5x, write all\n";
        for (int i = 0; i < COMMIT_SAFE_BYTES; ++i) { run_subprocess(argv[0], VirtualGrowCommit, i, 1); }
        std::cout << "\n";
 #endif

        std::cout << "VirtualVec, commit all, write all\n";
        for (int i = 0; i < COMMIT_SAFE_BYTES; ++i) { run_subprocess(argv[0], VirtualAllCommit, i, 1); }
        std::cout << "\n";

        for (int j = 0; j < allocSizes.size(); ++j) {
            size_t allocSize = allocSizes[j];
            std::cout << std::format("Call VirtualAlloc({}) N times, no commit, no write\n", prettyBytes(allocSize));
            for (int i = 0; i < numAllocs.size(); ++i) {
                size_t virtualSize = allocSize * numAllocs[i];
                if (virtualSize < RESERVE_LIMIT / 2) {
                    run_subprocess(argv[0], AllocCost, i, allocSize);
                }
            }
            std::cout << "\n";
        }

        return 0;
    }

    size_t test = 0, subtest = 0, extra = 0; 
    std::from_chars(argv[1], argv[1] + strlen(argv[1]), test);
    std::from_chars(argv[2], argv[2] + strlen(argv[2]), subtest);
    std::from_chars(argv[3], argv[3] + strlen(argv[3]), extra);

    // Virtual, Page Commit
    if (test == VirtualPageCommit) {
        auto start = Clock::now();

        VirtualVec<NumType, CommitStrategy::Page> nums;
        size_t numBytes = testBytes[subtest];
        size_t numElements = numBytes / sizeof(NumType);
        nums.init(numElements);
        if (extra == 1) {
            for (int i = 0; i < numElements; ++i) {
                nums.push_back(i);
            }
        }

        auto end = Clock::now();
        std::cout << "    Bytes: " << testBytesStr[subtest] << "  Time: " << prettyTime(end - start) << "\n";
    }

    // Virtual, 1.5x Commit
    if (test == VirtualGrowCommit) {
        auto start = Clock::now();

        VirtualVec<NumType, CommitStrategy::Multiplier> nums;
        size_t numBytes = testBytes[subtest];
        size_t numElements = numBytes / sizeof(NumType);
        nums.init(numElements);
        if (extra == 1) {
            for (int i = 0; i < numElements; ++i) {
                nums.push_back(i);
            }
        }

        auto end = Clock::now();
        std::cout << "    Bytes: " << testBytesStr[subtest] << "  Time: " << prettyTime(end - start) << "\n";
    }

    // Virtual, Full Commit
    if (test == VirtualAllCommit) {
        auto start = Clock::now();

        VirtualVec<NumType, CommitStrategy::All> nums;
        size_t numBytes = testBytes[subtest];
        size_t numElements = numBytes / sizeof(NumType);
        nums.init(numElements);
        if (extra == 1) {
            for (int i = 0; i < numElements; ++i) {
                nums.push_back(i);
            }
        }

        auto end = Clock::now();
        std::cout << "    Bytes: " << testBytesStr[subtest] << "  Time: " << prettyTime(end - start) << "\n";
    }

    // std::vector, no reserve
    // std::vector, reserve
        
    // Virtual Alloc cost
    if (test == AllocCost) {
        const size_t num_bytes = extra;
        const size_t num_elements = num_bytes / sizeof(NumType);

        size_t numAlloc = numAllocs[subtest];

 #if _WIN32
        std::vector<VirtualVec<NumType, CommitStrategy::Page>> vecs;
 #else
        std::vector<VirtualVec<NumType, CommitStrategy::All>> vecs;
 #endif
        vecs.resize(numAlloc);
        auto start = Clock::now();
        duration<long long, std::nano> lastAlloc = {};
        for (int j = 0; j < vecs.size(); ++j) {
            if (j == vecs.size() - 1) {
                auto s = Clock::now();
                vecs[j].init(num_elements);
                auto e = Clock::now();
                lastAlloc = e - s;
            }
            else {
                vecs[j].init(num_elements);
            }
        }
        auto end = Clock::now();
        auto elapsedTotal = duration_cast<nanoseconds>(end - start);
        auto elapsedPer = elapsedTotal / numAlloc;
        size_t totalBytes = num_bytes * numAlloc;
        std::cout << "    N: " << numAllocs[subtest] 
            << "  TotalTime: "  << prettyTime(elapsedTotal) 
            << "  TotalReserved: " << prettyBytes(totalBytes)
            << "  PerVirtualAlloc: " << elapsedPer << " / " << prettyTime(elapsedPer) 
            << "  PerVirtualMib: " << prettyTime(elapsedTotal / (totalBytes / ONE_MB))
            << "\n";
    }
 }

 // https://devblogs.microsoft.com/oldnewthing/20160318-00/?p=93181
 // https://alwaysprocessing.blog/2022/02/20/size-matters
 // over 100,000 virtual allocs and it sometimes hangs

 // Linux: clang++ main.cpp -std=c++20 -stdlib=libc++ -lc++ -lc++abi -o x64/va
	// virtual_alloc.cpp : This file contains the 'main' function. Program execution begins and ends there.
	//

	#if _WIN32
	#define WIN32_LEAN_AND_MEAN
	#define NOMINMAX
	#include <Windows.h>
	#include <psapi.h>
	#else
	#include <sys/mman.h>
	#include <sys/sysinfo.h>
	#include <sys/types.h>
	#include <sys/wait.h>
	#include <unistd.h>
	#endif

	#include <algorithm>
	#include <assert.h>
	#include <charconv>
	#include <chrono>
	#include <cstdint>
	#include <format>
	#include <iostream>
	#include <vector>
	#include <type_traits>

	#define ENABLE_SUPERLUMINAL 0
	#if ENABLE_SUPERLUMINAL
	#include <Superluminal/PerformanceAPI_capi.h>
	#else
	#define PerformanceAPI_BeginEvent(a,b,c)
	#define PerformanceAPI_EndEvent()
	#endif

	using namespace std::chrono;
	using Clock = high_resolution_clock;

	static const size_t ONE_KB = 1024;
	static const size_t ONE_MB = 1024 * 1024;
	static const size_t ONE_GB = 1024 * 1024 * 1024;
	static const size_t ONE_TB = 1024llu * 1024 * 1024 * 1024;

	static const size_t PAGE_SIZE = []() -> size_t {
	#if _WIN32
	SYSTEM_INFO systemInfo;
	GetSystemInfo(&systemInfo);
	return systemInfo.dwPageSize;
	#else
	return sysconf(_SC_PAGESIZE);
	#endif
	}();

	static const size_t RESERVE_LIMIT = []() -> size_t {
	#ifdef _WIN32
	MEMORYSTATUSEX memStatus;
	memStatus.dwLength = sizeof(memStatus);
	GlobalMemoryStatusEx(&memStatus);
	return memStatus.ullTotalVirtual;
	#else
	struct sysinfo si;
	if (sysinfo(&si) == 0) {
	// Total virtual memory = RAM + swap
	return (si.totalram + si.totalswap) * si.mem_unit;
	}
	return 0;
	#endif
	}();

	static const size_t COMMIT_LIMIT = []() -> size_t {
	#if _WIN32
	PERFORMANCE_INFORMATION perfInfo;
	perfInfo.cb = sizeof(perfInfo);

	GetPerformanceInfo(&perfInfo, sizeof(perfInfo));
	size_t commitLimit = perfInfo.CommitLimit * perfInfo.PageSize;
	return commitLimit;
	#else
	struct sysinfo si;
	if (sysinfo(&si) == 0) {
	// On Linux, commit limit is typically RAM + swap
	return (si.totalram + si.totalswap) * si.mem_unit;
	}
	return 0;
	#endif
	}();


	// Small Helpers
	size_t roundUpToPage(size_t size) { return (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); }
	size_t roundDownToPage(size_t size) { return size & ~(PAGE_SIZE - 1); }
	void* offsetPtr(void* ptr, size_t offset) { return (void)((char)ptr + offset); }

	std::string prettyTime(nanoseconds ns) {
	if (ns.count() < 1000) return std::format("{}ns", ns.count());
	else if (ns.count() < 1000 * 1000) return std::format("{}us", ns.count() / 1000);
	else if (ns.count() < 1000llu * 1000llu * 1000llu * 10llu) return std::format("{}ms", ns.count() / 1000 / 1000);
	else return std::format("{}s", ns.count() / 1000 / 1000 / 1000);
	}

	std::string prettyBytes(size_t bytes) {
	if (bytes < 1024llu) return std::format("{}bytes", bytes);
	else if (bytes < 1024llu * 1024) return std::format("{}KiB", bytes / 1024);
	else if (bytes < 1024llu * 1024 * 1024) return std::format("{}MiB", bytes / 1024 / 1024);
	else if (bytes < 1024llu * 1024 * 1024 * 1024) return std::format("{}GiB", bytes / 1024 / 1024 / 1024);
	else return std::format("{}TiB", bytes / 1024 / 1024 / 1024 / 1024);
	}

	enum class CommitStrategy { Page, Multiplier, All };
	const double CommitMultiplier = 1.5;

	template<typename T, CommitStrategy commitStrategy>
	struct VirtualVec {
	T* data = nullptr;
	size_t count = 0;
	size_t numVirtualBytes = 0;
	void* nextPage = nullptr;
	void* endPage = nullptr; // one byte past end of last page

	VirtualVec() = default;
	~VirtualVec() {
	if (data) {
	if constexpr (!std::is_trivially_destructible_v<T>) {
	for (size_t i = 0; i < count; ++i) {
	data[i].~T(); // destroy!
	}
	}
	#if _WIN32
	VirtualFree(data, 0, MEM_RELEASE);
	#else
	munmap(data, numVirtualBytes); // Note: needs original size used in mmap
	#endif
	}
	}

	void init(size_t numElements) {
	#if !_WIN32
	// Linux only supports ALL
	assert(commitStrategy == CommitStrategy::All);
	#endif

	PerformanceAPI_BeginEvent("VirtualVec::init", nullptr, PERFORMANCEAPI_DEFAULT_COLOR);

	assert(data == nullptr); // don't handle re-init
	numVirtualBytes = roundUpToPage(numElements * sizeof(T)); // align to page size

	#if _WIN32
	if constexpr (commitStrategy == CommitStrategy::All) {
	// reserve AND commit
	void* raw = VirtualAlloc(nullptr, numVirtualBytes, MEM_COMMIT, PAGE_READWRITE);
	assert(raw != nullptr);
	data = static_cast<T*>(raw);
	endPage = offsetPtr(data, numVirtualBytes);
	nextPage = endPage;
	}
	else {
	// reserve but do NOT commit
	void* raw = VirtualAlloc(nullptr, numVirtualBytes, MEM_RESERVE, PAGE_READWRITE);
	assert(raw != nullptr);
	data = static_cast<T*>(raw);
	nextPage = data;
	endPage = offsetPtr(data, numVirtualBytes);
	}
	#else
	void* raw = mmap(nullptr, numVirtualBytes,
	PROT_READ \| PROT_WRITE, // Protection flags
	MAP_PRIVATE \| MAP_ANONYMOUS, // Mapping flags
	-1, // File descriptor (not used)
	0); // Offset (not used)
	assert(raw != nullptr);
	data = static_cast<T*>(raw);
	endPage = offsetPtr(data, numVirtualBytes);
	nextPage = endPage;
	#endif

	PerformanceAPI_EndEvent();
	}

	void push_back(T const& v) {
	assert(data != nullptr); // must call init first

	T* nextSlot = data + count;
	while (nextSlot >= nextPage) {
	reserveMore();
	}

	*nextSlot = v;
	count += 1;
	}

	void reserveMore() {
	#if _WIN32
	assert(nextPage < endPage);

	if constexpr (commitStrategy == CommitStrategy::Page) {
	VirtualAlloc(nextPage, PAGE_SIZE, MEM_COMMIT, PAGE_READWRITE); // commit first page
	nextPage = static_cast<void>(static_cast<char>(nextPage) + PAGE_SIZE);
	}
	else if constexpr (commitStrategy == CommitStrategy::Multiplier) {
	size_t currCommitedBytes = ((char)nextPage - (char)data);
	void* targetPage = nullptr;
	if (currCommitedBytes == 0) {
	targetPage = offsetPtr(data, PAGE_SIZE);
	}
	else {
	targetPage = offsetPtr(data, size_t(double(currCommitedBytes) * CommitMultiplier));
	}
	targetPage = std::min(targetPage, endPage);

	while (nextPage < targetPage && nextPage < endPage) {
	VirtualAlloc(nextPage, PAGE_SIZE, MEM_COMMIT, PAGE_READWRITE); // commit first page
	nextPage = static_cast<void>(static_cast<char>(nextPage) + PAGE_SIZE);
	}
	}
	else {
	}
	#else
	// Linux doesn't have to deal with this
	assert(false);
	#endif
	}
	};

	void run_subprocess(const char* exePath, size_t test, size_t subtest, size_t extra) {
	PerformanceAPI_BeginEvent("run_subprocess", nullptr, PERFORMANCEAPI_DEFAULT_COLOR);

	#if _WIN32
	HANDLE pipe_read, pipe_write;
	SECURITY_ATTRIBUTES sa = {
	.nLength = sizeof(SECURITY_ATTRIBUTES),
	.lpSecurityDescriptor = NULL,
	.bInheritHandle = TRUE,
	};

	// Create pipe for stdout
	if (!CreatePipe(&pipe_read, &pipe_write, &sa, 0)) {
	// Handle error
	}

	// Ensure the read handle is not inherited
	SetHandleInformation(pipe_read, HANDLE_FLAG_INHERIT, 0);

	STARTUPINFOA si = {
	.cb = sizeof(STARTUPINFO),
	.dwFlags = STARTF_USESTDHANDLES,
	.hStdOutput = pipe_write,
	.hStdError = pipe_write,
	};
	PROCESS_INFORMATION pi;

	std::string cli = std::format("{} {} {} {}", exePath, test, subtest, extra);

	if (!CreateProcessA(
	NULL, // Application name
	cli.data(), // Command line
	NULL, // Process security attributes
	NULL, // Thread security attributes
	TRUE, // Inherit handles
	0, // Creation flags
	NULL, // Use parent's environment
	NULL, // Use parent's directory
	&si, // STARTUPINFO pointer
	&pi // PROCESS_INFORMATION pointer
	)) {
	// Handle error
	}

	// Close the write end of pipe in parent
	CloseHandle(pipe_write);

	// Read from pipe
	char buffer[4096];
	DWORD bytes_read;
	while (ReadFile(pipe_read, buffer, sizeof(buffer) - 1, &bytes_read, NULL)) {
	if (bytes_read == 0) break;
	buffer[bytes_read] = '\0';
	// Process output here
	printf("%s", buffer);
	}

	// Wait for process to complete
	WaitForSingleObject(pi.hProcess, INFINITE);

	// Cleanup
	CloseHandle(pipe_read);
	CloseHandle(pi.hProcess);
	CloseHandle(pi.hThread);
	#else
	int pipe_fd[2];
	if (pipe(pipe_fd) == -1) {
	perror("pipe");
	return;
	}

	pid_t pid = fork();
	if (pid == -1) {
	perror("fork");
	close(pipe_fd[0]);
	close(pipe_fd[1]);
	return;
	}

	if (pid == 0) { // Child process
	// Close read end of pipe
	close(pipe_fd[0]);

	// Redirect stdout and stderr to pipe
	dup2(pipe_fd[1], STDOUT_FILENO);
	dup2(pipe_fd[1], STDERR_FILENO);
	close(pipe_fd[1]);

	// Convert arguments to strings
	std::string test_str = std::to_string(test);
	std::string subtest_str = std::to_string(subtest);
	std::string extra_str = std::to_string(extra);

	// Execute the program
	execl(exePath, exePath,
	test_str.c_str(),
	subtest_str.c_str(),
	extra_str.c_str(),
	(char*)nullptr);

	// If execl returns, there was an error
	perror("execl");
	exit(1);
	} else { // Parent process
	// Close write end of pipe
	close(pipe_fd[1]);

	// Read from pipe
	char buffer[4096];
	ssize_t bytes_read;
	while ((bytes_read = read(pipe_fd[0], buffer, sizeof(buffer) - 1)) > 0) {
	buffer[bytes_read] = '\0';
	printf("%s", buffer);
	}

	// Close read end of pipe
	close(pipe_fd[0]);

	// Wait for child process to complete
	int status;
	waitpid(pid, &status, 0);
	}
	#endif

	PerformanceAPI_EndEvent();
	}

	enum Tests {
	VirtualPageCommit,
	VirtualGrowCommit,
	VirtualAllCommit,
	AllocCost
	};

	int main(int argc, char** argv) {
	using NumType = int;
	const std::vector<size_t> testBytes = { ONE_MB, ONE_MB * 10, ONE_MB * 100, ONE_GB, ONE_GB * 4, ONE_GB * 16, ONE_GB * 128, ONE_TB };
	const std::vector<std::string> testBytesStr = { "1 MiB", "10 MiB", "100 MiB", "1 GiB", "4 GiB", "16GiB", "128GiB", "1TiB" };
	const size_t COMMIT_SAFE_BYTES = 6;
	const std::vector<size_t> numAllocs = { 1, 5, 10, 25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 25000, 50000, 75000, 100000 };
	const std::vector<size_t> allocSizes = { ONE_MB, ONE_MB * 5, ONE_MB * 10, ONE_MB * 50, ONE_MB * 100, ONE_MB * 500, ONE_GB, ONE_GB * 4, ONE_GB * 8, ONE_GB * 16 };

	if (argc == 1) {
	std::cout << std::format("Page Size: {}\n", PAGE_SIZE);
	std::cout << std::format("Reserve Limit: {}\n", prettyBytes(RESERVE_LIMIT)); // ~127TB
	std::cout << std::format("Commit Limit: {}\n\n", prettyBytes(COMMIT_LIMIT)); // ~100GB

	#if _WIN32
	std::cout << "VirtualVec, reserve, no commit, no write\n";
	for (int i = 0; i < testBytes.size(); ++i) { run_subprocess(argv[0], VirtualPageCommit, i, 0); }
	std::cout << "\n";
	#endif

	std::cout << "VirtualVec, commit all, no write\n";
	for (int i = 0; i < COMMIT_SAFE_BYTES; ++i) { run_subprocess(argv[0], VirtualAllCommit, i, 0); }
	std::cout << "\n";

	#if _WIN32
	std::cout << "VirtualVec, commit page at a time, write all\n";
	for (int i = 0; i < COMMIT_SAFE_BYTES; ++i) { run_subprocess(argv[0], VirtualPageCommit, i, 1); }
	std::cout << "\n";

	std::cout << "VirtualVec, commit by 1.5x, write all\n";
	for (int i = 0; i < COMMIT_SAFE_BYTES; ++i) { run_subprocess(argv[0], VirtualGrowCommit, i, 1); }
	std::cout << "\n";
	#endif

	std::cout << "VirtualVec, commit all, write all\n";
	for (int i = 0; i < COMMIT_SAFE_BYTES; ++i) { run_subprocess(argv[0], VirtualAllCommit, i, 1); }
	std::cout << "\n";

	for (int j = 0; j < allocSizes.size(); ++j) {
	size_t allocSize = allocSizes[j];
	std::cout << std::format("Call VirtualAlloc({}) N times, no commit, no write\n", prettyBytes(allocSize));
	for (int i = 0; i < numAllocs.size(); ++i) {
	size_t virtualSize = allocSize * numAllocs[i];
	if (virtualSize < RESERVE_LIMIT / 2) {
	run_subprocess(argv[0], AllocCost, i, allocSize);
	}
	}
	std::cout << "\n";
	}

	return 0;
	}

	size_t test = 0, subtest = 0, extra = 0;
	std::from_chars(argv[1], argv[1] + strlen(argv[1]), test);
	std::from_chars(argv[2], argv[2] + strlen(argv[2]), subtest);
	std::from_chars(argv[3], argv[3] + strlen(argv[3]), extra);

	// Virtual, Page Commit
	if (test == VirtualPageCommit) {
	auto start = Clock::now();

	VirtualVec<NumType, CommitStrategy::Page> nums;
	size_t numBytes = testBytes[subtest];
	size_t numElements = numBytes / sizeof(NumType);
	nums.init(numElements);
	if (extra == 1) {
	for (int i = 0; i < numElements; ++i) {
	nums.push_back(i);
	}
	}

	auto end = Clock::now();
	std::cout << " Bytes: " << testBytesStr[subtest] << " Time: " << prettyTime(end - start) << "\n";
	}

	// Virtual, 1.5x Commit
	if (test == VirtualGrowCommit) {
	auto start = Clock::now();

	VirtualVec<NumType, CommitStrategy::Multiplier> nums;
	size_t numBytes = testBytes[subtest];
	size_t numElements = numBytes / sizeof(NumType);
	nums.init(numElements);
	if (extra == 1) {
	for (int i = 0; i < numElements; ++i) {
	nums.push_back(i);
	}
	}

	auto end = Clock::now();
	std::cout << " Bytes: " << testBytesStr[subtest] << " Time: " << prettyTime(end - start) << "\n";
	}

	// Virtual, Full Commit
	if (test == VirtualAllCommit) {
	auto start = Clock::now();

	VirtualVec<NumType, CommitStrategy::All> nums;
	size_t numBytes = testBytes[subtest];
	size_t numElements = numBytes / sizeof(NumType);
	nums.init(numElements);
	if (extra == 1) {
	for (int i = 0; i < numElements; ++i) {
	nums.push_back(i);
	}
	}

	auto end = Clock::now();
	std::cout << " Bytes: " << testBytesStr[subtest] << " Time: " << prettyTime(end - start) << "\n";
	}

	// std::vector, no reserve
	// std::vector, reserve

	// Virtual Alloc cost
	if (test == AllocCost) {
	const size_t num_bytes = extra;
	const size_t num_elements = num_bytes / sizeof(NumType);

	size_t numAlloc = numAllocs[subtest];

	#if _WIN32
	std::vector<VirtualVec<NumType, CommitStrategy::Page>> vecs;
	#else
	std::vector<VirtualVec<NumType, CommitStrategy::All>> vecs;
	#endif
	vecs.resize(numAlloc);
	auto start = Clock::now();
	duration<long long, std::nano> lastAlloc = {};
	for (int j = 0; j < vecs.size(); ++j) {
	if (j == vecs.size() - 1) {
	auto s = Clock::now();
	vecs[j].init(num_elements);
	auto e = Clock::now();
	lastAlloc = e - s;
	}
	else {
	vecs[j].init(num_elements);
	}
	}
	auto end = Clock::now();
	auto elapsedTotal = duration_cast<nanoseconds>(end - start);
	auto elapsedPer = elapsedTotal / numAlloc;
	size_t totalBytes = num_bytes * numAlloc;
	std::cout << " N: " << numAllocs[subtest]
	<< " TotalTime: " << prettyTime(elapsedTotal)
	<< " TotalReserved: " << prettyBytes(totalBytes)
	<< " PerVirtualAlloc: " << elapsedPer << " / " << prettyTime(elapsedPer)
	<< " PerVirtualMib: " << prettyTime(elapsedTotal / (totalBytes / ONE_MB))
	<< "\n";
	}
	}

	// https://devblogs.microsoft.com/oldnewthing/20160318-00/?p=93181
	// https://alwaysprocessing.blog/2022/02/20/size-matters
	// over 100,000 virtual allocs and it sometimes hangs

	// Linux: clang++ main.cpp -std=c++20 -stdlib=libc++ -lc++ -lc++abi -o x64/va