Created
July 3, 2019 15:17
-
-
Save daramkun/456d81800ef076a23caed52f3dd7f808 to your computer and use it in GitHub Desktop.
Memory Copy Performance Measure (memcpy, ID3D11DeviceContext::CopyResource)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <Windows.h> | |
#include <d3d11.h> | |
#include <atlbase.h> | |
#pragma comment (lib, "d3d11.lib") | |
#include <iostream> | |
#include <thread> | |
#include <chrono> | |
#include <memory> | |
#include <vector> | |
#include <cstdint> | |
#include <cassert> | |
#include <algorithm> | |
#include <execution> | |
constexpr double MEASURE_SECONDS = 10; | |
constexpr double GIGABYTE_MAKER = 1 / 1073741824.0; | |
class performance | |
{ | |
public: | |
performance () : _copy_bytes (0), _running (false) { } | |
virtual ~performance () noexcept | |
{ | |
_run.join (); | |
} | |
public: | |
uint64_t copy_bytes () const noexcept { return _copy_bytes; } | |
std::chrono::duration<double> proceed_time () const noexcept | |
{ | |
return std::chrono::high_resolution_clock::now () - _started; | |
} | |
bool is_running () const noexcept { return _running; } | |
protected: | |
virtual size_t do_measure () noexcept = 0; | |
public: | |
void run () noexcept | |
{ | |
_running = true; | |
_started = std::chrono::high_resolution_clock::now (); | |
_run = std::thread ([this]() | |
{ | |
_copy_bytes = 0; | |
do | |
{ | |
_copy_bytes += this->do_measure (); | |
std::this_thread::yield (); | |
} | |
while (_running); | |
} | |
); | |
} | |
void stop () { _running = false; } | |
private: | |
uint64_t _copy_bytes; | |
std::chrono::steady_clock::time_point _started; | |
std::thread _run; | |
bool _running; | |
}; | |
class memcpy_performance : public performance | |
{ | |
private: | |
const size_t BUFFER_SIZE = 1024 * 1024 * 16; //< 16MB | |
public: | |
memcpy_performance () | |
{ | |
_dest.resize (BUFFER_SIZE); | |
_src.resize (BUFFER_SIZE); | |
for (unsigned int i = 0; i < std::thread::hardware_concurrency (); ++i) | |
_temp.push_back (i); | |
} | |
protected: | |
virtual size_t do_measure () noexcept override | |
{ | |
std::for_each (std::execution::par_unseq, _temp.begin (), _temp.end (), [this](unsigned int i) | |
{ | |
memcpy (_dest.data (), _src.data (), BUFFER_SIZE); | |
}); | |
return BUFFER_SIZE * _temp.size (); | |
} | |
private: | |
std::vector<uint8_t> _dest, _src; | |
std::vector<unsigned int> _temp; | |
}; | |
class D3D11CopyResourceRAM2VRAM_performance : public performance | |
{ | |
public: | |
D3D11CopyResourceRAM2VRAM_performance (size_t size = 4096, DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM) | |
{ | |
HRESULT hr = D3D11CreateDevice (nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &_d3dDevice, nullptr, &_immediateContext); | |
assert (SUCCEEDED (hr)); | |
memset (&_texDesc, 0, sizeof (D3D11_TEXTURE2D_DESC)); | |
_texDesc.Width = _texDesc.Height = size; | |
_texDesc.ArraySize = 1; | |
_texDesc.MipLevels = 1; | |
_texDesc.Format = format; | |
_texDesc.SampleDesc.Count = 1; | |
_texDesc.Usage = D3D11_USAGE_DEFAULT; | |
_texDesc.CPUAccessFlags = 0; | |
_texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE; | |
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_dest); | |
assert (SUCCEEDED (hr)); | |
_texDesc.Usage = D3D11_USAGE_STAGING; | |
_texDesc.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; | |
_texDesc.BindFlags = 0; | |
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_src); | |
assert (SUCCEEDED (hr)); | |
_totalSize = size * size * (format == DXGI_FORMAT_R8G8B8A8_UNORM ? 4 : 16); | |
} | |
protected: | |
virtual size_t do_measure () noexcept override | |
{ | |
_immediateContext->CopyResource (_dest, _src); | |
_immediateContext->Flush (); | |
return _totalSize; | |
} | |
private: | |
D3D11_TEXTURE2D_DESC _texDesc; | |
CComPtr<ID3D11Device> _d3dDevice; | |
CComPtr<ID3D11DeviceContext> _immediateContext; | |
CComPtr<ID3D11Texture2D> _dest, _src; | |
size_t _totalSize; | |
}; | |
class D3D11CopyResourceVRAM2VRAM_performance : public performance | |
{ | |
public: | |
D3D11CopyResourceVRAM2VRAM_performance (size_t size = 4096, DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM) | |
{ | |
HRESULT hr = D3D11CreateDevice (nullptr, D3D_DRIVER_TYPE_HARDWARE, nullptr, 0, nullptr, 0, D3D11_SDK_VERSION, &_d3dDevice, nullptr, &_immediateContext); | |
assert (SUCCEEDED (hr)); | |
memset (&_texDesc, 0, sizeof (D3D11_TEXTURE2D_DESC)); | |
_texDesc.Width = _texDesc.Height = size; | |
_texDesc.ArraySize = 1; | |
_texDesc.MipLevels = 1; | |
_texDesc.Format = format; | |
_texDesc.SampleDesc.Count = 1; | |
_texDesc.Usage = D3D11_USAGE_DEFAULT; | |
_texDesc.CPUAccessFlags = 0; | |
_texDesc.BindFlags = D3D11_BIND_SHADER_RESOURCE; | |
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_dest); | |
assert (SUCCEEDED (hr)); | |
hr = _d3dDevice->CreateTexture2D (&_texDesc, nullptr, &_src); | |
assert (SUCCEEDED (hr)); | |
_totalSize = size * size * (format == DXGI_FORMAT_R8G8B8A8_UNORM ? 4 : 16); | |
} | |
protected: | |
virtual size_t do_measure () noexcept override | |
{ | |
_immediateContext->CopyResource (_dest, _src); | |
_immediateContext->Flush (); | |
return _totalSize; | |
} | |
private: | |
D3D11_TEXTURE2D_DESC _texDesc; | |
CComPtr<ID3D11Device> _d3dDevice; | |
CComPtr<ID3D11DeviceContext> _immediateContext; | |
CComPtr<ID3D11Texture2D> _dest, _src; | |
size_t _totalSize; | |
}; | |
void measure (const char * testname, performance* perf) | |
{ | |
printf ("==== %s Performance Measure ====\n", testname); | |
std::shared_ptr<performance> _measure (perf); | |
_measure->run (); | |
while (_measure->is_running ()) | |
{ | |
if (_measure->proceed_time ().count () >= MEASURE_SECONDS) | |
_measure->stop (); | |
printf ("\r%3.3lfs... %lfGB/s... Total Copied: %lfGB", | |
_measure->proceed_time ().count (), | |
(_measure->copy_bytes () / _measure->proceed_time ().count ()) * GIGABYTE_MAKER, | |
_measure->copy_bytes () * GIGABYTE_MAKER); | |
std::this_thread::yield (); | |
} | |
printf ("\r%3.3lfs... %lfGB/s... Total Copied: %lfGB", | |
_measure->proceed_time ().count (), | |
(_measure->copy_bytes () / _measure->proceed_time ().count ()) * GIGABYTE_MAKER, | |
_measure->copy_bytes ()* GIGABYTE_MAKER); | |
putchar ('\n'); | |
} | |
int main (int argc, char* argv[]) | |
{ | |
measure (u8"CPU memcpy", new memcpy_performance ()); | |
measure (u8"Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8)", new D3D11CopyResourceRAM2VRAM_performance (4096, DXGI_FORMAT_R8G8B8A8_UNORM)); | |
measure (u8"Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF)", new D3D11CopyResourceRAM2VRAM_performance (4096, DXGI_FORMAT_R32G32B32A32_FLOAT)); | |
measure (u8"Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8)", new D3D11CopyResourceVRAM2VRAM_performance (4096, DXGI_FORMAT_R8G8B8A8_UNORM)); | |
measure (u8"Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF)", new D3D11CopyResourceVRAM2VRAM_performance (4096, DXGI_FORMAT_R32G32B32A32_FLOAT)); | |
return 0; | |
} |
CPU: AMD Ryzen 7 3700X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (No Overclocked, RAM Timing XMP)
M/B: MSI B550M Mortar WiFi
GPU: AMD Radeon RX 5700XT Reference Model by Sapphire
Result:
==== CPU memcpy Performance Measure ====
10.005s... 14.043436GB/s... Total Copied: 140.500000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 22.462423GB/s... Total Copied: 224.625000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 23.749940GB/s... Total Copied: 237.500000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 183.493069GB/s... Total Copied: 1834.937500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 187.524475GB/s... Total Copied: 1875.250000GB
Background Informations
PCI-Express 4.0 x16 Maximum Bandwidth: 31.5 GB/s.
CPU: AMD Ryzen 7 3700X (No Overclocked)
RAM: ESSENCORE DDR4 2400MHz 16GBx2 (No Overclocked, RAM Timing XMP)
M/B: MSI B550M Mortar WiFi
GPU: GALAX NVIDIA GeForce RTX 3070 EX OC
Result:
==== CPU memcpy Performance Measure ====
10.006s... 14.916766GB/s... Total Copied: 149.250000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 22.962413GB/s... Total Copied: 229.625000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 25.024848GB/s... Total Copied: 250.250000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 328.060400GB/s... Total Copied: 3280.625000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 174.724219GB/s... Total Copied: 1747.250000GB
CPU: AMD Ryzen 3 2200U
RAM: Crucial DDR4 4GBx2
M/B: Lenovo ideapad 330S-15ARR (81FB)
GPU: AMD Radeon Vega 3 Mobile Graphics
Result:
==== CPU memcpy Performance Measure ====
10.007s... 5.233843GB/s... Total Copied: 52.375000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 6.624958GB/s... Total Copied: 66.250000GB
==== Direct3D Texture2D Copy Resource RAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 7.474972GB/s... Total Copied: 74.750000GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * R8G8B8A8) Performance Measure ====
10.000s... 10.406215GB/s... Total Copied: 104.062500GB
==== Direct3D Texture2D Copy Resource VRAM to VRAM(4096 * 4096 * RGBAF) Performance Measure ====
10.000s... 8.499961GB/s... Total Copied: 85.000000GB
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
CPU: AMD Ryzen 5 1600X (No Overclocked)
RAM: DDR4 2133MHz 8GBx2 (No Overclocked)
M/B: ASRock B350M Pro4
GPU: Zotac NVIDIA GeForce GTX 970
Result: