Created
September 25, 2011 02:27
-
-
Save rygorous/1240137 to your computer and use it in GitHub Desktop.
Does your GPU have pixel alignment requirements for quads?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <windows.h> | |
#include <D3D10.h> | |
#include <D3DX10.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdarg.h> | |
#include <algorithm> | |
// "Does your GPU have pixel alignment requirements for quads?" tester | |
// Win32 only, D3D10+ only because it's a quick hack. | |
// | |
// The test works like this: | |
// We fill the screen with triangles laid out in a grid, with a spacing of 2 pixels | |
// in each direction. Each triangle has the vertex coordinates (in pixels): | |
// (x0, y0) | |
// (x0, y0 + 2.5) | |
// (x0 + 2.5, y0) | |
// which means each triangle covers exactly 3 pixels. They end up looking like this: | |
// * | |
// * * | |
// where the lower-left star is at the base vertex position (x0, y0). This is all set | |
// up in the vertex shader. We then try displacing the triangles by one pixel each in | |
// the x and y directions (as well as both at once). If the GPU can produce rasterized | |
// quads with arbitrary alignment, this shouldn't make any significant difference. If, | |
// however, the GPU tiles the render target with "quad footprints" and will rasterize | |
// any quads touched by a triangle, shifting our test triangles by 1 pixel in either x | |
// or y will cause 2 quads (instead of 1) to be rasterized per triangle; the case where | |
// we shift by 1 pixel in both x and y is even worse, since the triangle now covers 3 | |
// quads (each of which have exactly one pixel lit). | |
// | |
// The PS just does random busy work (counting primes under a specified threshold) in | |
// a deliberately inefficient way to make pixels take long to shade so we get long | |
// enough frame times. The main point here is to just keep the ALUs busy and make sure | |
// that other potential limiting factors (such as memory bandwidth) are out of the | |
// equation. WorkFactor (set in the code below) controls how much work is done per pixel. | |
// You have to set this up properly so the driver doesn't think the GPU is hung working | |
// on the DrawPrimitive and does a reset. :) Choosing WorkFactor so the first test takes | |
// about 130ms worked fine for me. | |
// | |
// So, for unconstrained quad alignment we expect equal performance in all 4 cases, whereas | |
// for even-x/y alignment we expect one of the cases (probably the non-shifted one) to be | |
// fastest, the two cases that hit 2 quads each to be roughly twice as slow, and the last | |
// case (which hits 3 quads per triangle) to take roughly 3x as long as the fast case. | |
// | |
// Test results: | |
// | |
// AMD Radeon HD 5770 (WorkFactor = 100) - Evergreen series | |
// xOffset=0 yOffset=0: 191.7 ms | |
// xOffset=1 yOffset=0: 383.9 ms | |
// xOffset=0 yOffset=1: 384.3 ms | |
// xOffset=1 yOffset=1: 588.5 ms | |
// -> nearly perfectly linear - even alignment required. | |
// | |
// AMD Radeon HD 6900 (WorkFactor = 100) - Northern islands series | |
// xOffset=0 yOffset=0: 137.7 ms | |
// xOffset=1 yOffset=0: 206.9 ms | |
// xOffset=0 yOffset=1: 208.0 ms | |
// xOffset=1 yOffset=1: 301.8 ms | |
// -> likely to have even-alignment requirement. | |
// | |
// NVidia GeForce 8800 GTX (WorkFactor = 50) - G80 architecture | |
// xOffset=0 yOffset=0: 127.6 ms | |
// xOffset=1 yOffset=0: 157.0 ms | |
// xOffset=0 yOffset=1: 158.5 ms | |
// xOffset=1 yOffset=1: 252.8 ms | |
// -> we definitely get some slowdown here, but not as much as expected. The ratios are | |
// fairly consistent between different work factors, so this isn't just some constant overhead | |
// that's distorting the relative frame times. I don't have detailed knowledge of how the | |
// GF 8x00 series handles rasterization and quad dispatch internally, so I can't say for sure | |
// what's going on here. | |
// | |
// NVidia GeForce GTX 465 (WorkFactor = 100) - Fermi architecture | |
// xOffset=0 yOffset=0: 71.8ms | |
// xOffset=1 yOffset=0: 139.3ms | |
// xOffset=0 yOffset=1: 138.3ms | |
// xOffset=1 yOffset=1: 207.9ms | |
// -> This one's crystal clear: alignment required. And whatever other bottlenecks besides shaders | |
// seemed to exist for this test in previous NV architectures seem to have been eliminated. | |
// Shader code | |
static const char shaderCode[] = | |
"cbuffer cbAll : register(cb0) {\n" | |
" float4 pixelToNDC;\n" | |
" uint width, upperBound;\n" | |
"};\n" | |
"struct PSIn {\n" | |
" float4 Pos : SV_POSITION;\n" | |
"};\n" | |
"PSIn VS(uint i : SV_VertexID)\n" | |
"{\n" | |
" PSIn o;\n" | |
" uint iTri = i / 3;\n" | |
" uint iVertInTri = i % 3;\n" | |
" float2 v;\n" | |
" v.x = (float) (iTri % width)*2;\n" | |
" v.y = (float) (iTri / width)*2;\n" | |
" v.x += (iVertInTri == 2) ? 2.5f : 0.0f;\n" | |
" v.y += (iVertInTri == 1) ? 2.5f : 0.0f;\n" | |
" o.Pos.xy = v * pixelToNDC.xy + pixelToNDC.zw;\n" | |
" o.Pos.z = 0.5f;\n" | |
" o.Pos.w = 1.0f;\n" | |
" return o;\n" | |
"}\n" | |
"float4 PS(PSIn x) : SV_Target\n" | |
"{\n" | |
" uint nPrimes = 0;\n" | |
" for (uint i=2; i < upperBound; i++)\n" | |
" {\n" | |
" uint nFactors = 0;\n" | |
" for (uint j=2; j <= i; j++)\n" | |
" if (i % j == 0) nFactors++;\n" | |
" nPrimes += (nFactors == 1) ? 1 : 0;\n" | |
" }\n" | |
" return (nPrimes != 0) ? float4(1,1,1,1) : float4(0,0,0,0);\n" | |
"}\n" | |
"technique10 Render {\n" | |
" pass P0 {\n" | |
" SetVertexShader(CompileShader(vs_4_0, VS()));\n" | |
" SetGeometryShader(NULL);\n" | |
" SetPixelShader(CompileShader(ps_4_0, PS()));\n" | |
" }\n" | |
"}\n"; | |
// ---- App code. | |
#pragma comment(lib, "d3d10.lib") | |
#pragma comment(lib, "d3dx10.lib") | |
static const int WIDTH = 512, HEIGHT = 512; | |
static HWND hWnd; | |
static IDXGISwapChain *swapChain; | |
static ID3D10Device *device; | |
static ID3D10RenderTargetView *renderTargetView; | |
static ID3D10Effect *effect; | |
static void errorExit(const char *fmt, ...) | |
{ | |
char buffer[2048]; | |
va_list arg; | |
va_start(arg, fmt); | |
vsprintf_s(buffer, fmt, arg); | |
va_end(arg); | |
MessageBoxA(hWnd, buffer, "Error", MB_ICONERROR | MB_OK); | |
exit(1); | |
} | |
static void check(HRESULT hr) | |
{ | |
if (!FAILED(hr)) | |
return; | |
errorExit("D3D error code %08x\n", hr); | |
} | |
static LRESULT CALLBACK windowProc(HWND hWnd, UINT msg, WPARAM wparam, LPARAM lparam) | |
{ | |
switch (msg) | |
{ | |
case WM_DESTROY: | |
PostQuitMessage(0); | |
break; | |
case WM_CHAR: | |
if (wparam == 27) // escape | |
DestroyWindow(hWnd); | |
return 0; | |
} | |
return DefWindowProc(hWnd, msg, wparam, lparam); | |
} | |
static void createWindow(HINSTANCE hInst) | |
{ | |
WNDCLASS wc = { 0, windowProc, 0, 0, hInst, 0, LoadCursor(0, IDC_ARROW), (HBRUSH) GetStockObject(WHITE_BRUSH), NULL, TEXT("quadtest") }; | |
if (!RegisterClass(&wc)) | |
errorExit("Couldn't register class."); | |
RECT r = { 0, 0, WIDTH, HEIGHT }; | |
AdjustWindowRect(&r, WS_OVERLAPPEDWINDOW, FALSE); | |
hWnd = CreateWindow(TEXT("quadtest"), TEXT("quadtest"), WS_OVERLAPPEDWINDOW | WS_VISIBLE, CW_USEDEFAULT, CW_USEDEFAULT, | |
r.right - r.left, r.bottom - r.top, NULL, NULL, hInst, NULL); | |
if (!hWnd) | |
errorExit("Error creating window."); | |
} | |
static void initD3D() | |
{ | |
DXGI_SWAP_CHAIN_DESC sd = { | |
{ | |
WIDTH, HEIGHT, { 60, 1 }, DXGI_FORMAT_R8G8B8A8_UNORM, | |
DXGI_MODE_SCANLINE_ORDER_UNSPECIFIED, DXGI_MODE_SCALING_UNSPECIFIED | |
}, | |
{ 1, 0 }, | |
DXGI_USAGE_RENDER_TARGET_OUTPUT, | |
1, | |
hWnd, | |
TRUE, | |
DXGI_SWAP_EFFECT_DISCARD, | |
0 | |
}; | |
check(D3D10CreateDeviceAndSwapChain(NULL, D3D10_DRIVER_TYPE_HARDWARE, NULL, 0, D3D10_SDK_VERSION, | |
&sd, &swapChain, &device)); | |
// Create a render target view | |
ID3D10Texture2D *buffer; | |
check(swapChain->GetBuffer(0, __uuidof(ID3D10Texture2D), (void **)&buffer)); | |
check(device->CreateRenderTargetView(buffer, NULL, &renderTargetView)); | |
buffer->Release(); | |
// Compile the shaders | |
ID3D10Blob *errors; | |
HRESULT hr = D3DX10CreateEffectFromMemory(shaderCode, strlen(shaderCode), "shader.fx", NULL, | |
NULL, "fx_4_0", 0, 0, device, NULL, NULL, &effect, &errors, NULL); | |
if (FAILED(hr)) | |
errorExit("Effect compilation error: %s", errors->GetBufferPointer()); | |
if (errors) | |
errors->Release(); | |
// Initialize the viewport | |
D3D10_VIEWPORT vp = { 0, 0, WIDTH, HEIGHT, 0.0f, 1.0f }; | |
device->RSSetViewports(1, &vp); | |
} | |
static void deinitD3D() | |
{ | |
effect->Release(); | |
renderTargetView->Release(); | |
device->Release(); | |
swapChain->Release(); | |
} | |
static double frame(char *desc, int test, int nFrameInTest) | |
{ | |
static const float clearColor[4] = { 0, 0, 0, 0 }; | |
device->ClearRenderTargetView(renderTargetView, clearColor); | |
device->OMSetRenderTargets(1, &renderTargetView, NULL); | |
// Prepare for rendering | |
int WidthQuads = WIDTH/2 - 1; | |
int HeightQuads = HEIGHT/2 - 1; | |
int WorkFactor = 100; | |
int xOffset = (test & 1), yOffset = (test >> 1) & 1; | |
// During startup, don't do any significant work. | |
if (test == -1) | |
{ | |
strcpy_s(desc, 256, "Warmup"); | |
WidthQuads = HeightQuads = 1; | |
WorkFactor = 3; | |
xOffset = yOffset = 0; | |
} | |
else | |
sprintf_s(desc, 256, "xOffset=%d yOffset=%d", xOffset, yOffset); | |
float pixelToNDC[4]; | |
pixelToNDC[0] = 2.0f / WIDTH; | |
pixelToNDC[1] = 2.0f / HEIGHT; | |
pixelToNDC[2] = -1.0f + xOffset * pixelToNDC[0]; | |
pixelToNDC[3] = -1.0f + yOffset * pixelToNDC[1]; | |
effect->GetVariableByName("pixelToNDC")->AsVector()->SetFloatVector(pixelToNDC); | |
effect->GetVariableByName("width")->AsScalar()->SetInt(WidthQuads); | |
effect->GetVariableByName("upperBound")->AsScalar()->SetInt(WorkFactor); | |
// Actually render | |
device->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST); | |
effect->GetTechniqueByIndex(0)->GetPassByIndex(0)->Apply(0); | |
device->Draw(WidthQuads * HeightQuads * 3, 0); | |
// Present | |
swapChain->Present(0, 0); | |
// Stats | |
static LARGE_INTEGER lastFrame, freq; | |
LARGE_INTEGER now; | |
double msTime = 0.0; | |
QueryPerformanceCounter(&now); | |
if (nFrameInTest == 0) | |
QueryPerformanceFrequency(&freq); | |
else | |
msTime = 1000.0 * (now.QuadPart - lastFrame.QuadPart) / freq.QuadPart; | |
lastFrame = now; | |
return msTime; | |
} | |
int CALLBACK WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow) | |
{ | |
createWindow(hInstance); | |
initD3D(); | |
static const int startTestFrame = 50; | |
static const int warmupFramesPerTest = 3; | |
static const int measureFramesPerTest = 7; | |
static const int numTests = 4; | |
static const int totalFramesPerTest = warmupFramesPerTest + measureFramesPerTest; | |
double tempTimes[measureFramesPerTest]; | |
double testResult[numTests]; | |
char testDesc[numTests][256]; | |
char textBuffer[1024]; | |
int frameCounter = 0; | |
for (;;) | |
{ | |
MSG msg; | |
while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE)) | |
{ | |
if (msg.message == WM_QUIT) | |
goto Done; | |
TranslateMessage(&msg); | |
DispatchMessage(&msg); | |
} | |
if (frameCounter < startTestFrame) | |
{ | |
frame(textBuffer, -1, frameCounter); | |
Sleep(10); | |
} | |
else | |
{ | |
int testFrame = frameCounter - startTestFrame; | |
int iTest = testFrame / totalFramesPerTest; | |
int frameInTest = testFrame % totalFramesPerTest; | |
if (iTest >= numTests) | |
{ | |
// Format results | |
char *p = textBuffer; | |
char *pEnd = textBuffer + sizeof(textBuffer) / sizeof(*textBuffer); | |
for (int i=0; i < numTests; i++) | |
p += sprintf_s(p, pEnd - p, "%s: %.1f ms\n", testDesc[i], testResult[i]); | |
p += sprintf_s(p, pEnd-p, "\nPress Ctrl+C to copy to clipboard!"); | |
// Present them in a message box | |
MessageBoxA(hWnd, textBuffer, "Test results", MB_ICONINFORMATION | MB_OK); | |
DestroyWindow(hWnd); | |
} | |
else | |
{ | |
double time = frame(testDesc[iTest], iTest, frameInTest); | |
if (frameInTest >= warmupFramesPerTest) | |
tempTimes[frameInTest - warmupFramesPerTest] = time; | |
if (frameInTest == totalFramesPerTest - 1) | |
{ | |
// Find and record the median | |
std::nth_element(tempTimes, tempTimes + (measureFramesPerTest/2), tempTimes + measureFramesPerTest); | |
testResult[iTest] = tempTimes[measureFramesPerTest/2]; | |
} | |
} | |
} | |
frameCounter++; | |
} | |
Done: | |
deinitD3D(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment