Skip to content

Instantly share code, notes, and snippets.

@rygorous
Created September 25, 2011 02:27
Show Gist options
  • Save rygorous/1240137 to your computer and use it in GitHub Desktop.
Save rygorous/1240137 to your computer and use it in GitHub Desktop.
Does your GPU have pixel alignment requirements for quads?
#include <windows.h>
#include <D3D10.h>
#include <D3DX10.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <algorithm>
// "Does your GPU have pixel alignment requirements for quads?" tester
// Win32 only, D3D10+ only because it's a quick hack.
//
// The test works like this:
// We fill the screen with triangles laid out in a grid, with a spacing of 2 pixels
// in each direction. Each triangle has the vertex coordinates (in pixels):
// (x0, y0)
// (x0, y0 + 2.5)
// (x0 + 2.5, y0)
// which means each triangle covers exactly 3 pixels. They end up looking like this:
// *
// * *
// where the lower-left star is at the base vertex position (x0, y0). This is all set
// up in the vertex shader. We then try displacing the triangles by one pixel each in
// the x and y directions (as well as both at once). If the GPU can produce rasterized
// quads with arbitrary alignment, this shouldn't make any significant difference. If,
// however, the GPU tiles the render target with "quad footprints" and will rasterize
// any quads touched by a triangle, shifting our test triangles by 1 pixel in either x
// or y will cause 2 quads (instead of 1) to be rasterized per triangle; the case where
// we shift by 1 pixel in both x and y is even worse, since the triangle now covers 3
// quads (each of which have exactly one pixel lit).
//
// The PS just does random busy work (counting primes under a specified threshold) in
// a deliberately inefficient way to make pixels take long to shade so we get long
// enough frame times. The main point here is to just keep the ALUs busy and make sure
// that other potential limiting factors (such as memory bandwidth) are out of the
// equation. WorkFactor (set in the code below) controls how much work is done per pixel.
// You have to set this up properly so the driver doesn't think the GPU is hung working
// on the DrawPrimitive and does a reset. :) Choosing WorkFactor so the first test takes
// about 130ms worked fine for me.
//
// So, for unconstrained quad alignment we expect equal performance in all 4 cases, whereas
// for even-x/y alignment we expect one of the cases (probably the non-shifted one) to be
// fastest, the two cases that hit 2 quads each to be roughly twice as slow, and the last
// case (which hits 3 quads per triangle) to take roughly 3x as long as the fast case.
//
// Test results:
//
// AMD Radeon HD 5770 (WorkFactor = 100) - Evergreen series
// xOffset=0 yOffset=0: 191.7 ms
// xOffset=1 yOffset=0: 383.9 ms
// xOffset=0 yOffset=1: 384.3 ms
// xOffset=1 yOffset=1: 588.5 ms
// -> nearly perfectly linear - even alignment required.
//
// AMD Radeon HD 6900 (WorkFactor = 100) - Northern islands series
// xOffset=0 yOffset=0: 137.7 ms
// xOffset=1 yOffset=0: 206.9 ms
// xOffset=0 yOffset=1: 208.0 ms
// xOffset=1 yOffset=1: 301.8 ms
// -> likely to have even-alignment requirement.
//
// NVidia GeForce 8800 GTX (WorkFactor = 50) - G80 architecture
// xOffset=0 yOffset=0: 127.6 ms
// xOffset=1 yOffset=0: 157.0 ms
// xOffset=0 yOffset=1: 158.5 ms
// xOffset=1 yOffset=1: 252.8 ms
// -> we definitely get some slowdown here, but not as much as expected. The ratios are
// fairly consistent between different work factors, so this isn't just some constant overhead
// that's distorting the relative frame times. I don't have detailed knowledge of how the
// GF 8x00 series handles rasterization and quad dispatch internally, so I can't say for sure
// what's going on here.
//
// NVidia GeForce GTX 465 (WorkFactor = 100) - Fermi architecture
// xOffset=0 yOffset=0: 71.8ms
// xOffset=1 yOffset=0: 139.3ms
// xOffset=0 yOffset=1: 138.3ms
// xOffset=1 yOffset=1: 207.9ms
// -> This one's crystal clear: alignment required. And whatever other bottlenecks besides shaders
// seemed to exist for this test in previous NV architectures seem to have been eliminated.
// Shader code
static const char shaderCode[] =
"cbuffer cbAll : register(cb0) {\n"
" float4 pixelToNDC;\n"
" uint width, upperBound;\n"
"};\n"
"struct PSIn {\n"
" float4 Pos : SV_POSITION;\n"
"};\n"
"PSIn VS(uint i : SV_VertexID)\n"
"{\n"
" PSIn o;\n"
" uint iTri = i / 3;\n"
" uint iVertInTri = i % 3;\n"
" float2 v;\n"
" v.x = (float) (iTri % width)*2;\n"
" v.y = (float) (iTri / width)*2;\n"
" v.x += (iVertInTri == 2) ? 2.5f : 0.0f;\n"
" v.y += (iVertInTri == 1) ? 2.5f : 0.0f;\n"
" o.Pos.xy = v * pixelToNDC.xy + pixelToNDC.zw;\n"
" o.Pos.z = 0.5f;\n"
" o.Pos.w = 1.0f;\n"
" return o;\n"
"}\n"
"float4 PS(PSIn x) : SV_Target\n"
"{\n"
" uint nPrimes = 0;\n"
" for (uint i=2; i < upperBound; i++)\n"
" {\n"
" uint nFactors = 0;\n"
" for (uint j=2; j <= i; j++)\n"
" if (i % j == 0) nFactors++;\n"
" nPrimes += (nFactors == 1) ? 1 : 0;\n"
" }\n"
" return (nPrimes != 0) ? float4(1,1,1,1) : float4(0,0,0,0);\n"
"}\n"
"technique10 Render {\n"
" pass P0 {\n"
" SetVertexShader(CompileShader(vs_4_0, VS()));\n"
" SetGeometryShader(NULL);\n"
" SetPixelShader(CompileShader(ps_4_0, PS()));\n"
" }\n"
"}\n";
// ---- App code.
#pragma comment(lib, "d3d10.lib")
#pragma comment(lib, "d3dx10.lib")
static const int WIDTH = 512, HEIGHT = 512;
static HWND hWnd;
static IDXGISwapChain *swapChain;
static ID3D10Device *device;
static ID3D10RenderTargetView *renderTargetView;
static ID3D10Effect *effect;
static void errorExit(const char *fmt, ...)
{
char buffer[2048];
va_list arg;
va_start(arg, fmt);
vsprintf_s(buffer, fmt, arg);
va_end(arg);
MessageBoxA(hWnd, buffer, "Error", MB_ICONERROR | MB_OK);
exit(1);
}
static void check(HRESULT hr)
{
if (!FAILED(hr))
return;
errorExit("D3D error code %08x\n", hr);
}
static LRESULT CALLBACK windowProc(HWND hWnd, UINT msg, WPARAM wparam, LPARAM lparam)
{
switch (msg)
{
case WM_DESTROY:
PostQuitMessage(0);
break;
case WM_CHAR:
if (wparam == 27) // escape
DestroyWindow(hWnd);
return 0;
}
return DefWindowProc(hWnd, msg, wparam, lparam);
}
static void createWindow(HINSTANCE hInst)
{
WNDCLASS wc = { 0, windowProc, 0, 0, hInst, 0, LoadCursor(0, IDC_ARROW), (HBRUSH) GetStockObject(WHITE_BRUSH), NULL, TEXT("quadtest") };
if (!RegisterClass(&wc))
errorExit("Couldn't register class.");
RECT r = { 0, 0, WIDTH, HEIGHT };
AdjustWindowRect(&r, WS_OVERLAPPEDWINDOW, FALSE);
hWnd = CreateWindow(TEXT("quadtest"), TEXT("quadtest"), WS_OVERLAPPEDWINDOW | WS_VISIBLE, CW_USEDEFAULT, CW_USEDEFAULT,
r.right - r.left, r.bottom - r.top, NULL, NULL, hInst, NULL);
if (!hWnd)
errorExit("Error creating window.");
}
static void initD3D()
{
DXGI_SWAP_CHAIN_DESC sd = {
{
WIDTH, HEIGHT, { 60, 1 }, DXGI_FORMAT_R8G8B8A8_UNORM,
DXGI_MODE_SCANLINE_ORDER_UNSPECIFIED, DXGI_MODE_SCALING_UNSPECIFIED
},
{ 1, 0 },
DXGI_USAGE_RENDER_TARGET_OUTPUT,
1,
hWnd,
TRUE,
DXGI_SWAP_EFFECT_DISCARD,
0
};
check(D3D10CreateDeviceAndSwapChain(NULL, D3D10_DRIVER_TYPE_HARDWARE, NULL, 0, D3D10_SDK_VERSION,
&sd, &swapChain, &device));
// Create a render target view
ID3D10Texture2D *buffer;
check(swapChain->GetBuffer(0, __uuidof(ID3D10Texture2D), (void **)&buffer));
check(device->CreateRenderTargetView(buffer, NULL, &renderTargetView));
buffer->Release();
// Compile the shaders
ID3D10Blob *errors;
HRESULT hr = D3DX10CreateEffectFromMemory(shaderCode, strlen(shaderCode), "shader.fx", NULL,
NULL, "fx_4_0", 0, 0, device, NULL, NULL, &effect, &errors, NULL);
if (FAILED(hr))
errorExit("Effect compilation error: %s", errors->GetBufferPointer());
if (errors)
errors->Release();
// Initialize the viewport
D3D10_VIEWPORT vp = { 0, 0, WIDTH, HEIGHT, 0.0f, 1.0f };
device->RSSetViewports(1, &vp);
}
static void deinitD3D()
{
effect->Release();
renderTargetView->Release();
device->Release();
swapChain->Release();
}
static double frame(char *desc, int test, int nFrameInTest)
{
static const float clearColor[4] = { 0, 0, 0, 0 };
device->ClearRenderTargetView(renderTargetView, clearColor);
device->OMSetRenderTargets(1, &renderTargetView, NULL);
// Prepare for rendering
int WidthQuads = WIDTH/2 - 1;
int HeightQuads = HEIGHT/2 - 1;
int WorkFactor = 100;
int xOffset = (test & 1), yOffset = (test >> 1) & 1;
// During startup, don't do any significant work.
if (test == -1)
{
strcpy_s(desc, 256, "Warmup");
WidthQuads = HeightQuads = 1;
WorkFactor = 3;
xOffset = yOffset = 0;
}
else
sprintf_s(desc, 256, "xOffset=%d yOffset=%d", xOffset, yOffset);
float pixelToNDC[4];
pixelToNDC[0] = 2.0f / WIDTH;
pixelToNDC[1] = 2.0f / HEIGHT;
pixelToNDC[2] = -1.0f + xOffset * pixelToNDC[0];
pixelToNDC[3] = -1.0f + yOffset * pixelToNDC[1];
effect->GetVariableByName("pixelToNDC")->AsVector()->SetFloatVector(pixelToNDC);
effect->GetVariableByName("width")->AsScalar()->SetInt(WidthQuads);
effect->GetVariableByName("upperBound")->AsScalar()->SetInt(WorkFactor);
// Actually render
device->IASetPrimitiveTopology(D3D10_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
effect->GetTechniqueByIndex(0)->GetPassByIndex(0)->Apply(0);
device->Draw(WidthQuads * HeightQuads * 3, 0);
// Present
swapChain->Present(0, 0);
// Stats
static LARGE_INTEGER lastFrame, freq;
LARGE_INTEGER now;
double msTime = 0.0;
QueryPerformanceCounter(&now);
if (nFrameInTest == 0)
QueryPerformanceFrequency(&freq);
else
msTime = 1000.0 * (now.QuadPart - lastFrame.QuadPart) / freq.QuadPart;
lastFrame = now;
return msTime;
}
int CALLBACK WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow)
{
createWindow(hInstance);
initD3D();
static const int startTestFrame = 50;
static const int warmupFramesPerTest = 3;
static const int measureFramesPerTest = 7;
static const int numTests = 4;
static const int totalFramesPerTest = warmupFramesPerTest + measureFramesPerTest;
double tempTimes[measureFramesPerTest];
double testResult[numTests];
char testDesc[numTests][256];
char textBuffer[1024];
int frameCounter = 0;
for (;;)
{
MSG msg;
while (PeekMessage(&msg, 0, 0, 0, PM_REMOVE))
{
if (msg.message == WM_QUIT)
goto Done;
TranslateMessage(&msg);
DispatchMessage(&msg);
}
if (frameCounter < startTestFrame)
{
frame(textBuffer, -1, frameCounter);
Sleep(10);
}
else
{
int testFrame = frameCounter - startTestFrame;
int iTest = testFrame / totalFramesPerTest;
int frameInTest = testFrame % totalFramesPerTest;
if (iTest >= numTests)
{
// Format results
char *p = textBuffer;
char *pEnd = textBuffer + sizeof(textBuffer) / sizeof(*textBuffer);
for (int i=0; i < numTests; i++)
p += sprintf_s(p, pEnd - p, "%s: %.1f ms\n", testDesc[i], testResult[i]);
p += sprintf_s(p, pEnd-p, "\nPress Ctrl+C to copy to clipboard!");
// Present them in a message box
MessageBoxA(hWnd, textBuffer, "Test results", MB_ICONINFORMATION | MB_OK);
DestroyWindow(hWnd);
}
else
{
double time = frame(testDesc[iTest], iTest, frameInTest);
if (frameInTest >= warmupFramesPerTest)
tempTimes[frameInTest - warmupFramesPerTest] = time;
if (frameInTest == totalFramesPerTest - 1)
{
// Find and record the median
std::nth_element(tempTimes, tempTimes + (measureFramesPerTest/2), tempTimes + measureFramesPerTest);
testResult[iTest] = tempTimes[measureFramesPerTest/2];
}
}
}
frameCounter++;
}
Done:
deinitD3D();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment