Subv · February 23, 2015 22:38
diff --git a/gistfile1.diff b/gistfile1.diff
 diff --git a/externals/boost b/externals/boost
 --- a/externals/boost
 +++ b/externals/boost
 @@ -1 +1 @@
 -Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5
 +Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5-dirty
 diff --git a/externals/nihstro b/externals/nihstro
 --- a/externals/nihstro
 +++ b/externals/nihstro
 @@ -1 +1 @@
 -Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211
 +Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211-dirty
 diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
 index 8800235..08fb772 100644
 --- a/src/core/hw/gpu.cpp
 +++ b/src/core/hw/gpu.cpp
 @@ -122,6 +122,18 @@ inline void Write(u32 addr, const T data) {
 
             u32 output_width = config.output_width / pixel_skip;
 
 +            if (config.raw_copy) {
 +                LOG_ERROR(HW_GPU, "Raw display transfers are not yet implemented.");
 +                UNIMPLEMENTED();
 +                return;
 +            }
 +
 +            if (config.output_tiled0 || config.output_tiled1) {
 +                LOG_ERROR(HW_GPU, "Display transfers with tiled output are not yet implemented");
 +                UNIMPLEMENTED();
 +                return;
 +            }
 +
             for (u32 y = 0; y < config.output_height; ++y) {
                 // TODO: Why does the register seem to hold twice the framebuffer width?
 
 @@ -130,10 +142,29 @@ inline void Write(u32 addr, const T data) {
                         int r, g, b, a;
                     } source_color = { 0, 0, 0, 0 };
 
 +                    const unsigned int block_width = 8;
 +                    const unsigned int block_height = 8;
 +
 +                    const unsigned int coarse_x = x & ~7;
 +                    const unsigned int coarse_y = y & ~7;
 +
 +                    // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
 +                    // arranged in a Z-order curve. More details on the bit manipulation at:
 +                    // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
 +                    unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
 +                    i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
 +                    i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
 +                    i = (i | (i >> 7)) & 0x3F;
 +
 +                    const unsigned int offset = coarse_x * block_height;
 +
 +                    u32 dst_offset = (x + y * config.output_width) * GPU::Regs::BytesPerPixel(config.output_format) / pixel_skip;
 +                    u32 src_offset = (offset + i + coarse_y * output_width) * GPU::Regs::BytesPerPixel(config.input_format);
 +
                     switch (config.input_format) {
                     case Regs::PixelFormat::RGBA8:
                     {
 -                        u8* srcptr = source_pointer + (x * pixel_skip + y * config.input_width) * 4;
 +                        u8* srcptr = source_pointer + src_offset;
                         source_color.r = srcptr[3]; // red
                         source_color.g = srcptr[2]; // green
                         source_color.b = srcptr[1]; // blue
 @@ -143,7 +174,7 @@ inline void Write(u32 addr, const T data) {
 
                     case Regs::PixelFormat::RGB5A1:
                     {
 -                        u16 srcval = *(u16*)(source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip);
 +                        u16 srcval = *(u16*)(source_pointer + src_offset);
                         source_color.r = Color::Convert5To8((srcval >> 11) & 0x1F); // red
                         source_color.g = Color::Convert5To8((srcval >>  6) & 0x1F); // green
                         source_color.b = Color::Convert5To8((srcval >>  1) & 0x1F); // blue
 @@ -153,7 +184,7 @@ inline void Write(u32 addr, const T data) {
 
                     case Regs::PixelFormat::RGBA4:
                     {
 -                        u16 srcval = *(u16*)(source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip);
 +                        u16 srcval = *(u16*)(source_pointer + src_offset);
                         source_color.r = Color::Convert4To8((srcval >> 12) & 0xF); // red
                         source_color.g = Color::Convert4To8((srcval >>  8) & 0xF); // green
                         source_color.b = Color::Convert4To8((srcval >>  4) & 0xF); // blue
 @@ -167,20 +198,20 @@ inline void Write(u32 addr, const T data) {
                     }
 
                     switch (config.output_format) {
 -                    /*case Regs::PixelFormat::RGBA8:
 +                    case Regs::PixelFormat::RGBA8:
                     {
                         // TODO: Untested
 -                        u8* dstptr = (u32*)(dest_pointer + x * 4 + y * config.output_width * 4);
 -                        dstptr[0] = source_color.r;
 -                        dstptr[1] = source_color.g;
 -                        dstptr[2] = source_color.b;
 -                        dstptr[3] = source_color.a;
 +                        u8* dstptr = (u8*)(dest_pointer + dst_offset);
 +                        dstptr[3] = source_color.r;
 +                        dstptr[2] = source_color.g;
 +                        dstptr[1] = source_color.b;
 +                        dstptr[0] = source_color.a;
                         break;
 -                    }*/
 +                    }
 
                     case Regs::PixelFormat::RGB8:
                     {
 -                        u8* dstptr = dest_pointer + (x + y * output_width) * 3;
 +                        u8* dstptr = dest_pointer + dst_offset;
                         dstptr[2] = source_color.r; // red
                         dstptr[1] = source_color.g; // green
                         dstptr[0] = source_color.b; // blue
 @@ -189,7 +220,7 @@ inline void Write(u32 addr, const T data) {
 
                     case Regs::PixelFormat::RGB5A1:
                     {
 -                        u16* dstptr = (u16*)(dest_pointer + x * 2 + y * config.output_width * 2);
 +                        u16* dstptr = (u16*)(dest_pointer + dst_offset);
                         *dstptr = ((source_color.r >> 3) << 11) | ((source_color.g >> 3) << 6)
                                 | ((source_color.b >> 3) <<  1) | ( source_color.a >> 7);
                         break;
 @@ -197,7 +228,7 @@ inline void Write(u32 addr, const T data) {
 
                     case Regs::PixelFormat::RGBA4:
                     {
 -                        u16* dstptr = (u16*)(dest_pointer + x * 2 + y * config.output_width * 2);
 +                        u16* dstptr = (u16*)(dest_pointer + dst_offset);
                         *dstptr = ((source_color.r >> 4) << 12) | ((source_color.g >> 4) << 8)
                                 | ((source_color.b >> 4) <<  4) | ( source_color.a >> 4);
                         break;
 diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
 index 75f5244..b365ff1 100644
 --- a/src/core/hw/gpu.h
 +++ b/src/core/hw/gpu.h
 @@ -192,9 +192,11 @@ struct Regs {
             u32 flags;
 
             BitField< 0, 1, u32> flip_data;        // flips input data horizontally (TODO) if true
 +            BitField< 1, 1, u32> output_tiled0;    // Converts the output to tiles. TODO(Subv): Verify and implement
 +            BitField< 3, 1, u32> raw_copy;         // Copies the data without performing any processing
             BitField< 8, 3, PixelFormat> input_format;
             BitField<12, 3, PixelFormat> output_format;
 -            BitField<16, 1, u32> output_tiled;     // stores output in a tiled format
 +            BitField<16, 1, u32> output_tiled1;     // Converts the output to tiles. TODO(Subv): Verify and implement
 
             // TODO: Not really sure if this actually scales, or even resizes at all.
             BitField<24, 1, u32> scale_horizontally;
 diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
 index 17f8f70..94610a3 100644
 --- a/src/video_core/rasterizer.cpp
 +++ b/src/video_core/rasterizer.cpp
 @@ -7,6 +7,7 @@
 #include "common/common_types.h"
 #include "common/math_util.h"
 
 +#include "core/hw/gpu.h"
 #include "math.h"
 #include "pica.h"
 #include "rasterizer.h"
 @@ -26,10 +27,27 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     // NOTE: The framebuffer height register contains the actual FB height minus one.
     y = (registers.framebuffer.height - y);
 
 +    const unsigned int block_width = 8;
 +    const unsigned int block_height = 8;
 +
 +    const unsigned int coarse_x = x & ~7;
 +    const unsigned int coarse_y = y & ~7;
 +
 +    // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
 +    // arranged in a Z-order curve. More details on the bit manipulation at:
 +    // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
 +    unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
 +    i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
 +    i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
 +    i = (i | (i >> 7)) & 0x3F;
 +
 +    const unsigned int offset = coarse_x * block_height;
 +    u32 dst_offset = (offset + i + coarse_y * registers.framebuffer.width) * GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));
 +
     switch (registers.framebuffer.color_format) {
     case registers.framebuffer.RGBA8:
     {
 -        u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
 +        u8* pixel = color_buffer + dst_offset;
         pixel[3] = color.r();
         pixel[2] = color.g();
         pixel[1] = color.b();
 @@ -48,12 +66,28 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
     u8* color_buffer = Memory::GetPointer(PAddrToVAddr(addr));
 
     y = (registers.framebuffer.height - y);
 +    const unsigned int block_width = 8;
 +    const unsigned int block_height = 8;
 +
 +    const unsigned int coarse_x = x & ~7;
 +    const unsigned int coarse_y = y & ~7;
 +
 +    // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
 +    // arranged in a Z-order curve. More details on the bit manipulation at:
 +    // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
 +    unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
 +    i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
 +    i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
 +    i = (i | (i >> 7)) & 0x3F;
 +
 +    const unsigned int offset = coarse_x * block_height;
 +    u32 src_offset = (offset + i + coarse_y * registers.framebuffer.width) * GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));
 
     switch (registers.framebuffer.color_format) {
     case registers.framebuffer.RGBA8:
     {
         Math::Vec4<u8> ret;
 -        u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
 +        u8* pixel = color_buffer + src_offset;
         ret.r() = pixel[3];
         ret.g() = pixel[2];
         ret.b() = pixel[1];
 @@ -70,22 +104,54 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
 
 static u32 GetDepth(int x, int y) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
 -    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 +    u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));
 
     y = (registers.framebuffer.height - y);
 +    const unsigned int block_width = 8;
 +    const unsigned int block_height = 8;
 +
 +    const unsigned int coarse_x = x & ~7;
 +    const unsigned int coarse_y = y & ~7;
 +
 +    // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
 +    // arranged in a Z-order curve. More details on the bit manipulation at:
 +    // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
 +    unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
 +    i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
 +    i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
 +    i = (i | (i >> 7)) & 0x3F;
 +
 +    const unsigned int offset = coarse_x * block_height;
 +    u32 stride = registers.framebuffer.width * 2;
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
 -    return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
 +    return *(u16*)(depth_buffer + i * 2 + offset * 2 + coarse_y * stride);
 }
 
 static void SetDepth(int x, int y, u16 value) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
 -    u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 +    u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));
 
     y = (registers.framebuffer.height - y);
 +    const unsigned int block_width = 8;
 +    const unsigned int block_height = 8;
 +
 +    const unsigned int coarse_x = x & ~7;
 +    const unsigned int coarse_y = y & ~7;
 +
 +    // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
 +    // arranged in a Z-order curve. More details on the bit manipulation at:
 +    // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
 +    unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
 +    i = (i ^ (i << 2)) & 0x1313;              // ---2 --10
 +    i = (i ^ (i << 1)) & 0x1515;              // ---2 -1-0
 +    i = (i | (i >> 7)) & 0x3F;
 +
 +    const unsigned int offset = coarse_x * block_height;
 +    u32 stride = registers.framebuffer.width * 2;
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
 -    *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
 +    *(u16*)(depth_buffer + i * 2 + offset * 2 + coarse_y * stride) = value;
 }
 
 // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
 diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
 index 2726951..f357b5c 100644
 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp
 +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
 @@ -262,6 +262,7 @@ void RendererOpenGL::DrawScreens() {
 
     DrawSingleScreenRotated(textures[0], top_x, 0,
         (float)VideoCore::kScreenTopWidth, (float)VideoCore::kScreenTopHeight);
 +    glFlush();
     DrawSingleScreenRotated(textures[1], bottom_x, (float)VideoCore::kScreenTopHeight,
         (float)VideoCore::kScreenBottomWidth, (float)VideoCore::kScreenBottomHeight);
	diff --git a/externals/boost b/externals/boost
	--- a/externals/boost
	+++ b/externals/boost
	@@ -1 +1 @@
	-Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5
	+Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5-dirty
	diff --git a/externals/nihstro b/externals/nihstro
	--- a/externals/nihstro
	+++ b/externals/nihstro
	@@ -1 +1 @@
	-Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211
	+Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211-dirty
	diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
	index 8800235..08fb772 100644
	--- a/src/core/hw/gpu.cpp
	+++ b/src/core/hw/gpu.cpp
	@@ -122,6 +122,18 @@ inline void Write(u32 addr, const T data) {

	u32 output_width = config.output_width / pixel_skip;

	+ if (config.raw_copy) {
	+ LOG_ERROR(HW_GPU, "Raw display transfers are not yet implemented.");
	+ UNIMPLEMENTED();
	+ return;
	+ }
	+
	+ if (config.output_tiled0 \|\| config.output_tiled1) {
	+ LOG_ERROR(HW_GPU, "Display transfers with tiled output are not yet implemented");
	+ UNIMPLEMENTED();
	+ return;
	+ }
	+
	for (u32 y = 0; y < config.output_height; ++y) {
	// TODO: Why does the register seem to hold twice the framebuffer width?

	@@ -130,10 +142,29 @@ inline void Write(u32 addr, const T data) {
	int r, g, b, a;
	} source_color = { 0, 0, 0, 0 };

	+ const unsigned int block_width = 8;
	+ const unsigned int block_height = 8;
	+
	+ const unsigned int coarse_x = x & ~7;
	+ const unsigned int coarse_y = y & ~7;
	+
	+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
	+ // arranged in a Z-order curve. More details on the bit manipulation at:
	+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
	+ unsigned int i = (x \| (y << 8)) & 0x0707; // ---- -210
	+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
	+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
	+ i = (i \| (i >> 7)) & 0x3F;
	+
	+ const unsigned int offset = coarse_x * block_height;
	+
	+ u32 dst_offset = (x + y * config.output_width) * GPU::Regs::BytesPerPixel(config.output_format) / pixel_skip;
	+ u32 src_offset = (offset + i + coarse_y * output_width) * GPU::Regs::BytesPerPixel(config.input_format);
	+
	switch (config.input_format) {
	case Regs::PixelFormat::RGBA8:
	{
	- u8* srcptr = source_pointer + (x * pixel_skip + y * config.input_width) * 4;
	+ u8* srcptr = source_pointer + src_offset;
	source_color.r = srcptr[3]; // red
	source_color.g = srcptr[2]; // green
	source_color.b = srcptr[1]; // blue
	@@ -143,7 +174,7 @@ inline void Write(u32 addr, const T data) {

	case Regs::PixelFormat::RGB5A1:
	{
	- u16 srcval = (u16)(source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip);
	+ u16 srcval = (u16)(source_pointer + src_offset);
	source_color.r = Color::Convert5To8((srcval >> 11) & 0x1F); // red
	source_color.g = Color::Convert5To8((srcval >> 6) & 0x1F); // green
	source_color.b = Color::Convert5To8((srcval >> 1) & 0x1F); // blue
	@@ -153,7 +184,7 @@ inline void Write(u32 addr, const T data) {

	case Regs::PixelFormat::RGBA4:
	{
	- u16 srcval = (u16)(source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip);
	+ u16 srcval = (u16)(source_pointer + src_offset);
	source_color.r = Color::Convert4To8((srcval >> 12) & 0xF); // red
	source_color.g = Color::Convert4To8((srcval >> 8) & 0xF); // green
	source_color.b = Color::Convert4To8((srcval >> 4) & 0xF); // blue
	@@ -167,20 +198,20 @@ inline void Write(u32 addr, const T data) {
	}

	switch (config.output_format) {
	- /*case Regs::PixelFormat::RGBA8:
	+ case Regs::PixelFormat::RGBA8:
	{
	// TODO: Untested
	- u8* dstptr = (u32)(dest_pointer + x 4 + y * config.output_width * 4);
	- dstptr[0] = source_color.r;
	- dstptr[1] = source_color.g;
	- dstptr[2] = source_color.b;
	- dstptr[3] = source_color.a;
	+ u8* dstptr = (u8*)(dest_pointer + dst_offset);
	+ dstptr[3] = source_color.r;
	+ dstptr[2] = source_color.g;
	+ dstptr[1] = source_color.b;
	+ dstptr[0] = source_color.a;
	break;
	- }*/
	+ }

	case Regs::PixelFormat::RGB8:
	{
	- u8* dstptr = dest_pointer + (x + y * output_width) * 3;
	+ u8* dstptr = dest_pointer + dst_offset;
	dstptr[2] = source_color.r; // red
	dstptr[1] = source_color.g; // green
	dstptr[0] = source_color.b; // blue
	@@ -189,7 +220,7 @@ inline void Write(u32 addr, const T data) {

	case Regs::PixelFormat::RGB5A1:
	{
	- u16* dstptr = (u16)(dest_pointer + x 2 + y * config.output_width * 2);
	+ u16* dstptr = (u16*)(dest_pointer + dst_offset);
	*dstptr = ((source_color.r >> 3) << 11) \| ((source_color.g >> 3) << 6)
	\| ((source_color.b >> 3) << 1) \| ( source_color.a >> 7);
	break;
	@@ -197,7 +228,7 @@ inline void Write(u32 addr, const T data) {

	case Regs::PixelFormat::RGBA4:
	{
	- u16* dstptr = (u16)(dest_pointer + x 2 + y * config.output_width * 2);
	+ u16* dstptr = (u16*)(dest_pointer + dst_offset);
	*dstptr = ((source_color.r >> 4) << 12) \| ((source_color.g >> 4) << 8)
	\| ((source_color.b >> 4) << 4) \| ( source_color.a >> 4);
	break;
	diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
	index 75f5244..b365ff1 100644
	--- a/src/core/hw/gpu.h
	+++ b/src/core/hw/gpu.h
	@@ -192,9 +192,11 @@ struct Regs {
	u32 flags;

	BitField< 0, 1, u32> flip_data; // flips input data horizontally (TODO) if true
	+ BitField< 1, 1, u32> output_tiled0; // Converts the output to tiles. TODO(Subv): Verify and implement
	+ BitField< 3, 1, u32> raw_copy; // Copies the data without performing any processing
	BitField< 8, 3, PixelFormat> input_format;
	BitField<12, 3, PixelFormat> output_format;
	- BitField<16, 1, u32> output_tiled; // stores output in a tiled format
	+ BitField<16, 1, u32> output_tiled1; // Converts the output to tiles. TODO(Subv): Verify and implement

	// TODO: Not really sure if this actually scales, or even resizes at all.
	BitField<24, 1, u32> scale_horizontally;
	diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
	index 17f8f70..94610a3 100644
	--- a/src/video_core/rasterizer.cpp
	+++ b/src/video_core/rasterizer.cpp
	@@ -7,6 +7,7 @@
	#include "common/common_types.h"
	#include "common/math_util.h"

	+#include "core/hw/gpu.h"
	#include "math.h"
	#include "pica.h"
	#include "rasterizer.h"
	@@ -26,10 +27,27 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
	// NOTE: The framebuffer height register contains the actual FB height minus one.
	y = (registers.framebuffer.height - y);

	+ const unsigned int block_width = 8;
	+ const unsigned int block_height = 8;
	+
	+ const unsigned int coarse_x = x & ~7;
	+ const unsigned int coarse_y = y & ~7;
	+
	+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
	+ // arranged in a Z-order curve. More details on the bit manipulation at:
	+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
	+ unsigned int i = (x \| (y << 8)) & 0x0707; // ---- -210
	+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
	+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
	+ i = (i \| (i >> 7)) & 0x3F;
	+
	+ const unsigned int offset = coarse_x * block_height;
	+ u32 dst_offset = (offset + i + coarse_y * registers.framebuffer.width) * GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));
	+
	switch (registers.framebuffer.color_format) {
	case registers.framebuffer.RGBA8:
	{
	- u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
	+ u8* pixel = color_buffer + dst_offset;
	pixel[3] = color.r();
	pixel[2] = color.g();
	pixel[1] = color.b();
	@@ -48,12 +66,28 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
	u8* color_buffer = Memory::GetPointer(PAddrToVAddr(addr));

	y = (registers.framebuffer.height - y);
	+ const unsigned int block_width = 8;
	+ const unsigned int block_height = 8;
	+
	+ const unsigned int coarse_x = x & ~7;
	+ const unsigned int coarse_y = y & ~7;
	+
	+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
	+ // arranged in a Z-order curve. More details on the bit manipulation at:
	+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
	+ unsigned int i = (x \| (y << 8)) & 0x0707; // ---- -210
	+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
	+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
	+ i = (i \| (i >> 7)) & 0x3F;
	+
	+ const unsigned int offset = coarse_x * block_height;
	+ u32 src_offset = (offset + i + coarse_y * registers.framebuffer.width) * GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));

	switch (registers.framebuffer.color_format) {
	case registers.framebuffer.RGBA8:
	{
	Math::Vec4<u8> ret;
	- u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
	+ u8* pixel = color_buffer + src_offset;
	ret.r() = pixel[3];
	ret.g() = pixel[2];
	ret.b() = pixel[1];
	@@ -70,22 +104,54 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {

	static u32 GetDepth(int x, int y) {
	const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
	- u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
	+ u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));

	y = (registers.framebuffer.height - y);
	+ const unsigned int block_width = 8;
	+ const unsigned int block_height = 8;
	+
	+ const unsigned int coarse_x = x & ~7;
	+ const unsigned int coarse_y = y & ~7;
	+
	+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
	+ // arranged in a Z-order curve. More details on the bit manipulation at:
	+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
	+ unsigned int i = (x \| (y << 8)) & 0x0707; // ---- -210
	+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
	+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
	+ i = (i \| (i >> 7)) & 0x3F;
	+
	+ const unsigned int offset = coarse_x * block_height;
	+ u32 stride = registers.framebuffer.width * 2;

	// Assuming 16-bit depth buffer format until actual format handling is implemented
	- return (depth_buffer + x + y registers.framebuffer.GetWidth());
	+ return (u16)(depth_buffer + i * 2 + offset * 2 + coarse_y * stride);
	}

	static void SetDepth(int x, int y, u16 value) {
	const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
	- u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
	+ u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));

	y = (registers.framebuffer.height - y);
	+ const unsigned int block_width = 8;
	+ const unsigned int block_height = 8;
	+
	+ const unsigned int coarse_x = x & ~7;
	+ const unsigned int coarse_y = y & ~7;
	+
	+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
	+ // arranged in a Z-order curve. More details on the bit manipulation at:
	+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
	+ unsigned int i = (x \| (y << 8)) & 0x0707; // ---- -210
	+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
	+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
	+ i = (i \| (i >> 7)) & 0x3F;
	+
	+ const unsigned int offset = coarse_x * block_height;
	+ u32 stride = registers.framebuffer.width * 2;

	// Assuming 16-bit depth buffer format until actual format handling is implemented
	- (depth_buffer + x + y registers.framebuffer.GetWidth()) = value;
	+ (u16)(depth_buffer + i * 2 + offset * 2 + coarse_y * stride) = value;
	}

	// NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
	diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
	index 2726951..f357b5c 100644
	--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
	+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
	@@ -262,6 +262,7 @@ void RendererOpenGL::DrawScreens() {

	DrawSingleScreenRotated(textures[0], top_x, 0,
	(float)VideoCore::kScreenTopWidth, (float)VideoCore::kScreenTopHeight);
	+ glFlush();
	DrawSingleScreenRotated(textures[1], bottom_x, (float)VideoCore::kScreenTopHeight,
	(float)VideoCore::kScreenBottomWidth, (float)VideoCore::kScreenBottomHeight);