Subv · August 29, 2015 14:15
diff --git a/gistfile1.diff b/gistfile1.diff
 diff --git a/externals/boost b/externals/boost
 --- a/externals/boost
 +++ b/externals/boost
 @@ -1 +1 @@
 -Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5
 +Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5-dirty
 diff --git a/externals/nihstro b/externals/nihstro
 --- a/externals/nihstro
 +++ b/externals/nihstro
 @@ -1 +1 @@
 -Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211
 +Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211-dirty
 diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
 index 8800235..31a2045 100644
 --- a/src/core/hw/gpu.cpp
 +++ b/src/core/hw/gpu.cpp
 @@ -52,6 +52,37 @@ inline void Read(T &var, const u32 raw_addr) {
     var = g_regs[addr / 4];
 }
 
 +u32 Compact1By1(u32 x)
 +{
 +    x &= 0x55555555;                  // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
 +    x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
 +    x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
 +    x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
 +    x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
 +    return x;
 +}
 +
 +// Inverse of Part1By2 - "delete" all bits not at positions divisible by 3
 +u32 Compact1By2(u32 x)
 +{
 +    x &= 0x09249249;                  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
 +    x = (x ^ (x >> 2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
 +    x = (x ^ (x >> 4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
 +    x = (x ^ (x >> 8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
 +    x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
 +    return x;
 +}
 +
 +u32 DecodeMorton2X(u32 code)
 +{
 +    return Compact1By1(code >> 0);
 +}
 +
 +u32 DecodeMorton2Y(u32 code)
 +{
 +    return Compact1By1(code >> 1);
 +}
 +
 template <typename T>
 inline void Write(u32 addr, const T data) {
     addr -= 0x1EF00000;
 @@ -130,6 +161,10 @@ inline void Write(u32 addr, const T data) {
                         int r, g, b, a;
                     } source_color = { 0, 0, 0, 0 };
 
 +                    u32 morton_code = y * config.input_width + x;
 +                    u32 curve_x = DecodeMorton2X(morton_code);
 +                    u32 curve_y = DecodeMorton2Y(morton_code);
 +
                     switch (config.input_format) {
                     case Regs::PixelFormat::RGBA8:
                     {
 @@ -180,7 +215,7 @@ inline void Write(u32 addr, const T data) {
 
                     case Regs::PixelFormat::RGB8:
                     {
 -                        u8* dstptr = dest_pointer + (x + y * output_width) * 3;
 +                        u8* dstptr = dest_pointer + (curve_x + curve_y * output_width) * 3;
                         dstptr[2] = source_color.r; // red
                         dstptr[1] = source_color.g; // green
                         dstptr[0] = source_color.b; // blue
 @@ -189,7 +224,7 @@ inline void Write(u32 addr, const T data) {
 
                     case Regs::PixelFormat::RGB5A1:
                     {
 -                        u16* dstptr = (u16*)(dest_pointer + x * 2 + y * config.output_width * 2);
 +                        u16* dstptr = (u16*)(dest_pointer + curve_x * 2 + curve_y * config.output_width * 2);
                         *dstptr = ((source_color.r >> 3) << 11) | ((source_color.g >> 3) << 6)
                                 | ((source_color.b >> 3) <<  1) | ( source_color.a >> 7);
                         break;
 @@ -197,7 +232,7 @@ inline void Write(u32 addr, const T data) {
 
                     case Regs::PixelFormat::RGBA4:
                     {
 -                        u16* dstptr = (u16*)(dest_pointer + x * 2 + y * config.output_width * 2);
 +                        u16* dstptr = (u16*)(dest_pointer + curve_x * 2 + curve_y * config.output_width * 2);
                         *dstptr = ((source_color.r >> 4) << 12) | ((source_color.g >> 4) << 8)
                                 | ((source_color.b >> 4) <<  4) | ( source_color.a >> 4);
                         break;
 diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
 index 17f8f70..b932873 100644
 --- a/src/video_core/rasterizer.cpp
 +++ b/src/video_core/rasterizer.cpp
 @@ -18,6 +18,19 @@ namespace Pica {
 
 namespace Rasterizer {
 
 +    unsigned int SeparateBy1(unsigned int x) {
 +        x &= 0x0000ffff;                  // x = ---- ---- ---- ---- fedc ba98 7654 3210
 +        x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
 +        x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
 +        x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
 +        x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
 +        return x;
 +    }
 +
 +    u32 MortonCode2(unsigned int x, unsigned int y) {
 +        return SeparateBy1(x) | (SeparateBy1(y) << 1);
 +    }
 +
 static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
     u8* color_buffer = Memory::GetPointer(PAddrToVAddr(addr));
 @@ -26,10 +39,12 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
     // NOTE: The framebuffer height register contains the actual FB height minus one.
     y = (registers.framebuffer.height - y);
 
 +    u32 code = MortonCode2(x, y);
 +
     switch (registers.framebuffer.color_format) {
     case registers.framebuffer.RGBA8:
     {
 -        u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
 +        u8* pixel = color_buffer + code * 4;
         pixel[3] = color.r();
         pixel[2] = color.g();
         pixel[1] = color.b();
 @@ -48,12 +63,13 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
     u8* color_buffer = Memory::GetPointer(PAddrToVAddr(addr));
 
     y = (registers.framebuffer.height - y);
 +    u32 code = MortonCode2(x, y);
 
     switch (registers.framebuffer.color_format) {
     case registers.framebuffer.RGBA8:
     {
         Math::Vec4<u8> ret;
 -        u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
 +        u8* pixel = color_buffer + code * 4;
         ret.r() = pixel[3];
         ret.g() = pixel[2];
         ret.b() = pixel[1];
 @@ -73,9 +89,10 @@ static u32 GetDepth(int x, int y) {
     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
     y = (registers.framebuffer.height - y);
 +    u32 code = MortonCode2(x, y);
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
 -    return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
 +    return depth_buffer[code];
 }
 
 static void SetDepth(int x, int y, u16 value) {
 @@ -83,9 +100,10 @@ static void SetDepth(int x, int y, u16 value) {
     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 
     y = (registers.framebuffer.height - y);
 +    u32 code = MortonCode2(x, y);
 
     // Assuming 16-bit depth buffer format until actual format handling is implemented
 -    *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
 +    depth_buffer[code] = value;
 }
 
 // NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
 diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
 index 2726951..f357b5c 100644
 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp
 +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
 @@ -262,6 +262,7 @@ void RendererOpenGL::DrawScreens() {
 
     DrawSingleScreenRotated(textures[0], top_x, 0,
         (float)VideoCore::kScreenTopWidth, (float)VideoCore::kScreenTopHeight);
 +    glFlush();
     DrawSingleScreenRotated(textures[1], bottom_x, (float)VideoCore::kScreenTopHeight,
         (float)VideoCore::kScreenBottomWidth, (float)VideoCore::kScreenBottomHeight);
	diff --git a/externals/boost b/externals/boost
	--- a/externals/boost
	+++ b/externals/boost
	@@ -1 +1 @@
	-Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5
	+Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5-dirty
	diff --git a/externals/nihstro b/externals/nihstro
	--- a/externals/nihstro
	+++ b/externals/nihstro
	@@ -1 +1 @@
	-Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211
	+Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211-dirty
	diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
	index 8800235..31a2045 100644
	--- a/src/core/hw/gpu.cpp
	+++ b/src/core/hw/gpu.cpp
	@@ -52,6 +52,37 @@ inline void Read(T &var, const u32 raw_addr) {
	var = g_regs[addr / 4];
	}

	+u32 Compact1By1(u32 x)
	+{
	+ x &= 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
	+ x = (x ^ (x >> 1)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
	+ x = (x ^ (x >> 2)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
	+ x = (x ^ (x >> 4)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
	+ x = (x ^ (x >> 8)) & 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
	+ return x;
	+}
	+
	+// Inverse of Part1By2 - "delete" all bits not at positions divisible by 3
	+u32 Compact1By2(u32 x)
	+{
	+ x &= 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
	+ x = (x ^ (x >> 2)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
	+ x = (x ^ (x >> 4)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
	+ x = (x ^ (x >> 8)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
	+ x = (x ^ (x >> 16)) & 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
	+ return x;
	+}
	+
	+u32 DecodeMorton2X(u32 code)
	+{
	+ return Compact1By1(code >> 0);
	+}
	+
	+u32 DecodeMorton2Y(u32 code)
	+{
	+ return Compact1By1(code >> 1);
	+}
	+
	template <typename T>
	inline void Write(u32 addr, const T data) {
	addr -= 0x1EF00000;
	@@ -130,6 +161,10 @@ inline void Write(u32 addr, const T data) {
	int r, g, b, a;
	} source_color = { 0, 0, 0, 0 };

	+ u32 morton_code = y * config.input_width + x;
	+ u32 curve_x = DecodeMorton2X(morton_code);
	+ u32 curve_y = DecodeMorton2Y(morton_code);
	+
	switch (config.input_format) {
	case Regs::PixelFormat::RGBA8:
	{
	@@ -180,7 +215,7 @@ inline void Write(u32 addr, const T data) {

	case Regs::PixelFormat::RGB8:
	{
	- u8* dstptr = dest_pointer + (x + y * output_width) * 3;
	+ u8* dstptr = dest_pointer + (curve_x + curve_y * output_width) * 3;
	dstptr[2] = source_color.r; // red
	dstptr[1] = source_color.g; // green
	dstptr[0] = source_color.b; // blue
	@@ -189,7 +224,7 @@ inline void Write(u32 addr, const T data) {

	case Regs::PixelFormat::RGB5A1:
	{
	- u16* dstptr = (u16)(dest_pointer + x 2 + y * config.output_width * 2);
	+ u16* dstptr = (u16)(dest_pointer + curve_x 2 + curve_y * config.output_width * 2);
	*dstptr = ((source_color.r >> 3) << 11) \| ((source_color.g >> 3) << 6)
	\| ((source_color.b >> 3) << 1) \| ( source_color.a >> 7);
	break;
	@@ -197,7 +232,7 @@ inline void Write(u32 addr, const T data) {

	case Regs::PixelFormat::RGBA4:
	{
	- u16* dstptr = (u16)(dest_pointer + x 2 + y * config.output_width * 2);
	+ u16* dstptr = (u16)(dest_pointer + curve_x 2 + curve_y * config.output_width * 2);
	*dstptr = ((source_color.r >> 4) << 12) \| ((source_color.g >> 4) << 8)
	\| ((source_color.b >> 4) << 4) \| ( source_color.a >> 4);
	break;
	diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
	index 17f8f70..b932873 100644
	--- a/src/video_core/rasterizer.cpp
	+++ b/src/video_core/rasterizer.cpp
	@@ -18,6 +18,19 @@ namespace Pica {

	namespace Rasterizer {

	+ unsigned int SeparateBy1(unsigned int x) {
	+ x &= 0x0000ffff; // x = ---- ---- ---- ---- fedc ba98 7654 3210
	+ x = (x ^ (x << 8)) & 0x00ff00ff; // x = ---- ---- fedc ba98 ---- ---- 7654 3210
	+ x = (x ^ (x << 4)) & 0x0f0f0f0f; // x = ---- fedc ---- ba98 ---- 7654 ---- 3210
	+ x = (x ^ (x << 2)) & 0x33333333; // x = --fe --dc --ba --98 --76 --54 --32 --10
	+ x = (x ^ (x << 1)) & 0x55555555; // x = -f-e -d-c -b-a -9-8 -7-6 -5-4 -3-2 -1-0
	+ return x;
	+ }
	+
	+ u32 MortonCode2(unsigned int x, unsigned int y) {
	+ return SeparateBy1(x) \| (SeparateBy1(y) << 1);
	+ }
	+
	static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
	const PAddr addr = registers.framebuffer.GetColorBufferPhysicalAddress();
	u8* color_buffer = Memory::GetPointer(PAddrToVAddr(addr));
	@@ -26,10 +39,12 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
	// NOTE: The framebuffer height register contains the actual FB height minus one.
	y = (registers.framebuffer.height - y);

	+ u32 code = MortonCode2(x, y);
	+
	switch (registers.framebuffer.color_format) {
	case registers.framebuffer.RGBA8:
	{
	- u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
	+ u8* pixel = color_buffer + code * 4;
	pixel[3] = color.r();
	pixel[2] = color.g();
	pixel[1] = color.b();
	@@ -48,12 +63,13 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
	u8* color_buffer = Memory::GetPointer(PAddrToVAddr(addr));

	y = (registers.framebuffer.height - y);
	+ u32 code = MortonCode2(x, y);

	switch (registers.framebuffer.color_format) {
	case registers.framebuffer.RGBA8:
	{
	Math::Vec4<u8> ret;
	- u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
	+ u8* pixel = color_buffer + code * 4;
	ret.r() = pixel[3];
	ret.g() = pixel[2];
	ret.b() = pixel[1];
	@@ -73,9 +89,10 @@ static u32 GetDepth(int x, int y) {
	u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));

	y = (registers.framebuffer.height - y);
	+ u32 code = MortonCode2(x, y);

	// Assuming 16-bit depth buffer format until actual format handling is implemented
	- return (depth_buffer + x + y registers.framebuffer.GetWidth());
	+ return depth_buffer[code];
	}

	static void SetDepth(int x, int y, u16 value) {
	@@ -83,9 +100,10 @@ static void SetDepth(int x, int y, u16 value) {
	u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));

	y = (registers.framebuffer.height - y);
	+ u32 code = MortonCode2(x, y);

	// Assuming 16-bit depth buffer format until actual format handling is implemented
	- (depth_buffer + x + y registers.framebuffer.GetWidth()) = value;
	+ depth_buffer[code] = value;
	}

	// NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
	diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
	index 2726951..f357b5c 100644
	--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
	+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
	@@ -262,6 +262,7 @@ void RendererOpenGL::DrawScreens() {

	DrawSingleScreenRotated(textures[0], top_x, 0,
	(float)VideoCore::kScreenTopWidth, (float)VideoCore::kScreenTopHeight);
	+ glFlush();
	DrawSingleScreenRotated(textures[1], bottom_x, (float)VideoCore::kScreenTopHeight,
	(float)VideoCore::kScreenBottomWidth, (float)VideoCore::kScreenBottomHeight);