Skip to content

Instantly share code, notes, and snippets.

@Subv
Created February 23, 2015 22:38
Show Gist options
  • Save Subv/6c246856452e8b77086a to your computer and use it in GitHub Desktop.
Save Subv/6c246856452e8b77086a to your computer and use it in GitHub Desktop.
diff --git a/externals/boost b/externals/boost
--- a/externals/boost
+++ b/externals/boost
@@ -1 +1 @@
-Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5
+Subproject commit 728a4d7d1c8b28355544ae829df9c4b5f28373c5-dirty
diff --git a/externals/nihstro b/externals/nihstro
--- a/externals/nihstro
+++ b/externals/nihstro
@@ -1 +1 @@
-Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211
+Subproject commit 0a8b4d221425f13e24a3cef9b02edc3221bab211-dirty
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index 8800235..08fb772 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -122,6 +122,18 @@ inline void Write(u32 addr, const T data) {
u32 output_width = config.output_width / pixel_skip;
+ if (config.raw_copy) {
+ LOG_ERROR(HW_GPU, "Raw display transfers are not yet implemented.");
+ UNIMPLEMENTED();
+ return;
+ }
+
+ if (config.output_tiled0 || config.output_tiled1) {
+ LOG_ERROR(HW_GPU, "Display transfers with tiled output are not yet implemented");
+ UNIMPLEMENTED();
+ return;
+ }
+
for (u32 y = 0; y < config.output_height; ++y) {
// TODO: Why does the register seem to hold twice the framebuffer width?
@@ -130,10 +142,29 @@ inline void Write(u32 addr, const T data) {
int r, g, b, a;
} source_color = { 0, 0, 0, 0 };
+ const unsigned int block_width = 8;
+ const unsigned int block_height = 8;
+
+ const unsigned int coarse_x = x & ~7;
+ const unsigned int coarse_y = y & ~7;
+
+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
+ // arranged in a Z-order curve. More details on the bit manipulation at:
+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+ unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
+ i = (i | (i >> 7)) & 0x3F;
+
+ const unsigned int offset = coarse_x * block_height;
+
+ u32 dst_offset = (x + y * config.output_width) * GPU::Regs::BytesPerPixel(config.output_format) / pixel_skip;
+ u32 src_offset = (offset + i + coarse_y * output_width) * GPU::Regs::BytesPerPixel(config.input_format);
+
switch (config.input_format) {
case Regs::PixelFormat::RGBA8:
{
- u8* srcptr = source_pointer + (x * pixel_skip + y * config.input_width) * 4;
+ u8* srcptr = source_pointer + src_offset;
source_color.r = srcptr[3]; // red
source_color.g = srcptr[2]; // green
source_color.b = srcptr[1]; // blue
@@ -143,7 +174,7 @@ inline void Write(u32 addr, const T data) {
case Regs::PixelFormat::RGB5A1:
{
- u16 srcval = *(u16*)(source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip);
+ u16 srcval = *(u16*)(source_pointer + src_offset);
source_color.r = Color::Convert5To8((srcval >> 11) & 0x1F); // red
source_color.g = Color::Convert5To8((srcval >> 6) & 0x1F); // green
source_color.b = Color::Convert5To8((srcval >> 1) & 0x1F); // blue
@@ -153,7 +184,7 @@ inline void Write(u32 addr, const T data) {
case Regs::PixelFormat::RGBA4:
{
- u16 srcval = *(u16*)(source_pointer + x * 4 * pixel_skip + y * config.input_width * 4 * pixel_skip);
+ u16 srcval = *(u16*)(source_pointer + src_offset);
source_color.r = Color::Convert4To8((srcval >> 12) & 0xF); // red
source_color.g = Color::Convert4To8((srcval >> 8) & 0xF); // green
source_color.b = Color::Convert4To8((srcval >> 4) & 0xF); // blue
@@ -167,20 +198,20 @@ inline void Write(u32 addr, const T data) {
}
switch (config.output_format) {
- /*case Regs::PixelFormat::RGBA8:
+ case Regs::PixelFormat::RGBA8:
{
// TODO: Untested
- u8* dstptr = (u32*)(dest_pointer + x * 4 + y * config.output_width * 4);
- dstptr[0] = source_color.r;
- dstptr[1] = source_color.g;
- dstptr[2] = source_color.b;
- dstptr[3] = source_color.a;
+ u8* dstptr = (u8*)(dest_pointer + dst_offset);
+ dstptr[3] = source_color.r;
+ dstptr[2] = source_color.g;
+ dstptr[1] = source_color.b;
+ dstptr[0] = source_color.a;
break;
- }*/
+ }
case Regs::PixelFormat::RGB8:
{
- u8* dstptr = dest_pointer + (x + y * output_width) * 3;
+ u8* dstptr = dest_pointer + dst_offset;
dstptr[2] = source_color.r; // red
dstptr[1] = source_color.g; // green
dstptr[0] = source_color.b; // blue
@@ -189,7 +220,7 @@ inline void Write(u32 addr, const T data) {
case Regs::PixelFormat::RGB5A1:
{
- u16* dstptr = (u16*)(dest_pointer + x * 2 + y * config.output_width * 2);
+ u16* dstptr = (u16*)(dest_pointer + dst_offset);
*dstptr = ((source_color.r >> 3) << 11) | ((source_color.g >> 3) << 6)
| ((source_color.b >> 3) << 1) | ( source_color.a >> 7);
break;
@@ -197,7 +228,7 @@ inline void Write(u32 addr, const T data) {
case Regs::PixelFormat::RGBA4:
{
- u16* dstptr = (u16*)(dest_pointer + x * 2 + y * config.output_width * 2);
+ u16* dstptr = (u16*)(dest_pointer + dst_offset);
*dstptr = ((source_color.r >> 4) << 12) | ((source_color.g >> 4) << 8)
| ((source_color.b >> 4) << 4) | ( source_color.a >> 4);
break;
diff --git a/src/core/hw/gpu.h b/src/core/hw/gpu.h
index 75f5244..b365ff1 100644
--- a/src/core/hw/gpu.h
+++ b/src/core/hw/gpu.h
@@ -192,9 +192,11 @@ struct Regs {
u32 flags;
BitField< 0, 1, u32> flip_data; // flips input data horizontally (TODO) if true
+ BitField< 1, 1, u32> output_tiled0; // Converts the output to tiles. TODO(Subv): Verify and implement
+ BitField< 3, 1, u32> raw_copy; // Copies the data without performing any processing
BitField< 8, 3, PixelFormat> input_format;
BitField<12, 3, PixelFormat> output_format;
- BitField<16, 1, u32> output_tiled; // stores output in a tiled format
+ BitField<16, 1, u32> output_tiled1; // Converts the output to tiles. TODO(Subv): Verify and implement
// TODO: Not really sure if this actually scales, or even resizes at all.
BitField<24, 1, u32> scale_horizontally;
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index 17f8f70..94610a3 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -7,6 +7,7 @@
#include "common/common_types.h"
#include "common/math_util.h"
+#include "core/hw/gpu.h"
#include "math.h"
#include "pica.h"
#include "rasterizer.h"
@@ -26,10 +27,27 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
// NOTE: The framebuffer height register contains the actual FB height minus one.
y = (registers.framebuffer.height - y);
+ const unsigned int block_width = 8;
+ const unsigned int block_height = 8;
+
+ const unsigned int coarse_x = x & ~7;
+ const unsigned int coarse_y = y & ~7;
+
+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
+ // arranged in a Z-order curve. More details on the bit manipulation at:
+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+ unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
+ i = (i | (i >> 7)) & 0x3F;
+
+ const unsigned int offset = coarse_x * block_height;
+ u32 dst_offset = (offset + i + coarse_y * registers.framebuffer.width) * GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));
+
switch (registers.framebuffer.color_format) {
case registers.framebuffer.RGBA8:
{
- u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
+ u8* pixel = color_buffer + dst_offset;
pixel[3] = color.r();
pixel[2] = color.g();
pixel[1] = color.b();
@@ -48,12 +66,28 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
u8* color_buffer = Memory::GetPointer(PAddrToVAddr(addr));
y = (registers.framebuffer.height - y);
+ const unsigned int block_width = 8;
+ const unsigned int block_height = 8;
+
+ const unsigned int coarse_x = x & ~7;
+ const unsigned int coarse_y = y & ~7;
+
+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
+ // arranged in a Z-order curve. More details on the bit manipulation at:
+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+ unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
+ i = (i | (i >> 7)) & 0x3F;
+
+ const unsigned int offset = coarse_x * block_height;
+ u32 src_offset = (offset + i + coarse_y * registers.framebuffer.width) * GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(registers.framebuffer.color_format.Value()));
switch (registers.framebuffer.color_format) {
case registers.framebuffer.RGBA8:
{
Math::Vec4<u8> ret;
- u8* pixel = color_buffer + (x + y * registers.framebuffer.GetWidth()) * 4;
+ u8* pixel = color_buffer + src_offset;
ret.r() = pixel[3];
ret.g() = pixel[2];
ret.b() = pixel[1];
@@ -70,22 +104,54 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
static u32 GetDepth(int x, int y) {
const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
- u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
+ u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));
y = (registers.framebuffer.height - y);
+ const unsigned int block_width = 8;
+ const unsigned int block_height = 8;
+
+ const unsigned int coarse_x = x & ~7;
+ const unsigned int coarse_y = y & ~7;
+
+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
+ // arranged in a Z-order curve. More details on the bit manipulation at:
+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+ unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
+ i = (i | (i >> 7)) & 0x3F;
+
+ const unsigned int offset = coarse_x * block_height;
+ u32 stride = registers.framebuffer.width * 2;
// Assuming 16-bit depth buffer format until actual format handling is implemented
- return *(depth_buffer + x + y * registers.framebuffer.GetWidth());
+ return *(u16*)(depth_buffer + i * 2 + offset * 2 + coarse_y * stride);
}
static void SetDepth(int x, int y, u16 value) {
const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
- u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
+ u8* depth_buffer = Memory::GetPointer(PAddrToVAddr(addr));
y = (registers.framebuffer.height - y);
+ const unsigned int block_width = 8;
+ const unsigned int block_height = 8;
+
+ const unsigned int coarse_x = x & ~7;
+ const unsigned int coarse_y = y & ~7;
+
+ // Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
+ // arranged in a Z-order curve. More details on the bit manipulation at:
+ // https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+ unsigned int i = (x | (y << 8)) & 0x0707; // ---- -210
+ i = (i ^ (i << 2)) & 0x1313; // ---2 --10
+ i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
+ i = (i | (i >> 7)) & 0x3F;
+
+ const unsigned int offset = coarse_x * block_height;
+ u32 stride = registers.framebuffer.width * 2;
// Assuming 16-bit depth buffer format until actual format handling is implemented
- *(depth_buffer + x + y * registers.framebuffer.GetWidth()) = value;
+ *(u16*)(depth_buffer + i * 2 + offset * 2 + coarse_y * stride) = value;
}
// NOTE: Assuming that rasterizer coordinates are 12.4 fixed-point values
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 2726951..f357b5c 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -262,6 +262,7 @@ void RendererOpenGL::DrawScreens() {
DrawSingleScreenRotated(textures[0], top_x, 0,
(float)VideoCore::kScreenTopWidth, (float)VideoCore::kScreenTopHeight);
+ glFlush();
DrawSingleScreenRotated(textures[1], bottom_x, (float)VideoCore::kScreenTopHeight,
(float)VideoCore::kScreenBottomWidth, (float)VideoCore::kScreenBottomHeight);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment