Subv · February 19, 2015 19:48
diff --git a/gistfile1.diff b/gistfile1.diff
 diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
 index ea3367e..ba3876a 100644
 --- a/src/video_core/clipper.cpp
 +++ b/src/video_core/clipper.cpp
 @@ -16,7 +16,10 @@ namespace Clipper {
 struct ClippingEdge {
 public:
     ClippingEdge(Math::Vec4<float24> coeffs,
 -                 Math::Vec4<float24> bias = Math::Vec4<float24>(float24::FromFloat32(0), float24::FromFloat32(0), float24::FromFloat32(0), float24::FromFloat32(0)))
 +                 Math::Vec4<float24> bias = Math::Vec4<float24>(float24::FromFloat32(0),
 +                                                                float24::FromFloat32(0),
 +                                                                float24::FromFloat32(0),
 +                                                                float24::FromFloat32(0)))
         : coeffs(coeffs),
           bias(bias)
     {
 @@ -87,28 +90,26 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
     auto* output_list = &buffer_a;
     auto* input_list  = &buffer_b;
 
 -    // TODO: Test if dropping the whole primitive in this case reflects hardware behavior.
 -/*    ClippingEdge near_edge{ ClippingEdge::POS_Z, Math::Vec4<float24>(float24::FromFloat32(0), float24::FromFloat32(0), float24::FromFloat32(1), float24::FromFloat32(0)) };
 -    if (near_edge.IsOutSide(v0) || near_edge.IsOutSide(v1) || near_edge.IsOutSide(v2))
 -        return;*/
 -
     // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
     // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
     //       epsilon possible within float24 accuracy.
     static const float24 EPSILON = float24::FromFloat32(0.00001);
 -    static const float24 zero = float24::FromFloat32(0.0);
 -    static const float24 one = float24::FromFloat32(1.0);
 +    static const float24 f0 = float24::FromFloat32(0.0);
 +    static const float24 f1 = float24::FromFloat32(1.0);
     static const std::array<ClippingEdge, 7> clipping_edges = {{
 -        { Math::MakeVec(one, zero, zero, -one) },   // x = +w
 -        { Math::MakeVec(-one, zero, zero, -one) },  // x = -w
 -        { Math::MakeVec(zero, one, zero, -one) },   // y = +w
 -        { Math::MakeVec(zero, -one, zero, -one) },  // y = -w
 -        { Math::MakeVec(zero, zero, one, zero) },   // z =  0
 -        { Math::MakeVec(zero, zero, -one, -one) },  // z = -w
 -        { Math::MakeVec(zero, zero, zero, -one), Math::Vec4<float24>(zero, zero, zero, EPSILON) }, // w = EPSILON
 -
 +        { Math::MakeVec( f1,  f0,  f0, -f1) },  // x = +w
 +        { Math::MakeVec(-f1,  f0,  f0, -f1) },  // x = -w
 +        { Math::MakeVec( f0,  f1,  f0, -f1) },  // y = +w
 +        { Math::MakeVec( f0, -f1,  f0, -f1) },  // y = -w
 +        { Math::MakeVec( f0,  f0,  f1,  f0) },  // z =  0
 +        { Math::MakeVec( f0,  f0, -f1, -f1) },  // z = -w
 +        { Math::MakeVec( f0,  f0,  f0, -f1), Math::Vec4<float24>(f0, f0, f0, EPSILON) }, // w = EPSILON
     }};
 
 +    // TODO: If one vertex lies outside one of the depth clipping planes, some platforms (e.g. Wii)
 +    //       drop the whole primitive instead of clipping the primitive properly. We should test if
 +    //       this happens on the 3DS, too.
 +
     // Simple implementation of the Sutherland-Hodgman clipping algorithm.
     // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
     for (auto edge : clipping_edges) {
 @@ -160,7 +161,6 @@ void ProcessTriangle(OutputVertex &v0, OutputVertex &v1, OutputVertex &v2) {
                   vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), vtx2.screenpos.z.ToFloat32());
 
         Rasterizer::ProcessTriangle(vtx0, vtx1, vtx2);
 -        Rasterizer::ProcessTriangle(vtx2, vtx1, vtx0);
     }
 }
 
 diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
 index d1165bb..586ad62 100644
 --- a/src/video_core/command_processor.cpp
 +++ b/src/video_core/command_processor.cpp
 @@ -30,6 +30,10 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
     if (id >= registers.NumIds())
         return;
 
 +    // If we're skipping this frame, only allow trigger IRQ
 +    if (GPU::g_skip_frame && id != PICA_REG_INDEX(trigger_irq))
 +        return;
 +
     // TODO: Figure out how register masking acts on e.g. vs_uniform_setup.set_value
     u32 old_value = registers[id];
     registers[id] = (old_value & ~mask) | (value & mask);
 @@ -49,8 +53,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX(trigger_draw):
         case PICA_REG_INDEX(trigger_draw_indexed):
         {
 -            if (GPU::g_skip_frame) return;
 -
             DebugUtils::DumpTevStageConfig(registers.GetTevStages());
 
             if (g_debug_context)
 @@ -61,15 +63,15 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
 
             // Information about internal vertex attributes
             u32 vertex_attribute_sources[16];
 -            std::fill(vertex_attribute_sources, &vertex_attribute_sources[16], 0xdeadbeef);
 +            boost::fill(vertex_attribute_sources, 0xdeadbeef);
             u32 vertex_attribute_strides[16];
             u32 vertex_attribute_formats[16];
 -            u32 vertex_attribute_elements[16];
 -            u32 vertex_attribute_element_size[16];
 
             // HACK: Initialize vertex_attribute_elements to zero to prevent infinite loops below.
             // This is one of the hacks required to deal with uninitalized vertex attributes.
 -            boost::fill(vertex_attribute_elements, 0);
 +            // TODO: Fix this properly.
 +            u32 vertex_attribute_elements[16] = {};
 +            u32 vertex_attribute_element_size[16];
 
             // Setup attribute data from loaders
             for (int loader = 0; loader < 12; ++loader) {
 @@ -183,7 +185,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         }
 
         case PICA_REG_INDEX(vs_bool_uniforms):
 -            if (GPU::g_skip_frame) return;
             for (unsigned i = 0; i < 16; ++i)
                 VertexShader::GetBoolUniform(i) = (registers.vs_bool_uniforms.Value() & (1 << i)) != 0;
 
 @@ -194,7 +195,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[2], 0x2b3):
         case PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[3], 0x2b4):
         {
 -            if (GPU::g_skip_frame) return;
             int index = (id - PICA_REG_INDEX_WORKAROUND(vs_int_uniforms[0], 0x2b1));
             auto values = registers.vs_int_uniforms[index];
             VertexShader::GetIntUniform(index) = Math::Vec4<u8>(values.x, values.y, values.z, values.w);
 @@ -212,7 +212,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[6], 0x2c7):
         case PICA_REG_INDEX_WORKAROUND(vs_uniform_setup.set_value[7], 0x2c8):
         {
 -            if (GPU::g_skip_frame) return;
             auto& uniform_setup = registers.vs_uniform_setup;
 
             // TODO: Does actual hardware indeed keep an intermediate buffer or does
 @@ -280,7 +279,6 @@ static inline void WritePicaReg(u32 id, u32 value, u32 mask) {
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[6], 0x2dc):
         case PICA_REG_INDEX_WORKAROUND(vs_swizzle_patterns.set_word[7], 0x2dd):
         {
 -            if (GPU::g_skip_frame) return;
             VertexShader::SubmitSwizzleDataChange(registers.vs_swizzle_patterns.offset, value);
             registers.vs_swizzle_patterns.offset++;
             break;
 diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
 index b60f3d5..0beb72e 100644
 --- a/src/video_core/debug_utils/debug_utils.cpp
 +++ b/src/video_core/debug_utils/debug_utils.cpp
 @@ -16,7 +16,7 @@
 
 #include <nihstro/shader_binary.h>
 
 -#include "common/log.h"
 +#include "common/assert.h"
 #include "common/file_util.h"
 #include "common/math_util.h"
 
 @@ -189,7 +189,7 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
                                         );
 
                     if (it == output_info_table.end()) {
 -                        output_info_table.push_back({});
 +                        output_info_table.emplace_back();
                         output_info_table.back().type = type;
                         output_info_table.back().component_mask = component_mask;
                         output_info_table.back().id = i;
 @@ -197,7 +197,7 @@ void DumpShader(const u32* binary_data, u32 binary_size, const u32* swizzle_data
                         it->component_mask = it->component_mask | component_mask;
                     }
                 } catch (const std::out_of_range& ) {
 -                    _dbg_assert_msg_(HW_GPU, 0, "Unknown output attribute mapping");
 +                    DEBUG_ASSERT_MSG(false, "Unknown output attribute mapping");
                     LOG_ERROR(HW_GPU, "Unknown output attribute mapping: %03x, %03x, %03x, %03x",
                               (int)output_attributes[i].map_x.Value(),
                               (int)output_attributes[i].map_y.Value(),
 @@ -285,7 +285,7 @@ void OnPicaRegWrite(u32 id, u32 value)
     if (!is_pica_tracing)
         return;
 
 -    pica_trace->writes.push_back({id, value});
 +    pica_trace->writes.emplace_back(id, value);
 }
 
 std::unique_ptr<PicaTrace> FinishPicaTracing()
 @@ -489,7 +489,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
             BitField<34, 3, u64> table_index_2;
             BitField<37, 3, u64> table_index_1;
 
 -            union Union2 {
 +            union {
                 // delta value + base value
                 BitField<40, 3, s64> db;
                 BitField<43, 5, u64> b;
 @@ -501,7 +501,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
                 BitField<59, 5, u64> r;
             } differential;
 
 -            union Union3 {
 +            union {
                 BitField<40, 4, u64> b2;
                 BitField<44, 4, u64> b1;
 
 @@ -547,7 +547,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
                 // Add modifier
                 unsigned table_index = (x < 2) ? table_index_2.Value() : table_index_1.Value();
 
 -                static const std::array<std::array<u8, 2>, 8> etc1_modifier_table = std::array<std::array<u8, 2>, 8>{{
 +                static const auto etc1_modifier_table = std::array<std::array<u8, 2>, 8>{{
                     {  2,  8 }, {  5, 17 }, {  9,  29 }, { 13,  42 },
                     { 18, 60 }, { 24, 80 }, { 33, 106 }, { 47, 183 }
                 }};
 @@ -571,7 +571,7 @@ const Math::Vec4<u8> LookupTexture(const u8* source, int x, int y, const Texture
 
     default:
         LOG_ERROR(HW_GPU, "Unknown texture format: %x", (u32)info.format);
 -        /*_dbg_assert_(HW_GPU, 0);*/
 +        DEBUG_ASSERT(false);
         return {};
     }
 }
 diff --git a/src/video_core/gpu_debugger.h b/src/video_core/gpu_debugger.h
 index a51d49c..48ac269 100644
 --- a/src/video_core/gpu_debugger.h
 +++ b/src/video_core/gpu_debugger.h
 @@ -8,8 +8,6 @@
 #include <functional>
 #include <vector>
 
 -#include "common/log.h"
 -
 #include "core/hle/service/gsp_gpu.h"
 
 #include "command_processor.h"
 @@ -60,13 +58,13 @@ public:
         if (observers.empty())
             return;
 
 -        gx_command_history.push_back(GSP_GPU::Command());
 -        GSP_GPU::Command& cmd = gx_command_history[gx_command_history.size()-1];
 +        gx_command_history.emplace_back();
 +        GSP_GPU::Command& cmd = gx_command_history.back();
 
         memcpy(&cmd, command_data, sizeof(GSP_GPU::Command));
 
         ForEachObserver([this](DebuggerObserver* observer) {
 -                          observer->GXCommandProcessed(this->gx_command_history.size());
 +                          observer->GXCommandProcessed(static_cast<int>(this->gx_command_history.size()));
                         } );
     }
 
 diff --git a/src/video_core/math.h b/src/video_core/math.h
 index 9622e76..f9a8226 100644
 --- a/src/video_core/math.h
 +++ b/src/video_core/math.h
 @@ -457,27 +457,41 @@ public:
     const T& b() const { return z; }
     const T& a() const { return w; }
 
 -    // swizzlers - create a subvector of specific components
 +    // Swizzlers - Create a subvector of specific components
     // e.g. Vec2 uv() { return Vec2(x,y); }
 -    // _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
 +
 +    // _DEFINE_SWIZZLER2 defines a single such function
 +    // DEFINE_SWIZZLER2_COMP1 defines one-component functions for all component names (x<->r) 
 +    // DEFINE_SWIZZLER2_COMP2 defines two component functions for all component names (x<->r) and permutations (xy<->yx)
 #define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
 -#define DEFINE_SWIZZLER2(a, b, a2, b2) \
 +#define DEFINE_SWIZZLER2_COMP1(a, a2) \
 +    _DEFINE_SWIZZLER2(a, a, a##a); \
 +    _DEFINE_SWIZZLER2(a, a, a2##a2)
 +#define DEFINE_SWIZZLER2_COMP2(a, b, a2, b2) \
     _DEFINE_SWIZZLER2(a, b, a##b); \
     _DEFINE_SWIZZLER2(a, b, a2##b2); \
     _DEFINE_SWIZZLER2(b, a, b##a); \
     _DEFINE_SWIZZLER2(b, a, b2##a2)
 
 -    DEFINE_SWIZZLER2(x, y, r, g);
 -    DEFINE_SWIZZLER2(x, z, r, b);
 -    DEFINE_SWIZZLER2(x, w, r, a);
 -    DEFINE_SWIZZLER2(y, z, g, b);
 -    DEFINE_SWIZZLER2(y, w, g, a);
 -    DEFINE_SWIZZLER2(z, w, b, a);
 -#undef DEFINE_SWIZZLER2
 +    DEFINE_SWIZZLER2_COMP2(x, y, r, g);
 +    DEFINE_SWIZZLER2_COMP2(x, z, r, b);
 +    DEFINE_SWIZZLER2_COMP2(x, w, r, a);
 +    DEFINE_SWIZZLER2_COMP2(y, z, g, b);
 +    DEFINE_SWIZZLER2_COMP2(y, w, g, a);
 +    DEFINE_SWIZZLER2_COMP2(z, w, b, a);
 +    DEFINE_SWIZZLER2_COMP1(x, r);
 +    DEFINE_SWIZZLER2_COMP1(y, g);
 +    DEFINE_SWIZZLER2_COMP1(z, b);
 +    DEFINE_SWIZZLER2_COMP1(w, a);
 +#undef DEFINE_SWIZZLER2_COMP1
 +#undef DEFINE_SWIZZLER2_COMP2
 #undef _DEFINE_SWIZZLER2
 
 #define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3<T> name() const { return Vec3<T>(a, b, c); }
 -#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \
 +#define DEFINE_SWIZZLER3_COMP1(a, a2) \
 +    _DEFINE_SWIZZLER3(a, a, a, a##a##a); \
 +    _DEFINE_SWIZZLER3(a, a, a, a2##a2##a2)
 +#define DEFINE_SWIZZLER3_COMP3(a, b, c, a2, b2, c2) \
     _DEFINE_SWIZZLER3(a, b, c, a##b##c); \
     _DEFINE_SWIZZLER3(a, c, b, a##c##b); \
     _DEFINE_SWIZZLER3(b, a, c, b##a##c); \
 @@ -491,11 +505,16 @@ public:
     _DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
     _DEFINE_SWIZZLER3(c, b, a, c2##b2##a2)
 
 -    DEFINE_SWIZZLER3(x, y, z, r, g, b);
 -    DEFINE_SWIZZLER3(x, y, w, r, g, a);
 -    DEFINE_SWIZZLER3(x, z, w, r, b, a);
 -    DEFINE_SWIZZLER3(y, z, w, g, b, a);
 -#undef DEFINE_SWIZZLER3
 +    DEFINE_SWIZZLER3_COMP3(x, y, z, r, g, b);
 +    DEFINE_SWIZZLER3_COMP3(x, y, w, r, g, a);
 +    DEFINE_SWIZZLER3_COMP3(x, z, w, r, b, a);
 +    DEFINE_SWIZZLER3_COMP3(y, z, w, g, b, a);
 +    DEFINE_SWIZZLER3_COMP1(x, r);
 +    DEFINE_SWIZZLER3_COMP1(y, g);
 +    DEFINE_SWIZZLER3_COMP1(z, b);
 +    DEFINE_SWIZZLER3_COMP1(w, a);
 +#undef DEFINE_SWIZZLER3_COMP1
 +#undef DEFINE_SWIZZLER3_COMP3
 #undef _DEFINE_SWIZZLER3
 };
 
 @@ -612,7 +631,7 @@ static inline Vec4<T> MakeVec(const Vec3<T>& xyz, const T& w)
 }
 
 template<typename T>
 -static inline Vec4<T> MakeVec(const T& x, const Vec2<T>& yzw)
 +static inline Vec4<T> MakeVec(const T& x, const Vec3<T>& yzw)
 {
     return MakeVec(x, yzw[0], yzw[1], yzw[2]);
 }
 diff --git a/src/video_core/pica.h b/src/video_core/pica.h
 index bc863df..e4a5ef7 100644
 --- a/src/video_core/pica.h
 +++ b/src/video_core/pica.h
 @@ -236,19 +236,29 @@ struct Regs {
         };
 
         enum class ColorModifier : u32 {
 -            SourceColor         = 0,
 -            OneMinusSourceColor = 1,
 -            SourceAlpha         = 2,
 -            OneMinusSourceAlpha = 3,
 -
 -            // Other values seem to be non-standard extensions
 +            SourceColor         = 0x0,
 +            OneMinusSourceColor = 0x1,
 +            SourceAlpha         = 0x2,
 +            OneMinusSourceAlpha = 0x3,
 +            SourceRed           = 0x4,
 +            OneMinusSourceRed   = 0x5,
 +
 +            SourceGreen         = 0x8,
 +            OneMinusSourceGreen = 0x9,
 +
 +            SourceBlue          = 0xc,
 +            OneMinusSourceBlue  = 0xd,
         };
 
         enum class AlphaModifier : u32 {
 -            SourceAlpha         = 0,
 -            OneMinusSourceAlpha = 1,
 -
 -            // Other values seem to be non-standard extensions
 +            SourceAlpha         = 0x0,
 +            OneMinusSourceAlpha = 0x1,
 +            SourceRed           = 0x2,
 +            OneMinusSourceRed   = 0x3,
 +            SourceGreen         = 0x4,
 +            OneMinusSourceGreen = 0x5,
 +            SourceBlue          = 0x6,
 +            OneMinusSourceBlue  = 0x7,
         };
 
         enum class Operation : u32 {
 @@ -333,16 +343,30 @@ struct Regs {
         };
 
         union {
 -            enum BlendEquation : u32 {
 -                Add = 0,
 +            enum class BlendEquation : u32 {
 +                Add             = 0,
 +                Subtract        = 1,
 +                ReverseSubtract = 2,
 +                Min             = 3,
 +                Max             = 4
             };
 
             enum BlendFactor : u32 {
 -                Zero = 0,
 -                One = 1,
 -
 -                SourceAlpha = 6,
 -                OneMinusSourceAlpha = 7,
 +                Zero                    = 0,
 +                One                     = 1,
 +                SourceColor             = 2,
 +                OneMinusSourceColor     = 3,
 +                DestColor               = 4,
 +                OneMinusDestColor       = 5,
 +                SourceAlpha             = 6,
 +                OneMinusSourceAlpha     = 7,
 +                DestAlpha               = 8,
 +                OneMinusDestAlpha       = 9,
 +                ConstantColor           = 10,
 +                OneMinusConstantColor   = 11,
 +                ConstantAlpha           = 12,
 +                OneMinusConstantAlpha   = 13,
 +                SourceAlphaSaturate     = 14
             };
 
             BitField< 0, 8, BlendEquation> blend_equation_rgb;
 @@ -363,7 +387,12 @@ struct Regs {
             BitField<0, 4, Op> op;
         } logic_op;
 
 -        INSERT_PADDING_WORDS(0x1);
 +        union {
 +            BitField< 0, 8, u32> r;
 +            BitField< 8, 8, u32> g;
 +            BitField<16, 8, u32> b;
 +            BitField<24, 8, u32> a;
 +        } blend_const;
 
         union {
             BitField< 0, 1, u32> enable;
 diff --git a/src/video_core/primitive_assembly.cpp b/src/video_core/primitive_assembly.cpp
 index 242a07e..1776a19 100644
 --- a/src/video_core/primitive_assembly.cpp
 +++ b/src/video_core/primitive_assembly.cpp
 @@ -6,6 +6,7 @@
 #include "primitive_assembly.h"
 #include "vertex_shader.h"
 
 +#include "common/logging/log.h"
 #include "video_core/debug_utils/debug_utils.h"
 
 namespace Pica {
 diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
 index c9e0a79..81df09b 100644
 --- a/src/video_core/rasterizer.cpp
 +++ b/src/video_core/rasterizer.cpp
 @@ -5,6 +5,7 @@
 #include <algorithm>
 
 #include "common/common_types.h"
 +#include "common/math_util.h"
 
 #include "math.h"
 #include "pica.h"
 @@ -35,7 +36,7 @@ static void DrawPixel(int x, int y, const Math::Vec4<u8>& color) {
 
     default:
         LOG_CRITICAL(Render_Software, "Unknown framebuffer color format %x", registers.framebuffer.color_format);
 -        exit(1);
 +        UNIMPLEMENTED();
     }
 }
 
 @@ -57,8 +58,6 @@ static const Math::Vec4<u8> GetPixel(int x, int y) {
 static u32 GetDepth(int x, int y) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 -    if (!depth_buffer)
 -        return 0;
 
     y = (registers.framebuffer.height - y);
 
 @@ -69,8 +68,6 @@ static u32 GetDepth(int x, int y) {
 static void SetDepth(int x, int y, u16 value) {
     const PAddr addr = registers.framebuffer.GetDepthBufferPhysicalAddress();
     u16* depth_buffer = reinterpret_cast<u16*>(Memory::GetPointer(PAddrToVAddr(addr)));
 -    if (!depth_buffer)
 -        return;
 
     y = (registers.framebuffer.height - y);
 
 @@ -113,47 +110,45 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1,
     return Math::Cross(vec1, vec2).z;
 };
 
 -void ProcessTriangle(const VertexShader::OutputVertex& v0_,
 -                     const VertexShader::OutputVertex& v1_,
 -                     const VertexShader::OutputVertex& v2_)
 +/**
 + * Helper function for ProcessTriangle with the "reversed" flag to allow for implementing
 + * culling via recursion.
 + */
 +static void ProcessTriangleInternal(const VertexShader::OutputVertex& v0,
 +                                    const VertexShader::OutputVertex& v1,
 +                                    const VertexShader::OutputVertex& v2,
 +                                    bool reversed = false)
 {
     // vertex positions in rasterizer coordinates
 -    auto FloatToFix = [](float24 flt) {
 -                          // TODO: Rounding here is necessary to prevent garbage pixels at
 -                          //       triangle borders. Is it that the correct solution, though?
 -                          return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f)));
 -                      };
 -    auto ScreenToRasterizerCoordinates = [FloatToFix](const Math::Vec3<float24> vec) {
 -                                             return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
 -                                         };
 -
 -    VertexShader::OutputVertex v0 = v0_;
 -    VertexShader::OutputVertex v1 = v1_;
 -    VertexShader::OutputVertex v2 = v2_;
 +    static auto FloatToFix = [](float24 flt) {
 +        // TODO: Rounding here is necessary to prevent garbage pixels at
 +        //       triangle borders. Is it that the correct solution, though?
 +        return Fix12P4(static_cast<unsigned short>(round(flt.ToFloat32() * 16.0f)));
 +    };
 +    static auto ScreenToRasterizerCoordinates = [](const Math::Vec3<float24>& vec) {
 +        return Math::Vec3<Fix12P4>{FloatToFix(vec.x), FloatToFix(vec.y), FloatToFix(vec.z)};
 +    };
 +
     Math::Vec3<Fix12P4> vtxpos[3]{ ScreenToRasterizerCoordinates(v0.screenpos),
                                    ScreenToRasterizerCoordinates(v1.screenpos),
                                    ScreenToRasterizerCoordinates(v2.screenpos) };
 
 -    if (registers.cull_mode == Regs::CullMode::KeepCounterClockWise) {
 -        // Reverse vertex order and use the CW code path.
 -        std::swap(vtxpos[1], vtxpos[2]);
 -        std::swap(v1, v2);
 -    }
 -
 -    if (registers.cull_mode != Regs::CullMode::KeepAll) {
 -        // Cull away triangles which are wound counter-clockwise.
 -        // TODO: Make work :(
 -        if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
 -            std::swap(vtxpos[1], vtxpos[2]);
 -            std::swap(v1, v2);
 -//            return;
 +    if (registers.cull_mode == Regs::CullMode::KeepAll) {
 +        // Make sure we always end up with a triangle wound counter-clockwise
 +        if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
 +            ProcessTriangleInternal(v0, v2, v1, true);
 +            return;
         }
     } else {
 -        // TODO: Consider a check for degenerate triangles ("SignedArea == 0")
 -        if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
 -            std::swap(vtxpos[1], vtxpos[2]);
 -            std::swap(v1, v2);
 +        if (!reversed && registers.cull_mode == Regs::CullMode::KeepClockWise) {
 +            // Reverse vertex order and use the CCW code path.
 +            ProcessTriangleInternal(v0, v2, v1, true);
 +            return;
         }
 +
 +        // Cull away triangles which are wound clockwise.
 +        if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0)
 +            return;
     }
 
     // TODO: Proper scissor rect test!
 @@ -255,7 +250,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
                 if (!texture.enabled)
                     continue;
 
 -                //_dbg_assert_(HW_GPU, 0 != texture.config.address);
 +                DEBUG_ASSERT(0 != texture.config.address);
 
                 int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
                 int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
 @@ -267,25 +262,25 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
                             return val;
 
                         case Regs::TextureConfig::Repeat:
 -                            return (int)(((unsigned)val) % size);
 +                            return (int)((unsigned)val % size);
 
                         case Regs::TextureConfig::MirroredRepeat:
                         {
 -                            int val = (int)(((unsigned)val) % (2*size));
 -                            if (val >= size)
 -                                val = 2 * size - 1 - val;
 -                            return val;
 +                            int coord = (int)((unsigned)val % (2 * size));
 +                            if (coord >= size)
 +                                coord = 2 * size - 1 - coord;
 +                            return coord;
                         }
 
                         default:
                             LOG_ERROR(HW_GPU, "Unknown texture coordinate wrapping mode %x\n", (int)mode);
 -                            //_dbg_assert_(HW_GPU, 0);
 +                            UNIMPLEMENTED();
                             return 0;
                     }
                 };
 
                 // Textures are laid out from bottom to top, hence we invert the t coordinate.
 -                // NOTE: This may not be the right to place the inversion.
 +                // NOTE: This may not be the right place for the inversion.
                 // TODO: Check if this applies to ETC textures, too.
                 s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
                 t = texture.config.height - 1 - GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
 @@ -335,41 +330,13 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
 
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner source %d\n", (int)source);
 -                        //_dbg_assert_(HW_GPU, 0);
 +                        UNIMPLEMENTED();
                         return {};
                     }
                 };
 
 -                auto GetAlphaSource = [&](Source source) -> u8 {
 -                    switch (source) {
 -                    case Source::PrimaryColor:
 -                        return primary_color.a();
 -
 -                    case Source::Texture0:
 -                        return texture_color[0].a();
 -
 -                    case Source::Texture1:
 -                        return texture_color[1].a();
 -
 -                    case Source::Texture2:
 -                        return texture_color[2].a();
 -
 -                    case Source::Constant:
 -                        return tev_stage.const_a;
 -
 -                    case Source::Previous:
 -                        return combiner_output.a();
 -
 -                    default:
 -                        LOG_ERROR(HW_GPU, "Unknown alpha combiner source %d\n", (int)source);
 -                        //_dbg_assert_(HW_GPU, 0);
 -                        return 0;
 -                    }
 -                };
 -
                 static auto GetColorModifier = [](ColorModifier factor, const Math::Vec4<u8>& values) -> Math::Vec3<u8> {
 -                    switch (factor)
 -                    {
 +                    switch (factor) {
                     case ColorModifier::SourceColor:
                         return values.rgb();
 
 @@ -377,12 +344,28 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
                         return (Math::Vec3<u8>(255, 255, 255) - values.rgb()).Cast<u8>();
 
                     case ColorModifier::SourceAlpha:
 -                        return { values.a(), values.a(), values.a() };
 +                        return values.aaa();
 
 -                    default:
 -                        LOG_ERROR(HW_GPU, "Unknown color factor %d\n", (int)factor);
 -                        //_dbg_assert_(HW_GPU, 0);
 -                        return {};
 +                    case ColorModifier::OneMinusSourceAlpha:
 +                        return (Math::Vec3<u8>(255, 255, 255) - values.aaa()).Cast<u8>();
 +
 +                    case ColorModifier::SourceRed:
 +                        return values.rrr();
 +
 +                    case ColorModifier::OneMinusSourceRed:
 +                        return (Math::Vec3<u8>(255, 255, 255) - values.rrr()).Cast<u8>();
 +
 +                    case ColorModifier::SourceGreen:
 +                        return values.ggg();
 +
 +                    case ColorModifier::OneMinusSourceGreen:
 +                        return (Math::Vec3<u8>(255, 255, 255) - values.ggg()).Cast<u8>();
 +
 +                    case ColorModifier::SourceBlue:
 +                        return values.bbb();
 +
 +                    case ColorModifier::OneMinusSourceBlue:
 +                        return (Math::Vec3<u8>(255, 255, 255) - values.bbb()).Cast<u8>();
                     }
                 };
 
 @@ -394,10 +377,23 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
                     case AlphaModifier::OneMinusSourceAlpha:
                         return 255 - values.a();
 
 -                    default:
 -                        LOG_ERROR(HW_GPU, "Unknown alpha factor %d\n", (int)factor);
 -                        //_dbg_assert_(HW_GPU, 0);
 -                        return 0;
 +                    case AlphaModifier::SourceRed:
 +                        return values.r();
 +
 +                    case AlphaModifier::OneMinusSourceRed:
 +                        return 255 - values.r();
 +
 +                    case AlphaModifier::SourceGreen:
 +                        return values.g();
 +
 +                    case AlphaModifier::OneMinusSourceGreen:
 +                        return 255 - values.g();
 +
 +                    case AlphaModifier::SourceBlue:
 +                        return values.b();
 +
 +                    case AlphaModifier::OneMinusSourceBlue:
 +                        return 255 - values.b();
                     }
                 };
 
 @@ -451,7 +447,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
 
                     default:
                         LOG_ERROR(HW_GPU, "Unknown color combiner operation %d\n", (int)op);
 -                        //_dbg_assert_(HW_GPU, 0);
 +                        UNIMPLEMENTED();
                         return {};
                     }
                 };
 @@ -481,7 +477,7 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
 
                     default:
                         LOG_ERROR(HW_GPU, "Unknown alpha combiner operation %d\n", (int)op);
 -                        //_dbg_assert_(HW_GPU, 0);
 +                        UNIMPLEMENTED();
                         return 0;
                     }
                 };
 @@ -607,28 +603,58 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
                 auto params = registers.output_merger.alpha_blending;
 
                 auto LookupFactorRGB = [&](decltype(params)::BlendFactor factor) -> Math::Vec3<u8> {
 -                    switch(factor) {
 +                    switch (factor) {
                     case params.Zero:
                         return Math::Vec3<u8>(0, 0, 0);
 
                     case params.One:
                         return Math::Vec3<u8>(255, 255, 255);
 
 +                    case params.SourceColor:
 +                        return combiner_output.rgb();
 +
 +                    case params.OneMinusSourceColor:
 +                        return Math::Vec3<u8>(255 - combiner_output.r(), 255 - combiner_output.g(), 255 - combiner_output.b());
 +
 +                    case params.DestColor:
 +                        return dest.rgb();
 +
 +                    case params.OneMinusDestColor:
 +                        return Math::Vec3<u8>(255 - dest.r(), 255 - dest.g(), 255 - dest.b());
 +
                     case params.SourceAlpha:
 -                        return Math::MakeVec(combiner_output.a(), combiner_output.a(), combiner_output.a());
 +                        return Math::Vec3<u8>(combiner_output.a(), combiner_output.a(), combiner_output.a());
 
                     case params.OneMinusSourceAlpha:
 -                        return Math::Vec3<u8>(255-combiner_output.a(), 255-combiner_output.a(), 255-combiner_output.a());
 +                        return Math::Vec3<u8>(255 - combiner_output.a(), 255 - combiner_output.a(), 255 - combiner_output.a());
 +
 +                    case params.DestAlpha:
 +                        return Math::Vec3<u8>(dest.a(), dest.a(), dest.a());
 +
 +                    case params.OneMinusDestAlpha:
 +                        return Math::Vec3<u8>(255 - dest.a(), 255 - dest.a(), 255 - dest.a());
 +
 +                    case params.ConstantColor:
 +                        return Math::Vec3<u8>(registers.output_merger.blend_const.r, registers.output_merger.blend_const.g, registers.output_merger.blend_const.b);
 +
 +                    case params.OneMinusConstantColor:
 +                        return Math::Vec3<u8>(255 - registers.output_merger.blend_const.r, 255 - registers.output_merger.blend_const.g, 255 - registers.output_merger.blend_const.b);
 +
 +                    case params.ConstantAlpha:
 +                        return Math::Vec3<u8>(registers.output_merger.blend_const.a, registers.output_merger.blend_const.a, registers.output_merger.blend_const.a);
 +
 +                    case params.OneMinusConstantAlpha:
 +                        return Math::Vec3<u8>(255 - registers.output_merger.blend_const.a, 255 - registers.output_merger.blend_const.a, 255 - registers.output_merger.blend_const.a);
 
                     default:
 -                        return Math::Vec3<u8>(0, 0, 0); //LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
 -                        //exit(0);
 +                        LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor);
 +                        UNIMPLEMENTED();
                         break;
                     }
                 };
 
                 auto LookupFactorA = [&](decltype(params)::BlendFactor factor) -> u8 {
 -                    switch(factor) {
 +                    switch (factor) {
                     case params.Zero:
                         return 0;
 
 @@ -641,11 +667,73 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
                     case params.OneMinusSourceAlpha:
                         return 255 - combiner_output.a();
 
 +                    case params.DestAlpha:
 +                        return dest.a();
 +
 +                    case params.OneMinusDestAlpha:
 +                        return 255 - dest.a();
 +
 +                    case params.ConstantAlpha:
 +                        return registers.output_merger.blend_const.a;
 +
 +                    case params.OneMinusConstantAlpha:
 +                        return 255 - registers.output_merger.blend_const.a;
 +
                     default:
 -                        return 0; //LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
 -                        //exit(0);
 +                        LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor);
 +                        UNIMPLEMENTED();
 +                        break;
 +                    }
 +                };
 +
 +                using BlendEquation = decltype(params)::BlendEquation;
 +                static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor,
 +                                                       const Math::Vec4<u8>& dest, const Math::Vec4<u8>& destfactor,
 +                                                       BlendEquation equation) {
 +                    Math::Vec4<int> result;
 +
 +                    auto src_result = (src  *  srcfactor).Cast<int>();
 +                    auto dst_result = (dest * destfactor).Cast<int>();
 +
 +                    switch (equation) {
 +                    case BlendEquation::Add:
 +                        result = (src_result + dst_result) / 255;
                         break;
 +
 +                    case BlendEquation::Subtract:
 +                        result = (src_result - dst_result) / 255;
 +                        break;
 +
 +                    case BlendEquation::ReverseSubtract:
 +                        result = (dst_result - src_result) / 255;
 +                        break;
 +
 +                    // TODO: How do these two actually work?
 +                    //       OpenGL doesn't include the blend factors in the min/max computations,
 +                    //       but is this what the 3DS actually does?
 +                    case BlendEquation::Min:
 +                        result.r() = std::min(src.r(), dest.r());
 +                        result.g() = std::min(src.g(), dest.g());
 +                        result.b() = std::min(src.b(), dest.b());
 +                        result.a() = std::min(src.a(), dest.a());
 +                        break;
 +
 +                    case BlendEquation::Max:
 +                        result.r() = std::max(src.r(), dest.r());
 +                        result.g() = std::max(src.g(), dest.g());
 +                        result.b() = std::max(src.b(), dest.b());
 +                        result.a() = std::max(src.a(), dest.a());
 +                        break;
 +
 +                    default:
 +                        LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", equation);
 +                        UNIMPLEMENTED();
                     }
 +
 +                    return Math::Vec4<u8>(MathUtil::Clamp(result.r(), 0, 255),
 +                                    MathUtil::Clamp(result.g(), 0, 255),
 +                                    MathUtil::Clamp(result.b(), 0, 255),
 +                                    MathUtil::Clamp(result.a(), 0, 255));
                 };
 
                 auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb),
 @@ -653,38 +741,11 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
                 auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb),
                                                LookupFactorA(params.factor_dest_a));
 
 -                switch (params.blend_equation_rgb) {
 -                case params.Add:
 -                {
 -                    auto result = (combiner_output * srcfactor + dest * dstfactor) / 255;
 -                    result.r() = std::min(255, result.r());
 -                    result.g() = std::min(255, result.g());
 -                    result.b() = std::min(255, result.b());
 -                    blend_output = result.Cast<u8>();
 -                    break;
 -                }
 -
 -                default:
 -                    LOG_CRITICAL(HW_GPU, "Unknown RGB blend equation %x", params.blend_equation_rgb.Value());
 -                    exit(0);
 -                }
 -
 -                switch (params.blend_equation_a) {
 -                case params.Add:
 -                {
 -                    auto result = (combiner_output * srcfactor + dest * dstfactor) / 255;
 -                    result.a() = std::min(255, result.a());
 -                    blend_output.a() = result.Cast<u8>().a();
 -                    break;
 -                }
 -
 -                default:
 -                    LOG_CRITICAL(HW_GPU, "Unknown alpha blend equation %x", params.blend_equation_a.Value());
 -                    exit(0);
 -                }
 +                blend_output     = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb);
 +                blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a();
             } else {
                 LOG_CRITICAL(HW_GPU, "logic op: %x", registers.output_merger.logic_op);
 -                exit(0);
 +                UNIMPLEMENTED();
             }
 
             const Math::Vec4<u8> result = {
 @@ -699,6 +760,12 @@ void ProcessTriangle(const VertexShader::OutputVertex& v0_,
     }
 }
 
 +void ProcessTriangle(const VertexShader::OutputVertex& v0,
 +                     const VertexShader::OutputVertex& v1,
 +                     const VertexShader::OutputVertex& v2) {
 +    ProcessTriangleInternal(v0, v1, v2);
 +}
 +
 } // namespace Rasterizer
 
 } // namespace Pica
 diff --git a/src/video_core/renderer_opengl/gl_shader_util.cpp b/src/video_core/renderer_opengl/gl_shader_util.cpp
 index e982e37..42d0e59 100644
 --- a/src/video_core/renderer_opengl/gl_shader_util.cpp
 +++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
 @@ -3,7 +3,7 @@
 // Refer to the license.txt file included.
 
 #include "gl_shader_util.h"
 -#include "common/log.h"
 +#include "common/logging/log.h"
 
 #include <vector>
 #include <algorithm>
 diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
 index aa47bd6..2726951 100644
 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp
 +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
 @@ -61,15 +61,13 @@ void RendererOpenGL::SwapBuffers() {
     for(int i : {0, 1}) {
         const auto& framebuffer = GPU::g_regs.framebuffer_config[i];
 
 -        if (textures[i].width != (GLsizei)framebuffer.width || textures[i].height != (GLsizei)framebuffer.height) {
 +        if (textures[i].width != (GLsizei)framebuffer.width ||
 +            textures[i].height != (GLsizei)framebuffer.height ||
 +            textures[i].format != framebuffer.color_format) {
             // Reallocate texture if the framebuffer size has changed.
             // This is expected to not happen very often and hence should not be a
             // performance problem.
 -            glBindTexture(GL_TEXTURE_2D, textures[i].handle);
 -            glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, framebuffer.width, framebuffer.height, 0,
 -                GL_BGR, GL_UNSIGNED_BYTE, nullptr);
 -            textures[i].width = framebuffer.width;
 -            textures[i].height = framebuffer.height;
 +            ConfigureFramebufferTexture(textures[i], framebuffer);
         }
 
         LoadFBToActiveGLTexture(GPU::g_regs.framebuffer_config[i], textures[i]);
 @@ -98,16 +96,15 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig&
 
     const u8* framebuffer_data = Memory::GetPointer(framebuffer_vaddr);
 
 -    // TODO: Handle other pixel formats
 -    _dbg_assert_msg_(Render_OpenGL, framebuffer.color_format == GPU::Regs::PixelFormat::RGB8,
 -                     "Unsupported 3DS pixel format.");
 +    int bpp = GPU::Regs::BytesPerPixel(framebuffer.color_format);
 +    size_t pixel_stride = framebuffer.stride / bpp;
 
 -    size_t pixel_stride = framebuffer.stride / 3;
     // OpenGL only supports specifying a stride in units of pixels, not bytes, unfortunately
 -    _dbg_assert_(Render_OpenGL, pixel_stride * 3 == framebuffer.stride);
 +    ASSERT(pixel_stride * bpp == framebuffer.stride);
 +
     // Ensure no bad interactions with GL_UNPACK_ALIGNMENT, which by default
     // only allows rows to have a memory alignement of 4.
 -    _dbg_assert_(Render_OpenGL, pixel_stride % 4 == 0);
 +    ASSERT(pixel_stride % 4 == 0);
 
     glBindTexture(GL_TEXTURE_2D, texture.handle);
     glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint)pixel_stride);
 @@ -118,7 +115,7 @@ void RendererOpenGL::LoadFBToActiveGLTexture(const GPU::Regs::FramebufferConfig&
     // TODO: Applications could theoretically crash Citra here by specifying too large
     //       framebuffer sizes. We should make sure that this cannot happen.
     glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, framebuffer.width, framebuffer.height,
 -        GL_BGR, GL_UNSIGNED_BYTE, framebuffer_data);
 +        texture.gl_format, texture.gl_type, framebuffer_data);
 
     glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
 
 @@ -171,6 +168,59 @@ void RendererOpenGL::InitOpenGLObjects() {
     glBindTexture(GL_TEXTURE_2D, 0);
 }
 
 +void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
 +                                                 const GPU::Regs::FramebufferConfig& framebuffer) {
 +    GPU::Regs::PixelFormat format = framebuffer.color_format;
 +    GLint internal_format;
 +
 +    texture.format = format;
 +    texture.width = framebuffer.width;
 +    texture.height = framebuffer.height;
 +
 +    switch (format) {
 +    case GPU::Regs::PixelFormat::RGBA8:
 +        internal_format = GL_RGBA;
 +        texture.gl_format = GL_RGBA;
 +        texture.gl_type = GL_UNSIGNED_INT_8_8_8_8;
 +        break;
 +
 +    case GPU::Regs::PixelFormat::RGB8:
 +        // This pixel format uses BGR since GL_UNSIGNED_BYTE specifies byte-order, unlike every
 +        // specific OpenGL type used in this function using native-endian (that is, little-endian
 +        // mostly everywhere) for words or half-words.
 +        // TODO: check how those behave on big-endian processors.
 +        internal_format = GL_RGB;
 +        texture.gl_format = GL_BGR;
 +        texture.gl_type = GL_UNSIGNED_BYTE;
 +        break;
 +
 +    case GPU::Regs::PixelFormat::RGB565:
 +        internal_format = GL_RGB;
 +        texture.gl_format = GL_RGB;
 +        texture.gl_type = GL_UNSIGNED_SHORT_5_6_5;
 +        break;
 +
 +    case GPU::Regs::PixelFormat::RGB5A1:
 +        internal_format = GL_RGBA;
 +        texture.gl_format = GL_RGBA;
 +        texture.gl_type = GL_UNSIGNED_SHORT_5_5_5_1;
 +        break;
 +
 +    case GPU::Regs::PixelFormat::RGBA4:
 +        internal_format = GL_RGBA;
 +        texture.gl_format = GL_RGBA;
 +        texture.gl_type = GL_UNSIGNED_SHORT_4_4_4_4;
 +        break;
 +
 +    default:
 +        UNIMPLEMENTED();
 +    }
 +
 +    glBindTexture(GL_TEXTURE_2D, texture.handle);
 +    glTexImage2D(GL_TEXTURE_2D, 0, internal_format, texture.width, texture.height, 0,
 +            texture.gl_format, texture.gl_type, nullptr);
 +}
 +
 /**
  * Draws a single texture to the emulator window, rotating the texture to correct for the 3DS's LCD rotation.
  */
 diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
 index cf78c1e..bcabab5 100644
 --- a/src/video_core/renderer_opengl/renderer_opengl.h
 +++ b/src/video_core/renderer_opengl/renderer_opengl.h
 @@ -43,9 +43,14 @@ private:
         GLuint handle;
         GLsizei width;
         GLsizei height;
 +        GPU::Regs::PixelFormat format;
 +        GLenum gl_format;
 +        GLenum gl_type;
     };
 
     void InitOpenGLObjects();
 +    static void ConfigureFramebufferTexture(TextureInfo& texture,
 +                                            const GPU::Regs::FramebufferConfig& framebuffer);
     void DrawScreens();
     void DrawSingleScreenRotated(const TextureInfo& texture, float x, float y, float w, float h);
     void UpdateFramerate();
 diff --git a/src/video_core/vertex_shader.cpp b/src/video_core/vertex_shader.cpp
 index cb6785a..def868a 100644
 --- a/src/video_core/vertex_shader.cpp
 +++ b/src/video_core/vertex_shader.cpp
 @@ -85,9 +85,11 @@ struct VertexShaderState {
     };
 
     struct CallStackElement {
 -        u32 final_address;
 -        u32 return_address;
 -        std::function<int(VertexShaderState&)> branch_end_callback;
 +        u32 final_address;  // Address upon which we jump to return_address
 +        u32 return_address; // Where to jump when leaving scope
 +        u8 repeat_counter;  // How often to repeat until this call stack element is removed
 +        u8 loop_increment;  // Which value to add to the loop counter after an iteration
 +                            // TODO: Should this be a signed value? Does it even matter?
     };
 
     // TODO: Is there a maximal size for this?
 @@ -106,10 +108,12 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
     while (true) {
         if (!state.call_stack.empty()) {
 -            if (state.program_counter - shader_memory.data() == state.call_stack.top().final_address) {
 +            auto& top = state.call_stack.top();
 +            if (state.program_counter - shader_memory.data() == top.final_address) {
 +                state.address_registers[2] += top.loop_increment;
 
 -                if (state.call_stack.top().branch_end_callback(state)) {
 -                    state.program_counter = &shader_memory[state.call_stack.top().return_address];
 +                if (top.repeat_counter-- == 0) {
 +                    state.program_counter = &shader_memory[top.return_address];
                     state.call_stack.pop();
                 }
 
 @@ -122,11 +126,10 @@ static void ProcessShaderCode(VertexShaderState& state) {
         const Instruction& instr = *(const Instruction*)state.program_counter;
         const SwizzlePattern& swizzle = *(SwizzlePattern*)&swizzle_data[instr.common.operand_desc_id];
 
 -        auto call = [&](VertexShaderState& state, u32 offset, u32 num_instructions,
 -                        u32 return_offset,
 -                        std::function<int(VertexShaderState&)> branch_end_callback) {
 +        static auto call = [](VertexShaderState& state, u32 offset, u32 num_instructions,
 +                              u32 return_offset, u8 repeat_count, u8 loop_increment) {
             state.program_counter = &shader_memory[offset] - 1; // -1 to make sure when incrementing the PC we end up at the correct offset
 -            state.call_stack.push({ offset + num_instructions, return_offset, branch_end_callback });
 +            state.call_stack.push({ offset + num_instructions, return_offset, repeat_count, loop_increment });
         };
         u32 binary_offset = state.program_counter - shader_memory.data();
 
 @@ -152,13 +155,10 @@ static void ProcessShaderCode(VertexShaderState& state) {
         case Instruction::OpCodeType::Arithmetic:
         {
             bool is_inverted = 0 != (instr.opcode.GetInfo().subtype & Instruction::OpCodeInfo::SrcInversed);
 -            if (is_inverted) {
 -                // TODO: We don't really support this properly: For instance, the address register
 -                //       offset needs to be applied to SRC2 instead, etc.
 -                //       For now, we just abort in this situation.
 -                LOG_CRITICAL(HW_GPU, "Bad condition...");
 -                exit(0);
 -            }
 +            // TODO: We don't really support this properly: For instance, the address register
 +            //       offset needs to be applied to SRC2 instead, etc.
 +            //       For now, we just abort in this situation.
 +            ASSERT_MSG(!is_inverted, "Bad condition...");
 
             const int address_offset = (instr.common.address_register_index == 0)
                                        ? 0 : state.address_registers[instr.common.address_register_index - 1];
 @@ -166,8 +166,6 @@ static void ProcessShaderCode(VertexShaderState& state) {
             const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + address_offset);
             const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted));
 
 -            if (!src1_ || !src2_) break;
 -
             const bool negate_src1 = ((bool)swizzle.negate_src1 != false);
             const bool negate_src2 = ((bool)swizzle.negate_src2 != false);
 
 @@ -263,7 +261,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
                     // TODO: Be stable against division by zero!
                     // TODO: I think this might be wrong... we should only use one component here
 -                    dest[i] = float24::FromFloat32(1.0 / src1[i].ToFloat32());
 +                    dest[i] = float24::FromFloat32(1.0f / src1[i].ToFloat32());
                 }
 
                 break;
 @@ -278,7 +276,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
                     // TODO: Be stable against division by zero!
                     // TODO: I think this might be wrong... we should only use one component here
 -                    dest[i] = float24::FromFloat32(1.0 / sqrt(src1[i].ToFloat32()));
 +                    dest[i] = float24::FromFloat32(1.0f / sqrt(src1[i].ToFloat32()));
                 }
 
                 break;
 @@ -350,7 +348,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
             default:
                 LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x",
                           (int)instr.opcode.Value(), instr.opcode.GetInfo().name, instr.hex);
 -                _dbg_assert_(HW_GPU, 0);
 +                DEBUG_ASSERT(false);
                 break;
             }
 
 @@ -468,8 +466,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                 call(state,
                      instr.flow_control.dest_offset,
                      instr.flow_control.num_instructions,
 -                     binary_offset + 1,
 -                     [](VertexShaderState&) { return true; });
 +                     binary_offset + 1, 0, 0);
                 break;
 
             case Instruction::OpCode::CALLU:
 @@ -477,9 +474,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
 -                        binary_offset + 1,
 -                        [](VertexShaderState&) { return true; }
 -                        );
 +                        binary_offset + 1, 0, 0);
                 }
                 break;
 
 @@ -488,9 +483,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                         instr.flow_control.dest_offset,
                         instr.flow_control.num_instructions,
 -                        binary_offset + 1,
 -                        [](VertexShaderState&) { return true; }
 -                    );
 +                        binary_offset + 1, 0, 0);
                 }
                 break;
 
 @@ -502,14 +495,12 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                          binary_offset + 1,
                          instr.flow_control.dest_offset - binary_offset - 1,
 -                         instr.flow_control.dest_offset + instr.flow_control.num_instructions,
 -                         [](VertexShaderState&) { return true; });
 +                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
                          instr.flow_control.dest_offset,
                          instr.flow_control.num_instructions,
 -                         instr.flow_control.dest_offset + instr.flow_control.num_instructions,
 -                         [](VertexShaderState&) { return true; });
 +                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 }
 
                 break;
 @@ -522,14 +513,12 @@ static void ProcessShaderCode(VertexShaderState& state) {
                     call(state,
                          binary_offset + 1,
                          instr.flow_control.dest_offset - binary_offset - 1,
 -                         instr.flow_control.dest_offset + instr.flow_control.num_instructions,
 -                         [](VertexShaderState&) { return true; });
 +                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 } else {
                     call(state,
                          instr.flow_control.dest_offset,
                          instr.flow_control.num_instructions,
 -                         instr.flow_control.dest_offset + instr.flow_control.num_instructions,
 -                         [](VertexShaderState&) { return true; });
 +                         instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, 0);
                 }
 
                 break;
 @@ -537,25 +526,14 @@ static void ProcessShaderCode(VertexShaderState& state) {
 
             case Instruction::OpCode::LOOP:
             {
 -                        LOG_ERROR(HW_GPU, "%x %x %x %x %x", state.address_registers[2],
 -                                  shader_uniforms.i[instr.flow_control.int_uniform_id].x,
 -                                  shader_uniforms.i[instr.flow_control.int_uniform_id].y,
 -                                  shader_uniforms.i[instr.flow_control.int_uniform_id].z,
 -                                  shader_uniforms.i[instr.flow_control.int_uniform_id].w);
                 state.address_registers[2] = shader_uniforms.i[instr.flow_control.int_uniform_id].y;
 
                 call(state,
                      binary_offset + 1,
                      instr.flow_control.dest_offset - binary_offset + 1,
                      instr.flow_control.dest_offset + 1,
 -                     [&instr, binary_offset](VertexShaderState& state) { // Capture by value intended!
 -//                        state.address_registers[2] += shader_uniforms.i[instr.flow_control.int_uniform_id].z;
 -                        state.address_registers[2] += 1;
 -                        state.program_counter = &shader_memory[binary_offset+1];
 -                        return state.address_registers[2] > shader_uniforms.i[instr.flow_control.int_uniform_id].x +
 -                                                         shader_uniforms.i[instr.flow_control.int_uniform_id].y;
 -                     }
 -                    );
 +                     shader_uniforms.i[instr.flow_control.int_uniform_id].x,
 +                     shader_uniforms.i[instr.flow_control.int_uniform_id].z);
                 break;
             }
 
 diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
 index c9707e5..0a23659 100644
 --- a/src/video_core/video_core.cpp
 +++ b/src/video_core/video_core.cpp
 @@ -4,7 +4,6 @@
 
 #include "common/common.h"
 #include "common/emu_window.h"
 -#include "common/log.h"
 
 #include "core/core.h"