JayFoxRox · August 29, 2015 14:01
diff --git a/debugger b/debugger
 static void debugger_export_vertex_shader(const char* file, KelvinState* kelvin, bool standalone) {
    int i;
    GLint prog;
    glGetIntegerv(GL_CURRENT_PROGRAM,&prog);
    FILE* f = fopen(file,"wb");
    if (standalone) {
        fprintf(f,"#version 110\n"
                  "\n"
                  // Writeable registers
                  "#define attribute\n"
                  "#define uniform\n"
                  "\n"
                  // Move entrypoint
                  "void setup(void);\n"
                  "void shader(void);\n"
                  "void main(void) {\n"
                  "  setup();\n"
                  "  shader();\n"
                  "}\n"
                  "#define main(void) shader(void)\n"
                  "\n"
                  "\n// ");
    }
    char program[20*1024];
    GLsizei l;
    GLuint shader;
    glGetAttachedShaders(prog,  1,  &l,  &shader);
    glGetShaderSource(shader,  sizeof(program), &l,  program);
    program[l] = '\0';
    fprintf(f,"%s",program);
    if (standalone) {
        fprintf(f,"\n"
                  "void setup(void) {\n");
        for (i = 0; i < 192; i++) {
            float* c = kelvin->constants[i].data;
            if (!((c[0] == c[1]) && (c[1] == c[2]) && (c[2] == c[3]) && (fabsf(c[3]) <= 1.0e-20f))) {
                fprintf(f,"  c[%d] = vec4(%f, %f, %f, %f);\n",i,c[0],c[1],c[2],c[3]);
            }
        }
        fprintf(f,"  v0 = gl_Vertex;\n"
                  "}\n");
    }
    fclose(f);
 }
diff --git a/nv2a_vsh.c b/nv2a_vsh.c
 /*
 * QEMU Geforce NV2A GPU vertex shader translation
 *
 * Copyright (c) 2014 Jannik Vogel
 * Copyright (c) 2012 espes
 *
 * Based on:
 * Cxbx, VertexShader.cpp
 * Copyright (c) 2004 Aaron Robinson <[email protected]>
 *                    Kingofc <[email protected]>
 * Dxbx, uPushBuffer.pas
 * Copyright (c) 2007 Shadow_tj, PatrickvL
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 or
 * (at your option) version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
 */

 #include <stdio.h>
 #include <string.h>
 #include <stdbool.h>
 #include <assert.h>

 #include "hw/xbox/nv2a_gpu_vsh.h"

 #define VSH_D3DSCM_CORRECTION 96

 #define VSH_TOKEN_SIZE 4

 typedef enum {
    FLD_ILU = 0,
    FLD_MAC,
    FLD_CONST,
    FLD_V,
    // Input A
    FLD_A_NEG,
    FLD_A_SWZ_X,
    FLD_A_SWZ_Y,
    FLD_A_SWZ_Z,
    FLD_A_SWZ_W,
    FLD_A_R,
    FLD_A_MUX,
    // Input B
    FLD_B_NEG,
    FLD_B_SWZ_X,
    FLD_B_SWZ_Y,
    FLD_B_SWZ_Z,
    FLD_B_SWZ_W,
    FLD_B_R,
    FLD_B_MUX,
    // Input C
    FLD_C_NEG,
    FLD_C_SWZ_X,
    FLD_C_SWZ_Y,
    FLD_C_SWZ_Z,
    FLD_C_SWZ_W,
    FLD_C_R_HIGH,
    FLD_C_R_LOW,
    FLD_C_MUX,
    // Output
    FLD_OUT_MAC_MASK,
    FLD_OUT_R,
    FLD_OUT_ILU_MASK,
    FLD_OUT_O_MASK,
    FLD_OUT_ORB,
    FLD_OUT_ADDRESS,
    FLD_OUT_MUX,
    // Relative addressing
    FLD_A0X,
    // Final instruction
    FLD_FINAL
 } VshFieldName;


 typedef enum {
    PARAM_UNKNOWN = 0,
    PARAM_R,
    PARAM_V,
    PARAM_C
 } VshParameterType;

 typedef enum {
    OUTPUT_C = 0,
    OUTPUT_O
 } VshOutputType;

 typedef enum {
    OMUX_MAC = 0,
    OMUX_ILU
 } VshOutputMux;

 typedef enum {
    ILU_NOP = 0,
    ILU_MOV,
    ILU_RCP,
    ILU_RCC,
    ILU_RSQ,
    ILU_EXP,
    ILU_LOG,
    ILU_LIT
 } VshILU;

 typedef enum {
    MAC_NOP,
    MAC_MOV,
    MAC_MUL,
    MAC_ADD,
    MAC_MAD,
    MAC_DP3,
    MAC_DPH,
    MAC_DP4,
    MAC_DST,
    MAC_MIN,
    MAC_MAX,
    MAC_SLT,
    MAC_SGE,
    MAC_ARL
 } VshMAC;

 typedef enum {
    SWIZZLE_X = 0,
    SWIZZLE_Y,
    SWIZZLE_Z,
    SWIZZLE_W
 } VshSwizzle;


 typedef struct VshFieldMapping {
    VshFieldName field_name;
    uint8_t subtoken;
    uint8_t start_bit;
    uint8_t bit_length;
 } VshFieldMapping;

 static const VshFieldMapping field_mapping[] = {
    // Field Name         DWORD BitPos BitSize
    {  FLD_ILU,              1,   25,     3 },
    {  FLD_MAC,              1,   21,     4 },
    {  FLD_CONST,            1,   13,     8 },
    {  FLD_V,                1,    9,     4 },
    // INPUT A
    {  FLD_A_NEG,            1,    8,     1 },
    {  FLD_A_SWZ_X,          1,    6,     2 },
    {  FLD_A_SWZ_Y,          1,    4,     2 },
    {  FLD_A_SWZ_Z,          1,    2,     2 },
    {  FLD_A_SWZ_W,          1,    0,     2 },
    {  FLD_A_R,              2,   28,     4 },
    {  FLD_A_MUX,            2,   26,     2 },
    // INPUT B
    {  FLD_B_NEG,            2,   25,     1 },
    {  FLD_B_SWZ_X,          2,   23,     2 },
    {  FLD_B_SWZ_Y,          2,   21,     2 },
    {  FLD_B_SWZ_Z,          2,   19,     2 },
    {  FLD_B_SWZ_W,          2,   17,     2 },
    {  FLD_B_R,              2,   13,     4 },
    {  FLD_B_MUX,            2,   11,     2 },
    // INPUT C
    {  FLD_C_NEG,            2,   10,     1 },
    {  FLD_C_SWZ_X,          2,    8,     2 },
    {  FLD_C_SWZ_Y,          2,    6,     2 },
    {  FLD_C_SWZ_Z,          2,    4,     2 },
    {  FLD_C_SWZ_W,          2,    2,     2 },
    {  FLD_C_R_HIGH,         2,    0,     2 },
    {  FLD_C_R_LOW,          3,   30,     2 },
    {  FLD_C_MUX,            3,   28,     2 },
    // Output
    {  FLD_OUT_MAC_MASK,     3,   24,     4 },
    {  FLD_OUT_R,            3,   20,     4 },
    {  FLD_OUT_ILU_MASK,     3,   16,     4 },
    {  FLD_OUT_O_MASK,       3,   12,     4 },
    {  FLD_OUT_ORB,          3,   11,     1 },
    {  FLD_OUT_ADDRESS,      3,    3,     8 },
    {  FLD_OUT_MUX,          3,    2,     1 },
    // Other
    {  FLD_A0X,              3,    1,     1 },
    {  FLD_FINAL,            3,    0,     1 }
 };


 typedef struct VshOpcodeParams {
    bool A;
    bool B;
    bool C;
 } VshOpcodeParams;

 static const VshOpcodeParams ilu_opcode_params[] = {
    /* ILU OP       ParamA ParamB ParamC */
    /* ILU_NOP */ { false, false, false }, // Dxbx note : Unused
    /* ILU_MOV */ { false, false, true  },
    /* ILU_RCP */ { false, false, true  },
    /* ILU_RCC */ { false, false, true  },
    /* ILU_RSQ */ { false, false, true  },
    /* ILU_EXP */ { false, false, true  },
    /* ILU_LOG */ { false, false, true  },
    /* ILU_LIT */ { false, false, true  },
 };

 static const VshOpcodeParams mac_opcode_params[] = {
    /* MAC OP      ParamA  ParamB ParamC */
    /* MAC_NOP */ { false, false, false }, // Dxbx note : Unused
    /* MAC_MOV */ { true,  false, false },
    /* MAC_MUL */ { true,  true,  false },
    /* MAC_ADD */ { true,  false, true  },
    /* MAC_MAD */ { true,  true,  true  },
    /* MAC_DP3 */ { true,  true,  false },
    /* MAC_DPH */ { true,  true,  false },
    /* MAC_DP4 */ { true,  true,  false },
    /* MAC_DST */ { true,  true,  false },
    /* MAC_MIN */ { true,  true,  false },
    /* MAC_MAX */ { true,  true,  false },
    /* MAC_SLT */ { true,  true,  false },
    /* MAC_SGE */ { true,  true,  false },
    /* MAC_ARL */ { true,  false, false },
 };


 #if 0
 static const char* mask_str[] = {
            // xyzw xyzw
    "",     // 0000 ____
    ".waaa",   // 0001 ___w
    ".zaaa",   // 0010 __z_
    ".zwaa",  // 0011 __zw
    ".yaaa",   // 0100 _y__
    ".ywaa",  // 0101 _y_w
    ".yzaa",  // 0110 _yz_
    ".yzwa", // 0111 _yzw
    ".xaaa",   // 1000 x___
    ".xwaa",  // 1001 x__w
    ".xzaa",  // 1010 x_z_
    ".xzwa", // 1011 x_zw
    ".xyaa",  // 1100 xy__
    ".xywa", // 1101 xy_w
    ".xyza", // 1110 xyz_
    ""//.xyzw  1111 xyzw
 };
 #else
 static const char* mask_str[] = {
            // xyzw xyzw
    "",     // 0000 ____
    ",w",   // 0001 ___w
    ",z",   // 0010 __z_
    ",zw",  // 0011 __zw
    ",y",   // 0100 _y__
    ",yw",  // 0101 _y_w
    ",yz",  // 0110 _yz_
    ",yzw", // 0111 _yzw
    ",x",   // 1000 x___
    ",xw",  // 1001 x__w
    ",xz",  // 1010 x_z_
    ",xzw", // 1011 x_zw
    ",xy",  // 1100 xy__
    ",xyw", // 1101 xy_w
    ",xyz", // 1110 xyz_
    ",xyzw"//.xyzw  1111 xyzw
 };
 #endif

 /* Note: OpenGL seems to be case-sensitive, and requires upper-case opcodes! */
 static const char* mac_opcode[] = {
    "NOP",
    "MOV",
    "MUL",
    "ADD",
    "MAD",
    "DP3",
    "DPH",
    "DP4",
    "DST",
    "MIN",
    "MAX",
    "SLT",
    "SGE",
    "ARL A0.x", // Dxbx note : Alias for "mov a0.x"
 };

 static const char* ilu_opcode[] = {
    "NOP",
    "MOV",
    "RCP",
    "RCC",
    "RSQ",
    "EXP",
    "LOG",
    "LIT",
 };

 static bool ilu_force_scalar[] = {
    false,
    false,
    true,
    true,
    true,
    true,
    true,
    false,
 };

 static const char* out_reg_name[] = {
    "oPos",
    "???",
    "???",
    "oD0",
    "oD1",
    "oFog",
    "oPts",
    "oB0",
    "oB1",
    "oT0",
    "oT1",
    "oT2",
    "oT3",
    "???",
    "???",
    "A0.x",
 };



 // Retrieves a number of bits in the instruction token
 static int vsh_get_from_token(uint32_t *shader_token,
                              uint8_t subtoken,
                              uint8_t start_bit,
                              uint8_t bit_length)
 {
    return (shader_token[subtoken] >> start_bit) & ~(0xFFFFFFFF << bit_length);
 }
 static uint8_t vsh_get_field(uint32_t *shader_token, VshFieldName field_name)
 {

    return (uint8_t)(vsh_get_from_token(shader_token,
                                        field_mapping[field_name].subtoken,
                                        field_mapping[field_name].start_bit,
                                        field_mapping[field_name].bit_length));
 }


 // Converts the C register address to disassembly format
 static int16_t convert_c_register(const int16_t c_reg)
 {
    int16_t r = ((((c_reg >> 5) & 7) - 3) * 32) + (c_reg & 31);
    r += VSH_D3DSCM_CORRECTION; /* to map -96..95 to 0..191 */
    return r; //FIXME: = c_reg?!
 }



 static QString* decode_swizzle(uint32_t *shader_token,
                               VshFieldName swizzle_field)
 {
    const char* swizzle_str = "xyzw";
    VshSwizzle x, y, z, w;

    /* some microcode instructions force a scalar value */
    if (swizzle_field == FLD_C_SWZ_X
        && ilu_force_scalar[vsh_get_field(shader_token, FLD_ILU)]) {
        x = y = z = w = vsh_get_field(shader_token, swizzle_field);
    } else {
        x = vsh_get_field(shader_token, swizzle_field++);
        y = vsh_get_field(shader_token, swizzle_field++);
        z = vsh_get_field(shader_token, swizzle_field++);
        w = vsh_get_field(shader_token, swizzle_field);
    }

    if (x == SWIZZLE_X && y == SWIZZLE_Y
        && z == SWIZZLE_Z && w == SWIZZLE_W) {
        /* Don't print the swizzle if it's .xyzw */
        return qstring_from_str(""); // Will turn ".xyzw" into "."
    /* Don't print duplicates */
    } else if (x == y && y == z && z == w) {
        return qstring_from_str((char[]){'.', swizzle_str[x], '\0'});
 #if 0
    } else if (x == y && z == w) {
        return qstring_from_str((char[]){'.',
            swizzle_str[x], swizzle_str[y], '\0'}); //FIXME: !!!! Would turn ".xxyy" into ".xy" ?! !!!!
    /* } else if (z == w) {
        return qstring_from_str((char[]){'.',
            swizzle_str[x], swizzle_str[y], swizzle_str[z], '\0'}); */
 #endif
    } else {
        return qstring_from_str((char[]){'.',
                                       swizzle_str[x], swizzle_str[y],
                                       swizzle_str[z], swizzle_str[w],
                                       '\0'}); // Normal swizzle mask
    }
 }

 static QString* decode_opcode_input(uint32_t *shader_token,
                                    VshParameterType param,
                                    VshFieldName neg_field,
                                    int reg_num)
 {
    /* This function decodes a vertex shader opcode parameter into a string.
     * Input A, B or C is controlled via the Param and NEG fieldnames,
     * the R-register address for each input is already given by caller. */

    QString *ret_str = qstring_new();


    if (vsh_get_field(shader_token, neg_field) > 0) {
        qstring_append_chr(ret_str, '-');
    }

    /* PARAM_R uses the supplied reg_num, but the other two need to be
     * determined */
    char tmp[40];
    switch (param) {
    case PARAM_R:
        snprintf(tmp, sizeof(tmp), "R%d", reg_num);
        break;
    case PARAM_V:
        reg_num = vsh_get_field(shader_token, FLD_V);
        snprintf(tmp, sizeof(tmp), "v%d", reg_num);
        break;
    case PARAM_C:
        reg_num = convert_c_register(vsh_get_field(shader_token, FLD_CONST));
        if (vsh_get_field(shader_token, FLD_A0X) > 0) {
            snprintf(tmp, sizeof(tmp), "c[A0+%d]", reg_num); //FIXME: does this really require the "correction" doe in convert_c_register?!
        } else {
            snprintf(tmp, sizeof(tmp), "c[%d]", reg_num);
        }
        break;
    default:
        printf("Param: 0x%x\n", param);
        assert(false);
    }
    qstring_append(ret_str, tmp);

    {
        /* swizzle bits are next to the neg bit */
        QString *swizzle_str = decode_swizzle(shader_token, neg_field+1);
        qstring_append(ret_str, qstring_get_str(swizzle_str));
        QDECREF(swizzle_str);
    }

    return ret_str;
 }


 static QString* decode_opcode(uint32_t *shader_token,
                              VshOutputMux out_mux,
                              uint32_t mask,
                              const char* opcode,
                              QString *inputs)
 {
    QString *ret = qstring_new();
    int reg_num = vsh_get_field(shader_token, FLD_OUT_R);

    /* Test for paired opcodes (in other words : Are both <> NOP?) */
    if (out_mux == OMUX_MAC
          &&  vsh_get_field(shader_token, FLD_ILU) != ILU_NOP
          && reg_num == 1) {
        /* Ignore paired MAC opcodes that write to R1 */
        mask = 0;
    } else if (out_mux == OMUX_ILU
               && vsh_get_field(shader_token, FLD_MAC) != MAC_NOP) {
        /* Paired ILU opcodes can only write to R1 */
        reg_num = 1;
    }

    if (mask > 0) {
        if (strcmp(opcode, mac_opcode[MAC_ARL]) == 0) {
            qstring_append(ret, "  ARL(a0");
            qstring_append(ret, qstring_get_str(inputs));
            qstring_append(ret, ";\n");
        } else {
            qstring_append(ret, "  ");
            qstring_append(ret, opcode);
            qstring_append(ret, "(");
            qstring_append(ret, "R");
            qstring_append_int(ret, reg_num);
            qstring_append(ret, mask_str[mask]);
            qstring_append(ret, qstring_get_str(inputs));
            qstring_append(ret, ");\n");
        }
    }

    /* See if we must add a muxed opcode too: */
    if (vsh_get_field(shader_token, FLD_OUT_MUX) == out_mux
        /* Only if it's not masked away: */
        && vsh_get_field(shader_token, FLD_OUT_O_MASK) != 0) {

        qstring_append(ret, "  ");
        qstring_append(ret, opcode);
        qstring_append(ret, "(");

        if (vsh_get_field(shader_token, FLD_OUT_ORB) == OUTPUT_C) {
            /* TODO : Emulate writeable const registers */
            qstring_append(ret, "c");
            qstring_append_int(ret,
                convert_c_register(
                    vsh_get_field(shader_token, FLD_OUT_ADDRESS)));
        } else {
            qstring_append(ret,
                out_reg_name[
                    vsh_get_field(shader_token, FLD_OUT_ADDRESS) & 0xF]);
        }
        qstring_append(ret,
            mask_str[
                vsh_get_field(shader_token, FLD_OUT_O_MASK)]);
        qstring_append(ret, qstring_get_str(inputs));
        qstring_append(ret, ");\n");
    }

    return ret;
 }


 static QString* decode_token(uint32_t *shader_token)
 {
    QString *ret;

    /* Since it's potentially used twice, decode input C once: */
    QString *input_c =
        decode_opcode_input(shader_token,
                            vsh_get_field(shader_token, FLD_C_MUX),
                            FLD_C_NEG,
                            (vsh_get_field(shader_token, FLD_C_R_HIGH) << 2)
                                | vsh_get_field(shader_token, FLD_C_R_LOW));

    /* See what MAC opcode is written to (if not masked away): */
    VshMAC mac = vsh_get_field(shader_token, FLD_MAC);
    if (mac != MAC_NOP) {
        QString *inputs_mac = qstring_new();
        if (mac_opcode_params[mac].A) {
            QString *input_a =
                decode_opcode_input(shader_token,
                                    vsh_get_field(shader_token, FLD_A_MUX),
                                    FLD_A_NEG,
                                    vsh_get_field(shader_token, FLD_A_R));
            qstring_append(inputs_mac, ", ");
            qstring_append(inputs_mac, qstring_get_str(input_a));
            QDECREF(input_a);
        }
        if (mac_opcode_params[mac].B) {
            QString *input_b =
                decode_opcode_input(shader_token,
                                    vsh_get_field(shader_token, FLD_B_MUX),
                                    FLD_B_NEG,
                                    vsh_get_field(shader_token, FLD_B_R));
            qstring_append(inputs_mac, ", ");
            qstring_append(inputs_mac, qstring_get_str(input_b));
            QDECREF(input_b);
        }
        if (mac_opcode_params[mac].C) {
            qstring_append(inputs_mac, ", ");
            qstring_append(inputs_mac, qstring_get_str(input_c));
        }

        /* Then prepend these inputs with the actual opcode, mask, and input : */
        ret = decode_opcode(shader_token,
                            OMUX_MAC,
                            vsh_get_field(shader_token, FLD_OUT_MAC_MASK),
                            mac_opcode[mac],
                            inputs_mac);
        QDECREF(inputs_mac);
    } else {
        ret = qstring_new();
    }

    /* See if a ILU opcode is present too: */
    VshILU ilu = vsh_get_field(shader_token, FLD_ILU);
    if (ilu != ILU_NOP) {
        QString *inputs_c = qstring_from_str(", ");
        qstring_append(inputs_c, qstring_get_str(input_c));

        /* Append the ILU opcode, mask and (the already determined) input C: */
        QString *ilu_op =
            decode_opcode(shader_token,
                          OMUX_ILU,
                          vsh_get_field(shader_token, FLD_OUT_ILU_MASK),
                          ilu_opcode[ilu],
                          inputs_c);

        qstring_append(ret, qstring_get_str(ilu_op));

        QDECREF(inputs_c);
        QDECREF(ilu_op);
    }

    QDECREF(input_c);

    return ret;
 }

 /* Vertex shader header, mapping Xbox1 registers to the ARB syntax (original
 * version by KingOfC). Note about the use of 'conventional' attributes in here:
 * Since we prefer to use only one shader for both immediate and deferred mode
 * rendering, we alias all attributes to conventional inputs as much as possible.
 * Only when there's no conventional attribute available, we use generic
 * attributes. So in the following header, we use conventional attributes first,
 * and generic attributes for the rest of the vertex attribute slots. This makes
 * it possible to support immediate and deferred mode rendering with the same
 * shader, and the use of the OpenGL fixed-function pipeline without a shader.
 */
 static const char* vsh_header =
    "#version 110\n"
    "\n"
    //FIXME: I just assumed this is true for all registers?!
    "vec4 R0 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R1 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R2 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R3 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R4 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R5 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R6 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R7 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R8 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R9 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R10 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R11 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 R12 = vec4(0.0,0.0,0.0,1.0);\n"
    "\n"
    //FIXME: What is a0 initialized as?
    "int A0 = 0;\n"
    "\n"
 #if 0
    "ATTRIB v0 = vertex.position;" // (See "conventional" note above)
    "ATTRIB v1 = vertex.%s;" // Note : We replace this with "weight" or "attrib[1]" depending GL_ARB_vertex_blend
    "ATTRIB v2 = vertex.normal;"
    "ATTRIB v3 = vertex.color.primary;"
    "ATTRIB v4 = vertex.color.secondary;"
    "ATTRIB v5 = vertex.fogcoord;"
    "ATTRIB v6 = vertex.attrib[6];"
    "ATTRIB v7 = vertex.attrib[7];"
    "ATTRIB v8 = vertex.texcoord[0];"
    "ATTRIB v9 = vertex.texcoord[1];"
    "ATTRIB v10 = vertex.texcoord[2];"
    "ATTRIB v11 = vertex.texcoord[3];"
 #else
    "attribute vec4 v0;\n"
    "attribute vec4 v1;\n"
    "attribute vec4 v2;\n"
    "attribute vec4 v3;\n"
    "attribute vec4 v4;\n"
    "attribute vec4 v5;\n"
    "attribute vec4 v6;\n"
    "attribute vec4 v7;\n"
    "attribute vec4 v8;\n"
    "attribute vec4 v9;\n"
    "attribute vec4 v10;\n"
    "attribute vec4 v11;\n"
 #endif
    "attribute vec4 v12;\n"
    "attribute vec4 v13;\n"
    "attribute vec4 v14;\n"
    "attribute vec4 v15;\n"

    "\n"

 /*
 //FIXME: temp var?
    "OUTPUT oPos = result.position;\n"
    "OUTPUT oD0 = result.color.front.primary;\n"
    "OUTPUT oD1 = result.color.front.secondary;\n"
    "OUTPUT oB0 = result.color.back.primary;\n"
    "OUTPUT oB1 = result.color.back.secondary;\n"
    "OUTPUT oPts = result.pointsize;\n"
    "OUTPUT oFog = result.fogcoord;\n"
    "OUTPUT oT0 = result.texcoord[0];\n"
    "OUTPUT oT1 = result.texcoord[1];\n"
    "OUTPUT oT2 = result.texcoord[2];\n"
    "OUTPUT oT3 = result.texcoord[3];\n"
 */
    "#define oPos R12 /* oPos is a mirror of R12 */\n"
    "vec4 oD0 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 oD1 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 oB0 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 oB1 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 oPts = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 oFog = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 oT0 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 oT1 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 oT2 = vec4(0.0,0.0,0.0,1.0);\n"
    "vec4 oT3 = vec4(0.0,0.0,0.0,1.0);\n"

    "\n"

    /* All constants in 1 array declaration */
 //FIXME: it's probably wise to change the c[x] to c##x later because it forces us to allocate and reupload around 100*4*4 bytes (~1.5kB) of useless data on/to the GPU :P
   "uniform vec4 c[192];\n"
   "#define viewport_scale c[58] /* This seems to be hardwired? See comment in nv2a_gpu.c */\n"
   "#define viewport_offset c[59] /* Same as above */\n"
   "uniform vec2 cliprange;\n"

 /*


 FIXME: !!!!!! MAJOR BUG !!!!!!
 JayFoxRox: mhhh I believe there is a bug in my glsl stuff too I didn't even think about before
 JayFoxRox: but if mask is yz it would result in: dest.yz = OP().yz when it should be dest.yz = OP().xy



 */

 // Code from pages linked here http://msdn.microsoft.com/en-us/library/windows/desktop/bb174703%28v=vs.85%29.aspx
 // and also https://www.opengl.org/registry/specs/NV/vertex_program1_1.txt
 // Some code was also written from scratch because it seemed easy - if you are bored verify the behaviour!
    "\n"
    "/* LUT to convert write mask to same amount of components */\n"
                                  // 0000 ____ (NOP) writes not inserted into shader
    "#define COMPONENTS_w    x\n" // 0001 ___w
    "#define COMPONENTS_z    x\n"  // 0010 __z_
    "#define COMPONENTS_y    x\n"   // 0100 _y__
    "#define COMPONENTS_x    x\n"   // 1000 x___
    "#define COMPONENTS_zw   xy\n"  // 0011 __zw
    "#define COMPONENTS_yw   xy\n"  // 0101 _y_w
    "#define COMPONENTS_yz   xy\n"  // 0110 _yz_
    "#define COMPONENTS_xw   xy\n"  // 1001 x__w
    "#define COMPONENTS_xz   xy\n"  // 1010 x_z_
    "#define COMPONENTS_xy   xy\n"  // 1100 xy__
    "#define COMPONENTS_yzw  xyz\n" // 0111 _yzw
    "#define COMPONENTS_xzw  xyz\n" // 1011 x_zw
    "#define COMPONENTS_xyw  xyz\n" // 1101 xy_w
    "#define COMPONENTS_xyz  xyz\n" // 1110 xyz_
    "#define COMPONENTS_xyzw xyzw\n" // 1111 xyzw (alternative)
    "#define COMPONENTS_     xyzw\n" // 1111 xyzw (alternative)
    "\n"
    "#define MOV(dest,mask, src) dest.mask = _MOV(vec4(src)).COMPONENTS_ ## mask\n"
    "vec4 _MOV(vec4 src)\n" 
    "{\n"
    "  return src;\n"
    "}\n"
    "\n"
    "#define MUL(dest,mask, src0, src1) dest.mask = _MUL(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _MUL(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return src0 * src1;\n"
    "}\n"
    "\n"
    "#define ADD(dest,mask, src0, src1) dest.mask = _ADD(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _ADD(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return src0 + src1;\n"
    "}\n"
    "\n"
    "#define MAD(dest,mask, src0, src1, src2) dest.mask = _MAD(vec4(src0), vec4(src1), vec4(src2)).COMPONENTS_ ## mask\n"
    "vec4 _MAD(vec4 src0, vec4 src1, vec4 src2)\n" 
    "{\n"
    "  return src0 * src1 + src2;\n"
    "}\n"
    "\n"
    "#define DP3(dest,mask, src0, src1) dest.mask = _DP3(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _DP3(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return vec4(dot(src0.xyz, src1.xyz));\n"
    "}\n"
    "\n"
    "#define DPH(dest,mask, src0, src1) dest.mask = _DPH(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _DPH(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return vec4(dot(vec4(src0.xyz, 1.0), src1));\n"
    "}\n"
    "\n"
    "#define DP4(dest,mask, src0, src1) dest.mask = _DP4(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _DP4(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return vec4(dot(src0, src1));\n"
    "}\n"
    "\n"
    "#define DST(dest,mask, src0, src1) dest.mask = _DST(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _DST(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return vec4(1.0,\n"
    "              src0.y * src1.y,\n"
    "              src0.z,\n"
    "              src1.w);\n"
    "}\n"
    "\n"
    "#define MIN(dest,mask, src0, src1) dest.mask = _MIN(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _MIN(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return min(src0, src1);\n"
    "}\n"
    "\n"
    "#define MAX(dest,mask, src0, src1) dest.mask = _MAX(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _MAX(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return max(src0, src1);\n"
    "}\n"
    "\n"
    "#define SLT(dest,mask, src0, src1) dest.mask = _SLT(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _SLT(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return vec4(src0.x < src1.x ? 1.0 : 0.0,\n"
    "              src0.y < src1.y ? 1.0 : 0.0,\n"
    "              src0.z < src1.z ? 1.0 : 0.0,\n"
    "              src0.w < src1.w ? 1.0 : 0.0);\n"
    "}\n"
    "\n"
    "#define ARL(dest,mask, src) dest = _ARL(vec4(src).x)\n"
    "int _ARL(float src)\n" 
    "{\n"
    "  return int(src);\n"
    "}\n"
    "\n"
    "#define SGE(dest,mask, src0, src1) dest.mask = _SGE(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
    "vec4 _SGE(vec4 src0, vec4 src1)\n" 
    "{\n"
    "  return vec4(src0.x >= src1.x ? 1.0 : 0.0,\n"
    "              src0.y >= src1.y ? 1.0 : 0.0,\n"
    "              src0.z >= src1.z ? 1.0 : 0.0,\n"
    "              src0.w >= src1.w ? 1.0 : 0.0);\n"
    "}\n"
    "\n"
    "#define RCP(dest,mask, src) dest.mask = _RCP(vec4(src).x).COMPONENTS_ ## mask\n"
    "vec4 _RCP(float src)\n" 
    "{\n"
    "  return vec4(1.0 / src);\n"
    "}\n"
    "\n"
    "#define RCC(dest,mask, src) dest.mask = _RCC(vec4(src).x).COMPONENTS_ ## mask\n"
    "vec4 _RCC(float src)\n" 
    "{\n"
    "  float t = 1.0 / src;\n"
    "  if (t > 0.0) {\n"
    "    t = min(t, 1.884467e+019);\n"
    "    t = max(t, 5.42101e-020);\n"
    "  } else {\n"
    "    t = max(t, -1.884467e+019);\n"
    "    t = min(t, -5.42101e-020);\n"
    "  }\n"
    "  return vec4(t);\n"
    "}\n"
    "\n"
    "#define RSQ(dest,mask, src) dest.mask = _RSQ(vec4(src).x).COMPONENTS_ ## mask\n"
    "vec4 _RSQ(float src)\n" 
    "{\n"
    "  return vec4(1.0 / sqrt(src));\n"
    "}\n"
    "\n"
    "#define EXP(dest,mask, src) dest.mask = _EXP(vec4(src).x).COMPONENTS_ ## mask\n"
    "vec4 _EXP(float src)\n" 
    "{\n"
    "  return vec4(exp2(src));\n"
    "}\n"
    "\n"
    "#define LOG(dest,mask, src) dest.mask = _LOG(vec4(src).x).COMPONENTS_ ## mask\n"
    "vec4 _LOG(float src)\n" 
    "{\n"
    "  return vec4(log2(src));\n"
    "}\n"
    "\n"
    "#define LIT(dest,mask, src) dest.mask = _LIT(vec4(src)).COMPONENTS_ ## mask\n"
    "vec4 _LIT(vec4 src)\n" 
    "{\n"
    "  vec4 t = vec4(1.0, 0.0, 0.0, 1.0);\n"
    "  float power = src.w;\n"
 #if 0
    //XXX: Limitation for 8.8 fixed point
    "  power = max(power, -127.9961);\n"
    "  power = min(power, 127.9961);\n"
 #endif
    "  if (src.x > 0.0) {\n"
    "    t.y = src.x;\n"
    "    if (src.y > 0.0) {\n"
    //XXX: Allowed approximation is EXP(power * LOG(src.y))
    "      t.z = pow(src.y, power);\n"
    "    }\n"
    "  }\n"
    "  return t;\n"
    "}\n";

 QString* vsh_translate(uint16_t version,
                       uint32_t *tokens, unsigned int tokens_length)
 {
    QString *body = qstring_from_str("\n");
    QString *header = qstring_from_str(vsh_header);
                          
 #ifdef DEBUG_NV2A_GPU_SHADER_FEEDBACK
    qstring_append(header,
                   "\n"
                   "/* Debug stuff */\n"
                   "varying vec4 debug_v0;\n"
                   "varying vec4 debug_v1;\n"
                   "varying vec4 debug_v2;\n"
                   "varying vec4 debug_v3;\n"
                   "varying vec4 debug_v4;\n"
                   "varying vec4 debug_v5;\n"
                   "varying vec4 debug_v6;\n"
                   "varying vec4 debug_v7;\n"
                   "varying vec4 debug_v8;\n"
                   "varying vec4 debug_v9;\n"
                   "varying vec4 debug_v10;\n"
                   "varying vec4 debug_v11;\n"
                   "varying vec4 debug_v12;\n"
                   "varying vec4 debug_v13;\n"
                   "varying vec4 debug_v14;\n"
                   "varying vec4 debug_v15;\n"
                   "varying vec4 debug_oPos;\n"
                   "varying vec4 debug_oD0;\n"
                   "varying vec4 debug_oD1;\n"
                   "varying vec4 debug_oB0;\n"
                   "varying vec4 debug_oB1;\n"
                   "varying vec4 debug_oPts;\n"
                   "varying vec4 debug_oFog;\n"
                   "varying vec4 debug_oT0;\n"
                   "varying vec4 debug_oT1;\n"
                   "varying vec4 debug_oT2;\n"
                   "varying vec4 debug_oT3;\n"
                   "\n"
                   "#define DEBUG_VAR(slot,var) debug_ ## slot ## _ ## var = var;\n"
                   "#define DEBUG(slot) \\\n"
                   "  DEBUG_VAR(slot,R0) \\\n"
                   "  DEBUG_VAR(slot,R1) \\\n"
                   "  DEBUG_VAR(slot,R2) \\\n"
                   "  DEBUG_VAR(slot,R3) \\\n"
                   "  DEBUG_VAR(slot,R4) \\\n"
                   "  DEBUG_VAR(slot,R5) \\\n"
                   "  DEBUG_VAR(slot,R6) \\\n"
                   "  DEBUG_VAR(slot,R7) \\\n"
                   "  DEBUG_VAR(slot,R8) \\\n"
                   "  DEBUG_VAR(slot,R9) \\\n"
                   "  DEBUG_VAR(slot,R10) \\\n"
                   "  DEBUG_VAR(slot,R11) \\\n"
                   "  DEBUG_VAR(slot,R12)\n"
                   "\n"
                   "#define DEBUG_VARYING_VAR(slot,var) varying vec4 debug_ ## slot ## _ ## var;\n"
                   "#define DEBUG_VARYING(slot) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R0) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R1) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R2) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R3) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R4) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R5) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R6) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R7) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R8) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R9) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R10) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R11) \\\n"
                   "  DEBUG_VARYING_VAR(slot,R12)\n"
                   "\n");
    qstring_append(body,
                   "  /* Debug input */\n"
                   "  debug_v0 = v0;\n"
                   "  debug_v1 = v1;\n"
                   "  debug_v2 = v2;\n"
                   "  debug_v3 = v3;\n"
                   "  debug_v4 = v4;\n"
                   "  debug_v5 = v5;\n"
                   "  debug_v6 = v6;\n"
                   "  debug_v7 = v7;\n"
                   "  debug_v8 = v8;\n"
                   "  debug_v9 = v9;\n"
                   "  debug_v10 = v10;\n"
                   "  debug_v11 = v11;\n"
                   "  debug_v12 = v12;\n"
                   "  debug_v13 = v13;\n"
                   "  debug_v14 = v14;\n"
                   "  debug_v15 = v15;\n"
                   "\n");
 #endif


    bool has_final = false;
    uint32_t *cur_token = tokens;
    while (cur_token-tokens < tokens_length) {
        unsigned int slot = (cur_token-tokens) / VSH_TOKEN_SIZE;
        QString *token_str = decode_token(cur_token);
        qstring_append_fmt(body,
                           "  /* Slot %d: 0x%08X 0x%08X 0x%08X 0x%08X */\n",
                           slot,
                           cur_token[0],cur_token[1],cur_token[2],cur_token[3]);
        qstring_append(body, qstring_get_str(token_str));
 #ifdef DEBUG_NV2A_GPU_SHADER_FEEDBACK
        qstring_append_fmt(header,"DEBUG_VARYING(%d)\n",slot);
        qstring_append_fmt(body,"  DEBUG(%d)\n",slot);
 #endif
        qstring_append(body, "\n");
        QDECREF(token_str);

        if (vsh_get_field(cur_token, FLD_FINAL)) {
            printf("Final at %u\n",slot);
            has_final = true;
            break;
        }
        cur_token += VSH_TOKEN_SIZE;
    }
    assert(has_final);

    /* Note : Since we replaced oPos with r12 in the above decoding,
     * we have to assign oPos at the end; This can be done in two ways;
     * 1) When the shader is complete (including transformations),
     *    we could just do a 'MOV oPos, R12;' and be done with it.
     */
    qstring_append(body,
 /*
    '# Dxbx addition : Transform the vertex to clip coordinates :'
    "DP4 R0.x, mvp[0], R12;"
    "DP4 R0.y, mvp[1], R12;"
    "DP4 R0.z, mvp[2], R12;"
    "DP4 R0.w, mvp[3], R12;"
    "MOV R12, R0;"
 */


        /* the shaders leave the result in screen space, while
         * opengl expects it in clip coordinates.
         * Use the magic viewport constants for now,
         * but they're not necessarily present.
         * Same idea as above I think, but dono what the mvp stuff is about...
        */
 #ifdef DEBUG_NV2A_GPU_SHADER_FEEDBACK
        "  /* Debug output */\n"
        "  debug_oPos = oPos;\n"
        "  debug_oD0 = oD0;\n"
        "  debug_oD1 = oD1;\n"
        "  debug_oB0 = oB0;\n"
        "  debug_oB1 = oB1;\n"
        "  debug_oPts = oPts;\n"
        "  debug_oFog = oFog;\n"
        "  debug_oT0 = oT0;\n"
        "  debug_oT1 = oT1;\n"
        "  debug_oT2 = oT2;\n"
        "  debug_oT3 = oT3;\n"
        "\n"
 #endif
 #if 1
        "  /* Un-screenspace transform */\n"
        "  R12.xyz = R12.xyz - viewport_offset.xyz;\n"
        "  R1.x = 1.0 / viewport_scale.x;\n"
        "  R1.y = 1.0 / viewport_scale.y;\n"

        /* scale_z = view_z == 0 ? 1 : (1 / view_z) */
        "  if (viewport_scale.z == 0.0) {\n"
        "    R1.z = 1.0;\n"
        "  } else {\n"
        "    R1.z = 1.0 / viewport_scale.z;\n"
        "  }\n"

        "  R12.xyz = R12.xyz * R1.xyz;\n"
        "  R12.xyz *= R12.w;\n" //This breaks 2D? Maybe w is zero?
        "\n"
 #else
 //FIXME: Use surface width / height / zeta max
      "R12.z /= 16777215.0;\n" // Z[0;1]
      "R12.z *= (cliprange.y - cliprange.x) / 16777215.0;\n" // Scale so [0;zmax] -> [0;cliprange_size]
      "R12.z -= cliprange.x / 16777215.0;\n" // Move down so [clipmin_min;clipmin_max]
      // X = [0;surface_width]; Y = [surface_height;0]; Z = [0;1]; W = ???
      "R12.xyz = R12.xyz / vec3(640.0,480.0,1.0);\n"
      // X,Z = [0;1]; Y = [1;0]; W = ???
      "R12.xyz = R12.xyz * vec3(2.0) - vec3(1.0);\n"
      "R12.y *= -1.0;\n"
      "R12.w = 1.0;\n"
      // X,Y,Z = [-1;+1]; W = 1
        "\n"
 #endif
        /* undo the perspective divide? */
        //"MUL R12.xyz, R12, R12.w;\n"

        /* Z coord [0;1]->[-1;1] mapping, see comment in transform_projection
         * in state.c
         *
         * Basically we want (in homogeneous coordinates) z = z * 2 - 1. However,
         * shaders are run before the homogeneous divide, so we have to take the w
         * into account: z = ((z / w) * 2 - 1) * w, which is the same as
         * z = z * 2 - w.
         */
        //"# Apply Z coord mapping\n"
        //"ADD R12.z, R12.z, R12.z;\n"
        //"ADD R12.z, R12.z, -R12.w;\n"
        "  /* Set outputs */\n"
        "  gl_Position = oPos;\n"
        "  gl_FrontColor = oD0;\n"
        "  gl_FrontSecondaryColor = oD1;\n"
        "  gl_BackColor = oB0;\n"
        "  gl_BackSecondaryColor = oB1;\n"
        "  gl_PointSize = oPts.x;\n"
        "  gl_FogFragCoord = oFog.x;\n"
        "  gl_TexCoord[0] = oT0;\n"
        "  gl_TexCoord[1] = oT1;\n"
        "  gl_TexCoord[2] = oT2;\n"
        "  gl_TexCoord[3] = oT3;\n"
        "\n"
    );

    QString *ret = qstring_new();
    qstring_append(ret, qstring_get_str(header));
    qstring_append(ret,"\n"
                       "void main(void)\n"
                       "{\n");
    qstring_append(ret, qstring_get_str(body));
    qstring_append(ret,"}\n");
    QDECREF(header);
    QDECREF(body);
    return ret;
 }
	static void debugger_export_vertex_shader(const char* file, KelvinState* kelvin, bool standalone) {
	int i;
	GLint prog;
	glGetIntegerv(GL_CURRENT_PROGRAM,&prog);
	FILE* f = fopen(file,"wb");
	if (standalone) {
	fprintf(f,"#version 110\n"
	"\n"
	// Writeable registers
	"#define attribute\n"
	"#define uniform\n"
	"\n"
	// Move entrypoint
	"void setup(void);\n"
	"void shader(void);\n"
	"void main(void) {\n"
	" setup();\n"
	" shader();\n"
	"}\n"
	"#define main(void) shader(void)\n"
	"\n"
	"\n// ");
	}
	char program[20*1024];
	GLsizei l;
	GLuint shader;
	glGetAttachedShaders(prog, 1, &l, &shader);
	glGetShaderSource(shader, sizeof(program), &l, program);
	program[l] = '\0';
	fprintf(f,"%s",program);
	if (standalone) {
	fprintf(f,"\n"
	"void setup(void) {\n");
	for (i = 0; i < 192; i++) {
	float* c = kelvin->constants[i].data;
	if (!((c[0] == c[1]) && (c[1] == c[2]) && (c[2] == c[3]) && (fabsf(c[3]) <= 1.0e-20f))) {
	fprintf(f," c[%d] = vec4(%f, %f, %f, %f);\n",i,c[0],c[1],c[2],c[3]);
	}
	}
	fprintf(f," v0 = gl_Vertex;\n"
	"}\n");
	}
	fclose(f);
	}