Skip to content

Instantly share code, notes, and snippets.

@JayFoxRox
Last active August 29, 2015 14:01
Show Gist options
  • Save JayFoxRox/aa5f30ad95fc71e020fe to your computer and use it in GitHub Desktop.
Save JayFoxRox/aa5f30ad95fc71e020fe to your computer and use it in GitHub Desktop.
static void debugger_export_vertex_shader(const char* file, KelvinState* kelvin, bool standalone) {
int i;
GLint prog;
glGetIntegerv(GL_CURRENT_PROGRAM,&prog);
FILE* f = fopen(file,"wb");
if (standalone) {
fprintf(f,"#version 110\n"
"\n"
// Writeable registers
"#define attribute\n"
"#define uniform\n"
"\n"
// Move entrypoint
"void setup(void);\n"
"void shader(void);\n"
"void main(void) {\n"
" setup();\n"
" shader();\n"
"}\n"
"#define main(void) shader(void)\n"
"\n"
"\n// ");
}
char program[20*1024];
GLsizei l;
GLuint shader;
glGetAttachedShaders(prog, 1, &l, &shader);
glGetShaderSource(shader, sizeof(program), &l, program);
program[l] = '\0';
fprintf(f,"%s",program);
if (standalone) {
fprintf(f,"\n"
"void setup(void) {\n");
for (i = 0; i < 192; i++) {
float* c = kelvin->constants[i].data;
if (!((c[0] == c[1]) && (c[1] == c[2]) && (c[2] == c[3]) && (fabsf(c[3]) <= 1.0e-20f))) {
fprintf(f," c[%d] = vec4(%f, %f, %f, %f);\n",i,c[0],c[1],c[2],c[3]);
}
}
fprintf(f," v0 = gl_Vertex;\n"
"}\n");
}
fclose(f);
}
/*
* QEMU Geforce NV2A GPU vertex shader translation
*
* Copyright (c) 2014 Jannik Vogel
* Copyright (c) 2012 espes
*
* Based on:
* Cxbx, VertexShader.cpp
* Copyright (c) 2004 Aaron Robinson <[email protected]>
* Kingofc <[email protected]>
* Dxbx, uPushBuffer.pas
* Copyright (c) 2007 Shadow_tj, PatrickvL
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 or
* (at your option) version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <assert.h>
#include "hw/xbox/nv2a_gpu_vsh.h"
#define VSH_D3DSCM_CORRECTION 96
#define VSH_TOKEN_SIZE 4
typedef enum {
FLD_ILU = 0,
FLD_MAC,
FLD_CONST,
FLD_V,
// Input A
FLD_A_NEG,
FLD_A_SWZ_X,
FLD_A_SWZ_Y,
FLD_A_SWZ_Z,
FLD_A_SWZ_W,
FLD_A_R,
FLD_A_MUX,
// Input B
FLD_B_NEG,
FLD_B_SWZ_X,
FLD_B_SWZ_Y,
FLD_B_SWZ_Z,
FLD_B_SWZ_W,
FLD_B_R,
FLD_B_MUX,
// Input C
FLD_C_NEG,
FLD_C_SWZ_X,
FLD_C_SWZ_Y,
FLD_C_SWZ_Z,
FLD_C_SWZ_W,
FLD_C_R_HIGH,
FLD_C_R_LOW,
FLD_C_MUX,
// Output
FLD_OUT_MAC_MASK,
FLD_OUT_R,
FLD_OUT_ILU_MASK,
FLD_OUT_O_MASK,
FLD_OUT_ORB,
FLD_OUT_ADDRESS,
FLD_OUT_MUX,
// Relative addressing
FLD_A0X,
// Final instruction
FLD_FINAL
} VshFieldName;
typedef enum {
PARAM_UNKNOWN = 0,
PARAM_R,
PARAM_V,
PARAM_C
} VshParameterType;
typedef enum {
OUTPUT_C = 0,
OUTPUT_O
} VshOutputType;
typedef enum {
OMUX_MAC = 0,
OMUX_ILU
} VshOutputMux;
typedef enum {
ILU_NOP = 0,
ILU_MOV,
ILU_RCP,
ILU_RCC,
ILU_RSQ,
ILU_EXP,
ILU_LOG,
ILU_LIT
} VshILU;
typedef enum {
MAC_NOP,
MAC_MOV,
MAC_MUL,
MAC_ADD,
MAC_MAD,
MAC_DP3,
MAC_DPH,
MAC_DP4,
MAC_DST,
MAC_MIN,
MAC_MAX,
MAC_SLT,
MAC_SGE,
MAC_ARL
} VshMAC;
typedef enum {
SWIZZLE_X = 0,
SWIZZLE_Y,
SWIZZLE_Z,
SWIZZLE_W
} VshSwizzle;
typedef struct VshFieldMapping {
VshFieldName field_name;
uint8_t subtoken;
uint8_t start_bit;
uint8_t bit_length;
} VshFieldMapping;
static const VshFieldMapping field_mapping[] = {
// Field Name DWORD BitPos BitSize
{ FLD_ILU, 1, 25, 3 },
{ FLD_MAC, 1, 21, 4 },
{ FLD_CONST, 1, 13, 8 },
{ FLD_V, 1, 9, 4 },
// INPUT A
{ FLD_A_NEG, 1, 8, 1 },
{ FLD_A_SWZ_X, 1, 6, 2 },
{ FLD_A_SWZ_Y, 1, 4, 2 },
{ FLD_A_SWZ_Z, 1, 2, 2 },
{ FLD_A_SWZ_W, 1, 0, 2 },
{ FLD_A_R, 2, 28, 4 },
{ FLD_A_MUX, 2, 26, 2 },
// INPUT B
{ FLD_B_NEG, 2, 25, 1 },
{ FLD_B_SWZ_X, 2, 23, 2 },
{ FLD_B_SWZ_Y, 2, 21, 2 },
{ FLD_B_SWZ_Z, 2, 19, 2 },
{ FLD_B_SWZ_W, 2, 17, 2 },
{ FLD_B_R, 2, 13, 4 },
{ FLD_B_MUX, 2, 11, 2 },
// INPUT C
{ FLD_C_NEG, 2, 10, 1 },
{ FLD_C_SWZ_X, 2, 8, 2 },
{ FLD_C_SWZ_Y, 2, 6, 2 },
{ FLD_C_SWZ_Z, 2, 4, 2 },
{ FLD_C_SWZ_W, 2, 2, 2 },
{ FLD_C_R_HIGH, 2, 0, 2 },
{ FLD_C_R_LOW, 3, 30, 2 },
{ FLD_C_MUX, 3, 28, 2 },
// Output
{ FLD_OUT_MAC_MASK, 3, 24, 4 },
{ FLD_OUT_R, 3, 20, 4 },
{ FLD_OUT_ILU_MASK, 3, 16, 4 },
{ FLD_OUT_O_MASK, 3, 12, 4 },
{ FLD_OUT_ORB, 3, 11, 1 },
{ FLD_OUT_ADDRESS, 3, 3, 8 },
{ FLD_OUT_MUX, 3, 2, 1 },
// Other
{ FLD_A0X, 3, 1, 1 },
{ FLD_FINAL, 3, 0, 1 }
};
typedef struct VshOpcodeParams {
bool A;
bool B;
bool C;
} VshOpcodeParams;
static const VshOpcodeParams ilu_opcode_params[] = {
/* ILU OP ParamA ParamB ParamC */
/* ILU_NOP */ { false, false, false }, // Dxbx note : Unused
/* ILU_MOV */ { false, false, true },
/* ILU_RCP */ { false, false, true },
/* ILU_RCC */ { false, false, true },
/* ILU_RSQ */ { false, false, true },
/* ILU_EXP */ { false, false, true },
/* ILU_LOG */ { false, false, true },
/* ILU_LIT */ { false, false, true },
};
static const VshOpcodeParams mac_opcode_params[] = {
/* MAC OP ParamA ParamB ParamC */
/* MAC_NOP */ { false, false, false }, // Dxbx note : Unused
/* MAC_MOV */ { true, false, false },
/* MAC_MUL */ { true, true, false },
/* MAC_ADD */ { true, false, true },
/* MAC_MAD */ { true, true, true },
/* MAC_DP3 */ { true, true, false },
/* MAC_DPH */ { true, true, false },
/* MAC_DP4 */ { true, true, false },
/* MAC_DST */ { true, true, false },
/* MAC_MIN */ { true, true, false },
/* MAC_MAX */ { true, true, false },
/* MAC_SLT */ { true, true, false },
/* MAC_SGE */ { true, true, false },
/* MAC_ARL */ { true, false, false },
};
#if 0
static const char* mask_str[] = {
// xyzw xyzw
"", // 0000 ____
".waaa", // 0001 ___w
".zaaa", // 0010 __z_
".zwaa", // 0011 __zw
".yaaa", // 0100 _y__
".ywaa", // 0101 _y_w
".yzaa", // 0110 _yz_
".yzwa", // 0111 _yzw
".xaaa", // 1000 x___
".xwaa", // 1001 x__w
".xzaa", // 1010 x_z_
".xzwa", // 1011 x_zw
".xyaa", // 1100 xy__
".xywa", // 1101 xy_w
".xyza", // 1110 xyz_
""//.xyzw 1111 xyzw
};
#else
static const char* mask_str[] = {
// xyzw xyzw
"", // 0000 ____
",w", // 0001 ___w
",z", // 0010 __z_
",zw", // 0011 __zw
",y", // 0100 _y__
",yw", // 0101 _y_w
",yz", // 0110 _yz_
",yzw", // 0111 _yzw
",x", // 1000 x___
",xw", // 1001 x__w
",xz", // 1010 x_z_
",xzw", // 1011 x_zw
",xy", // 1100 xy__
",xyw", // 1101 xy_w
",xyz", // 1110 xyz_
",xyzw"//.xyzw 1111 xyzw
};
#endif
/* Note: OpenGL seems to be case-sensitive, and requires upper-case opcodes! */
static const char* mac_opcode[] = {
"NOP",
"MOV",
"MUL",
"ADD",
"MAD",
"DP3",
"DPH",
"DP4",
"DST",
"MIN",
"MAX",
"SLT",
"SGE",
"ARL A0.x", // Dxbx note : Alias for "mov a0.x"
};
static const char* ilu_opcode[] = {
"NOP",
"MOV",
"RCP",
"RCC",
"RSQ",
"EXP",
"LOG",
"LIT",
};
static bool ilu_force_scalar[] = {
false,
false,
true,
true,
true,
true,
true,
false,
};
static const char* out_reg_name[] = {
"oPos",
"???",
"???",
"oD0",
"oD1",
"oFog",
"oPts",
"oB0",
"oB1",
"oT0",
"oT1",
"oT2",
"oT3",
"???",
"???",
"A0.x",
};
// Retrieves a number of bits in the instruction token
static int vsh_get_from_token(uint32_t *shader_token,
uint8_t subtoken,
uint8_t start_bit,
uint8_t bit_length)
{
return (shader_token[subtoken] >> start_bit) & ~(0xFFFFFFFF << bit_length);
}
static uint8_t vsh_get_field(uint32_t *shader_token, VshFieldName field_name)
{
return (uint8_t)(vsh_get_from_token(shader_token,
field_mapping[field_name].subtoken,
field_mapping[field_name].start_bit,
field_mapping[field_name].bit_length));
}
// Converts the C register address to disassembly format
static int16_t convert_c_register(const int16_t c_reg)
{
int16_t r = ((((c_reg >> 5) & 7) - 3) * 32) + (c_reg & 31);
r += VSH_D3DSCM_CORRECTION; /* to map -96..95 to 0..191 */
return r; //FIXME: = c_reg?!
}
static QString* decode_swizzle(uint32_t *shader_token,
VshFieldName swizzle_field)
{
const char* swizzle_str = "xyzw";
VshSwizzle x, y, z, w;
/* some microcode instructions force a scalar value */
if (swizzle_field == FLD_C_SWZ_X
&& ilu_force_scalar[vsh_get_field(shader_token, FLD_ILU)]) {
x = y = z = w = vsh_get_field(shader_token, swizzle_field);
} else {
x = vsh_get_field(shader_token, swizzle_field++);
y = vsh_get_field(shader_token, swizzle_field++);
z = vsh_get_field(shader_token, swizzle_field++);
w = vsh_get_field(shader_token, swizzle_field);
}
if (x == SWIZZLE_X && y == SWIZZLE_Y
&& z == SWIZZLE_Z && w == SWIZZLE_W) {
/* Don't print the swizzle if it's .xyzw */
return qstring_from_str(""); // Will turn ".xyzw" into "."
/* Don't print duplicates */
} else if (x == y && y == z && z == w) {
return qstring_from_str((char[]){'.', swizzle_str[x], '\0'});
#if 0
} else if (x == y && z == w) {
return qstring_from_str((char[]){'.',
swizzle_str[x], swizzle_str[y], '\0'}); //FIXME: !!!! Would turn ".xxyy" into ".xy" ?! !!!!
/* } else if (z == w) {
return qstring_from_str((char[]){'.',
swizzle_str[x], swizzle_str[y], swizzle_str[z], '\0'}); */
#endif
} else {
return qstring_from_str((char[]){'.',
swizzle_str[x], swizzle_str[y],
swizzle_str[z], swizzle_str[w],
'\0'}); // Normal swizzle mask
}
}
static QString* decode_opcode_input(uint32_t *shader_token,
VshParameterType param,
VshFieldName neg_field,
int reg_num)
{
/* This function decodes a vertex shader opcode parameter into a string.
* Input A, B or C is controlled via the Param and NEG fieldnames,
* the R-register address for each input is already given by caller. */
QString *ret_str = qstring_new();
if (vsh_get_field(shader_token, neg_field) > 0) {
qstring_append_chr(ret_str, '-');
}
/* PARAM_R uses the supplied reg_num, but the other two need to be
* determined */
char tmp[40];
switch (param) {
case PARAM_R:
snprintf(tmp, sizeof(tmp), "R%d", reg_num);
break;
case PARAM_V:
reg_num = vsh_get_field(shader_token, FLD_V);
snprintf(tmp, sizeof(tmp), "v%d", reg_num);
break;
case PARAM_C:
reg_num = convert_c_register(vsh_get_field(shader_token, FLD_CONST));
if (vsh_get_field(shader_token, FLD_A0X) > 0) {
snprintf(tmp, sizeof(tmp), "c[A0+%d]", reg_num); //FIXME: does this really require the "correction" doe in convert_c_register?!
} else {
snprintf(tmp, sizeof(tmp), "c[%d]", reg_num);
}
break;
default:
printf("Param: 0x%x\n", param);
assert(false);
}
qstring_append(ret_str, tmp);
{
/* swizzle bits are next to the neg bit */
QString *swizzle_str = decode_swizzle(shader_token, neg_field+1);
qstring_append(ret_str, qstring_get_str(swizzle_str));
QDECREF(swizzle_str);
}
return ret_str;
}
static QString* decode_opcode(uint32_t *shader_token,
VshOutputMux out_mux,
uint32_t mask,
const char* opcode,
QString *inputs)
{
QString *ret = qstring_new();
int reg_num = vsh_get_field(shader_token, FLD_OUT_R);
/* Test for paired opcodes (in other words : Are both <> NOP?) */
if (out_mux == OMUX_MAC
&& vsh_get_field(shader_token, FLD_ILU) != ILU_NOP
&& reg_num == 1) {
/* Ignore paired MAC opcodes that write to R1 */
mask = 0;
} else if (out_mux == OMUX_ILU
&& vsh_get_field(shader_token, FLD_MAC) != MAC_NOP) {
/* Paired ILU opcodes can only write to R1 */
reg_num = 1;
}
if (mask > 0) {
if (strcmp(opcode, mac_opcode[MAC_ARL]) == 0) {
qstring_append(ret, " ARL(a0");
qstring_append(ret, qstring_get_str(inputs));
qstring_append(ret, ";\n");
} else {
qstring_append(ret, " ");
qstring_append(ret, opcode);
qstring_append(ret, "(");
qstring_append(ret, "R");
qstring_append_int(ret, reg_num);
qstring_append(ret, mask_str[mask]);
qstring_append(ret, qstring_get_str(inputs));
qstring_append(ret, ");\n");
}
}
/* See if we must add a muxed opcode too: */
if (vsh_get_field(shader_token, FLD_OUT_MUX) == out_mux
/* Only if it's not masked away: */
&& vsh_get_field(shader_token, FLD_OUT_O_MASK) != 0) {
qstring_append(ret, " ");
qstring_append(ret, opcode);
qstring_append(ret, "(");
if (vsh_get_field(shader_token, FLD_OUT_ORB) == OUTPUT_C) {
/* TODO : Emulate writeable const registers */
qstring_append(ret, "c");
qstring_append_int(ret,
convert_c_register(
vsh_get_field(shader_token, FLD_OUT_ADDRESS)));
} else {
qstring_append(ret,
out_reg_name[
vsh_get_field(shader_token, FLD_OUT_ADDRESS) & 0xF]);
}
qstring_append(ret,
mask_str[
vsh_get_field(shader_token, FLD_OUT_O_MASK)]);
qstring_append(ret, qstring_get_str(inputs));
qstring_append(ret, ");\n");
}
return ret;
}
static QString* decode_token(uint32_t *shader_token)
{
QString *ret;
/* Since it's potentially used twice, decode input C once: */
QString *input_c =
decode_opcode_input(shader_token,
vsh_get_field(shader_token, FLD_C_MUX),
FLD_C_NEG,
(vsh_get_field(shader_token, FLD_C_R_HIGH) << 2)
| vsh_get_field(shader_token, FLD_C_R_LOW));
/* See what MAC opcode is written to (if not masked away): */
VshMAC mac = vsh_get_field(shader_token, FLD_MAC);
if (mac != MAC_NOP) {
QString *inputs_mac = qstring_new();
if (mac_opcode_params[mac].A) {
QString *input_a =
decode_opcode_input(shader_token,
vsh_get_field(shader_token, FLD_A_MUX),
FLD_A_NEG,
vsh_get_field(shader_token, FLD_A_R));
qstring_append(inputs_mac, ", ");
qstring_append(inputs_mac, qstring_get_str(input_a));
QDECREF(input_a);
}
if (mac_opcode_params[mac].B) {
QString *input_b =
decode_opcode_input(shader_token,
vsh_get_field(shader_token, FLD_B_MUX),
FLD_B_NEG,
vsh_get_field(shader_token, FLD_B_R));
qstring_append(inputs_mac, ", ");
qstring_append(inputs_mac, qstring_get_str(input_b));
QDECREF(input_b);
}
if (mac_opcode_params[mac].C) {
qstring_append(inputs_mac, ", ");
qstring_append(inputs_mac, qstring_get_str(input_c));
}
/* Then prepend these inputs with the actual opcode, mask, and input : */
ret = decode_opcode(shader_token,
OMUX_MAC,
vsh_get_field(shader_token, FLD_OUT_MAC_MASK),
mac_opcode[mac],
inputs_mac);
QDECREF(inputs_mac);
} else {
ret = qstring_new();
}
/* See if a ILU opcode is present too: */
VshILU ilu = vsh_get_field(shader_token, FLD_ILU);
if (ilu != ILU_NOP) {
QString *inputs_c = qstring_from_str(", ");
qstring_append(inputs_c, qstring_get_str(input_c));
/* Append the ILU opcode, mask and (the already determined) input C: */
QString *ilu_op =
decode_opcode(shader_token,
OMUX_ILU,
vsh_get_field(shader_token, FLD_OUT_ILU_MASK),
ilu_opcode[ilu],
inputs_c);
qstring_append(ret, qstring_get_str(ilu_op));
QDECREF(inputs_c);
QDECREF(ilu_op);
}
QDECREF(input_c);
return ret;
}
/* Vertex shader header, mapping Xbox1 registers to the ARB syntax (original
* version by KingOfC). Note about the use of 'conventional' attributes in here:
* Since we prefer to use only one shader for both immediate and deferred mode
* rendering, we alias all attributes to conventional inputs as much as possible.
* Only when there's no conventional attribute available, we use generic
* attributes. So in the following header, we use conventional attributes first,
* and generic attributes for the rest of the vertex attribute slots. This makes
* it possible to support immediate and deferred mode rendering with the same
* shader, and the use of the OpenGL fixed-function pipeline without a shader.
*/
static const char* vsh_header =
"#version 110\n"
"\n"
//FIXME: I just assumed this is true for all registers?!
"vec4 R0 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R1 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R2 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R3 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R4 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R5 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R6 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R7 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R8 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R9 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R10 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R11 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 R12 = vec4(0.0,0.0,0.0,1.0);\n"
"\n"
//FIXME: What is a0 initialized as?
"int A0 = 0;\n"
"\n"
#if 0
"ATTRIB v0 = vertex.position;" // (See "conventional" note above)
"ATTRIB v1 = vertex.%s;" // Note : We replace this with "weight" or "attrib[1]" depending GL_ARB_vertex_blend
"ATTRIB v2 = vertex.normal;"
"ATTRIB v3 = vertex.color.primary;"
"ATTRIB v4 = vertex.color.secondary;"
"ATTRIB v5 = vertex.fogcoord;"
"ATTRIB v6 = vertex.attrib[6];"
"ATTRIB v7 = vertex.attrib[7];"
"ATTRIB v8 = vertex.texcoord[0];"
"ATTRIB v9 = vertex.texcoord[1];"
"ATTRIB v10 = vertex.texcoord[2];"
"ATTRIB v11 = vertex.texcoord[3];"
#else
"attribute vec4 v0;\n"
"attribute vec4 v1;\n"
"attribute vec4 v2;\n"
"attribute vec4 v3;\n"
"attribute vec4 v4;\n"
"attribute vec4 v5;\n"
"attribute vec4 v6;\n"
"attribute vec4 v7;\n"
"attribute vec4 v8;\n"
"attribute vec4 v9;\n"
"attribute vec4 v10;\n"
"attribute vec4 v11;\n"
#endif
"attribute vec4 v12;\n"
"attribute vec4 v13;\n"
"attribute vec4 v14;\n"
"attribute vec4 v15;\n"
"\n"
/*
//FIXME: temp var?
"OUTPUT oPos = result.position;\n"
"OUTPUT oD0 = result.color.front.primary;\n"
"OUTPUT oD1 = result.color.front.secondary;\n"
"OUTPUT oB0 = result.color.back.primary;\n"
"OUTPUT oB1 = result.color.back.secondary;\n"
"OUTPUT oPts = result.pointsize;\n"
"OUTPUT oFog = result.fogcoord;\n"
"OUTPUT oT0 = result.texcoord[0];\n"
"OUTPUT oT1 = result.texcoord[1];\n"
"OUTPUT oT2 = result.texcoord[2];\n"
"OUTPUT oT3 = result.texcoord[3];\n"
*/
"#define oPos R12 /* oPos is a mirror of R12 */\n"
"vec4 oD0 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 oD1 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 oB0 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 oB1 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 oPts = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 oFog = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 oT0 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 oT1 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 oT2 = vec4(0.0,0.0,0.0,1.0);\n"
"vec4 oT3 = vec4(0.0,0.0,0.0,1.0);\n"
"\n"
/* All constants in 1 array declaration */
//FIXME: it's probably wise to change the c[x] to c##x later because it forces us to allocate and reupload around 100*4*4 bytes (~1.5kB) of useless data on/to the GPU :P
"uniform vec4 c[192];\n"
"#define viewport_scale c[58] /* This seems to be hardwired? See comment in nv2a_gpu.c */\n"
"#define viewport_offset c[59] /* Same as above */\n"
"uniform vec2 cliprange;\n"
/*
FIXME: !!!!!! MAJOR BUG !!!!!!
JayFoxRox: mhhh I believe there is a bug in my glsl stuff too I didn't even think about before
JayFoxRox: but if mask is yz it would result in: dest.yz = OP().yz when it should be dest.yz = OP().xy
*/
// Code from pages linked here http://msdn.microsoft.com/en-us/library/windows/desktop/bb174703%28v=vs.85%29.aspx
// and also https://www.opengl.org/registry/specs/NV/vertex_program1_1.txt
// Some code was also written from scratch because it seemed easy - if you are bored verify the behaviour!
"\n"
"/* LUT to convert write mask to same amount of components */\n"
// 0000 ____ (NOP) writes not inserted into shader
"#define COMPONENTS_w x\n" // 0001 ___w
"#define COMPONENTS_z x\n" // 0010 __z_
"#define COMPONENTS_y x\n" // 0100 _y__
"#define COMPONENTS_x x\n" // 1000 x___
"#define COMPONENTS_zw xy\n" // 0011 __zw
"#define COMPONENTS_yw xy\n" // 0101 _y_w
"#define COMPONENTS_yz xy\n" // 0110 _yz_
"#define COMPONENTS_xw xy\n" // 1001 x__w
"#define COMPONENTS_xz xy\n" // 1010 x_z_
"#define COMPONENTS_xy xy\n" // 1100 xy__
"#define COMPONENTS_yzw xyz\n" // 0111 _yzw
"#define COMPONENTS_xzw xyz\n" // 1011 x_zw
"#define COMPONENTS_xyw xyz\n" // 1101 xy_w
"#define COMPONENTS_xyz xyz\n" // 1110 xyz_
"#define COMPONENTS_xyzw xyzw\n" // 1111 xyzw (alternative)
"#define COMPONENTS_ xyzw\n" // 1111 xyzw (alternative)
"\n"
"#define MOV(dest,mask, src) dest.mask = _MOV(vec4(src)).COMPONENTS_ ## mask\n"
"vec4 _MOV(vec4 src)\n"
"{\n"
" return src;\n"
"}\n"
"\n"
"#define MUL(dest,mask, src0, src1) dest.mask = _MUL(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _MUL(vec4 src0, vec4 src1)\n"
"{\n"
" return src0 * src1;\n"
"}\n"
"\n"
"#define ADD(dest,mask, src0, src1) dest.mask = _ADD(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _ADD(vec4 src0, vec4 src1)\n"
"{\n"
" return src0 + src1;\n"
"}\n"
"\n"
"#define MAD(dest,mask, src0, src1, src2) dest.mask = _MAD(vec4(src0), vec4(src1), vec4(src2)).COMPONENTS_ ## mask\n"
"vec4 _MAD(vec4 src0, vec4 src1, vec4 src2)\n"
"{\n"
" return src0 * src1 + src2;\n"
"}\n"
"\n"
"#define DP3(dest,mask, src0, src1) dest.mask = _DP3(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _DP3(vec4 src0, vec4 src1)\n"
"{\n"
" return vec4(dot(src0.xyz, src1.xyz));\n"
"}\n"
"\n"
"#define DPH(dest,mask, src0, src1) dest.mask = _DPH(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _DPH(vec4 src0, vec4 src1)\n"
"{\n"
" return vec4(dot(vec4(src0.xyz, 1.0), src1));\n"
"}\n"
"\n"
"#define DP4(dest,mask, src0, src1) dest.mask = _DP4(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _DP4(vec4 src0, vec4 src1)\n"
"{\n"
" return vec4(dot(src0, src1));\n"
"}\n"
"\n"
"#define DST(dest,mask, src0, src1) dest.mask = _DST(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _DST(vec4 src0, vec4 src1)\n"
"{\n"
" return vec4(1.0,\n"
" src0.y * src1.y,\n"
" src0.z,\n"
" src1.w);\n"
"}\n"
"\n"
"#define MIN(dest,mask, src0, src1) dest.mask = _MIN(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _MIN(vec4 src0, vec4 src1)\n"
"{\n"
" return min(src0, src1);\n"
"}\n"
"\n"
"#define MAX(dest,mask, src0, src1) dest.mask = _MAX(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _MAX(vec4 src0, vec4 src1)\n"
"{\n"
" return max(src0, src1);\n"
"}\n"
"\n"
"#define SLT(dest,mask, src0, src1) dest.mask = _SLT(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _SLT(vec4 src0, vec4 src1)\n"
"{\n"
" return vec4(src0.x < src1.x ? 1.0 : 0.0,\n"
" src0.y < src1.y ? 1.0 : 0.0,\n"
" src0.z < src1.z ? 1.0 : 0.0,\n"
" src0.w < src1.w ? 1.0 : 0.0);\n"
"}\n"
"\n"
"#define ARL(dest,mask, src) dest = _ARL(vec4(src).x)\n"
"int _ARL(float src)\n"
"{\n"
" return int(src);\n"
"}\n"
"\n"
"#define SGE(dest,mask, src0, src1) dest.mask = _SGE(vec4(src0), vec4(src1)).COMPONENTS_ ## mask\n"
"vec4 _SGE(vec4 src0, vec4 src1)\n"
"{\n"
" return vec4(src0.x >= src1.x ? 1.0 : 0.0,\n"
" src0.y >= src1.y ? 1.0 : 0.0,\n"
" src0.z >= src1.z ? 1.0 : 0.0,\n"
" src0.w >= src1.w ? 1.0 : 0.0);\n"
"}\n"
"\n"
"#define RCP(dest,mask, src) dest.mask = _RCP(vec4(src).x).COMPONENTS_ ## mask\n"
"vec4 _RCP(float src)\n"
"{\n"
" return vec4(1.0 / src);\n"
"}\n"
"\n"
"#define RCC(dest,mask, src) dest.mask = _RCC(vec4(src).x).COMPONENTS_ ## mask\n"
"vec4 _RCC(float src)\n"
"{\n"
" float t = 1.0 / src;\n"
" if (t > 0.0) {\n"
" t = min(t, 1.884467e+019);\n"
" t = max(t, 5.42101e-020);\n"
" } else {\n"
" t = max(t, -1.884467e+019);\n"
" t = min(t, -5.42101e-020);\n"
" }\n"
" return vec4(t);\n"
"}\n"
"\n"
"#define RSQ(dest,mask, src) dest.mask = _RSQ(vec4(src).x).COMPONENTS_ ## mask\n"
"vec4 _RSQ(float src)\n"
"{\n"
" return vec4(1.0 / sqrt(src));\n"
"}\n"
"\n"
"#define EXP(dest,mask, src) dest.mask = _EXP(vec4(src).x).COMPONENTS_ ## mask\n"
"vec4 _EXP(float src)\n"
"{\n"
" return vec4(exp2(src));\n"
"}\n"
"\n"
"#define LOG(dest,mask, src) dest.mask = _LOG(vec4(src).x).COMPONENTS_ ## mask\n"
"vec4 _LOG(float src)\n"
"{\n"
" return vec4(log2(src));\n"
"}\n"
"\n"
"#define LIT(dest,mask, src) dest.mask = _LIT(vec4(src)).COMPONENTS_ ## mask\n"
"vec4 _LIT(vec4 src)\n"
"{\n"
" vec4 t = vec4(1.0, 0.0, 0.0, 1.0);\n"
" float power = src.w;\n"
#if 0
//XXX: Limitation for 8.8 fixed point
" power = max(power, -127.9961);\n"
" power = min(power, 127.9961);\n"
#endif
" if (src.x > 0.0) {\n"
" t.y = src.x;\n"
" if (src.y > 0.0) {\n"
//XXX: Allowed approximation is EXP(power * LOG(src.y))
" t.z = pow(src.y, power);\n"
" }\n"
" }\n"
" return t;\n"
"}\n";
QString* vsh_translate(uint16_t version,
uint32_t *tokens, unsigned int tokens_length)
{
QString *body = qstring_from_str("\n");
QString *header = qstring_from_str(vsh_header);
#ifdef DEBUG_NV2A_GPU_SHADER_FEEDBACK
qstring_append(header,
"\n"
"/* Debug stuff */\n"
"varying vec4 debug_v0;\n"
"varying vec4 debug_v1;\n"
"varying vec4 debug_v2;\n"
"varying vec4 debug_v3;\n"
"varying vec4 debug_v4;\n"
"varying vec4 debug_v5;\n"
"varying vec4 debug_v6;\n"
"varying vec4 debug_v7;\n"
"varying vec4 debug_v8;\n"
"varying vec4 debug_v9;\n"
"varying vec4 debug_v10;\n"
"varying vec4 debug_v11;\n"
"varying vec4 debug_v12;\n"
"varying vec4 debug_v13;\n"
"varying vec4 debug_v14;\n"
"varying vec4 debug_v15;\n"
"varying vec4 debug_oPos;\n"
"varying vec4 debug_oD0;\n"
"varying vec4 debug_oD1;\n"
"varying vec4 debug_oB0;\n"
"varying vec4 debug_oB1;\n"
"varying vec4 debug_oPts;\n"
"varying vec4 debug_oFog;\n"
"varying vec4 debug_oT0;\n"
"varying vec4 debug_oT1;\n"
"varying vec4 debug_oT2;\n"
"varying vec4 debug_oT3;\n"
"\n"
"#define DEBUG_VAR(slot,var) debug_ ## slot ## _ ## var = var;\n"
"#define DEBUG(slot) \\\n"
" DEBUG_VAR(slot,R0) \\\n"
" DEBUG_VAR(slot,R1) \\\n"
" DEBUG_VAR(slot,R2) \\\n"
" DEBUG_VAR(slot,R3) \\\n"
" DEBUG_VAR(slot,R4) \\\n"
" DEBUG_VAR(slot,R5) \\\n"
" DEBUG_VAR(slot,R6) \\\n"
" DEBUG_VAR(slot,R7) \\\n"
" DEBUG_VAR(slot,R8) \\\n"
" DEBUG_VAR(slot,R9) \\\n"
" DEBUG_VAR(slot,R10) \\\n"
" DEBUG_VAR(slot,R11) \\\n"
" DEBUG_VAR(slot,R12)\n"
"\n"
"#define DEBUG_VARYING_VAR(slot,var) varying vec4 debug_ ## slot ## _ ## var;\n"
"#define DEBUG_VARYING(slot) \\\n"
" DEBUG_VARYING_VAR(slot,R0) \\\n"
" DEBUG_VARYING_VAR(slot,R1) \\\n"
" DEBUG_VARYING_VAR(slot,R2) \\\n"
" DEBUG_VARYING_VAR(slot,R3) \\\n"
" DEBUG_VARYING_VAR(slot,R4) \\\n"
" DEBUG_VARYING_VAR(slot,R5) \\\n"
" DEBUG_VARYING_VAR(slot,R6) \\\n"
" DEBUG_VARYING_VAR(slot,R7) \\\n"
" DEBUG_VARYING_VAR(slot,R8) \\\n"
" DEBUG_VARYING_VAR(slot,R9) \\\n"
" DEBUG_VARYING_VAR(slot,R10) \\\n"
" DEBUG_VARYING_VAR(slot,R11) \\\n"
" DEBUG_VARYING_VAR(slot,R12)\n"
"\n");
qstring_append(body,
" /* Debug input */\n"
" debug_v0 = v0;\n"
" debug_v1 = v1;\n"
" debug_v2 = v2;\n"
" debug_v3 = v3;\n"
" debug_v4 = v4;\n"
" debug_v5 = v5;\n"
" debug_v6 = v6;\n"
" debug_v7 = v7;\n"
" debug_v8 = v8;\n"
" debug_v9 = v9;\n"
" debug_v10 = v10;\n"
" debug_v11 = v11;\n"
" debug_v12 = v12;\n"
" debug_v13 = v13;\n"
" debug_v14 = v14;\n"
" debug_v15 = v15;\n"
"\n");
#endif
bool has_final = false;
uint32_t *cur_token = tokens;
while (cur_token-tokens < tokens_length) {
unsigned int slot = (cur_token-tokens) / VSH_TOKEN_SIZE;
QString *token_str = decode_token(cur_token);
qstring_append_fmt(body,
" /* Slot %d: 0x%08X 0x%08X 0x%08X 0x%08X */\n",
slot,
cur_token[0],cur_token[1],cur_token[2],cur_token[3]);
qstring_append(body, qstring_get_str(token_str));
#ifdef DEBUG_NV2A_GPU_SHADER_FEEDBACK
qstring_append_fmt(header,"DEBUG_VARYING(%d)\n",slot);
qstring_append_fmt(body," DEBUG(%d)\n",slot);
#endif
qstring_append(body, "\n");
QDECREF(token_str);
if (vsh_get_field(cur_token, FLD_FINAL)) {
printf("Final at %u\n",slot);
has_final = true;
break;
}
cur_token += VSH_TOKEN_SIZE;
}
assert(has_final);
/* Note : Since we replaced oPos with r12 in the above decoding,
* we have to assign oPos at the end; This can be done in two ways;
* 1) When the shader is complete (including transformations),
* we could just do a 'MOV oPos, R12;' and be done with it.
*/
qstring_append(body,
/*
'# Dxbx addition : Transform the vertex to clip coordinates :'
"DP4 R0.x, mvp[0], R12;"
"DP4 R0.y, mvp[1], R12;"
"DP4 R0.z, mvp[2], R12;"
"DP4 R0.w, mvp[3], R12;"
"MOV R12, R0;"
*/
/* the shaders leave the result in screen space, while
* opengl expects it in clip coordinates.
* Use the magic viewport constants for now,
* but they're not necessarily present.
* Same idea as above I think, but dono what the mvp stuff is about...
*/
#ifdef DEBUG_NV2A_GPU_SHADER_FEEDBACK
" /* Debug output */\n"
" debug_oPos = oPos;\n"
" debug_oD0 = oD0;\n"
" debug_oD1 = oD1;\n"
" debug_oB0 = oB0;\n"
" debug_oB1 = oB1;\n"
" debug_oPts = oPts;\n"
" debug_oFog = oFog;\n"
" debug_oT0 = oT0;\n"
" debug_oT1 = oT1;\n"
" debug_oT2 = oT2;\n"
" debug_oT3 = oT3;\n"
"\n"
#endif
#if 1
" /* Un-screenspace transform */\n"
" R12.xyz = R12.xyz - viewport_offset.xyz;\n"
" R1.x = 1.0 / viewport_scale.x;\n"
" R1.y = 1.0 / viewport_scale.y;\n"
/* scale_z = view_z == 0 ? 1 : (1 / view_z) */
" if (viewport_scale.z == 0.0) {\n"
" R1.z = 1.0;\n"
" } else {\n"
" R1.z = 1.0 / viewport_scale.z;\n"
" }\n"
" R12.xyz = R12.xyz * R1.xyz;\n"
" R12.xyz *= R12.w;\n" //This breaks 2D? Maybe w is zero?
"\n"
#else
//FIXME: Use surface width / height / zeta max
"R12.z /= 16777215.0;\n" // Z[0;1]
"R12.z *= (cliprange.y - cliprange.x) / 16777215.0;\n" // Scale so [0;zmax] -> [0;cliprange_size]
"R12.z -= cliprange.x / 16777215.0;\n" // Move down so [clipmin_min;clipmin_max]
// X = [0;surface_width]; Y = [surface_height;0]; Z = [0;1]; W = ???
"R12.xyz = R12.xyz / vec3(640.0,480.0,1.0);\n"
// X,Z = [0;1]; Y = [1;0]; W = ???
"R12.xyz = R12.xyz * vec3(2.0) - vec3(1.0);\n"
"R12.y *= -1.0;\n"
"R12.w = 1.0;\n"
// X,Y,Z = [-1;+1]; W = 1
"\n"
#endif
/* undo the perspective divide? */
//"MUL R12.xyz, R12, R12.w;\n"
/* Z coord [0;1]->[-1;1] mapping, see comment in transform_projection
* in state.c
*
* Basically we want (in homogeneous coordinates) z = z * 2 - 1. However,
* shaders are run before the homogeneous divide, so we have to take the w
* into account: z = ((z / w) * 2 - 1) * w, which is the same as
* z = z * 2 - w.
*/
//"# Apply Z coord mapping\n"
//"ADD R12.z, R12.z, R12.z;\n"
//"ADD R12.z, R12.z, -R12.w;\n"
" /* Set outputs */\n"
" gl_Position = oPos;\n"
" gl_FrontColor = oD0;\n"
" gl_FrontSecondaryColor = oD1;\n"
" gl_BackColor = oB0;\n"
" gl_BackSecondaryColor = oB1;\n"
" gl_PointSize = oPts.x;\n"
" gl_FogFragCoord = oFog.x;\n"
" gl_TexCoord[0] = oT0;\n"
" gl_TexCoord[1] = oT1;\n"
" gl_TexCoord[2] = oT2;\n"
" gl_TexCoord[3] = oT3;\n"
"\n"
);
QString *ret = qstring_new();
qstring_append(ret, qstring_get_str(header));
qstring_append(ret,"\n"
"void main(void)\n"
"{\n");
qstring_append(ret, qstring_get_str(body));
qstring_append(ret,"}\n");
QDECREF(header);
QDECREF(body);
return ret;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment