Created
September 10, 2017 14:59
-
-
Save Mati365/e2eb4c3eae483cf9fa99d99a18715939 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const R = require('ramda'); | |
const {Either} = require('ramda-fantasy'); | |
const messages = { | |
unknown: { | |
opcode: 'Unknown %{0} opcode', | |
argument: 'Wrong %{0} instruction %{1} argument', | |
combination: 'Unknown %{0} opcode argument combination', | |
}, | |
}; | |
/** | |
* Extract little endian values | |
* | |
* @param {Number} num | |
*/ | |
const extract16 = num => ([ | |
num & 0xFF, | |
(num >> 0x8) & 0xFF, | |
]); | |
const hexValue = val => ( | |
Buffer | |
.from(val) | |
.toString('hex') | |
); | |
const OPCODES = { | |
mov: { | |
m: { | |
r2: (mem, reg16) => { | |
return new Uint8Array([ | |
0x89, | |
...extractMemToken(mem.value), | |
]); | |
}, | |
}, | |
r2: { | |
i: (reg16, imm16) => new Uint8Array([ | |
0xB8 + reg16[1], | |
...extract16(imm16[1]), | |
]), | |
m: (reg16, mem) => { | |
return new Uint8Array([ | |
0x8B, | |
...extractMemToken(mem.value), | |
]) | |
}, | |
}, | |
r1: { | |
i: (reg8, imm8) => new Uint8Array([ | |
0xB0 + reg8[1], | |
imm8[1], | |
]), | |
}, | |
}, | |
}; | |
const SIZE = { | |
BYTE: 1, | |
WORD: 2, | |
DWORD: 4, | |
}; | |
/** | |
* @see http://wiki.osdev.org/X86-64_Instruction_Encoding#Legacy_Prefixes | |
*/ | |
const PREFIXES = [ | |
// first group | |
{ | |
lock: 0xF0, // lock | |
repne: 0xF2, repnz: 0xF2, // REPNE/REPNZ | |
rep: 0xF3, repe: 0xF3, repz: 0xF3, // REP, REPE/REPZ | |
}, | |
]; | |
/** | |
* Array of registers, structure: [index, size] | |
* | |
* AL = 0 AX = 0 EAX = 0 | |
* CL = 1 CX = 1 ECX = 1 | |
* DL = 2 DX = 2 EDX = 2 | |
* BL = 3 BX = 3 EBX = 3 | |
* AH = 4 SP = 4 ESP = 4 | |
* CH = 5 BP = 5 EBP = 5 | |
* DH = 6 SI = 6 ESI = 6 | |
* BH = 7 DI = 7 EDI = 7 | |
*/ | |
const REGISTERS = { | |
al: [0x0, 0x1], ah: [0x4, 0x1], ax: [0x0, 0x2], eax: [0x0, 0x4], | |
cl: [0x1, 0x1], ch: [0x5, 0x1], cx: [0x1, 0x2], ecx: [0x1, 0x4], | |
dl: [0x2, 0x1], dh: [0x6, 0x1], dx: [0x2, 0x2], edx: [0x2, 0x4], | |
bl: [0x3, 0x1], bh: [0x7, 0x1], bx: [0x3, 0x2], ebx: [0x3, 0x4], | |
sp: [0x4, 0x2], | |
bp: [0x5, 0x2], | |
si: [0x6, 0x2], | |
di: [0x7, 0x2], | |
}; | |
/** | |
* Each line should be parsed into package | |
* with type property, it will help compiler | |
* to link all labels after compilation | |
*/ | |
const LINE_TOKEN_TYPE = { | |
CALLER: 0, | |
LABEL: 1, | |
ERROR: 2, | |
BINARY: 3, | |
}; | |
/** | |
* Generate single MOD bye | |
* @see http://www.c-jump.com/CIS77/CPU/x86/X77_0030_encoding_format.htm | |
* | |
* @param {Number} mod | |
* @param {Number} reg | |
* @param {Number} rm | |
*/ | |
const genMOD = (mod, reg, rm) => ( | |
(rm & 0x7) | (reg & (0x7 << 0x3)) | ((mod & 0x3) << 0x6) | |
); | |
const genSIB = (ss, index, base) => genMOD( | |
ss, index, base, | |
); | |
/** | |
* Returns number of bytes required to store variable | |
* | |
* @param {Number} num | |
*/ | |
const getBytesCount = (num) => { | |
let bytes = 0; | |
while (num) { | |
num >>= 0x8; | |
bytes++; | |
} | |
return bytes; | |
}; | |
/** | |
* Replaces template variables with array values | |
* | |
* @param {String} str template | |
* @param {Array} args template args | |
*/ | |
const format = (str, args) => ( | |
R.replace( | |
/%{\d+}/g, | |
match => args[+match.substr(2, 1)], | |
str, | |
) | |
); | |
/** | |
* @param {Number} line line number | |
* @param {String} message message content | |
*/ | |
const printLineError = R.curry( | |
(line, message) => console.log(`error at line ${line} :: ${message}`), | |
); | |
/** | |
* Detects if string can be casted to | |
* number | |
* @todo Find build-in function similiar to it | |
*/ | |
const isNumber = R.complement(isNaN); | |
/** | |
* @todo Make it better, testing via | |
* string is stupid | |
*/ | |
const isRegister = R.propEq('type', 'r'); | |
const isImmediate = R.propEq('type', 'i'); | |
/** | |
* Replaces single instruction token to template, | |
* also maps opcode size to template. It is | |
* used to generate path to OPCODE table | |
* | |
* @example | |
* input: ax mov 32 eax | |
* output: r1 mov 32 r4 | |
* | |
* output is array [type, ...type arg] | |
*/ | |
const prependTemplateType = type => R.compose( | |
R.assoc('type', type), | |
R.objOf('value'), | |
); | |
const genArgumentTemplate = R.cond([ | |
[ | |
isNumber, | |
R.compose( | |
prependTemplateType('i'), | |
parseInt, | |
), | |
], // number size is not known during compilation | |
[ | |
R.test(/^\[[a-zA-Z\s\d\+\*]+\]$/), | |
R.compose( | |
prependTemplateType('m'), | |
R.slice(1, -1), | |
), | |
], // extract address | |
[ | |
R.flip(R.has)(REGISTERS), | |
R.converge( | |
R.mergeDeepLeft, | |
[ | |
R.compose( | |
prependTemplateType('r'), | |
R.zipObj([ | |
'index', | |
'size', | |
]), | |
R.flip(R.prop)(REGISTERS) | |
), | |
R.objOf('name'), | |
], | |
), | |
], // pick register opcode length template, {name, type, value: {index, size}} | |
[ | |
R.T, | |
prependTemplateType('keyword'), | |
], // ignore if not found | |
]); | |
/** | |
* Accepts array of ASM tokens, | |
* generate path to RM map | |
* | |
* @see | |
* http://www.swansontec.com/sintel.html | |
* http://wiki.osdev.org/X86-64_Instruction_Encoding | |
*/ | |
const rm16Byte = R.compose( | |
R.defaultTo(0x0), | |
R.flip(R.prop)({ | |
BX_SI: 0b000, | |
BX_DI: 0b001, | |
BP_SI: 0b010, | |
BP_DI: 0b011, | |
SI: 0b100, | |
DI: 0b101, | |
BP: 0b110, | |
BX: 0b111, | |
}), | |
R.reduce( | |
(acc, val) => { | |
if (val.type === 'r') | |
return acc + (acc.length ? '_' : '') + R.toUpper(val.name); | |
return acc; | |
}, | |
'', | |
), | |
); | |
/** | |
* Extract addressing mode | |
* | |
* @param {String} mem | |
*/ | |
const extractMemToken = R.compose( | |
(args) => { | |
const imm = R.ifElse( | |
isImmediate, | |
R.prop('value'), | |
R.always(0x0), | |
)(R.last(args)); | |
const immSize = imm && getBytesCount(imm); | |
const mod = genMOD( | |
immSize, | |
0x0, | |
rm16Byte(args), | |
); | |
const bytes = R.of(mod); | |
if (imm) { | |
return [ | |
...bytes, | |
...( | |
immSize === 2 | |
? extract16(imm) | |
: R.of(imm) | |
), | |
]; | |
} | |
return bytes; | |
}, | |
R.map( | |
R.compose( | |
genArgumentTemplate, | |
R.trim, | |
), | |
), | |
R.split('+'), | |
); | |
/** | |
* Extract prefixes from opcode arguments | |
* | |
* @param {Array} line tokens with types | |
*/ | |
const extractInstructionLine = (line) => { | |
// first group prefixes should be placed before opcode | |
const precedingPrefixes = R.filter( | |
R.pipe( | |
R.nth(1), | |
R.flip(R.has)(PREFIXES[0]), | |
), | |
)(line); | |
/** | |
* @todo Add 32bit prefixes support | |
*/ | |
let instructionArgs = R.drop(precedingPrefixes.length, line); | |
let size = null; | |
/** | |
* decode size instruction property | |
* @example: | |
* mov byte [0x2], 0x3 | |
*/ | |
if ( | |
instructionArgs.length >= 2 | |
&& instructionArgs[1].type === 'keyword' | |
) { | |
size = SIZE[ | |
R.toUpper(instructionArgs[1].value) | |
]; | |
instructionArgs = R.remove(1, 1, instructionArgs); | |
} | |
return { | |
prefixes: precedingPrefixes, | |
opcode: R.head(instructionArgs), // first instruction arg is opcode type | |
args: R.tail(instructionArgs), // rest args are opcode parameters, usually 2 - 3 | |
size, | |
}; | |
}; | |
/** | |
* Get assembler caller function from tokens, | |
* simple generate path to opcodes table | |
* | |
* @param {Array} line tokens | |
*/ | |
const getLineInstructionCaller = (line) => { | |
const instruction = R.map(genArgumentTemplate)(line); | |
const { | |
opcode, | |
args, | |
...instructionDescription | |
} = extractInstructionLine(instruction); | |
const generateError = (message, args = []) => ( | |
new Either.Left( | |
{ | |
type: LINE_TOKEN_TYPE.ERROR, | |
message: format(message, [ | |
R.toUpper(opcode.value), | |
...args, | |
]), | |
}, | |
) | |
); | |
// generate tokens from template | |
let caller = OPCODES[opcode.value]; | |
for (let i = 0;i < args.length;++i) { | |
// if type has value.size property | |
// merge it with type, useful in register | |
// path generation | |
const token = R.converge( | |
(a, b) => `${a}${b}`, | |
[ | |
R.prop('type'), | |
R.pipe( | |
arg => arg.value.size, | |
R.ifElse( | |
R.is(Number), | |
R.identity, | |
R.always(''), | |
), | |
), | |
], | |
)(args[i]); | |
if (caller[token]) | |
caller = caller[token]; | |
else if (i > 0) | |
return generateError(messages.unknown.argument, R.of(i)); | |
else | |
break; | |
} | |
if (caller === OPCODES) | |
return generateError(messages.unknown.opcode); | |
if (caller && !R.is(Function, caller)) | |
return generateError(messages.unknown.combination); | |
return new Either.Right( | |
{ | |
type: LINE_TOKEN_TYPE.CALLER, | |
args, // drop first opcode parameter, its found | |
caller, | |
...instructionDescription, | |
}, | |
); | |
}; | |
/** | |
* Extracts all tokens from line | |
* @example | |
* input: mov ax, 2 ; test | |
* output: ['mov', 'ax', 2] | |
*/ | |
const extractLine = R.pipe( | |
R.replace(/(?:;.*)/ig, ''), | |
R.trim, | |
R.match(/\w+|\[[^[]+\]/g), | |
R.reject( | |
R.either(R.isNil, R.isEmpty), | |
), | |
); | |
/** | |
* Extract lines from string and parse it | |
* to tokens | |
*/ | |
const extractLines = R.compose( | |
R.map(extractLine), | |
R.split(/\n/ig), | |
); | |
/** | |
* Detect if tokens list have opcode, comment, link | |
* or something else | |
*/ | |
const isLabelLine = R.both( | |
R.pipe(R.length, R.equals(1)), | |
R.pipe(R.nth(0), R.endsWith(':')), | |
); | |
const parseExtractedTokens = R.cond([ | |
[ | |
isLabelLine, | |
([token]) => new Either.Right({ | |
type: LINE_TOKEN_TYPE.LABEL, | |
name: R.dropLast(1, token), | |
}), | |
], | |
// add some extra cases like nasm #include | |
[ | |
R.T, | |
getLineInstructionCaller, | |
], | |
]); | |
/** | |
* Add type property and caller to each line, decode | |
* extract line and if label save its name | |
* @todo Add merge #include instructions | |
* | |
* @param {Array} extractedLines | |
*/ | |
const linkLinesCallers = (extractedLines) => { | |
const parseExtractedLine = index => R.pipe( | |
parseExtractedTokens, | |
Either.either( | |
(lineToken) => { | |
printLineError(index, lineToken.message); | |
return lineToken; | |
}, | |
R.identity, | |
), | |
); | |
return R.addIndex(R.chain)( | |
(line, index) => ( | |
R.unless(R.isEmpty)( | |
parseExtractedLine(index), | |
)(line) | |
), | |
extractedLines, | |
); | |
}; | |
/** | |
* Transform extracted instruction to binary data | |
* | |
* @param {Instruction} | |
*/ | |
const compileInstruction = ({caller, args}) => { | |
const code = R.when( | |
R.both( | |
R.complement(R.isNil), | |
R.is(Uint8Array), | |
), | |
hexValue, | |
)( | |
R.apply(caller, args), | |
); | |
return { | |
type: LINE_TOKEN_TYPE.BINARY, | |
code, | |
}; | |
}; | |
const compile = R.compose( | |
R.map( | |
R.when( | |
R.propEq('type', LINE_TOKEN_TYPE.CALLER), | |
compileInstruction, | |
), | |
), | |
linkLinesCallers, | |
extractLines, | |
); | |
console.log( | |
compile(` | |
mov ax, [bp + di + 0xFFF] | |
`), | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment