Skip to content

Instantly share code, notes, and snippets.

@Mati365
Created September 10, 2017 14:59
Show Gist options
  • Save Mati365/e2eb4c3eae483cf9fa99d99a18715939 to your computer and use it in GitHub Desktop.
Save Mati365/e2eb4c3eae483cf9fa99d99a18715939 to your computer and use it in GitHub Desktop.
const R = require('ramda');
const {Either} = require('ramda-fantasy');
const messages = {
unknown: {
opcode: 'Unknown %{0} opcode',
argument: 'Wrong %{0} instruction %{1} argument',
combination: 'Unknown %{0} opcode argument combination',
},
};
/**
* Extract little endian values
*
* @param {Number} num
*/
const extract16 = num => ([
num & 0xFF,
(num >> 0x8) & 0xFF,
]);
const hexValue = val => (
Buffer
.from(val)
.toString('hex')
);
const OPCODES = {
mov: {
m: {
r2: (mem, reg16) => {
return new Uint8Array([
0x89,
...extractMemToken(mem.value),
]);
},
},
r2: {
i: (reg16, imm16) => new Uint8Array([
0xB8 + reg16[1],
...extract16(imm16[1]),
]),
m: (reg16, mem) => {
return new Uint8Array([
0x8B,
...extractMemToken(mem.value),
])
},
},
r1: {
i: (reg8, imm8) => new Uint8Array([
0xB0 + reg8[1],
imm8[1],
]),
},
},
};
const SIZE = {
BYTE: 1,
WORD: 2,
DWORD: 4,
};
/**
* @see http://wiki.osdev.org/X86-64_Instruction_Encoding#Legacy_Prefixes
*/
const PREFIXES = [
// first group
{
lock: 0xF0, // lock
repne: 0xF2, repnz: 0xF2, // REPNE/REPNZ
rep: 0xF3, repe: 0xF3, repz: 0xF3, // REP, REPE/REPZ
},
];
/**
* Array of registers, structure: [index, size]
*
* AL = 0 AX = 0 EAX = 0
* CL = 1 CX = 1 ECX = 1
* DL = 2 DX = 2 EDX = 2
* BL = 3 BX = 3 EBX = 3
* AH = 4 SP = 4 ESP = 4
* CH = 5 BP = 5 EBP = 5
* DH = 6 SI = 6 ESI = 6
* BH = 7 DI = 7 EDI = 7
*/
const REGISTERS = {
al: [0x0, 0x1], ah: [0x4, 0x1], ax: [0x0, 0x2], eax: [0x0, 0x4],
cl: [0x1, 0x1], ch: [0x5, 0x1], cx: [0x1, 0x2], ecx: [0x1, 0x4],
dl: [0x2, 0x1], dh: [0x6, 0x1], dx: [0x2, 0x2], edx: [0x2, 0x4],
bl: [0x3, 0x1], bh: [0x7, 0x1], bx: [0x3, 0x2], ebx: [0x3, 0x4],
sp: [0x4, 0x2],
bp: [0x5, 0x2],
si: [0x6, 0x2],
di: [0x7, 0x2],
};
/**
* Each line should be parsed into package
* with type property, it will help compiler
* to link all labels after compilation
*/
const LINE_TOKEN_TYPE = {
CALLER: 0,
LABEL: 1,
ERROR: 2,
BINARY: 3,
};
/**
* Generate single MOD bye
* @see http://www.c-jump.com/CIS77/CPU/x86/X77_0030_encoding_format.htm
*
* @param {Number} mod
* @param {Number} reg
* @param {Number} rm
*/
const genMOD = (mod, reg, rm) => (
(rm & 0x7) | (reg & (0x7 << 0x3)) | ((mod & 0x3) << 0x6)
);
const genSIB = (ss, index, base) => genMOD(
ss, index, base,
);
/**
* Returns number of bytes required to store variable
*
* @param {Number} num
*/
const getBytesCount = (num) => {
let bytes = 0;
while (num) {
num >>= 0x8;
bytes++;
}
return bytes;
};
/**
* Replaces template variables with array values
*
* @param {String} str template
* @param {Array} args template args
*/
const format = (str, args) => (
R.replace(
/%{\d+}/g,
match => args[+match.substr(2, 1)],
str,
)
);
/**
* @param {Number} line line number
* @param {String} message message content
*/
const printLineError = R.curry(
(line, message) => console.log(`error at line ${line} :: ${message}`),
);
/**
* Detects if string can be casted to
* number
* @todo Find build-in function similiar to it
*/
const isNumber = R.complement(isNaN);
/**
* @todo Make it better, testing via
* string is stupid
*/
const isRegister = R.propEq('type', 'r');
const isImmediate = R.propEq('type', 'i');
/**
* Replaces single instruction token to template,
* also maps opcode size to template. It is
* used to generate path to OPCODE table
*
* @example
* input: ax mov 32 eax
* output: r1 mov 32 r4
*
* output is array [type, ...type arg]
*/
const prependTemplateType = type => R.compose(
R.assoc('type', type),
R.objOf('value'),
);
const genArgumentTemplate = R.cond([
[
isNumber,
R.compose(
prependTemplateType('i'),
parseInt,
),
], // number size is not known during compilation
[
R.test(/^\[[a-zA-Z\s\d\+\*]+\]$/),
R.compose(
prependTemplateType('m'),
R.slice(1, -1),
),
], // extract address
[
R.flip(R.has)(REGISTERS),
R.converge(
R.mergeDeepLeft,
[
R.compose(
prependTemplateType('r'),
R.zipObj([
'index',
'size',
]),
R.flip(R.prop)(REGISTERS)
),
R.objOf('name'),
],
),
], // pick register opcode length template, {name, type, value: {index, size}}
[
R.T,
prependTemplateType('keyword'),
], // ignore if not found
]);
/**
* Accepts array of ASM tokens,
* generate path to RM map
*
* @see
* http://www.swansontec.com/sintel.html
* http://wiki.osdev.org/X86-64_Instruction_Encoding
*/
const rm16Byte = R.compose(
R.defaultTo(0x0),
R.flip(R.prop)({
BX_SI: 0b000,
BX_DI: 0b001,
BP_SI: 0b010,
BP_DI: 0b011,
SI: 0b100,
DI: 0b101,
BP: 0b110,
BX: 0b111,
}),
R.reduce(
(acc, val) => {
if (val.type === 'r')
return acc + (acc.length ? '_' : '') + R.toUpper(val.name);
return acc;
},
'',
),
);
/**
* Extract addressing mode
*
* @param {String} mem
*/
const extractMemToken = R.compose(
(args) => {
const imm = R.ifElse(
isImmediate,
R.prop('value'),
R.always(0x0),
)(R.last(args));
const immSize = imm && getBytesCount(imm);
const mod = genMOD(
immSize,
0x0,
rm16Byte(args),
);
const bytes = R.of(mod);
if (imm) {
return [
...bytes,
...(
immSize === 2
? extract16(imm)
: R.of(imm)
),
];
}
return bytes;
},
R.map(
R.compose(
genArgumentTemplate,
R.trim,
),
),
R.split('+'),
);
/**
* Extract prefixes from opcode arguments
*
* @param {Array} line tokens with types
*/
const extractInstructionLine = (line) => {
// first group prefixes should be placed before opcode
const precedingPrefixes = R.filter(
R.pipe(
R.nth(1),
R.flip(R.has)(PREFIXES[0]),
),
)(line);
/**
* @todo Add 32bit prefixes support
*/
let instructionArgs = R.drop(precedingPrefixes.length, line);
let size = null;
/**
* decode size instruction property
* @example:
* mov byte [0x2], 0x3
*/
if (
instructionArgs.length >= 2
&& instructionArgs[1].type === 'keyword'
) {
size = SIZE[
R.toUpper(instructionArgs[1].value)
];
instructionArgs = R.remove(1, 1, instructionArgs);
}
return {
prefixes: precedingPrefixes,
opcode: R.head(instructionArgs), // first instruction arg is opcode type
args: R.tail(instructionArgs), // rest args are opcode parameters, usually 2 - 3
size,
};
};
/**
* Get assembler caller function from tokens,
* simple generate path to opcodes table
*
* @param {Array} line tokens
*/
const getLineInstructionCaller = (line) => {
const instruction = R.map(genArgumentTemplate)(line);
const {
opcode,
args,
...instructionDescription
} = extractInstructionLine(instruction);
const generateError = (message, args = []) => (
new Either.Left(
{
type: LINE_TOKEN_TYPE.ERROR,
message: format(message, [
R.toUpper(opcode.value),
...args,
]),
},
)
);
// generate tokens from template
let caller = OPCODES[opcode.value];
for (let i = 0;i < args.length;++i) {
// if type has value.size property
// merge it with type, useful in register
// path generation
const token = R.converge(
(a, b) => `${a}${b}`,
[
R.prop('type'),
R.pipe(
arg => arg.value.size,
R.ifElse(
R.is(Number),
R.identity,
R.always(''),
),
),
],
)(args[i]);
if (caller[token])
caller = caller[token];
else if (i > 0)
return generateError(messages.unknown.argument, R.of(i));
else
break;
}
if (caller === OPCODES)
return generateError(messages.unknown.opcode);
if (caller && !R.is(Function, caller))
return generateError(messages.unknown.combination);
return new Either.Right(
{
type: LINE_TOKEN_TYPE.CALLER,
args, // drop first opcode parameter, its found
caller,
...instructionDescription,
},
);
};
/**
* Extracts all tokens from line
* @example
* input: mov ax, 2 ; test
* output: ['mov', 'ax', 2]
*/
const extractLine = R.pipe(
R.replace(/(?:;.*)/ig, ''),
R.trim,
R.match(/\w+|\[[^[]+\]/g),
R.reject(
R.either(R.isNil, R.isEmpty),
),
);
/**
* Extract lines from string and parse it
* to tokens
*/
const extractLines = R.compose(
R.map(extractLine),
R.split(/\n/ig),
);
/**
* Detect if tokens list have opcode, comment, link
* or something else
*/
const isLabelLine = R.both(
R.pipe(R.length, R.equals(1)),
R.pipe(R.nth(0), R.endsWith(':')),
);
const parseExtractedTokens = R.cond([
[
isLabelLine,
([token]) => new Either.Right({
type: LINE_TOKEN_TYPE.LABEL,
name: R.dropLast(1, token),
}),
],
// add some extra cases like nasm #include
[
R.T,
getLineInstructionCaller,
],
]);
/**
* Add type property and caller to each line, decode
* extract line and if label save its name
* @todo Add merge #include instructions
*
* @param {Array} extractedLines
*/
const linkLinesCallers = (extractedLines) => {
const parseExtractedLine = index => R.pipe(
parseExtractedTokens,
Either.either(
(lineToken) => {
printLineError(index, lineToken.message);
return lineToken;
},
R.identity,
),
);
return R.addIndex(R.chain)(
(line, index) => (
R.unless(R.isEmpty)(
parseExtractedLine(index),
)(line)
),
extractedLines,
);
};
/**
* Transform extracted instruction to binary data
*
* @param {Instruction}
*/
const compileInstruction = ({caller, args}) => {
const code = R.when(
R.both(
R.complement(R.isNil),
R.is(Uint8Array),
),
hexValue,
)(
R.apply(caller, args),
);
return {
type: LINE_TOKEN_TYPE.BINARY,
code,
};
};
const compile = R.compose(
R.map(
R.when(
R.propEq('type', LINE_TOKEN_TYPE.CALLER),
compileInstruction,
),
),
linkLinesCallers,
extractLines,
);
console.log(
compile(`
mov ax, [bp + di + 0xFFF]
`),
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment