Mati365 · September 10, 2017 14:59
diff --git a/ass.js b/ass.js
 const R = require('ramda');
 const {Either} = require('ramda-fantasy');

 const messages = {
  unknown: {
    opcode: 'Unknown %{0} opcode',
    argument: 'Wrong %{0} instruction %{1} argument',
    combination: 'Unknown %{0} opcode argument combination',
  },
 };

 /**
 * Extract little endian values
 *
 * @param {Number} num
 */
 const extract16 = num => ([
  num & 0xFF,
  (num >> 0x8) & 0xFF,
 ]);

 const hexValue = val => (
  Buffer
    .from(val)
    .toString('hex')
 );

 const OPCODES = {
  mov: {
    m: {
      r2: (mem, reg16) => {
        return new Uint8Array([
          0x89,
          ...extractMemToken(mem.value),
        ]);
      },
    },
    r2: {
      i: (reg16, imm16) => new Uint8Array([
        0xB8 + reg16[1],
        ...extract16(imm16[1]),
      ]),
      m: (reg16, mem) => {
        return new Uint8Array([
          0x8B,
          ...extractMemToken(mem.value),
        ])
      },
    },
    r1: {
      i: (reg8, imm8) => new Uint8Array([
        0xB0 + reg8[1],
        imm8[1],
      ]),
    },
  },
 };

 const SIZE = {
  BYTE: 1,
  WORD: 2,
  DWORD: 4,
 };

 /**
 * @see http://wiki.osdev.org/X86-64_Instruction_Encoding#Legacy_Prefixes
 */
 const PREFIXES = [
  // first group
  {
    lock: 0xF0, // lock
    repne: 0xF2, repnz: 0xF2, // REPNE/REPNZ
    rep: 0xF3, repe: 0xF3, repz: 0xF3, // REP, REPE/REPZ
  },
 ];

 /**
 * Array of registers, structure: [index, size]
 *
 * AL = 0 AX = 0 EAX = 0
 * CL = 1 CX = 1 ECX = 1
 * DL = 2 DX = 2 EDX = 2
 * BL = 3 BX = 3 EBX = 3
 * AH = 4 SP = 4 ESP = 4
 * CH = 5 BP = 5 EBP = 5
 * DH = 6 SI = 6 ESI = 6
 * BH = 7 DI = 7 EDI = 7
 */
 const REGISTERS = {
  al: [0x0, 0x1], ah: [0x4, 0x1], ax: [0x0, 0x2], eax: [0x0, 0x4],
  cl: [0x1, 0x1], ch: [0x5, 0x1], cx: [0x1, 0x2], ecx: [0x1, 0x4],
  dl: [0x2, 0x1], dh: [0x6, 0x1], dx: [0x2, 0x2], edx: [0x2, 0x4],
  bl: [0x3, 0x1], bh: [0x7, 0x1], bx: [0x3, 0x2], ebx: [0x3, 0x4],

  sp: [0x4, 0x2],
  bp: [0x5, 0x2],
  si: [0x6, 0x2],
  di: [0x7, 0x2],
 };

 /**
 * Each line should be parsed into package
 * with type property, it will help compiler
 * to link all labels after compilation
 */
 const LINE_TOKEN_TYPE = {
  CALLER: 0,
  LABEL: 1,
  ERROR: 2,
  BINARY: 3,
 };

 /**
 * Generate single MOD bye
 * @see http://www.c-jump.com/CIS77/CPU/x86/X77_0030_encoding_format.htm
 *
 * @param {Number} mod
 * @param {Number} reg
 * @param {Number} rm
 */
 const genMOD = (mod, reg, rm) => (
  (rm & 0x7) | (reg & (0x7 << 0x3)) | ((mod & 0x3) << 0x6)
 );

 const genSIB = (ss, index, base) => genMOD(
  ss, index, base,
 );

 /**
 * Returns number of bytes required to store variable
 *
 * @param {Number} num
 */
 const getBytesCount = (num) => {
  let bytes = 0;
  while (num) {
    num >>= 0x8;
    bytes++;
  }
  return bytes;
 };

 /**
 * Replaces template variables with array values
 *
 * @param {String}  str   template
 * @param {Array}   args  template args
 */
 const format = (str, args) => (
  R.replace(
    /%{\d+}/g,
    match => args[+match.substr(2, 1)],
    str,
  )
 );

 /**
 * @param {Number}  line    line number
 * @param {String}  message message content
 */
 const printLineError = R.curry(
  (line, message) => console.log(`error at line ${line} :: ${message}`),
 );

 /**
 * Detects if string can be casted to
 * number
 * @todo Find build-in function similiar to it
 */
 const isNumber = R.complement(isNaN);

 /**
 * @todo Make it better, testing via
 * string is stupid
 */
 const isRegister = R.propEq('type', 'r');
 const isImmediate = R.propEq('type', 'i');

 /**
 * Replaces single instruction token to template,
 * also maps opcode size to template. It is
 * used to generate path to OPCODE table
 *
 * @example
 * input: ax    mov   32    eax
 * output: r1   mov   32    r4
 *
 * output is array [type, ...type arg]
 */
 const prependTemplateType = type => R.compose(
  R.assoc('type', type),
  R.objOf('value'),
 );

 const genArgumentTemplate = R.cond([
  [
    isNumber,
    R.compose(
      prependTemplateType('i'),
      parseInt,
    ),
  ], // number size is not known during compilation
  [
    R.test(/^\[[a-zA-Z\s\d\+\*]+\]$/),
    R.compose(
      prependTemplateType('m'),
      R.slice(1, -1),
    ),
  ], // extract address
  [
    R.flip(R.has)(REGISTERS),
    R.converge(
      R.mergeDeepLeft,
      [
        R.compose(
          prependTemplateType('r'),
          R.zipObj([
            'index',
            'size',
          ]),
          R.flip(R.prop)(REGISTERS)
        ),
        R.objOf('name'),
      ],
    ),
  ], // pick register opcode length template, {name, type, value: {index, size}}
  [
    R.T,
    prependTemplateType('keyword'),
  ], // ignore if not found
 ]);

 /**
 * Accepts array of ASM tokens,
 * generate path to RM map
 *
 * @see
 * http://www.swansontec.com/sintel.html
 * http://wiki.osdev.org/X86-64_Instruction_Encoding
 */
 const rm16Byte = R.compose(
  R.defaultTo(0x0),
  R.flip(R.prop)({
    BX_SI: 0b000,
    BX_DI: 0b001,
    BP_SI: 0b010,
    BP_DI: 0b011,
    SI: 0b100,
    DI: 0b101,
    BP: 0b110,
    BX: 0b111,
  }),
  R.reduce(
    (acc, val) => {
      if (val.type === 'r')
        return acc + (acc.length ? '_' : '') + R.toUpper(val.name);

      return acc;
    },
    '',
  ),
 );

 /**
 * Extract addressing mode
 *
 * @param {String} mem
 */
 const extractMemToken = R.compose(
  (args) => {
    const imm = R.ifElse(
      isImmediate,
      R.prop('value'),
      R.always(0x0),
    )(R.last(args));

    const immSize = imm && getBytesCount(imm);
    const mod = genMOD(
      immSize,
      0x0,
      rm16Byte(args),
    );

    const bytes = R.of(mod);
    if (imm) {
      return [
        ...bytes,
        ...(
          immSize === 2
            ? extract16(imm)
            : R.of(imm)
        ),
      ];
    }
    return bytes;
  },
  R.map(
    R.compose(
      genArgumentTemplate,
      R.trim,
    ),
  ),
  R.split('+'),
 );

 /**
 * Extract prefixes from opcode arguments
 *
 * @param {Array} line  tokens with types
 */
 const extractInstructionLine = (line) => {
  // first group prefixes should be placed before opcode
  const precedingPrefixes = R.filter(
    R.pipe(
      R.nth(1),
      R.flip(R.has)(PREFIXES[0]),
    ),
  )(line);

  /**
   * @todo Add 32bit prefixes support
   */
  let instructionArgs = R.drop(precedingPrefixes.length, line);
  let size = null;

  /**
   * decode size instruction property
   * @example:
   * mov byte [0x2], 0x3
   */
  if (
      instructionArgs.length >= 2
      && instructionArgs[1].type === 'keyword'
  ) {
    size = SIZE[
      R.toUpper(instructionArgs[1].value)
    ];
    instructionArgs = R.remove(1, 1, instructionArgs);
  }

  return {
    prefixes: precedingPrefixes,
    opcode: R.head(instructionArgs), // first instruction arg is opcode type
    args: R.tail(instructionArgs), // rest args are opcode parameters, usually 2 - 3
    size,
  };
 };

 /**
 * Get assembler caller function from tokens,
 * simple generate path to opcodes table
 *
 * @param {Array} line  tokens
 */
 const getLineInstructionCaller = (line) => {
  const instruction = R.map(genArgumentTemplate)(line);
  const {
    opcode,
    args,
    ...instructionDescription
  } = extractInstructionLine(instruction);

  const generateError = (message, args = []) => (
    new Either.Left(
      {
        type: LINE_TOKEN_TYPE.ERROR,
        message: format(message, [
          R.toUpper(opcode.value),
          ...args,
        ]),
      },
    )
  );

  // generate tokens from template
  let caller = OPCODES[opcode.value];
  for (let i = 0;i < args.length;++i) {
    // if type has value.size property
    // merge it with type, useful in register
    // path generation
    const token = R.converge(
      (a, b) => `${a}${b}`,
      [
        R.prop('type'),
        R.pipe(
          arg => arg.value.size,
          R.ifElse(
            R.is(Number),
            R.identity,
            R.always(''),
          ),
        ),
      ],
    )(args[i]);

    if (caller[token])
      caller = caller[token];
    else if (i > 0)
      return generateError(messages.unknown.argument, R.of(i));
    else
      break;
  }

  if (caller === OPCODES)
    return generateError(messages.unknown.opcode);

  if (caller && !R.is(Function, caller))
    return generateError(messages.unknown.combination);

  return new Either.Right(
    {
      type: LINE_TOKEN_TYPE.CALLER,
      args, // drop first opcode parameter, its found
      caller,
      ...instructionDescription,
    },
  );
 };

 /**
 * Extracts all tokens from line
 * @example
 * input: mov ax, 2 ; test
 * output: ['mov', 'ax', 2]
 */
 const extractLine = R.pipe(
  R.replace(/(?:;.*)/ig, ''),
  R.trim,
  R.match(/\w+|\[[^[]+\]/g),
  R.reject(
    R.either(R.isNil, R.isEmpty),
  ),
 );

 /**
 * Extract lines from string and parse it
 * to tokens
 */
 const extractLines = R.compose(
  R.map(extractLine),
  R.split(/\n/ig),
 );

 /**
 * Detect if tokens list have opcode, comment, link
 * or something else
 */
 const isLabelLine = R.both(
  R.pipe(R.length, R.equals(1)),
  R.pipe(R.nth(0), R.endsWith(':')),
 );

 const parseExtractedTokens = R.cond([
  [
    isLabelLine,
    ([token]) => new Either.Right({
      type: LINE_TOKEN_TYPE.LABEL,
      name: R.dropLast(1, token),
    }),
  ],
  // add some extra cases like nasm #include
  [
    R.T,
    getLineInstructionCaller,
  ],
 ]);

 /**
 * Add type property and caller to each line, decode
 * extract line and if label save its name
 * @todo Add merge #include instructions
 *
 * @param {Array} extractedLines
 */
 const linkLinesCallers = (extractedLines) => {
  const parseExtractedLine = index => R.pipe(
    parseExtractedTokens,
    Either.either(
      (lineToken) => {
        printLineError(index, lineToken.message);
        return lineToken;
      },
      R.identity,
    ),
  );

  return R.addIndex(R.chain)(
    (line, index) => (
      R.unless(R.isEmpty)(
        parseExtractedLine(index),
      )(line)
    ),
    extractedLines,
  );
 };

 /**
 * Transform extracted instruction to binary data
 *
 * @param {Instruction}
 */
 const compileInstruction = ({caller, args}) => {
  const code = R.when(
    R.both(
      R.complement(R.isNil),
      R.is(Uint8Array),
    ),
    hexValue,
  )(
    R.apply(caller, args),
  );

  return {
    type: LINE_TOKEN_TYPE.BINARY,
    code,
  };
 };

 const compile = R.compose(
  R.map(
    R.when(
      R.propEq('type', LINE_TOKEN_TYPE.CALLER),
      compileInstruction,
    ),
  ),
  linkLinesCallers,
  extractLines,
 );

 console.log(
  compile(`
    mov ax, [bp + di + 0xFFF]
  `),
 );
	const R = require('ramda');
	const {Either} = require('ramda-fantasy');

	const messages = {
	unknown: {
	opcode: 'Unknown %{0} opcode',
	argument: 'Wrong %{0} instruction %{1} argument',
	combination: 'Unknown %{0} opcode argument combination',
	},
	};

	/**
	* Extract little endian values
	*
	* @param {Number} num
	*/
	const extract16 = num => ([
	num & 0xFF,
	(num >> 0x8) & 0xFF,
	]);

	const hexValue = val => (
	Buffer
	.from(val)
	.toString('hex')
	);

	const OPCODES = {
	mov: {
	m: {
	r2: (mem, reg16) => {
	return new Uint8Array([
	0x89,
	...extractMemToken(mem.value),
	]);
	},
	},
	r2: {
	i: (reg16, imm16) => new Uint8Array([
	0xB8 + reg16[1],
	...extract16(imm16[1]),
	]),
	m: (reg16, mem) => {
	return new Uint8Array([
	0x8B,
	...extractMemToken(mem.value),
	])
	},
	},
	r1: {
	i: (reg8, imm8) => new Uint8Array([
	0xB0 + reg8[1],
	imm8[1],
	]),
	},
	},
	};

	const SIZE = {
	BYTE: 1,
	WORD: 2,
	DWORD: 4,
	};

	/**
	* @see http://wiki.osdev.org/X86-64_Instruction_Encoding#Legacy_Prefixes
	*/
	const PREFIXES = [
	// first group
	{
	lock: 0xF0, // lock
	repne: 0xF2, repnz: 0xF2, // REPNE/REPNZ
	rep: 0xF3, repe: 0xF3, repz: 0xF3, // REP, REPE/REPZ
	},
	];

	/**
	* Array of registers, structure: [index, size]
	*
	* AL = 0 AX = 0 EAX = 0
	* CL = 1 CX = 1 ECX = 1
	* DL = 2 DX = 2 EDX = 2
	* BL = 3 BX = 3 EBX = 3
	* AH = 4 SP = 4 ESP = 4
	* CH = 5 BP = 5 EBP = 5
	* DH = 6 SI = 6 ESI = 6
	* BH = 7 DI = 7 EDI = 7
	*/
	const REGISTERS = {
	al: [0x0, 0x1], ah: [0x4, 0x1], ax: [0x0, 0x2], eax: [0x0, 0x4],
	cl: [0x1, 0x1], ch: [0x5, 0x1], cx: [0x1, 0x2], ecx: [0x1, 0x4],
	dl: [0x2, 0x1], dh: [0x6, 0x1], dx: [0x2, 0x2], edx: [0x2, 0x4],
	bl: [0x3, 0x1], bh: [0x7, 0x1], bx: [0x3, 0x2], ebx: [0x3, 0x4],

	sp: [0x4, 0x2],
	bp: [0x5, 0x2],
	si: [0x6, 0x2],
	di: [0x7, 0x2],
	};

	/**
	* Each line should be parsed into package
	* with type property, it will help compiler
	* to link all labels after compilation
	*/
	const LINE_TOKEN_TYPE = {
	CALLER: 0,
	LABEL: 1,
	ERROR: 2,
	BINARY: 3,
	};

	/**
	* Generate single MOD bye
	* @see http://www.c-jump.com/CIS77/CPU/x86/X77_0030_encoding_format.htm
	*
	* @param {Number} mod
	* @param {Number} reg
	* @param {Number} rm
	*/
	const genMOD = (mod, reg, rm) => (
	(rm & 0x7) \| (reg & (0x7 << 0x3)) \| ((mod & 0x3) << 0x6)
	);

	const genSIB = (ss, index, base) => genMOD(
	ss, index, base,
	);

	/**
	* Returns number of bytes required to store variable
	*
	* @param {Number} num
	*/
	const getBytesCount = (num) => {
	let bytes = 0;
	while (num) {
	num >>= 0x8;
	bytes++;
	}
	return bytes;
	};

	/**
	* Replaces template variables with array values
	*
	* @param {String} str template
	* @param {Array} args template args
	*/
	const format = (str, args) => (
	R.replace(
	/%{\d+}/g,
	match => args[+match.substr(2, 1)],
	str,
	)
	);

	/**
	* @param {Number} line line number
	* @param {String} message message content
	*/
	const printLineError = R.curry(
	(line, message) => console.log(`error at line ${line} :: ${message}`),
	);

	/**
	* Detects if string can be casted to
	* number
	* @todo Find build-in function similiar to it
	*/
	const isNumber = R.complement(isNaN);

	/**
	* @todo Make it better, testing via
	* string is stupid
	*/
	const isRegister = R.propEq('type', 'r');
	const isImmediate = R.propEq('type', 'i');

	/**
	* Replaces single instruction token to template,
	* also maps opcode size to template. It is
	* used to generate path to OPCODE table
	*
	* @example
	* input: ax mov 32 eax
	* output: r1 mov 32 r4
	*
	* output is array [type, ...type arg]
	*/
	const prependTemplateType = type => R.compose(
	R.assoc('type', type),
	R.objOf('value'),
	);

	const genArgumentTemplate = R.cond([
	[
	isNumber,
	R.compose(
	prependTemplateType('i'),
	parseInt,
	),
	], // number size is not known during compilation
	[
	R.test(/^\[[a-zA-Z\s\d\+\*]+\]$/),
	R.compose(
	prependTemplateType('m'),
	R.slice(1, -1),
	),
	], // extract address
	[
	R.flip(R.has)(REGISTERS),
	R.converge(
	R.mergeDeepLeft,
	[
	R.compose(
	prependTemplateType('r'),
	R.zipObj([
	'index',
	'size',
	]),
	R.flip(R.prop)(REGISTERS)
	),
	R.objOf('name'),
	],
	),
	], // pick register opcode length template, {name, type, value: {index, size}}
	[
	R.T,
	prependTemplateType('keyword'),
	], // ignore if not found
	]);

	/**
	* Accepts array of ASM tokens,
	* generate path to RM map
	*
	* @see
	* http://www.swansontec.com/sintel.html
	* http://wiki.osdev.org/X86-64_Instruction_Encoding
	*/
	const rm16Byte = R.compose(
	R.defaultTo(0x0),
	R.flip(R.prop)({
	BX_SI: 0b000,
	BX_DI: 0b001,
	BP_SI: 0b010,
	BP_DI: 0b011,
	SI: 0b100,
	DI: 0b101,
	BP: 0b110,
	BX: 0b111,
	}),
	R.reduce(
	(acc, val) => {
	if (val.type === 'r')
	return acc + (acc.length ? '_' : '') + R.toUpper(val.name);

	return acc;
	},
	'',
	),
	);

	/**
	* Extract addressing mode
	*
	* @param {String} mem
	*/
	const extractMemToken = R.compose(
	(args) => {
	const imm = R.ifElse(
	isImmediate,
	R.prop('value'),
	R.always(0x0),
	)(R.last(args));

	const immSize = imm && getBytesCount(imm);
	const mod = genMOD(
	immSize,
	0x0,
	rm16Byte(args),
	);

	const bytes = R.of(mod);
	if (imm) {
	return [
	...bytes,
	...(
	immSize === 2
	? extract16(imm)
	: R.of(imm)
	),
	];
	}
	return bytes;
	},
	R.map(
	R.compose(
	genArgumentTemplate,
	R.trim,
	),
	),
	R.split('+'),
	);

	/**
	* Extract prefixes from opcode arguments
	*
	* @param {Array} line tokens with types
	*/
	const extractInstructionLine = (line) => {
	// first group prefixes should be placed before opcode
	const precedingPrefixes = R.filter(
	R.pipe(
	R.nth(1),
	R.flip(R.has)(PREFIXES[0]),
	),
	)(line);

	/**
	* @todo Add 32bit prefixes support
	*/
	let instructionArgs = R.drop(precedingPrefixes.length, line);
	let size = null;

	/**
	* decode size instruction property
	* @example:
	* mov byte [0x2], 0x3
	*/
	if (
	instructionArgs.length >= 2
	&& instructionArgs[1].type === 'keyword'
	) {
	size = SIZE[
	R.toUpper(instructionArgs[1].value)
	];
	instructionArgs = R.remove(1, 1, instructionArgs);
	}

	return {
	prefixes: precedingPrefixes,
	opcode: R.head(instructionArgs), // first instruction arg is opcode type
	args: R.tail(instructionArgs), // rest args are opcode parameters, usually 2 - 3
	size,
	};
	};

	/**
	* Get assembler caller function from tokens,
	* simple generate path to opcodes table
	*
	* @param {Array} line tokens
	*/
	const getLineInstructionCaller = (line) => {
	const instruction = R.map(genArgumentTemplate)(line);
	const {
	opcode,
	args,
	...instructionDescription
	} = extractInstructionLine(instruction);

	const generateError = (message, args = []) => (
	new Either.Left(
	{
	type: LINE_TOKEN_TYPE.ERROR,
	message: format(message, [
	R.toUpper(opcode.value),
	...args,
	]),
	},
	)
	);

	// generate tokens from template
	let caller = OPCODES[opcode.value];
	for (let i = 0;i < args.length;++i) {
	// if type has value.size property
	// merge it with type, useful in register
	// path generation
	const token = R.converge(
	(a, b) => `${a}${b}`,
	[
	R.prop('type'),
	R.pipe(
	arg => arg.value.size,
	R.ifElse(
	R.is(Number),
	R.identity,
	R.always(''),
	),
	),
	],
	)(args[i]);

	if (caller[token])
	caller = caller[token];
	else if (i > 0)
	return generateError(messages.unknown.argument, R.of(i));
	else
	break;
	}

	if (caller === OPCODES)
	return generateError(messages.unknown.opcode);

	if (caller && !R.is(Function, caller))
	return generateError(messages.unknown.combination);

	return new Either.Right(
	{
	type: LINE_TOKEN_TYPE.CALLER,
	args, // drop first opcode parameter, its found
	caller,
	...instructionDescription,
	},
	);
	};

	/**
	* Extracts all tokens from line
	* @example
	* input: mov ax, 2 ; test
	* output: ['mov', 'ax', 2]
	*/
	const extractLine = R.pipe(
	R.replace(/(?:;.*)/ig, ''),
	R.trim,
	R.match(/\w+\|\[[^[]+\]/g),
	R.reject(
	R.either(R.isNil, R.isEmpty),
	),
	);

	/**
	* Extract lines from string and parse it
	* to tokens
	*/
	const extractLines = R.compose(
	R.map(extractLine),
	R.split(/\n/ig),
	);

	/**
	* Detect if tokens list have opcode, comment, link
	* or something else
	*/
	const isLabelLine = R.both(
	R.pipe(R.length, R.equals(1)),
	R.pipe(R.nth(0), R.endsWith(':')),
	);

	const parseExtractedTokens = R.cond([
	[
	isLabelLine,
	([token]) => new Either.Right({
	type: LINE_TOKEN_TYPE.LABEL,
	name: R.dropLast(1, token),
	}),
	],
	// add some extra cases like nasm #include
	[
	R.T,
	getLineInstructionCaller,
	],
	]);

	/**
	* Add type property and caller to each line, decode
	* extract line and if label save its name
	* @todo Add merge #include instructions
	*
	* @param {Array} extractedLines
	*/
	const linkLinesCallers = (extractedLines) => {
	const parseExtractedLine = index => R.pipe(
	parseExtractedTokens,
	Either.either(
	(lineToken) => {
	printLineError(index, lineToken.message);
	return lineToken;
	},
	R.identity,
	),
	);

	return R.addIndex(R.chain)(
	(line, index) => (
	R.unless(R.isEmpty)(
	parseExtractedLine(index),
	)(line)
	),
	extractedLines,
	);
	};

	/**
	* Transform extracted instruction to binary data
	*
	* @param {Instruction}
	*/
	const compileInstruction = ({caller, args}) => {
	const code = R.when(
	R.both(
	R.complement(R.isNil),
	R.is(Uint8Array),
	),
	hexValue,
	)(
	R.apply(caller, args),
	);

	return {
	type: LINE_TOKEN_TYPE.BINARY,
	code,
	};
	};

	const compile = R.compose(
	R.map(
	R.when(
	R.propEq('type', LINE_TOKEN_TYPE.CALLER),
	compileInstruction,
	),
	),
	linkLinesCallers,
	extractLines,
	);

	console.log(
	compile(`
	mov ax, [bp + di + 0xFFF]
	`),
	);