MicroCorruption/25-Halifax/assemble.py

# Taken from https://github.com/ValShaped/MSProbe
# Forked from https://github.com/Swiftloke/MSProbe
# © 2018-2023 Swiftloke

import sys
import pdb
import re

from typing import Callable

jumpOpcodes = ['jne', 'jeq', 'jlo', 'jhs', 'jn', 'jge', 'jl', 'jmp']
twoOpOpcodes = ['!!!', '!!!', '!!!', '!!!', 'mov', 'add', 'addc', 'subc', 'sub', 'cmp', 'dadd', 'bit', 'bic', 'bis', 'xor', 'and']
oneOpOpcodes = ['rrc', 'swpb', 'rra', 'sxt', 'push', 'call', 'reti']
emulatedOpcodes = {
'ret' : 'mov @sp+, pc',
'clrc' : 'bic #1, sr',
'setc' : 'bis #1, sr',
'clrz' : 'bic #2, sr',
'setz' : 'bis #2, sr',
'clrn' : 'bic #4, sr',
'setn' : 'bis #4, sr',
'dint' : 'bic #8, sr',
'eint' : 'bis #8, sr',
'nop'  : 'mov r3, r3', #Any register would do the same
'br'   : 'mov {reg}, pc',
'pop'  : 'mov @sp+, {reg}',
'rla'  : 'add {reg}, {reg}',
'rlc'  : 'addc {reg}, {reg}',
'inv'  : 'xor #0xffff, {reg}',
'clr'  : 'mov #0, {reg}',
'tst'  : 'cmp #0, {reg}',
'dec'  : 'sub #1, {reg}',
'decd' : 'sub #2, {reg}',
'inc'  : 'add #1, {reg}',
'incd' : 'add #2, {reg}',
'adc'  : 'addc #0, {reg}',
'dadc' : 'dadd #0, {reg}',
'sbc'  : 'subc #0, {reg}',
'jnc'  : 'jlo {reg}', #jlo, jhs are aliases of jnc, jc
'jnz'  : 'jne {reg}', #jnz, jz are aliases of jne, jeq
'jc'   : 'jhs {reg}',
'jz'   : 'jeq {reg}',
}

def bitrep(number, bits = 16):
	"""Converts to binary form, fixing leading zeroes."""
	mask = int('0b' + '1' * bits, 2)
	binstr = str(bin(number & mask))[2:]
	#negative = binstr[0] == '-'
	bitcount = len(binstr)
	leading0s = bits - bitcount
	return ('0' * leading0s) + binstr

def hexrep(number, zeroes = 4):
	"""Converts to hex form, fixing leading zeroes."""
	mask = int('0b' + '1' * (zeroes * 4), 2)
	hexstr = hex(number & mask)[2:]
	hexcount = len(hexstr)
	leading0s = zeroes - hexcount
	return ('0' * leading0s) + hexstr

def highlight(string: str, substring: str) -> str:
	"""Highlight a substring in a string"""
	return string.replace(substring, f"\033[4m{substring}\033[0m") if substring else string

class AssemblyError(Exception):
	"""
	The base class for all Assembly Exceptions
	"""
	def __init__(self, name: str, reason: str) -> None:
		self.type = "Improperly defined AssemblyError"
		self.name = name
		self.reason = reason

class OpcodeError(AssemblyError):
	"""
	`OpcodeError` is raised when an opcode mnemonic is not found in the opcode map
	"""
	def __init__(self, opcode, reason = "Opcode not found in opcode map."):
		super().__init__(name=opcode, reason=reason)
		self.type = "Invalid opcode mnemonic"

class RedefinedLabelError(AssemblyError):
	"""
	`RedefinedLabelError` is raised when a label is defined multiple times in the same source file.
	Since labels are resolved after compilation, it cannot be known whether you intend to reference a past
	or future definition of a label.
	"""
	def __init__(self, label, reason = "Label already defined."):
		super().__init__(name=label, reason=reason)
		self.type = "Redefined Label"

class UndefinedLabelError(AssemblyError):
	"""
	`UndefinedLabelError` is raised when a label used in a jump instruction is not defined in the source
	"""
	def __init__(self, operand: str, reason: str):
		super().__init__(name=operand, reason=reason)
		self.type = "Undefined label"

class AddressingModeError(AssemblyError):
	"""
	`AddressingModeError` is raised when the operand of an instruction is specified with an
	unrepresentable addressing mode.
	"""
	def __init__(self, operand: str, reason: str):
		super().__init__(name=operand, reason=reason)
		self.type = "Invalid addressing mode"

class JumpOffsetError(AssemblyError):
	"""
	`JumpOffsetError` is raised when a jump offset cannot be encoded.
	Jump offsets are a 12 bit signed integer representing the number of processor words to jump.
	As such, they can only encode jump offsets from -0x3fe to +0x400
	"""
	def __init__(self, offset: str, reason: str):
		super().__init__(name=offset, reason=reason)
		self.type = "Invalid jump offset"

class RegisterError(AssemblyError):
	"""
	`RegisterError` is raised when a register isn't one of
	[`pc`, `sp`, `sr`, `cg`, `r0`, ..., `r15`]
	"""
	def __init__(self, register: str, reason: str = "Valid registers are pc, sp, sr, cg, or r0-r15."):
		super().__init__(name=register, reason=reason)
		self.type = "Invalid register mnemonic"

preprocessor = []
"""
`preprocessorHooks` are functions which take a line from the source file, and return a line.
All registered hooks are called for each line of the source file.

Registering a `preprocessorHook` shall be done through the `registerPreprocessorHook` function.

Their signature is as follows:
```py
hook(instruction_line: str) -> str:
```
"""

postprocessor = []
"""
postprocessorHooks are functions which act on the output stream as a monolithic entity.
Each postprocessorHook is called exactly once per source file, after assembly and before output.

Registering a `postprocessorHook` shall be done through the `registerPostprocessorHook` function.

Their signature is as follows:
```py
hook():
"""

PC = 0  #Incremented by each instruction, incremented in words NOT bytes
labels = {} #Label name and its PC location
"""
`labels` are a label name, followed by a the address of the label relative to the loadaddr
"""
jumps = {} #PC location of jump and its corresponding label
"""
`jumps` are the address of a jump instruction and its corresponding label
During jump resolution, each jump in jumps is modified with a relative offset
Example jump:
{0: "loop"}
"""
output = [] #Output hex

def asmMain(asm_file, outfile=None, silent=False):
	line_number = 0
	global PC #Get PC

	outFP = open(outfile, 'w') if outfile else None

	if not asm_file:
		#Provide a prompt for entry
		instructions = ''
		ins = ''
		print('Input assembly. Terminate input with the ".end" directive, or Ctrl+D (EOF).')
		while True:
			ins = sys.stdin.readline()
			if ins == '.end\n' or ins == '':
				break
			instructions = instructions + ins
	else:
		with open(asm_file) as fp:
			instructions = fp.read()


	for ins in instructions.splitlines():
		#Strip leading and trailing whitespace
		ins = ins.strip()
		ins = re.split(r'\s*[/;]', ins)[0] #Remove comments
		#Skip empty lines or lines beginning with a comment
		if len(ins) == 0 or ins.startswith((';', '//')):
			continue

		#Handle .directives
		if ins.startswith('.'):
			if ins.startswith(".define"):
				registerDefine(ins)
			#Allow passing the .end directive in input files, for compatibility with stdin input
			if ins.startswith(".end"):
				break
			continue

		#Handle preprocessor substitution hooks
		for hook in preprocessor:
			ins = hook(ins)

		#Handle label registration
		if ':' in ins:
			try:
				registerLabel(ins)
			except RedefinedLabelError as exp:
				print('Label "' + exp.label + '" at line number ' + str(line_number + 1) + ' already defined')
				sys.exit(-1)
		else:
			try:
				assemble(ins)
			except AssemblyError as exp:
				ins = highlight(ins, exp.name)
				print(f'{exp.type} found on line {line_number + 1}: "{ins}"\n{exp.reason}')
				sys.exit(-1)

		line_number += 1

	#Handle postprocessor hooks.
	#These functions manipulate the raw output data, and perform tasks such as link resolution
	for hook in postprocessor:
		hook()

	#Output the object as hex
	for i in output:
		if not silent:
			print(hexrep(i), end='', file=sys.stdout)# + ' (' + bitrep(i, 16) + ')')
		if outFP:
			print(hexrep(i), end='', file=outFP)
	if not silent:
		print('') #End hex representation with a newline
	if outFP:
		outFP.close()

def registerPreprocessorHook(hook: Callable):
	if hook not in preprocessor:
		preprocessor.append(hook)

def registerPostprocessorHook(hook: Callable):
	if hook not in postprocessor:
		postprocessor.append(hook)

def processDirectives(ins: str) -> str:
	pass

def resolveJumps():
	"""Resolve pending jumps in the jumps list"""
	global labels, jumps, output
	#Resolve jump labels
	for pc, label in jumps.items():
		try:
			labelpos = labels[label]
		except KeyError:
			print(f'Label "{label}" does not exist, but a jump instruction attempts to jump to it')
			sys.exit(-1)
		#Modify the jump instruction
		#Get in little-endian format
		ins = hexrep(output[pc])
		ins = int(ins[2:4] + ins[0:2], 16)
		ins = [bit for bit in bitrep(ins, 16)]
		offset = (labelpos - pc) * 2 #Words versus bytes
		#Jump offsets are multiplied by two, added by two (PC increment), and sign extendedB
		ins[6:] = bitrep((offset - 2) // 2, 10)
		#Output again in little endian
		strword = hexrep(int(''.join(str(e) for e in ins), 2), 4)
		output[pc] = int(strword[2:] + strword[0:2], 16)

#TODO: Resolve labels in calls

def registerLabel(ins: str):
	"""Registers a label for later replacement"""
	global labels #Get labels
	global PC #Get PC
	label, addr = ins.split(sep=':')
	if label in labels:
		raise RedefinedLabelError(label)
	labels[label] = int(addr) if addr != '' else PC

# -- Defines --
def resolveDefines(ins: str) -> str:
	global defines
	for define in defines:
		ins = ins.replace(define, defines[define])
	return ins

def registerDefine(ins: str):
	"""
	Registers a define for replacement on subsequent lines
	A define is of format
	```asm
	.define identifier text...
	"""
	global defines, preprocessor
	if 'defines' not in globals():
		defines = {}
	#Define is of format .define [identifier] [any text]
	#Space(s) not required, but if spaces are not used, ':' or '=' must be used in its place
	define: tuple = re.match(r'.define\s*(\w+)[\s:=]+(.*)\s*', ins).groups()
	if define != ():
		label, replacement = define
		defines[label] = replacement
		registerPreprocessorHook(resolveDefines)

def registerJumpInstruction(PC, label):
	"""Defer jump offset calculation until labels are defined"""
	global jumps #Get jump instructions
	jumps[PC] = label
	registerPostprocessorHook(resolveJumps)

def assemble(ins):
	"""Assemble a single instruction, and append results to the output stream."""
	opcode, notUsed = getOpcode(ins)
	if opcode in jumpOpcodes:
		return assembleJumpInstruction(ins)
	elif opcode in oneOpOpcodes:
		return assembleOneOpInstruction(ins)
	elif opcode in twoOpOpcodes:
		return assembleTwoOpInstruction(ins)
	elif opcode in emulatedOpcodes:
		return assembleEmulatedInstruction(ins)
	else:
		raise OpcodeError(opcode)

def assembleEmulatedInstruction(ins):
	"""Assembles a zero- or one-operand 'emulated' instruction."""
	#Emulated instructions are either zero or one operand instructions.
	opcode, notUsed = getOpcode(ins)
	if '{reg}' in emulatedOpcodes[opcode]:
		register = ins[ins.find(' ') + 1 : ]
		ins = emulatedOpcodes[opcode].format(reg=register)
	else:
		ins = emulatedOpcodes[opcode]
	return assemble(ins)

def assembleOneOpInstruction(ins):
	"""Assembles a one-operand (format I) instruction."""
	out = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	out[0:6] = '000100' #One op identifier

	opcode, byteMode = getOpcode(ins)
	out[6:9] = bitrep(oneOpOpcodes.index(opcode), 3)
	out[9] = bitrep(byteMode, 1)

	#Figure out where the operand is
	start = ins.find(' ') + 1
	reg = ins[start :]

	#We need to provide the opcode here to detect the push bug; see the function itself
	extensionWord, adrmode, regID = assembleRegister(reg, opcode=opcode)

	out[10:12] = bitrep(adrmode, 2)
	out[12:] = bitrep(regID, 4)
	appendWord(int(''.join(str(e) for e in out), 2))
	if extensionWord:
		appendWord(int(extensionWord, 16))

def assembleTwoOpInstruction(ins):
	"""Assembles a two-operand (format III) instruction."""
	out = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

	opcode, byteMode = getOpcode(ins)
	out[0:4] = bitrep(twoOpOpcodes.index(opcode), 4)
	out[9] = bitrep(byteMode, 1)

	#Find the location of the first operand
	start = ins.find(' ') + 1
	end = ins.find(',')
	regSrc = ins[start : end]

	extensionWordSrc, adrmodeSrc, regIDSrc = assembleRegister(regSrc)

	out[10:12] = bitrep(adrmodeSrc, 2)
	out[4:8] = bitrep(regIDSrc, 4)

	#Figure out where the comment is
	start = end + 2 #Right after the comma, and the space after the comma
	regDest = ins[start :]

	extensionWordDest, adrmodeDest, regIDDest = assembleRegister(regDest, isDestReg = True)

	out[8] = bitrep(adrmodeDest, 1)
	out[12:] = bitrep(regIDDest, 4)

	appendWord(int(''.join(str(e) for e in out), 2))
	if extensionWordSrc:
		appendWord(int(extensionWordSrc, 16))
	if extensionWordDest:
		appendWord(int(extensionWordDest, 16))

def assembleJumpInstruction(ins):
	"""Assembles a jump instruction. If the offset is supplied, it is assembled
	immediately. Otherwise, if a label is provided, resolution of the offset is delayed
	so that all labels can be read (including those further ahead in the instruction stream)."""
	out = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
	out[0:3] = '001' #Jump identifier
	opcode, byteMode = getOpcode(ins)

	if byteMode: #Cannot have "jmp.b", how does that even make sense
		raise OpcodeError(opcode + '.b')

	out[3:6] = bitrep(jumpOpcodes.index(opcode), 3)

	#Figure out where the operand is
	start = ins.find(' ') + 1
	dest = ''.join(ins[start :].split()) #Remove whitespace

	#Immediate offset
	char1 = dest[0]
	#Is this a number?
	if re.match(r'[+\-]?[0x|0b]?[0-9A-Fa-f]+', dest):
		offset = int(dest, 16)
		if offset % 2 != 0:
			raise JumpOffsetError(dest, "Jump offset cannot be odd.")
		if offset <= -0x3fe or offset >= 0x400:
			raise JumpOffsetError(dest, "Jump offset out of range. Range is -3fe bytes through +400 bytes.")
		#Jump offsets are multiplied by two, added by two (PC increment), and sign extended
		out[6:] = bitrep((offset - 2) // 2, 10)
	else:
		registerJumpInstruction(PC, dest)

	appendWord(int(''.join(str(e) for e in out), 2))


def getRegister(registerName: str):
	"""Decodes special register names (or normal register names)."""
	registerName = registerName.strip().lower() #Strip leading and trailing whitespace, and convert to lowercase
	specialRegisterNames = {'pc': 0, 'sp': 1, 'sr': 2, 'cg': 3}
	if registerName in specialRegisterNames:
		return specialRegisterNames[registerName]
	elif registerName.startswith('r'):
		#FIXME: this allows registers with any integer name
		return int(registerName[1:]) #Remove 'r'
	else:
		raise RegisterError(registerName)

def getOpcode(ins: str):
	"""Returns the opcode and whether byte mode is being used."""
	#Split the opcode on characters that can't be used in an identifier
	#Example: [mov].b r15, r15
	opcode = re.split(r'[\.\W]', ins)[0]
	byteMode = False
	if '.b' in ins:
		byteMode = True
	return opcode, byteMode

def appendWord(word: int):
	"""Add a word to the output instruction stream, handling little endian format."""
	global PC #Get PC
	global output #Get output
	#Append in little-endian format
	strword = hexrep(word, 4)
	output.append(int(strword[2:] + strword[0:2], 16))
	PC += 1

def assembleRegister(reg: str, opcode=None, isDestReg = False):
	"""Assembles an operand, returning the extension word used (if applicable),
	the addressing mode, and the register ID."""
	extensionWord = None
	adrmode = 0
	regID = 0

	if '(' in reg: #Indexed mode (mode 1)
		extensionWord = reg[0 : reg.find('(')]
		adrmode = 1
		regID = getRegister(reg[reg.find('(') + 1 : reg.find(')')])
	elif '@' in reg and '+' in reg: #Indirect with post-increment mode (mode 3)
		#Destinations don't support indirect or indirect + post-increment.
		if isDestReg:
			raise AddressingModeError(reg,
				'Cannot use indirect with post-increment form for destination register.')
		adrmode = 3
		regID = getRegister(reg[reg.find('@') + 1 : reg.find('+')])
	elif '@' in reg: #Indirect mode (mode 2)
		#Destinations don't support indirect or indirect + post-increment.
		#Indirect can be faked with an index of 0. What a waste.
		if isDestReg:
			adrmode = 1
			extensionWord = 0
		else:
			adrmode = 2
			regID = getRegister(reg[reg.find('@') + 1 : ])
	elif '#' in reg: #Use PC to specify an immediate constant
		if isDestReg:
			raise AddressingModeError(reg,
				'Because immediates are encoded as @pc+, immediates cannot be used for ' +
				'destinations.\nConsider using &dest absolute addressing form instead.')
		adrmode = 3
		regID = 0
		constant = reg[reg.find('#') + 1 :].strip()

		#This might be an immediate constant supported by the hardware

		#A CPU bug prevents push #4 and push #8 with r2/SR encoding from working,
		#so one must simply use a 16-bit immediate there (what a waste, again)
		if constant == '4' and opcode != 'push':
			regID = 2
			adrmode = 2
		elif constant == '8' and opcode != 'push':
			regID = 2
			adrmode = 3
		elif constant == '0':
			regID = 3
			adrmode = 0
		elif constant == '1':
			regID = 3
			adrmode = 1
		elif constant == '2':
			regID = 3
			adrmode = 2
		elif constant == '-1' or constant.lower() == '0xffff':
			regID = 3
			adrmode = 3
		else:
			extensionWord = constant
	elif '&' in reg: #Direct addressing. An extension word is fetched and used as the raw address.
		regID = 2
		adrmode = 1
		extensionWord = reg[reg.find('&') + 1 : ]
	else: #Regular register access (mode 0)
		adrmode = 0
		regID = getRegister(reg)

	return extensionWord, adrmode, regID