================ @@ -0,0 +1,486 @@ +""" +Specification, compiler, disassembler, and interpreter +for LLDB dataformatter bytecode. + +See formatter-bytecode.md for more details. +""" +from __future__ import annotations + +# Types +type_String = 1 +type_Int = 2 +type_UInt = 3 +type_Object = 4 +type_Type = 5 + +# Opcodes +opcode = dict() + + +def define_opcode(n, mnemonic, name): + globals()["op_" + name] = n + if mnemonic: + opcode[mnemonic] = n + opcode[n] = mnemonic + + +define_opcode(1, "dup", "dup") +define_opcode(2, "drop", "drop") +define_opcode(3, "pick", "pick") +define_opcode(4, "over", "over") +define_opcode(5, "swap", "swap") +define_opcode(6, "rot", "rot") + +define_opcode(0x10, "{", "begin") +define_opcode(0x11, "if", "if") +define_opcode(0x12, "ifelse", "ifelse") + +define_opcode(0x20, None, "lit_uint") +define_opcode(0x21, None, "lit_int") +define_opcode(0x22, None, "lit_string") +define_opcode(0x23, None, "lit_selector") + +define_opcode(0x30, "+", "plus") +define_opcode(0x31, "-", "minus") +define_opcode(0x32, "*", "mul") +define_opcode(0x33, "/", "div") +define_opcode(0x34, "%", "mod") +define_opcode(0x35, "<<", "shl") +define_opcode(0x36, ">>", "shr") +define_opcode(0x37, "shra", "shra") + +define_opcode(0x40, "&", "and") +define_opcode(0x41, "|", "or") +define_opcode(0x42, "^", "xor") +define_opcode(0x43, "~", "not") + +define_opcode(0x50, "=", "eq") +define_opcode(0x51, "!=", "neq") +define_opcode(0x52, "<", "lt") +define_opcode(0x53, ">", "gt") +define_opcode(0x54, "=<", "le") +define_opcode(0x55, ">=", "ge") + +define_opcode(0x60, "call", "call") + +# Function signatures +sig_summary = 0 +sig_init = 1 +sig_get_num_children = 2 +sig_get_child_index = 3 +sig_get_child_at_index = 4 + +# Selectors +selector = dict() + + +def define_selector(n, name): + globals()["sel_" + name] = n + selector["@" + name] = n + selector[n] = "@" + name + + +define_selector(0, "summary") +define_selector(1, "type_summary") + +define_selector(0x10, "get_num_children") +define_selector(0x11, "get_child_at_index") +define_selector(0x12, "get_child_with_name") +define_selector(0x13, "get_child_index") +define_selector(0x15, "get_type") +define_selector(0x16, "get_template_argument_type") +define_selector(0x20, "get_value") +define_selector(0x21, "get_value_as_unsigned") +define_selector(0x22, "get_value_as_signed") +define_selector(0x23, "get_value_as_address") +define_selector(0x24, "cast") + +define_selector(0x40, "read_memory_byte") +define_selector(0x41, "read_memory_uint32") +define_selector(0x42, "read_memory_int32") +define_selector(0x43, "read_memory_unsigned") +define_selector(0x44, "read_memory_signed") +define_selector(0x45, "read_memory_address") +define_selector(0x46, "read_memory") + +define_selector(0x50, "fmt") +define_selector(0x51, "sprintf") +define_selector(0x52, "strlen") + + +################################################################################ +# Compiler. +################################################################################ + + +def compile(assembler: str) -> bytearray: + """Compile assembler into bytecode""" + # This is a stack of all in-flight/unterminated blocks. + bytecode = [bytearray()] + + def emit(byte): + bytecode[-1].append(byte) + + tokens = list(assembler.split(" ")) + tokens.reverse() + while tokens: + tok = tokens.pop() + if tok == "": + pass + elif tok == "{": + bytecode.append(bytearray()) + elif tok == "}": + block = bytecode.pop() + emit(op_begin) + emit(len(block)) # FIXME: uleb + bytecode[-1].extend(block) + elif tok[0].isdigit(): + if tok[-1] == "u": + emit(op_lit_uint) + emit(int(tok[:-1])) # FIXME + else: + emit(op_lit_int) + emit(int(tok)) # FIXME + elif tok[0] == "@": + emit(op_lit_selector) + emit(selector[tok]) + elif tok[0] == '"': + s = bytearray() + done = False + chrs = tok[1:] + while not done: + quoted = False + for c in chrs: + if quoted: + s.append(ord(c)) # FIXME + quoted = False + elif c == "\\": + quoted = True + elif c == '"': + done = True + break + # FIXME assert this is last in token + else: + s.append(ord(c)) + if not done: + s.append(ord(" ")) + chrs = tokens.pop() + + emit(op_lit_string) + emit(len(s)) + bytecode[-1].extend(s) + else: + emit(opcode[tok]) + assert len(bytecode) == 1 # unterminated { + return bytecode[0] + + +################################################################################ +# Disassembler. +################################################################################ + + +def disassemble(bytecode: bytearray) -> (str, int): + """Disassemble bytecode into (assembler, token starts)""" + asm = "" + all_bytes = list(bytecode) + all_bytes.reverse() + blocks = [] + tokens = [0] + + def next_byte(): + """Fetch the next byte in the bytecode and keep track of all + in-flight blocks""" + for i in range(len(blocks)): + blocks[i] -= 1 + tokens.append(len(asm)) + return all_bytes.pop() + + while all_bytes: + b = next_byte() + if b == op_begin: + asm += "{" + length = next_byte() + blocks.append(length) + elif b == op_lit_uint: + b = next_byte() + asm += str(b) # FIXME uleb + asm += "u" + elif b == op_lit_int: + b = next_byte() + asm += str(b) + elif b == op_lit_selector: + b = next_byte() + asm += selector[b] + elif b == op_lit_string: + length = next_byte() + s = "'" + while length: + s += chr(next_byte()) + length -= 1 + asm += '"' + repr(s)[2:] + else: + asm += opcode[b] + + while blocks and blocks[-1] == 0: + asm += " }" + blocks.pop() + + if all_bytes: + asm += " " + + if blocks: + asm += "ERROR" + return asm, tokens + + +################################################################################ +# Interpreter. +################################################################################ + + +def count_fmt_params(fmt: str) -> int: + """Count the number of parameters in a format string""" + from string import Formatter + + f = Formatter() + n = 0 + for _, name, _, _ in f.parse(fmt): + if name > n: + n = name + return n + + +def interpret(bytecode: bytearray, control: list, data: list, tracing: bool = False): + """Interpret bytecode""" + frame = [] + frame.append((0, len(bytecode))) + + def trace(): + """print a trace of the execution for debugging purposes""" + + def fmt(d): + if isinstance(d, int): + return str(d) + if isinstance(d, str): + return d + return repr(type(d)) + + pc, end = frame[-1] + asm, tokens = disassemble(bytecode) + print( + "=== frame = {1}, data = {2}, opcode = {0}".format( + opcode[b], frame, [fmt(d) for d in data] + ) + ) + print(asm) + print(" " * (tokens[pc]) + "^") + + def next_byte(): + """Fetch the next byte and update the PC""" + pc, end = frame[-1] + assert pc < len(bytecode) + b = bytecode[pc] + frame[-1] = pc + 1, end + # At the end of a block? + while pc >= end: + frame.pop() + if not frame: + return None + pc, end = frame[-1] + if pc >= end: + return None + b = bytecode[pc] + frame[-1] = pc + 1, end + return b + + while frame[-1][0] < len(bytecode): + b = next_byte() + if b == None: + break + if tracing: + trace() + # Data stack manipulation. + if b == op_dup: + data.append(data[-1]) + elif b == op_drop: + data.pop() + elif b == op_pick: + data.append(data[data.pop()]) + elif b == op_over: + data.append(data[-2]) + elif b == op_swap: + x = data.pop() + y = data.pop() + data.append(x) + data.append(y) + elif b == op_rot: + z = data.pop() + y = data.pop() + x = data.pop() + data.append(z) + data.append(x) + data.append(y) + + # Control stack manipulation. + elif b == op_begin: + length = next_byte() + pc, end = frame[-1] + control.append((pc, pc + length)) + frame[-1] = pc + length, end + elif b == op_if: + if data.pop(): + frame.append(control.pop()) + elif b == op_ifelse: + if data.pop(): + control.pop() + frame.append(control.pop()) + else: + frame.append(control.pop()) + control.pop() + + # Literals. + elif b == op_lit_uint: + b = next_byte() # FIXME uleb + data.append(int(b)) + elif b == op_lit_int: + b = next_byte() # FIXME uleb + data.append(int(b)) + elif b == op_lit_selector: + b = next_byte() + data.append(b) + elif b == op_lit_string: + length = next_byte() + s = "" + while length: + s += chr(next_byte()) + length -= 1 + data.append(s) + + # Arithmetic, logic, etc. + elif b == op_plus: + data.append(data.pop() + data.pop()) + elif b == op_minus: + data.append(-data.pop() + data.pop()) + elif b == op_mul: + data.append(data.pop() * data.pop()) + elif b == op_div: + y = data.pop() + data.append(data.pop() / y) + elif b == op_mod: + y = data.pop() + data.append(data.pop() % y) + elif b == op_shl: + y = data.pop() + data.append(data.pop() << y) + elif b == op_shr: + y = data.pop() + data.append(data.pop() >> y) + elif b == op_shra: + y = data.pop() + data.append(data.pop() >> y) # FIXME + elif b == op_and: + data.append(data.pop() & data.pop()) + elif b == op_or: + data.append(data.pop() | data.pop()) + elif b == op_xor: + data.append(data.pop() ^ data.pop()) + elif b == op_not: + data.append(not data.pop()) + elif b == op_eq: + data.append(data.pop() == data.pop()) + elif b == op_neq: + data.append(data.pop() != data.pop()) + elif b == op_lt: + data.append(data.pop() > data.pop()) + elif b == op_gt: + data.append(data.pop() < data.pop()) + elif b == op_le: + data.append(data.pop() >= data.pop()) + elif b == op_ge: + data.append(data.pop() <= data.pop()) + + # Function calls. + elif b == op_call: + sel = data.pop() + if sel == sel_summary: + data.append(data.pop().GetSummary()) + elif sel == sel_get_num_children: + data.append(data.pop().GetNumChildren()) + elif sel == sel_get_child_at_index: + index = data.pop() + valobj = data.pop() + data.append(valobj.GetChildAtIndex(index)) + elif sel == sel_get_child_with_name: + name = data.pop() + valobj = data.pop() + data.append(valobj.GetChildMemberWithName(name)) + elif sel == sel_get_child_index: + name = data.pop() + valobj = data.pop() + data.append(valobj.GetIndexOfChildWithName(name)) + elif sel == sel_get_type: + data.append(data.pop().GetType()) + elif sel == sel_get_template_argument_type: + n = data.pop() + valobj = data.pop() + data.append(valobj.GetTemplateArgumentType(n)) + elif sel == sel_get_value: + data.append(data.pop().GetValue()) + elif sel == sel_get_value_as_unsigned: + data.append(data.pop().GetValueAsUnsigned()) + elif sel == sel_get_value_as_signed: + data.append(data.pop().GetValueAsSigned()) + elif sel == sel_get_value_as_address: + data.append(data.pop().GetValueAsAddress()) + elif sel == sel_cast: + sbtype = data.pop() + valobj = data.pop() + data.append(valobj.Cast(sbtype)) + elif sel == sel_strlen: + data.append(len(data.pop())) + elif sel == sel_fmt: + fmt = data.pop() + n = count_fmt_params(fmt) + args = [] + for i in range(n): + args.append(data.pop()) + data.append(fmt.format(*args)) + else: + print("not implemented: " + selector[sel]) + assert False + pass + return data[-1] + + +################################################################################ +# Tests. +################################################################################ + +import unittest + + +class TestCompiler(unittest.TestCase): + def test(self): + self.assertEqual(compile("1u dup").hex(), "200101") + self.assertEqual(compile('"1u dup"').hex(), "2206317520647570") + self.assertEqual(compile("16 < { dup } if").hex(), "21105210010111") + self.assertEqual(compile('{ { " } " } }').hex(), "100710052203207d20") + + def roundtrip(asm): + self.assertEqual(disassemble(compile(asm))[0], asm) + + roundtrip("1u dup") + roundtrip('1u dup "1u dup"') + roundtrip("16 < { dup } if") + roundtrip('{ { " } " } }') + + self.assertEqual(interpret(compile("1 1 +"), [], []), 2) + self.assertEqual(interpret(compile("2 1 1 + *"), [], []), 4) + self.assertEqual( + interpret(compile('2 1 > { "yes" } { "no" } ifelse'), [], []), "yes" + ) + + +if __name__ == "__main__": + unittest.main() ---------------- DavidSpickett wrote:
You can also run unittests with `python3 -m unittest discover -p <filename>`. Which means your `main` can do actual work if you want the file to be runnable without running the tests each time. https://github.com/llvm/llvm-project/pull/113398 _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits