UserUnknownFactor · February 24, 2024 07:59
diff --git a/unity_typetree_to_kaitai_conv.py b/unity_typetree_to_kaitai_conv.py
 from sys import argv
 import json, re
 from ctypes import c_uint32
 from tqdm import tqdm

 # For usage with KaitaiStruct visualisers
 #  (like https://userunknownfactor.github.io/ for example)

 extract_class = "ClassName"

 ASSEMBLY_TREES: dict = dict()
 depth = 0
 result = ''
 prefix = """meta:
  id: assembly_typetrees
  endian: le
  encoding: UTF-8
  title: Assembly TypeTrees Types Definitions
  file-extension: dat

 seq:
  - id: test_sequence
    type: {class_name}

 #instances:
 #    unity_version:
 #      value: 2020
 types:"""

 kAlignBytes = 0x4000
 name_field = "name"
 type_field =  "type"
 align_field = "meta_flag"
 level_field = "level"
 fields_changed = False

 def set_old_fields():
    global name_field,type_field,align_field,level_field,fields_changed
    if fields_changed:
        return
    name_field = "m_Name"
    type_field =  "m_Type"
    align_field = "m_MetaFlag"
    level_field = "m_Level"
    fields_changed = True

 appendixes = {"aligned_string":"""  aligned_string:
    seq:
      - id: len
        type: u4
      - id: string
        type: str
        encoding: utf-8
        size: len
      - size: (4 - _io.pos) % 4\n""",
 "unity_py_binary_blob":"""  unity_py_binary_blob:
    seq:
      - id: data
        type: u1
        repeat: eos\n""",
 "pptr":"""  pptr:
    seq:
      - id: file_id
        type: u4
      - id: path_id
        type: u8\n"""
 }

 with open(argv[1] if len(argv) > 1 else "assembly_typetrees.json", "r", encoding="utf-8-sig") as f:
    ASSEMBLY_TREES = json.loads(f.read())

 tree_items = ASSEMBLY_TREES.items()

 def sanitize(text:str):
    return re.sub(
        r'<(?=\w)|(?<=\w)>|`', r'', re.sub(r'([a-z])([A-Z])', r'\1_\2', text)
        ).lower().replace('.', '_').replace('/', '_').replace('[]','')

 def pd(text:str, dont_sanitize:bool=False, only_return:bool=False, use_depth:int=None):
    global depth
    processed = ''
    if use_depth is None:
        use_depth = 0 if only_return else depth
    for line in text.splitlines():
        new_line = ('  ' * use_depth) + (line if dont_sanitize else sanitize(line))  + '\n'
        processed += new_line
    if not only_return:
        global result
        result += processed
        return ''
    return processed

 def align_stream(ttype=None, only_return=False):
    if ttype is None or ttype[align_field] & kAlignBytes != 0:
        ret = pd("- size: (4 - _io.pos) % 4", True, only_return)
        if only_return:
            return ret
    return ''

 def get_subtree_at(nodes: list, item_n: c_uint32) -> list:
    """Copies all nodes above the level of the node at the set index."""
    children = [nodes[item_n.value]]
    current_level = nodes[item_n.value][level_field]
    for node in nodes[(item_n.value + 1):]:
        if node[level_field] <= current_level:
            break
        children.append(node)
    item_n.value += len(children) - 1
    return children

 def is_subtree_same_level(nodes:list) -> bool:
    current_level = nodes[1][level_field]
    return all(x[level_field] == current_level for x in nodes[1:])

 def match_base_type(_type: str):
    match _type.lower():
        case "sint8":
            _type = "s1"
        case "uint8" | "char":
            _type = "u1"
        case "short" | "sint16":
            _type = "s2"
        case "uint16" | "unsigned short":
            _type = "u2"
        case "int" | "sint32":
            _type = "s4"
        case "uint32" | "unsigned int" | "Type*":
            _type = "u4"
        case "long long" | "sint64":
            _type = "s8"
        case "uint64" | "unsigned long long" | "filesize":
            _type = "u8"
        case "float":
            _type = "f4"
        case "double":
            _type = "f8"
        case "bool":
            _type = "b1"
        case _:
            return None
    return _type

 def replace_type(i:c_uint32, tlist:list, all_trees:dict, parent:str, return_instead:bool=False):
    _ttype: dict = tlist[i.value]
    if name_field not in _ttype:
        set_old_fields()
    _name: str = sanitize(_ttype[name_field])
    _type: str = _ttype[type_field]
    _level: int = _ttype[level_field]

    _t = match_base_type(_type)

    if _t is None:
        match _type:
            case _type if _type.startswith("PPtr<"):
                add = f"_of_{parent}" if len(parent) else ''
                ret = pd(f"- id: pptr_{_name}{add}\n  type: pptr", only_return=return_instead)
                i.value += 2
                if return_instead:
                    return ret
                return ''
            case "string":
                ret = parse_string(i, tlist, _ttype, _name, return_instead)
                if return_instead:
                    return ret
                return ''
            case "String[]":
                ret = parse_str_array(i, tlist, _ttype, _name, return_instead)
                if return_instead:
                    return ret
                return ''
            case "map":  # map == MultiDict
                ret = parse_map(i, tlist, _name, return_instead)
                if return_instead:
                    return ret
                return ''
            case "TypelessData":
                ret = parse_typeless(i, tlist, _ttype, _name, return_instead)
                if return_instead:
                    return ret
                return ''
            case _:
                if i.value < len(tlist) - 1 and tlist[i.value + 1][type_field] == "Array":
                    ret = parse_array(i, tlist, _name, _type, all_trees, return_instead)
                    if return_instead:
                        return ret
                    return ''
                else:  # Class
                    clz = get_subtree_at(tlist, i)
                    if len(clz) != 1:
                        ret = parse_complex_class(_ttype, _name, clz, all_trees)
                        if _level > 1:
                            return ret
                        return ''
                    else:
                        ret = parse_class(_type, clz, all_trees, return_instead)
                        if return_instead:
                            return ret
                        return ''
    else:
        _type = _t

    #if (_ttype[level_field] > 1):
    #    add = f"_of_{parent}" if len(parent) else ''
    #    template = "- id: {typ_id}" + add + "\n  type: {typ_name}"
    #else:

    template = "- id: " + ("{typ_id}" if is_subtree_same_level(tlist) or not parent else "{typ_id}_of_{parent}") + "\n  type: {typ_name}"
    ret = pd(template.format(typ_id=_name, typ_name=_type, parent=parent), only_return=return_instead)
    ret += align_stream(_ttype, only_return=return_instead)
    if return_instead:
        return ret
    return ''

 def parse_class(cls_type, clz, all_trees, return_instead):
    ret = ''
    is_array = cls_type[-2:] == "[]"
    cls_name = sanitize(clz[0][name_field])
    if is_array:
        ret += pd(f"- id: {cls_name}_len\n  type: u4", only_return=return_instead)
    ret += pd(f"- id: {cls_name}\n  type: {sanitize(clz[0][type_field])}", only_return=return_instead)
    if is_array:
        ret += pd(f"  repeat: expr\n  repeat-expr: {cls_name}_len\n  if: {cls_name}_len > 0", only_return=return_instead)
    if return_instead:
        return ret
    return ''

 def parse_complex_class(ttype, name, clz, all_trees):
    j = c_uint32(1)
    ret = ''
    _type = clz[0][type_field]
    _level = clz[0][level_field]
    deep_level = _level > 1
    while j.value < len(clz):
        ret += replace_type(j, clz, all_trees, parent=name, return_instead=True)
        j.value += 1
    ret += align_stream(ttype, only_return=True)
    _type = sanitize(_type)
    if _type not in appendixes:
        appendixes[_type] = make_appendix(_type, ret)
    if name == "data":
        return ''
    return pd(f"- id: class_{name}\n  type: {_type}", only_return=deep_level)

 def parse_array(i, tlist, name, var_type, all_trees, return_instead):
    align = (tlist[i.value + 1][align_field] & kAlignBytes) != 0
    _array = get_subtree_at(tlist, i)
    #name = name + var_type
    ret = pd(f"- id: {sanitize(name)}_array_len\n  type: u4", only_return=return_instead)
    j = c_uint32(3)
    while j.value < len(_array):
        ret += replace_type(j, _array, all_trees, parent=name, return_instead=True)
        j.value += 1
    if len(_array) == 7 and _array[3][type_field] == "string":
        _type = "aligned_string"
    else:
        _type = sanitize(_array[3][type_field])
    val = pd(f"- id: {name}\n  type: {_type}\n  repeat: expr\n  repeat-expr: {sanitize(name)}_array_len\n  if: {name}_array_len > 0", only_return=return_instead, dont_sanitize=True)
    if return_instead:
        ret += val
    if align:
        ret += align_stream(only_return=return_instead)
    global appendixes
    _type = sanitize(var_type)
    if ret and _type not in appendixes:
        global depth
        if _type not in ["list1", "vector"]:
            appendixes[_type] = make_appendix(_type, ret)
    if return_instead:
        return ret
    return ''

 def make_appendix(var_type, declaration):
    return pd(f"{var_type}:\n  seq:\n{pd(declaration, dont_sanitize=True, only_return=True, use_depth=2)}", dont_sanitize=True, only_return=True, use_depth=1)

 def parse_typeless(i, tlist, ttype, name, return_instead):
    get_subtree_at(tlist, i)
    ret = pd(f"- id: {sanitize(name)}_typeless_len\n  type: u4\n- id: {name}\n  type: u1\n  repeat: expr\n  repeat-expr: {sanitize(name)}_typeless_len\n  if: {name}_typeless_len > 0", only_return=return_instead)
    align_stream(ttype, only_return=return_instead)
    if return_instead:
        return ret
    return ''

 def parse_map(i, tlist, name, return_instead):
    align = (tlist[i.value + 1][align_field] & kAlignBytes) != 0
    map_ = get_subtree_at(tlist, i)
    first = get_subtree_at(map_, c_uint32(4))
    second = get_subtree_at(map_, c_uint32(4 + len(first)))
    ret = pd(f"# (key,value) types defined at the end", only_return=return_instead)
    ret += pd(f"- id: {sanitize(name)}_map_len\n  type: u4", only_return=return_instead)
    _name = sanitize(name)
    _type = first[0][type_field]
    if tlist[i.value][level_field] > 1 and _name not in appendixes and _name != "List`1":
        appendixes[_name] = pd(f"  key_value_pair_of_{_name}:\n    seq:\n      - id: key\n        type: {_type}\n      - id: value\n        type: {second[0][type_field]}", only_return=True)
    ret += pd(f"- id: {name}\n  type: key_value_pair_of_{_name}\n  repeat: expr\n  repeat-expr: {_name}_map_len\n  if: {name}_map_len > 0", only_return=return_instead)
    if align:
        ret += align_stream(only_return=return_instead)
    if return_instead:
        return ret
    return ''

 def parse_str_array(i, tlist, ttype, name, return_instead):
    get_subtree_at(tlist, i)
    ret = pd(f"- id: {name}_len\n  type: u4\n- id: {name}\n  type: aligned_string\n  repeat: expr\n  repeat-expr: {name}_len\n  if: {name}_len > 0", only_return=return_instead)
    ret += align_stream(ttype, only_return=return_instead)
    if return_instead:
        return ret
    return ''

 def parse_string(i, tlist, ttype, name, return_instead):
    get_subtree_at(tlist, i)
    ret = pd(f"- id: {name}\n  type: aligned_string", only_return=return_instead)
    ret += align_stream(ttype, only_return=return_instead)
    if return_instead:
        return ret
    return ''


 def process_class_typetree(ttree:list, i:c_uint32, tree_items:dict):
    while i.value < len(ttree):
        replace_type(i, ttree, tree_items, parent='')
        i.value += 1

 print("Converting TypeTrees to KSY format...")
 for class_name, ttree in tqdm(tree_items):
    if extract_class and class_name.lower() != extract_class.lower(): continue
    extract_class = class_name
    depth += 1
    pd(sanitize(extract_class) + ":")
    depth += 1
    pd("seq:")
    depth += 1
    process_class_typetree(ttree, c_uint32(1), tree_items) # starts from 1 to go directly to the 1st level
    depth -= 3

 if len(appendixes) > 0:
    appendixes = "\n  # Inner Classes\n" + ''.join(appendixes.values()) + "\n  # Main Class\n"
 cn = sanitize(extract_class)
 with open(f"{cn}.ksy", "w", encoding="utf-8-sig") as f:
    f.write(prefix.format(class_name=cn) + appendixes + result)
	from sys import argv
	import json, re
	from ctypes import c_uint32
	from tqdm import tqdm

	# For usage with KaitaiStruct visualisers
	# (like https://userunknownfactor.github.io/ for example)

	extract_class = "ClassName"

	ASSEMBLY_TREES: dict = dict()
	depth = 0
	result = ''
	prefix = """meta:
	id: assembly_typetrees
	endian: le
	encoding: UTF-8
	title: Assembly TypeTrees Types Definitions
	file-extension: dat

	seq:
	- id: test_sequence
	type: {class_name}

	#instances:
	# unity_version:
	# value: 2020
	types:"""

	kAlignBytes = 0x4000
	name_field = "name"
	type_field = "type"
	align_field = "meta_flag"
	level_field = "level"
	fields_changed = False

	def set_old_fields():
	global name_field,type_field,align_field,level_field,fields_changed
	if fields_changed:
	return
	name_field = "m_Name"
	type_field = "m_Type"
	align_field = "m_MetaFlag"
	level_field = "m_Level"
	fields_changed = True

	appendixes = {"aligned_string":""" aligned_string:
	seq:
	- id: len
	type: u4
	- id: string
	type: str
	encoding: utf-8
	size: len
	- size: (4 - _io.pos) % 4\n""",
	"unity_py_binary_blob":""" unity_py_binary_blob:
	seq:
	- id: data
	type: u1
	repeat: eos\n""",
	"pptr":""" pptr:
	seq:
	- id: file_id
	type: u4
	- id: path_id
	type: u8\n"""
	}

	with open(argv[1] if len(argv) > 1 else "assembly_typetrees.json", "r", encoding="utf-8-sig") as f:
	ASSEMBLY_TREES = json.loads(f.read())

	tree_items = ASSEMBLY_TREES.items()

	def sanitize(text:str):
	return re.sub(
	r'<(?=\w)\|(?<=\w)>\|`', r'', re.sub(r'([a-z])([A-Z])', r'\1_\2', text)
	).lower().replace('.', '_').replace('/', '_').replace('[]','')

	def pd(text:str, dont_sanitize:bool=False, only_return:bool=False, use_depth:int=None):
	global depth
	processed = ''
	if use_depth is None:
	use_depth = 0 if only_return else depth
	for line in text.splitlines():
	new_line = (' ' * use_depth) + (line if dont_sanitize else sanitize(line)) + '\n'
	processed += new_line
	if not only_return:
	global result
	result += processed
	return ''
	return processed

	def align_stream(ttype=None, only_return=False):
	if ttype is None or ttype[align_field] & kAlignBytes != 0:
	ret = pd("- size: (4 - _io.pos) % 4", True, only_return)
	if only_return:
	return ret
	return ''

	def get_subtree_at(nodes: list, item_n: c_uint32) -> list:
	"""Copies all nodes above the level of the node at the set index."""
	children = [nodes[item_n.value]]
	current_level = nodes[item_n.value][level_field]
	for node in nodes[(item_n.value + 1):]:
	if node[level_field] <= current_level:
	break
	children.append(node)
	item_n.value += len(children) - 1
	return children

	def is_subtree_same_level(nodes:list) -> bool:
	current_level = nodes[1][level_field]
	return all(x[level_field] == current_level for x in nodes[1:])

	def match_base_type(_type: str):
	match _type.lower():
	case "sint8":
	_type = "s1"
	case "uint8" \| "char":
	_type = "u1"
	case "short" \| "sint16":
	_type = "s2"
	case "uint16" \| "unsigned short":
	_type = "u2"
	case "int" \| "sint32":
	_type = "s4"
	case "uint32" \| "unsigned int" \| "Type*":
	_type = "u4"
	case "long long" \| "sint64":
	_type = "s8"
	case "uint64" \| "unsigned long long" \| "filesize":
	_type = "u8"
	case "float":
	_type = "f4"
	case "double":
	_type = "f8"
	case "bool":
	_type = "b1"
	case _:
	return None
	return _type

	def replace_type(i:c_uint32, tlist:list, all_trees:dict, parent:str, return_instead:bool=False):
	_ttype: dict = tlist[i.value]
	if name_field not in _ttype:
	set_old_fields()
	_name: str = sanitize(_ttype[name_field])
	_type: str = _ttype[type_field]
	_level: int = _ttype[level_field]

	_t = match_base_type(_type)

	if _t is None:
	match _type:
	case _type if _type.startswith("PPtr<"):
	add = f"_of_{parent}" if len(parent) else ''
	ret = pd(f"- id: pptr_{_name}{add}\n type: pptr", only_return=return_instead)
	i.value += 2
	if return_instead:
	return ret
	return ''
	case "string":
	ret = parse_string(i, tlist, _ttype, _name, return_instead)
	if return_instead:
	return ret
	return ''
	case "String[]":
	ret = parse_str_array(i, tlist, _ttype, _name, return_instead)
	if return_instead:
	return ret
	return ''
	case "map": # map == MultiDict
	ret = parse_map(i, tlist, _name, return_instead)
	if return_instead:
	return ret
	return ''
	case "TypelessData":
	ret = parse_typeless(i, tlist, _ttype, _name, return_instead)
	if return_instead:
	return ret
	return ''
	case _:
	if i.value < len(tlist) - 1 and tlist[i.value + 1][type_field] == "Array":
	ret = parse_array(i, tlist, _name, _type, all_trees, return_instead)
	if return_instead:
	return ret
	return ''
	else: # Class
	clz = get_subtree_at(tlist, i)
	if len(clz) != 1:
	ret = parse_complex_class(_ttype, _name, clz, all_trees)
	if _level > 1:
	return ret
	return ''
	else:
	ret = parse_class(_type, clz, all_trees, return_instead)
	if return_instead:
	return ret
	return ''
	else:
	_type = _t

	#if (_ttype[level_field] > 1):
	# add = f"_of_{parent}" if len(parent) else ''
	# template = "- id: {typ_id}" + add + "\n type: {typ_name}"
	#else:

	template = "- id: " + ("{typ_id}" if is_subtree_same_level(tlist) or not parent else "{typ_id}_of_{parent}") + "\n type: {typ_name}"
	ret = pd(template.format(typ_id=_name, typ_name=_type, parent=parent), only_return=return_instead)
	ret += align_stream(_ttype, only_return=return_instead)
	if return_instead:
	return ret
	return ''

	def parse_class(cls_type, clz, all_trees, return_instead):
	ret = ''
	is_array = cls_type[-2:] == "[]"
	cls_name = sanitize(clz[0][name_field])
	if is_array:
	ret += pd(f"- id: {cls_name}_len\n type: u4", only_return=return_instead)
	ret += pd(f"- id: {cls_name}\n type: {sanitize(clz[0][type_field])}", only_return=return_instead)
	if is_array:
	ret += pd(f" repeat: expr\n repeat-expr: {cls_name}_len\n if: {cls_name}_len > 0", only_return=return_instead)
	if return_instead:
	return ret
	return ''

	def parse_complex_class(ttype, name, clz, all_trees):
	j = c_uint32(1)
	ret = ''
	_type = clz[0][type_field]
	_level = clz[0][level_field]
	deep_level = _level > 1
	while j.value < len(clz):
	ret += replace_type(j, clz, all_trees, parent=name, return_instead=True)
	j.value += 1
	ret += align_stream(ttype, only_return=True)
	_type = sanitize(_type)
	if _type not in appendixes:
	appendixes[_type] = make_appendix(_type, ret)
	if name == "data":
	return ''
	return pd(f"- id: class_{name}\n type: {_type}", only_return=deep_level)

	def parse_array(i, tlist, name, var_type, all_trees, return_instead):
	align = (tlist[i.value + 1][align_field] & kAlignBytes) != 0
	_array = get_subtree_at(tlist, i)
	#name = name + var_type
	ret = pd(f"- id: {sanitize(name)}_array_len\n type: u4", only_return=return_instead)
	j = c_uint32(3)
	while j.value < len(_array):
	ret += replace_type(j, _array, all_trees, parent=name, return_instead=True)
	j.value += 1
	if len(_array) == 7 and _array[3][type_field] == "string":
	_type = "aligned_string"
	else:
	_type = sanitize(_array[3][type_field])
	val = pd(f"- id: {name}\n type: {_type}\n repeat: expr\n repeat-expr: {sanitize(name)}_array_len\n if: {name}_array_len > 0", only_return=return_instead, dont_sanitize=True)
	if return_instead:
	ret += val
	if align:
	ret += align_stream(only_return=return_instead)
	global appendixes
	_type = sanitize(var_type)
	if ret and _type not in appendixes:
	global depth
	if _type not in ["list1", "vector"]:
	appendixes[_type] = make_appendix(_type, ret)
	if return_instead:
	return ret
	return ''

	def make_appendix(var_type, declaration):
	return pd(f"{var_type}:\n seq:\n{pd(declaration, dont_sanitize=True, only_return=True, use_depth=2)}", dont_sanitize=True, only_return=True, use_depth=1)

	def parse_typeless(i, tlist, ttype, name, return_instead):
	get_subtree_at(tlist, i)
	ret = pd(f"- id: {sanitize(name)}_typeless_len\n type: u4\n- id: {name}\n type: u1\n repeat: expr\n repeat-expr: {sanitize(name)}_typeless_len\n if: {name}_typeless_len > 0", only_return=return_instead)
	align_stream(ttype, only_return=return_instead)
	if return_instead:
	return ret
	return ''

	def parse_map(i, tlist, name, return_instead):
	align = (tlist[i.value + 1][align_field] & kAlignBytes) != 0
	map_ = get_subtree_at(tlist, i)
	first = get_subtree_at(map_, c_uint32(4))
	second = get_subtree_at(map_, c_uint32(4 + len(first)))
	ret = pd(f"# (key,value) types defined at the end", only_return=return_instead)
	ret += pd(f"- id: {sanitize(name)}_map_len\n type: u4", only_return=return_instead)
	_name = sanitize(name)
	_type = first[0][type_field]
	if tlist[i.value][level_field] > 1 and _name not in appendixes and _name != "List`1":
	appendixes[_name] = pd(f" key_value_pair_of_{_name}:\n seq:\n - id: key\n type: {_type}\n - id: value\n type: {second[0][type_field]}", only_return=True)
	ret += pd(f"- id: {name}\n type: key_value_pair_of_{_name}\n repeat: expr\n repeat-expr: {_name}_map_len\n if: {name}_map_len > 0", only_return=return_instead)
	if align:
	ret += align_stream(only_return=return_instead)
	if return_instead:
	return ret
	return ''

	def parse_str_array(i, tlist, ttype, name, return_instead):
	get_subtree_at(tlist, i)
	ret = pd(f"- id: {name}_len\n type: u4\n- id: {name}\n type: aligned_string\n repeat: expr\n repeat-expr: {name}_len\n if: {name}_len > 0", only_return=return_instead)
	ret += align_stream(ttype, only_return=return_instead)
	if return_instead:
	return ret
	return ''

	def parse_string(i, tlist, ttype, name, return_instead):
	get_subtree_at(tlist, i)
	ret = pd(f"- id: {name}\n type: aligned_string", only_return=return_instead)
	ret += align_stream(ttype, only_return=return_instead)
	if return_instead:
	return ret
	return ''


	def process_class_typetree(ttree:list, i:c_uint32, tree_items:dict):
	while i.value < len(ttree):
	replace_type(i, ttree, tree_items, parent='')
	i.value += 1

	print("Converting TypeTrees to KSY format...")
	for class_name, ttree in tqdm(tree_items):
	if extract_class and class_name.lower() != extract_class.lower(): continue
	extract_class = class_name
	depth += 1
	pd(sanitize(extract_class) + ":")
	depth += 1
	pd("seq:")
	depth += 1
	process_class_typetree(ttree, c_uint32(1), tree_items) # starts from 1 to go directly to the 1st level
	depth -= 3

	if len(appendixes) > 0:
	appendixes = "\n # Inner Classes\n" + ''.join(appendixes.values()) + "\n # Main Class\n"
	cn = sanitize(extract_class)
	with open(f"{cn}.ksy", "w", encoding="utf-8-sig") as f:
	f.write(prefix.format(class_name=cn) + appendixes + result)
No results found