dunhamsteve · August 7, 2025 14:08
diff --git a/framework2proto.py b/framework2proto.py
 #!/usr/local/bin/python3

 import re, subprocess, sys

 if len(sys.argv) < 2:
 	print('''
 This utility generates a protobuf file from an Objective C framework. It was developed against 
 CloudKit/CloudKitDaemon, but may work with other libraries that are built the same way.
 ''')
 	print("Usage:\n\t",sys.argv[0],"MachOBinary\n")
 	exit(-1)

 fn = sys.argv[1]
 lines = subprocess.check_output(['objdump', '-macho', '-objc-meta-data', '-disassemble', fn]).splitlines()
 # lines = open('CloudKitDaemon.s','rb').read().splitlines()
 it = iter(lines)
 line = next(it)

 DEBUG=False

 r_field = r'.*\tmovq\t_OBJC_IVAR_\$_([A-Za-z]+)\._([A-Za-z]+)\('

 types = {
 	'_PBDataWriterWriteInt': 'int32',
 	'_PBDataWriterWriteStringField': 'string',
 	'_PBDataWriterWriteSubmessage': '-',
 	'_PBDataWriterWriteDataField': 'bytes',
 	'_PBDataWriterWriteBOOLField': 'bool',
 	'_PBDataWriterWriteUint': 'uint32',
 	'_PBDataWriterWriteFloatField': 'float',
 	'_PBDataWriterWriteDoubleField': 'double',
 }

 def dprint(*args):
 	if DEBUG:
 		print(*args)

 schema = {}
 objc = {}
 arrays = {}

 try:
 	while True:
 		m = re.match(b'^-\[([A-z]+) writeTo:\]:$',line)
 		if m:
 			typ = m.group(1).decode('ascii')
 			current = schema[typ] = {}
 			dprint(typ)
 			field,key = None,None
 			optional = "optional "
 			while True:
 				line = line.decode('utf8')
 				m = re.match(r_field,line)
 				if m:
 					a,field = m.groups()
 					assert a == typ
 					key = None
 					optional = "optional "
 				elif re.match(r'.*testq\t%rsi, %rsi',line):
 					optional = "optional "
                # this is %esi for the float/double case
 				elif re.match(r'.*\tmovl\t\$(\d+), %(edx|esi)',line):
 					m = re.match(r'.*\tmovl\t\$(\d+), %(edx|esi)',line)
 					key = m.group(1)
                # this is jmp when there is a tail-call optimization
 				elif re.match(r'.*(callq|jmp)',line):
 					m = re.match(r'.* ## symbol stub for: (_PBDataWriter[A-Za-z]+)',line)
 					if m:
 						assert key not in current
 						# This is subtle, but for repeated ints and doubles (from a c array)
 						# there is a incq %rbx on the next line. This doesn't catch all NSMutableArray cases.
 						line = next(it).strip()
 						if b"incq\t%rbx" in line:
 							optional = "repeated "
 						current[key] = (field,types[m.group(1)],optional)
 						dprint('XXX', typ,types[m.group(1)],field,'=',key,optional)
 						field,key = None,None
 						continue # we peeked at the next line so we skip back to the top
 					elif field:
 						dprint("STRAY",line)
 						# assert False
 				line = next(it).strip()
 				if line.startswith(b'-'):
 					break	
 				#dprint(line)
 		elif re.match(b' *isa .* _OBJC_METACLASS_\$_([A-Za-z]+)',line):
 			m = re.match(b' *isa .* _OBJC_METACLASS_\$_([A-Za-z]+)',line)
 			typ = m.group(1).decode('utf8')
 			current = objc[typ] = {}
 			dprint('ZZZ',typ)
 			props = False
 			name = None
 			while True:
 				line = next(it)
 				parts = line.strip().decode('utf8').split()
 				if parts[0].startswith('0') or parts[0] == 'Meta':
 					break
 				# dprint(parts)
 				if parts[0] == 'name':
 					name = parts[2]
 				if parts[0] == 'attributes':
 					m = re.match(r'T@"(.*?)"',parts[2])
 					if m:
 						dprint(typ,name,m.group(1))
 						current[name] = m.group(1)
 		elif re.match(b'_([A-Za-z]+)ReadFrom:',line):
 			# experimental - try to determine NSMutableArray types
 			typ = re.match(b'_([A-Za-z]+)ReadFrom:',line).group(1).decode('utf8')
 			refs = {}
 			clazz = None
 			while True:
 				line = next(it).rstrip()
 				if not line.startswith(b" "):
 					break
 				line = line.decode('utf8')
 				parts = line.split('\t')
 				m = re.match(r'.*%rax ## Objc selector ref: add([A-Za-z]+):',line)
 				if m:
 					key = m.group(1)
 					key = key[0].lower()+key[1:]
 					dprint('*',key)
 					line = next(it).rstrip().decode('utf8')
 					parts = line.split('\t')
 					if parts[2] == 'movq':
 						dest = parts[3].split(', ')[1]
 						dprint(typ,key,dest)
 						refs[dest] = key

 				m = re.match(r'.* ## Objc class ref: (_OBJC_CLASS_\$_)?([A-Za-z]+)',line)
 				if m:
 					dprint('clazz',m.group(2))
 					clazz = m.group(2)
 				if parts[0].startswith('j'):
 					dprint('clear',parts)
 					clazz = None # naive basic block detection.
 				if parts[2] == 'movq':
 					a,b = parts[3].split(', ')
 					if a in refs:
 						dprint(line)
 						# assert clazz != None
 						arrays[(typ,refs[a])] = clazz
 						# some of these drop the 's'
 						if not refs[a].endswith('s'):
 							arrays[(typ,refs[a]+'s')] = clazz
 						dprint('ADD',typ,refs[a],clazz)
 		else:
 			line = next(it).rstrip()
 except StopIteration:
 	pass

 from pprint import pprint
 for k,v in sorted(schema.items()):
 	print(f"message {k} {{")
 	for k2,v2 in v.items():
 		f,t,o = v2
 		if t == '-':
 			oo = objc[k]
 			t = oo.get(f,t)
 		if t.startswith('NS'):
 			o = 'repeated '
 			t = arrays.get((k,f),t)
 		print(f'    {o}{t} {f} = {k2};')
 	print("}\n")
 dprint(arrays)
	#!/usr/local/bin/python3

	import re, subprocess, sys

	if len(sys.argv) < 2:
	print('''
	This utility generates a protobuf file from an Objective C framework. It was developed against
	CloudKit/CloudKitDaemon, but may work with other libraries that are built the same way.
	''')
	print("Usage:\n\t",sys.argv[0],"MachOBinary\n")
	exit(-1)

	fn = sys.argv[1]
	lines = subprocess.check_output(['objdump', '-macho', '-objc-meta-data', '-disassemble', fn]).splitlines()
	# lines = open('CloudKitDaemon.s','rb').read().splitlines()
	it = iter(lines)
	line = next(it)

	DEBUG=False

	r_field = r'.*\tmovq\t_OBJC_IVAR_\$_([A-Za-z]+)\._([A-Za-z]+)\('

	types = {
	'_PBDataWriterWriteInt': 'int32',
	'_PBDataWriterWriteStringField': 'string',
	'_PBDataWriterWriteSubmessage': '-',
	'_PBDataWriterWriteDataField': 'bytes',
	'_PBDataWriterWriteBOOLField': 'bool',
	'_PBDataWriterWriteUint': 'uint32',
	'_PBDataWriterWriteFloatField': 'float',
	'_PBDataWriterWriteDoubleField': 'double',
	}

	def dprint(*args):
	if DEBUG:
	print(*args)

	schema = {}
	objc = {}
	arrays = {}

	try:
	while True:
	m = re.match(b'^-\[([A-z]+) writeTo:\]:$',line)
	if m:
	typ = m.group(1).decode('ascii')
	current = schema[typ] = {}
	dprint(typ)
	field,key = None,None
	optional = "optional "
	while True:
	line = line.decode('utf8')
	m = re.match(r_field,line)
	if m:
	a,field = m.groups()
	assert a == typ
	key = None
	optional = "optional "
	elif re.match(r'.*testq\t%rsi, %rsi',line):
	optional = "optional "
	# this is %esi for the float/double case
	elif re.match(r'.*\tmovl\t\$(\d+), %(edx\|esi)',line):
	m = re.match(r'.*\tmovl\t\$(\d+), %(edx\|esi)',line)
	key = m.group(1)
	# this is jmp when there is a tail-call optimization
	elif re.match(r'.*(callq\|jmp)',line):
	m = re.match(r'.* ## symbol stub for: (_PBDataWriter[A-Za-z]+)',line)
	if m:
	assert key not in current
	# This is subtle, but for repeated ints and doubles (from a c array)
	# there is a incq %rbx on the next line. This doesn't catch all NSMutableArray cases.
	line = next(it).strip()
	if b"incq\t%rbx" in line:
	optional = "repeated "
	current[key] = (field,types[m.group(1)],optional)
	dprint('XXX', typ,types[m.group(1)],field,'=',key,optional)
	field,key = None,None
	continue # we peeked at the next line so we skip back to the top
	elif field:
	dprint("STRAY",line)
	# assert False
	line = next(it).strip()
	if line.startswith(b'-'):
	break
	#dprint(line)
	elif re.match(b' isa . _OBJC_METACLASS_\$_([A-Za-z]+)',line):
	m = re.match(b' isa . _OBJC_METACLASS_\$_([A-Za-z]+)',line)
	typ = m.group(1).decode('utf8')
	current = objc[typ] = {}
	dprint('ZZZ',typ)
	props = False
	name = None
	while True:
	line = next(it)
	parts = line.strip().decode('utf8').split()
	if parts[0].startswith('0') or parts[0] == 'Meta':
	break
	# dprint(parts)
	if parts[0] == 'name':
	name = parts[2]
	if parts[0] == 'attributes':
	m = re.match(r'T@"(.*?)"',parts[2])
	if m:
	dprint(typ,name,m.group(1))
	current[name] = m.group(1)
	elif re.match(b'_([A-Za-z]+)ReadFrom:',line):
	# experimental - try to determine NSMutableArray types
	typ = re.match(b'_([A-Za-z]+)ReadFrom:',line).group(1).decode('utf8')
	refs = {}
	clazz = None
	while True:
	line = next(it).rstrip()
	if not line.startswith(b" "):
	break
	line = line.decode('utf8')
	parts = line.split('\t')
	m = re.match(r'.*%rax ## Objc selector ref: add([A-Za-z]+):',line)
	if m:
	key = m.group(1)
	key = key[0].lower()+key[1:]
	dprint('*',key)
	line = next(it).rstrip().decode('utf8')
	parts = line.split('\t')
	if parts[2] == 'movq':
	dest = parts[3].split(', ')[1]
	dprint(typ,key,dest)
	refs[dest] = key

	m = re.match(r'.* ## Objc class ref: (_OBJC_CLASS_\$_)?([A-Za-z]+)',line)
	if m:
	dprint('clazz',m.group(2))
	clazz = m.group(2)
	if parts[0].startswith('j'):
	dprint('clear',parts)
	clazz = None # naive basic block detection.
	if parts[2] == 'movq':
	a,b = parts[3].split(', ')
	if a in refs:
	dprint(line)
	# assert clazz != None
	arrays[(typ,refs[a])] = clazz
	# some of these drop the 's'
	if not refs[a].endswith('s'):
	arrays[(typ,refs[a]+'s')] = clazz
	dprint('ADD',typ,refs[a],clazz)
	else:
	line = next(it).rstrip()
	except StopIteration:
	pass

	from pprint import pprint
	for k,v in sorted(schema.items()):
	print(f"message {k} {{")
	for k2,v2 in v.items():
	f,t,o = v2
	if t == '-':
	oo = objc[k]
	t = oo.get(f,t)
	if t.startswith('NS'):
	o = 'repeated '
	t = arrays.get((k,f),t)
	print(f' {o}{t} {f} = {k2};')
	print("}\n")
	dprint(arrays)
No results found