Last active
July 17, 2025 13:04
-
-
Save UserUnknownFactor/5e9c5ac711110b3474cc14bc79ace141 to your computer and use it in GitHub Desktop.
Tool for extracting and modifying strings in Unity IL2CPP global-metadata.dat file to variable length strings; can keep overall string table size the same by trimming/padding disposable strings (with 1 in the 4th csv column)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import json | |
import os | |
import shutil | |
import sys | |
from filetranslate.service_fn import read_csv_list, write_csv_list | |
MAGIC_BYTES = b'\xAF\x1B\xB1\xFA' | |
LOOKUP_TABLE_DEFINITION_OFFSET = 8 | |
LOOKUP_TABLE_SIZE_DEFINITION_OFFSET = 12 | |
STRINGLITERAL_DATA_DEFINITION_OFFSET = 16 | |
STRINGLITERAL_DATA_SIZE_DEFINITION_OFFSET = 20 | |
IGNORE_NON_UTF8 = False | |
class LookupTableEntry: | |
length: int | |
index: int | |
def __init__(self, length: int, index: int): | |
self.length = length | |
self.index = index | |
class StringLiteral: | |
index: int | |
data: bytes | |
disposable: bool | |
valid: bool | |
def __init__(self, index: int, data: bytes, disposable: bool = False, valid_utf8: bool = True): | |
self.index = index | |
self.data = data | |
self.disposable = disposable | |
self.valid = valid_utf8 | |
def to_dict(self): | |
return { | |
'index': self.index, | |
'value': self.data.decode("utf-8", "ignore"), | |
'disposable': self.disposable | |
} | |
@staticmethod | |
def from_dict(d: dict): | |
if 'index' not in d or 'value' not in d: | |
raise Exception('Invalid StringLiteral object') | |
disposable = d.get('disposable', False) | |
# Encode string to bytes when importing from JSON | |
return StringLiteral(d['index'], d['value'].encode("utf-8"), disposable) | |
def get_string_value(self): | |
return self.data.decode("utf-8", "ignore") | |
def set_string_value(self, string_value): | |
self.data = string_value.encode("utf-8") | |
def __iter__(self): | |
yield self.index | |
yield self.get_string_value() | |
yield self.disposable | |
def __getitem__(self, key): | |
if key == 0 or key == 'index': | |
return self.index | |
elif key == 1 or key == 'value': | |
return self.get_string_value() | |
elif key == 2 or key == 'disposable': | |
return self.disposable | |
elif key == 'valid': | |
return self.valid | |
else: | |
raise IndexError(f"Invalid index: {key}") | |
def __len__(self): | |
return 3 | |
class StringLiteralManager: | |
def __init__(self, filepath=None): | |
self.filepath = filepath | |
self.lookup_table = [] | |
self.stringliterals = [] | |
self.original_sizes = [] | |
self.original_total_size = 0 | |
if filepath: | |
self.extract() | |
def extract(self, filepath=None): | |
"""Extract string literals from a global-metadata.dat file""" | |
if filepath: | |
self.filepath = filepath | |
if not self.filepath: | |
raise ValueError("No filepath specified") | |
with open(self.filepath, "rb") as f: | |
# Validate magic bytes | |
if f.read(4) != MAGIC_BYTES: | |
raise Exception("Invalid global-metadata file") | |
# Get offsets and sizes | |
f.seek(LOOKUP_TABLE_DEFINITION_OFFSET) | |
lookup_table_offset = int.from_bytes(f.read(4), byteorder="little") | |
f.seek(LOOKUP_TABLE_SIZE_DEFINITION_OFFSET) | |
lookup_table_size = int.from_bytes(f.read(4), byteorder="little") | |
f.seek(STRINGLITERAL_DATA_DEFINITION_OFFSET) | |
stringliteral_data_offset = int.from_bytes(f.read(4), byteorder="little") | |
# Extract lookup table | |
f.seek(lookup_table_offset) | |
bytes_read = 0 | |
while bytes_read < lookup_table_size: | |
length = int.from_bytes(f.read(4), byteorder="little") | |
index = int.from_bytes(f.read(4), byteorder="little") | |
self.lookup_table.append(LookupTableEntry(length, index)) | |
bytes_read += 8 | |
# Extract string literals | |
for idx, entry in enumerate(self.lookup_table): | |
f.seek(stringliteral_data_offset + entry.index) | |
literal_data = f.read(entry.length) | |
if IGNORE_NON_UTF8: | |
self.stringliterals.append(StringLiteral(idx, literal_data)) | |
else: | |
try: | |
allowed_chars = {'\n', '\t', '\r', '\x3000'} | |
if any(not c.isprintable() and c not in allowed_chars for c in literal_data.decode("utf-8")): | |
raise | |
self.stringliterals.append(StringLiteral(idx, literal_data)) | |
except: | |
#print(f"Invalid text in literal {literal_data} @ {idx}") | |
self.stringliterals.append(StringLiteral(idx, literal_data, valid_utf8=False)) | |
# Save original string sizes for balancing | |
self.original_sizes = [len(s.data) for s in self.stringliterals] | |
self.original_total_size = sum(self.original_sizes) | |
return self | |
def dump_csv(self, output_path): | |
"""Export string literals to CSV file""" | |
array_form = [] | |
for sl in self.stringliterals: | |
if not sl.valid: | |
print("Invalid literal:", sl.data) | |
continue | |
array_form.append([sl.get_string_value(), '', sl.index, '']) | |
write_csv_list(output_path, array_form) | |
return output_path | |
def dump_json(self, output_path): | |
"""Export string literals to JSON file""" | |
with open(output_path, "w", encoding="utf-8") as f: | |
string_data = [sl.to_dict() for sl in self.stringliterals if sl.valid] | |
f.write(json.dumps(string_data, indent=2, ensure_ascii=False)) | |
return output_path | |
def load_modified_strings(self, filepath): | |
"""Load modified strings from a CSV or JSON file""" | |
# Dictionary to track which indices were updated | |
updated_indices = set() | |
if filepath.endswith('.json'): | |
with open(filepath, "r", encoding="utf-8") as f: | |
data = json.load(f) | |
for entry in data: | |
idx = entry['index'] | |
if idx >= 0 and idx < len(self.stringliterals): | |
value = entry['value'] | |
disposable = entry.get('disposable', False) | |
self.stringliterals[idx].data = value.encode("utf-8") | |
self.stringliterals[idx].disposable = disposable | |
updated_indices.add(idx) | |
else: | |
print(f"Warning: Index {idx} out of range, skipping") | |
elif filepath.endswith('.csv'): | |
reader = read_csv_list(filepath) | |
for row in reader: | |
if len(row) < 3: continue | |
try: | |
idx = int(row[2]) | |
if idx >= 0 and idx < len(self.stringliterals): | |
new_value = row[1] | |
disposable = len(row) >= 4 and row[3] == '1' | |
self.stringliterals[idx].disposable = disposable | |
if row[0].startswith("//") and not new_value.startswith("//"): continue | |
if new_value: | |
self.stringliterals[idx].data = new_value.encode("utf-8") | |
updated_indices.add(idx) | |
else: | |
print(f"Warning: Index {idx} out of range, skipping") | |
except (ValueError, KeyError) as e: | |
print(f"Warning: Invalid row in CSV, skipping: {e}") | |
else: | |
raise ValueError("Unsupported file format. Use .json or .csv") | |
# Report on modification stats | |
print(f"Modified {len(updated_indices)} strings out of {len(self.stringliterals)} total") | |
print(f"Kept {len(self.stringliterals) - len(updated_indices)} original strings") | |
return updated_indices | |
def balance_string_sizes(self): | |
"""Balance string sizes to match the original total size""" | |
# Calculate total size before modifications | |
original_size = self.original_total_size | |
# Calculate new size after modifications | |
new_size = 0 | |
for idx, string_literal in enumerate(self.stringliterals): | |
new_size += len(string_literal.data) | |
# If new size is larger, we need to trim disposable strings | |
if new_size > original_size: | |
excess_bytes = new_size - original_size | |
print(f"New strings exceed original size by {excess_bytes} bytes") | |
disposable_strings = [] | |
for idx, string_literal in enumerate(self.stringliterals): | |
if string_literal.disposable: | |
disposable_strings.append((idx, len(string_literal.data))) | |
# Sort disposable strings by length (descending) to trim larger strings first | |
disposable_strings.sort(key=lambda x: x[1], reverse=True) | |
bytes_trimmed = 0 | |
for idx, size in disposable_strings: | |
if bytes_trimmed >= excess_bytes: | |
break | |
string_literal = self.stringliterals[idx] | |
max_trim = min(size - 1, excess_bytes - bytes_trimmed) # Keep at least 1 byte | |
if max_trim > 0: | |
new_length = size - max_trim | |
# Ensure we cut at valid UTF-8 boundaries | |
while new_length > 0: | |
try: | |
trimmed = string_literal.data[:new_length].decode("utf-8") | |
string_literal.data = trimmed.encode("utf-8") | |
break | |
except UnicodeDecodeError: | |
new_length -= 1 | |
if new_length > 0: | |
bytes_trimmed += size - len(string_literal.data) | |
print(f"Trimmed string {idx} from {size} to {len(string_literal.data)} bytes") | |
if bytes_trimmed < excess_bytes: | |
print(f"Warning: Could only trim {bytes_trimmed} of {excess_bytes} excess bytes") | |
print("The resulting file may be larger than the original") | |
elif new_size < original_size: | |
deficit_bytes = original_size - new_size | |
print(f"New strings are {deficit_bytes} bytes smaller than original") | |
disposable_strings = [] | |
for idx, string_literal in enumerate(self.stringliterals): | |
if string_literal.disposable: | |
disposable_strings.append(idx) | |
if disposable_strings and len(disposable_strings) > 0: | |
last_idx = None | |
for idx in disposable_strings: | |
if last_idx is None or idx >= last_idx: | |
last_idx = idx | |
if last_idx is not None and deficit_bytes > 0: | |
string_literal = self.stringliterals[last_idx] | |
string_value = string_literal.get_string_value() | |
padded_value = string_value + ' ' * deficit_bytes | |
string_literal.data = padded_value.encode("utf-8") | |
actual_padding = len(string_literal.data) - len(string_value.encode("utf-8")) | |
print(f"Added a total of {actual_padding} bytes of padding") | |
else: | |
print("No disposable strings found for padding") | |
return self | |
def update_string_offsets(self): | |
"""Update string offsets in the lookup table""" | |
for idx, string_literal in enumerate(self.stringliterals): | |
self.lookup_table[idx].length = len(string_literal.data) | |
# Recalculate indices | |
index = 0 | |
for entry in self.lookup_table: | |
entry.index = index | |
index += entry.length | |
return self | |
def patch(self, output_filepath): | |
"""Create a patched metadata file with modified strings""" | |
# Create a copy of the original file | |
shutil.copy2(self.filepath, output_filepath) | |
# Calculate total size after modifications | |
new_total_size = sum(len(s.data) for s in self.stringliterals) | |
in_place_replacement = new_total_size <= self.original_total_size | |
with open(output_filepath, "rb+") as f: | |
f.seek(STRINGLITERAL_DATA_DEFINITION_OFFSET) | |
stringliteral_data_offset = int.from_bytes(f.read(4), byteorder="little") | |
if in_place_replacement: | |
print("Performing in-place string replacement") | |
f.seek(LOOKUP_TABLE_DEFINITION_OFFSET) | |
lookup_table_offset = int.from_bytes(f.read(4), byteorder="little") | |
f.seek(lookup_table_offset) | |
for entry in self.lookup_table: | |
f.write(entry.length.to_bytes(4, byteorder="little")) | |
f.write(entry.index.to_bytes(4, byteorder="little")) | |
for string_literal in self.stringliterals: | |
entry = self.lookup_table[string_literal.index] | |
f.seek(stringliteral_data_offset + entry.index) | |
f.write(string_literal.data) | |
else: | |
print("Appending strings to end of file (balancing was insufficient)") | |
f.seek(0, os.SEEK_END) | |
new_strings_offset = f.tell() | |
for entry in self.stringliterals: | |
f.write(entry.data) | |
f.seek(LOOKUP_TABLE_DEFINITION_OFFSET) | |
lookup_table_offset = int.from_bytes(f.read(4), byteorder="little") | |
f.seek(lookup_table_offset) | |
for entry in self.lookup_table: | |
f.write(entry.length.to_bytes(4, byteorder="little")) | |
f.write(entry.index.to_bytes(4, byteorder="little")) | |
# Update string data offset in header | |
f.seek(STRINGLITERAL_DATA_DEFINITION_OFFSET) | |
f.write(new_strings_offset.to_bytes(4, byteorder="little")) | |
return output_filepath | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='Extract and reinsert strings in Unity IL2CPP global-metadata.dat' | |
) | |
subparsers = parser.add_subparsers(dest='command', help='Command to execute') | |
extract_parser = subparsers.add_parser('extract', help='Extract strings from global-metadata.dat') | |
extract_parser.add_argument('input', default='global-metadata.dat', help='Path to global-metadata.dat file') | |
extract_parser.add_argument('output', default='global-metadata_strings.csv', help='Path to output CSV or JSON file') | |
reinsert_parser = subparsers.add_parser('reinsert', help='Reinsert strings into global-metadata.dat') | |
reinsert_parser.add_argument('input_metadata', default='global-metadata.dat', help='Path to original global-metadata.dat file') | |
reinsert_parser.add_argument('input_strings', default='global-metadata_strings.csv', help='Path to CSV or JSON file with modified strings') | |
reinsert_parser.add_argument('output_metadata', default='global-metadata.dat_patched', help='Path to output modified global-metadata.dat file') | |
args = parser.parse_args() | |
if args.command == 'extract': | |
manager = StringLiteralManager(args.input) | |
if args.output.endswith('.json'): | |
manager.dump_json(args.output) | |
else: | |
manager.dump_csv(args.output) | |
print(f"Successfully extracted {len(manager.stringliterals)} strings to {args.output}") | |
elif args.command == 'reinsert': | |
manager = StringLiteralManager(args.input_metadata) | |
manager.load_modified_strings(args.input_strings) | |
manager.balance_string_sizes() | |
manager.update_string_offsets() | |
manager.patch(args.output_metadata) | |
print(f"Successfully patched metadata file: {args.output_metadata}") | |
else: | |
parser.print_help() | |
return 1 | |
return 0 | |
if __name__ == "__main__": | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment