robbiemu · June 6, 2026 18:12
diff --git a/make_synthetic_coreml_amp.py b/make_synthetic_coreml_amp.py
 #!/usr/bin/env python3
 from pathlib import Path

 import torch
 import torch.nn as nn
 import coremltools as ct

 OUT = Path("./models/synthetic")
 OUT.mkdir(parents=True, exist_ok=True)


 class AmplifiedConvTie(nn.Module):
    def __init__(self):
        super().__init__()

        self.stem = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.SiLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.SiLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.SiLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.SiLU(),
            nn.Conv2d(64, 16, 1),
            nn.SiLU(),
        )

        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.head = nn.Linear(16, 2, bias=True)

        # Make the decision boundary intentionally fragile.
        #
        # Class 0 depends on channel 0.
        # Class 1 depends on channel 1.
        #
        # This makes CPU/GPU/ANE backend drift capable of moving the two logits
        # differently, instead of shifting both logits together.
        with torch.no_grad():
            self.head.weight.zero_()
            self.head.bias.zero_()

            self.head.weight[0, 0] = 1.0
            self.head.weight[1, 1] = 1.0

            # Center the decision boundary near the observed CPU/ANE split.
            #
            # Previous CPU margin was about -15.06 and ANE moved the margin
            # by about +1.16, so +14.6 should make CPU slightly prefer class 1
            # while ANE has a shot at flipping to class 0.
            self.head.bias[0] = 14.6
            self.head.bias[1] = 0.0

    def forward(self, x):
        y = self.stem(x)

        # Amplify backend numerical differences before pooling/head.
        y = y * 300.0

        y = self.pool(y)
        y = y.flatten(1)
        y = self.head(y)
        return y


 def main():
    torch.manual_seed(1234)

    model = AmplifiedConvTie().eval()
    example = torch.randn(1, 3, 224, 224)

    traced = torch.jit.trace(model, example)

    mlmodel = ct.convert(
        traced,
        inputs=[ct.TensorType(name="x", shape=example.shape)],
        outputs=[ct.TensorType(name="y")],
        convert_to="mlprogram",
        minimum_deployment_target=ct.target.macOS14,
        compute_precision=ct.precision.FLOAT16,
    )

    path = OUT / "amplified_conv_tie_fp16.mlpackage"
    mlmodel.save(path)
    print("saved:", path)


 if __name__ == "__main__":
    main()
diff --git a/make_synthetic_coreml_amp_w8a8.py b/make_synthetic_coreml_amp_w8a8.py
 #!/usr/bin/env python3
 from pathlib import Path

 import coremltools as ct
 import coremltools.optimize.coreml as cto
 import numpy as np
 import torch
 import torch.nn as nn

 OUT = Path("./models/synthetic")
 OUT.mkdir(parents=True, exist_ok=True)


 class AmplifiedConvTie(nn.Module):
    def __init__(self):
        super().__init__()

        self.stem = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.SiLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.SiLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.SiLU(),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.SiLU(),
            nn.Conv2d(64, 16, 1),
            nn.SiLU(),
        )

        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.head = nn.Linear(16, 2, bias=True)

        with torch.no_grad():
            self.head.weight.zero_()
            self.head.bias.zero_()
            self.head.weight[0, 0] = 1.0
            self.head.weight[1, 1] = 1.0
            self.head.bias[0] = 14.6
            self.head.bias[1] = 0.0

    def forward(self, x):
        y = self.stem(x)
        y = y * 300.0
        y = self.pool(y)
        y = y.flatten(1)
        y = self.head(y)
        return y


 def calibration_data(count=8):
    samples = []
    rng = np.random.default_rng(10_000)
    for _ in range(count):
        samples.append({"x": rng.standard_normal((1, 3, 224, 224)).astype(np.float32)})
    return samples


 def main():
    torch.manual_seed(1234)

    model = AmplifiedConvTie().eval()
    example = torch.randn(1, 3, 224, 224)
    traced = torch.jit.trace(model, example)

    fp16 = ct.convert(
        traced,
        inputs=[ct.TensorType(name="x", shape=example.shape)],
        outputs=[ct.TensorType(name="y")],
        convert_to="mlprogram",
        minimum_deployment_target=ct.target.macOS14,
        compute_precision=ct.precision.FLOAT16,
    )

    activation_config = cto.OptimizationConfig(
        global_config=cto.OpLinearQuantizerConfig(mode="linear_symmetric", dtype=np.int8)
    )
    a8 = cto.linear_quantize_activations(
        fp16,
        activation_config,
        calibration_data(),
        calibration_op_group_size=8,
    )

    weight_config = cto.OptimizationConfig(
        global_config=cto.OpLinearQuantizerConfig(
            mode="linear_symmetric",
            dtype=np.int8,
            granularity="per_channel",
            weight_threshold=0,
        )
    )
    w8a8 = cto.linear_quantize_weights(a8, weight_config)

    path = OUT / "amplified_conv_tie_w8a8.mlpackage"
    w8a8.save(path)
    print("saved:", path)


 if __name__ == "__main__":
    main()
diff --git a/probe_2_plan.py b/probe_2_plan.py
 #!/usr/bin/env python3
 import argparse
 import shutil
 from pathlib import Path

 import coremltools as ct

 DEFAULT_MODEL = "./models/synthetic/amplified_conv_tie_fp16.mlpackage"

 parser = argparse.ArgumentParser()
 parser.add_argument("model", nargs="?", default=DEFAULT_MODEL)
 args = parser.parse_args()

 MODEL = Path(args.model).resolve()

 OUT = Path("./compiled_models").resolve()
 OUT.mkdir(exist_ok=True)

 STABLE_NAME = MODEL.name.replace(".mlpackage", ".mlmodelc")
 STABLE_COMPILED = OUT / STABLE_NAME

 print("coremltools:", ct.__version__)
 print("model:", MODEL)

 if not MODEL.exists():
    raise FileNotFoundError(f"Model not found: {MODEL}")

 # Keep this object alive while copying the compiled temporary model.
 mlmodel = ct.models.MLModel(str(MODEL), compute_units=ct.ComputeUnit.ALL)

 tmp_compiled = Path(mlmodel.get_compiled_model_path())

 if STABLE_COMPILED.exists():
    shutil.rmtree(STABLE_COMPILED)

 shutil.copytree(tmp_compiled, STABLE_COMPILED)

 print("tmp compiled:", tmp_compiled)
 print("stable compiled:", STABLE_COMPILED)
 print("stable exists:", STABLE_COMPILED.exists())

 plan = ct.models.compute_plan.MLComputePlan.load_from_path(
    path=str(STABLE_COMPILED),
    compute_units=ct.ComputeUnit.ALL,
 )

 print("\ncompute plan loaded")
 print("plan type:", type(plan))


 def op_type(op):
    return (
        getattr(op, "operator_name", None)
        or getattr(op, "type", None)
        or type(op).__name__
    )


 def device_name(device):
    if device is None:
        return "None"
    return type(device).__name__


 def usage_summary(usage):
    if usage is None:
        return "None", []

    preferred = device_name(usage.preferred_compute_device)
    supported = [device_name(d) for d in usage.supported_compute_devices]
    return preferred, supported


 def print_counts(title, counts):
    print(f"\n== {title} ==")
    for k, v in sorted(counts.items(), key=lambda kv: (-kv[1], str(kv[0]))):
        print(v, k)


 def record_usage(
    typ,
    preferred,
    supported,
    preferred_counts,
    supported_counts,
    op_counts,
    op_preferred_counts,
    ne_hits,
    hit_payload,
 ):
    preferred_counts[preferred] = preferred_counts.get(preferred, 0) + 1
    op_counts[typ] = op_counts.get(typ, 0) + 1
    op_preferred_counts[(typ, preferred)] = (
        op_preferred_counts.get((typ, preferred), 0) + 1
    )

    for dev in supported:
        supported_counts[dev] = supported_counts.get(dev, 0) + 1

    if "NeuralEngine" in preferred or any("NeuralEngine" in d for d in supported):
        ne_hits.append(hit_payload)


 if plan.model_structure.program is not None:
    print("\nmodel kind: mlprogram")

    program = plan.model_structure.program
    print("functions:", list(program.functions.keys()))

    total_ops = 0
    preferred_counts = {}
    supported_counts = {}
    op_counts = {}
    op_preferred_counts = {}
    ne_ops = []

    for fname, fn in program.functions.items():
        ops = fn.block.operations
        print(f"\nfunction: {fname}")
        print("ops:", len(ops))

        for i, op in enumerate(ops):
            total_ops += 1

            typ = op_type(op)
            usage = plan.get_compute_device_usage_for_mlprogram_operation(op)
            preferred, supported = usage_summary(usage)

            record_usage(
                typ,
                preferred,
                supported,
                preferred_counts,
                supported_counts,
                op_counts,
                op_preferred_counts,
                ne_ops,
                (fname, i, typ, preferred, supported),
            )

            if i < 80:
                print(f"\n[{i}]")
                print("op:", typ)
                print("name:", getattr(op, "name", None))
                print("preferred:", preferred)
                print("supported:", supported)

    print("\nTOTAL OPS:", total_ops)

    print_counts("PREFERRED DEVICE COUNTS", preferred_counts)
    print_counts("SUPPORTED DEVICE COUNTS", supported_counts)
    print_counts("OP TYPE COUNTS", op_counts)

    print("\n== OP x PREFERRED DEVICE ==")
    for (typ, pref), v in sorted(
        op_preferred_counts.items(),
        key=lambda kv: (-kv[1], str(kv[0])),
    ):
        print(v, typ, pref)

    print("\n== NEURAL ENGINE OPS ==")
    if not ne_ops:
        print("NONE")
    else:
        print("count:", len(ne_ops))
        for fname, i, typ, preferred, supported in ne_ops[:100]:
            print(
                f"{fname}[{i}] op={typ} "
                f"preferred={preferred} supported={supported}"
            )

 elif plan.model_structure.neuralnetwork is not None:
    print("\nmodel kind: neuralnetwork")

    nn = plan.model_structure.neuralnetwork
    layers = nn.layers

    preferred_counts = {}
    supported_counts = {}
    layer_type_counts = {}
    layer_preferred_counts = {}
    ne_layers = []

    print("layers:", len(layers))

    for i, layer in enumerate(layers):
        usage = plan.get_compute_device_usage_for_neuralnetwork_layer(layer)
        preferred, supported = usage_summary(usage)

        typ = getattr(layer, "type", None) or type(layer).__name__

        record_usage(
            typ,
            preferred,
            supported,
            preferred_counts,
            supported_counts,
            layer_type_counts,
            layer_preferred_counts,
            ne_layers,
            (i, layer.name, typ, preferred, supported),
        )

        if i < 80:
            print(f"\n[{i}] {layer.name}")
            print("type:", typ)
            print("preferred:", preferred)
            print("supported:", supported)

    print_counts("PREFERRED DEVICE COUNTS", preferred_counts)
    print_counts("SUPPORTED DEVICE COUNTS", supported_counts)
    print_counts("LAYER TYPE COUNTS", layer_type_counts)

    print("\n== LAYER x PREFERRED DEVICE ==")
    for (typ, pref), v in sorted(
        layer_preferred_counts.items(),
        key=lambda kv: (-kv[1], str(kv[0])),
    ):
        print(v, typ, pref)

    print("\n== NEURAL ENGINE LAYERS ==")
    if not ne_layers:
        print("NONE")
    else:
        print("count:", len(ne_layers))
        for i, name, typ, preferred, supported in ne_layers[:100]:
            print(
                f"[{i}] {name} type={typ} "
                f"preferred={preferred} supported={supported}"
            )

 else:
    print("\nunknown model structure")
diff --git a/probe_3_drift.py b/probe_3_drift.py
 #!/usr/bin/env python3
 import argparse

 import numpy as np
 import coremltools as ct

 DEFAULT_MODEL = "./models/synthetic/amplified_conv_tie_fp16.mlpackage"
 RUNS = 100

 parser = argparse.ArgumentParser()
 parser.add_argument("model", nargs="?", default=DEFAULT_MODEL)
 parser.add_argument("--runs", type=int, default=RUNS)
 args = parser.parse_args()

 MODEL = args.model
 RUNS = args.runs

 rng = np.random.default_rng(1234)
 x = {
    "x": rng.standard_normal((1, 3, 224, 224)).astype(np.float32)
 }


 def load(cu):
    return ct.models.MLModel(MODEL, compute_units=cu)


 def run(model):
    y = model.predict(x)
    return np.asarray(y["y"])


 def margin_01(arr):
    flat = arr.reshape(-1)
    return float(flat[0] - flat[1])


 def print_logits(prefix, arr):
    flat = arr.reshape(-1)
    print(f"{prefix}_logits:", flat)
    print(f"{prefix}_margin_0_minus_1:", margin_01(arr))
    print(f"{prefix}_argmax:", int(arr.argmax()))


 def probe(name, cu):
    model = load(cu)

    ref = run(model)
    ref_argmax = int(ref.argmax())

    raw_drift = False
    argmax_drift = False
    worst = 0.0
    first = None

    for i in range(1, RUNS + 1):
        cur = run(model)
        max_abs = float(np.max(np.abs(cur - ref)))
        worst = max(worst, max_abs)

        if not np.array_equal(cur, ref):
            raw_drift = True
            if first is None:
                first = i

        if int(cur.argmax()) != ref_argmax:
            argmax_drift = True
            first = i
            break

    print(f"\n== {name} ==")
    print("raw_drift_seen:", raw_drift)
    print("argmax_drift_seen:", argmax_drift)
    print("first_drift_run:", first)
    print("worst_abs_diff:", worst)
    print("shape:", ref.shape)
    print_logits("ref", ref)

    return ref


 results = {}

 for name, cu in [
    ("CPU_ONLY", ct.ComputeUnit.CPU_ONLY),
    ("CPU_AND_GPU", ct.ComputeUnit.CPU_AND_GPU),
    ("CPU_AND_NE", ct.ComputeUnit.CPU_AND_NE),
    ("ALL", ct.ComputeUnit.ALL),
 ]:
    try:
        results[name] = probe(name, cu)
    except Exception as e:
        print(f"\n== {name} FAILED ==")
        print(type(e).__name__, e)


 base = results.get("CPU_ONLY")

 if base is not None:
    print("\n== CROSS-BACKEND VS CPU_ONLY ==")
    base_argmax = int(base.argmax())

    print_logits("cpu", base)

    for name, arr in results.items():
        if name == "CPU_ONLY":
            continue

        print(f"\n{name}")
        print("array_equal:", np.array_equal(arr, base))
        print("max_abs_diff:", float(np.max(np.abs(arr - base))))
        print("argmax_equal:", int(arr.argmax()) == base_argmax)
        print_logits("cur", arr)
        print("delta_vs_cpu:", arr.reshape(-1) - base.reshape(-1))
        print("margin_delta_vs_cpu:", margin_01(arr) - margin_01(base))
diff --git a/probe_4_quant_evidence.py b/probe_4_quant_evidence.py
 #!/usr/bin/env python3
 import argparse
 from collections import Counter, defaultdict
 from pathlib import Path

 import coremltools as ct

 DEFAULT_MODEL = "./models/synthetic/amplified_conv_tie_w8a8.mlpackage"
 KEYWORDS = ("quant", "dequant", "constexpr", "affine", "cast", "int8", "lut")
 REAL_OPS = {"conv", "linear", "matmul", "add", "mul", "silu", "relu", "gelu", "reduce_mean", "reshape"}


 def dtype_name(tensor_type):
    value = tensor_type.dataType
    enum = tensor_type.DESCRIPTOR.fields_by_name["dataType"].enum_type
    enum_value = enum.values_by_number.get(value)
    return enum_value.name if enum_value is not None else str(value)


 def op_name(op):
    attr = op.attributes.get("name") if hasattr(op, "attributes") else None
    if attr is not None:
        text = str(attr)
        marker = 'values: "'
        if marker in text:
            return text.split(marker, 1)[1].split('"', 1)[0]
    if op.outputs:
        return op.outputs[0].name
    return ""


 def output_dtypes(op):
    dtypes = []
    for out in op.outputs:
        if out.type.HasField("tensorType"):
            dtypes.append(dtype_name(out.type.tensorType))
    return dtypes


 def input_names(op):
    names = []
    for input_value in op.inputs.values():
        for arg in input_value.arguments:
            if arg.name:
                names.append(arg.name)
    return names


 def iter_mlprogram_ops(spec):
    if spec.WhichOneof("Type") != "mlProgram":
        return
    for fname, fn in spec.mlProgram.functions.items():
        for bname, block in fn.block_specializations.items():
            for index, op in enumerate(block.operations):
                yield fname, bname, index, op


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("model", nargs="?", default=DEFAULT_MODEL)
    parser.add_argument("--hits", type=int, default=120)
    args = parser.parse_args()

    model_path = Path(args.model).resolve()
    print("coremltools:", ct.__version__)
    print("model:", model_path)
    if not model_path.exists():
        raise FileNotFoundError(model_path)

    mlmodel = ct.models.MLModel(str(model_path), compute_units=ct.ComputeUnit.CPU_ONLY)
    spec = mlmodel.get_spec()
    kind = spec.WhichOneof("Type")
    print("model kind:", kind)

    if kind != "mlProgram":
        print("Unsupported for this probe: expected mlProgram")
        return

    op_counts = Counter()
    dtype_counts = Counter()
    hits = []
    quant_outputs = set()
    dequant_outputs = set()
    real_ops_consuming_dequant = []
    real_ops_consuming_quant = []
    producer = {}

    ops = list(iter_mlprogram_ops(spec))
    for fname, bname, index, op in ops:
        typ = op.type
        name = op_name(op)
        dtypes = output_dtypes(op)
        op_counts[typ] += 1
        for dt in dtypes:
            dtype_counts[dt] += 1
        for out in op.outputs:
            producer[out.name] = typ

        haystack = " ".join([typ, name, *[out.name for out in op.outputs], *dtypes]).lower()
        if any(k in haystack for k in KEYWORDS):
            hits.append((fname, bname, index, typ, name, dtypes, [out.name for out in op.outputs]))

        if typ == "quantize":
            quant_outputs.update(out.name for out in op.outputs)
        if typ in {"dequantize", "constexpr_affine_dequantize", "constexpr_blockwise_shift_scale"}:
            dequant_outputs.update(out.name for out in op.outputs)

    for fname, bname, index, op in ops:
        if op.type not in REAL_OPS:
            continue
        consumed = input_names(op)
        if any(name in dequant_outputs for name in consumed):
            real_ops_consuming_dequant.append((fname, bname, index, op.type, op_name(op), consumed))
        if any(name in quant_outputs for name in consumed):
            real_ops_consuming_quant.append((fname, bname, index, op.type, op_name(op), consumed))

    print("\n== OP TYPE COUNTS ==")
    for typ, count in op_counts.most_common():
        print(count, typ)

    print("\n== OUTPUT DTYPE COUNTS ==")
    for typ, count in dtype_counts.most_common():
        print(count, typ)

    print("\n== QUANTIZATION-RELATED HITS ==")
    if not hits:
        print("NONE")
    else:
        print("count:", len(hits))
        for fname, bname, index, typ, name, dtypes, outputs in hits[: args.hits]:
            print(f"{fname}/{bname}[{index}] op={typ} name={name} dtypes={dtypes} outputs={outputs}")

    weight_evidence = op_counts["constexpr_affine_dequantize"] + op_counts["constexpr_blockwise_shift_scale"]
    activation_quant_evidence = op_counts["quantize"] + op_counts["dequantize"]

    print("\n== CLASSIFICATION ==")
    print("weight_quantization_evidence:", "yes" if weight_evidence else "no")
    print("weight_quantization_ops:", weight_evidence)
    print("activation_quantization_evidence:", "yes" if activation_quant_evidence else "no")
    print("activation_quantize_ops:", op_counts["quantize"])
    print("activation_dequantize_ops:", op_counts["dequantize"])
    print("real_ops_consuming_dequantized_values:", len(real_ops_consuming_dequant))
    print("real_ops_consuming_quantized_values:", len(real_ops_consuming_quant))

    if activation_quant_evidence and real_ops_consuming_dequant:
        print("activation_quantization_interpretation: quantize/dequantize pairs are present before real ops")
    elif activation_quant_evidence:
        print("activation_quantization_interpretation: quantize/dequantize ops are present, but no real-op consumers were detected")
    elif weight_evidence:
        print("activation_quantization_interpretation: weight compression only; no activation quantize/dequantize ops detected")
    else:
        print("activation_quantization_interpretation: no quantization evidence detected")

    if real_ops_consuming_dequant:
        print("\n== REAL OPS CONSUMING DEQUANTIZED VALUES ==")
        by_type = defaultdict(int)
        for _, _, _, typ, _, _ in real_ops_consuming_dequant:
            by_type[typ] += 1
        for typ, count in sorted(by_type.items()):
            print(count, typ)
        for fname, bname, index, typ, name, consumed in real_ops_consuming_dequant[:40]:
            print(f"{fname}/{bname}[{index}] op={typ} name={name} inputs={consumed}")


 if __name__ == "__main__":
    main()
diff --git a/RESULTS_STAGE3.md b/RESULTS_STAGE3.md
	#!/usr/bin/env python3
	from pathlib import Path

	import torch
	import torch.nn as nn
	import coremltools as ct

	OUT = Path("./models/synthetic")
	OUT.mkdir(parents=True, exist_ok=True)


	class AmplifiedConvTie(nn.Module):
	def __init__(self):
	super().__init__()

	self.stem = nn.Sequential(
	nn.Conv2d(3, 64, 3, padding=1),
	nn.SiLU(),
	nn.Conv2d(64, 64, 3, padding=1),
	nn.SiLU(),
	nn.Conv2d(64, 64, 3, padding=1),
	nn.SiLU(),
	nn.Conv2d(64, 64, 3, padding=1),
	nn.SiLU(),
	nn.Conv2d(64, 16, 1),
	nn.SiLU(),
	)

	self.pool = nn.AdaptiveAvgPool2d((1, 1))
	self.head = nn.Linear(16, 2, bias=True)

	# Make the decision boundary intentionally fragile.
	#
	# Class 0 depends on channel 0.
	# Class 1 depends on channel 1.
	#
	# This makes CPU/GPU/ANE backend drift capable of moving the two logits
	# differently, instead of shifting both logits together.
	with torch.no_grad():
	self.head.weight.zero_()
	self.head.bias.zero_()

	self.head.weight[0, 0] = 1.0
	self.head.weight[1, 1] = 1.0

	# Center the decision boundary near the observed CPU/ANE split.
	#
	# Previous CPU margin was about -15.06 and ANE moved the margin
	# by about +1.16, so +14.6 should make CPU slightly prefer class 1
	# while ANE has a shot at flipping to class 0.
	self.head.bias[0] = 14.6
	self.head.bias[1] = 0.0

	def forward(self, x):
	y = self.stem(x)

	# Amplify backend numerical differences before pooling/head.
	y = y * 300.0

	y = self.pool(y)
	y = y.flatten(1)
	y = self.head(y)
	return y


	def main():
	torch.manual_seed(1234)

	model = AmplifiedConvTie().eval()
	example = torch.randn(1, 3, 224, 224)

	traced = torch.jit.trace(model, example)

	mlmodel = ct.convert(
	traced,
	inputs=[ct.TensorType(name="x", shape=example.shape)],
	outputs=[ct.TensorType(name="y")],
	convert_to="mlprogram",
	minimum_deployment_target=ct.target.macOS14,
	compute_precision=ct.precision.FLOAT16,
	)

	path = OUT / "amplified_conv_tie_fp16.mlpackage"
	mlmodel.save(path)
	print("saved:", path)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	import argparse
	import shutil
	from pathlib import Path

	import coremltools as ct

	DEFAULT_MODEL = "./models/synthetic/amplified_conv_tie_fp16.mlpackage"

	parser = argparse.ArgumentParser()
	parser.add_argument("model", nargs="?", default=DEFAULT_MODEL)
	args = parser.parse_args()

	MODEL = Path(args.model).resolve()

	OUT = Path("./compiled_models").resolve()
	OUT.mkdir(exist_ok=True)

	STABLE_NAME = MODEL.name.replace(".mlpackage", ".mlmodelc")
	STABLE_COMPILED = OUT / STABLE_NAME

	print("coremltools:", ct.__version__)
	print("model:", MODEL)

	if not MODEL.exists():
	raise FileNotFoundError(f"Model not found: {MODEL}")

	# Keep this object alive while copying the compiled temporary model.
	mlmodel = ct.models.MLModel(str(MODEL), compute_units=ct.ComputeUnit.ALL)

	tmp_compiled = Path(mlmodel.get_compiled_model_path())

	if STABLE_COMPILED.exists():
	shutil.rmtree(STABLE_COMPILED)

	shutil.copytree(tmp_compiled, STABLE_COMPILED)

	print("tmp compiled:", tmp_compiled)
	print("stable compiled:", STABLE_COMPILED)
	print("stable exists:", STABLE_COMPILED.exists())

	plan = ct.models.compute_plan.MLComputePlan.load_from_path(
	path=str(STABLE_COMPILED),
	compute_units=ct.ComputeUnit.ALL,
	)

	print("\ncompute plan loaded")
	print("plan type:", type(plan))


	def op_type(op):
	return (
	getattr(op, "operator_name", None)
	or getattr(op, "type", None)
	or type(op).__name__
	)


	def device_name(device):
	if device is None:
	return "None"
	return type(device).__name__


	def usage_summary(usage):
	if usage is None:
	return "None", []

	preferred = device_name(usage.preferred_compute_device)
	supported = [device_name(d) for d in usage.supported_compute_devices]
	return preferred, supported


	def print_counts(title, counts):
	print(f"\n== {title} ==")
	for k, v in sorted(counts.items(), key=lambda kv: (-kv[1], str(kv[0]))):
	print(v, k)


	def record_usage(
	typ,
	preferred,
	supported,
	preferred_counts,
	supported_counts,
	op_counts,
	op_preferred_counts,
	ne_hits,
	hit_payload,
	):
	preferred_counts[preferred] = preferred_counts.get(preferred, 0) + 1
	op_counts[typ] = op_counts.get(typ, 0) + 1
	op_preferred_counts[(typ, preferred)] = (
	op_preferred_counts.get((typ, preferred), 0) + 1
	)

	for dev in supported:
	supported_counts[dev] = supported_counts.get(dev, 0) + 1

	if "NeuralEngine" in preferred or any("NeuralEngine" in d for d in supported):
	ne_hits.append(hit_payload)


	if plan.model_structure.program is not None:
	print("\nmodel kind: mlprogram")

	program = plan.model_structure.program
	print("functions:", list(program.functions.keys()))

	total_ops = 0
	preferred_counts = {}
	supported_counts = {}
	op_counts = {}
	op_preferred_counts = {}
	ne_ops = []

	for fname, fn in program.functions.items():
	ops = fn.block.operations
	print(f"\nfunction: {fname}")
	print("ops:", len(ops))

	for i, op in enumerate(ops):
	total_ops += 1

	typ = op_type(op)
	usage = plan.get_compute_device_usage_for_mlprogram_operation(op)
	preferred, supported = usage_summary(usage)

	record_usage(
	typ,
	preferred,
	supported,
	preferred_counts,
	supported_counts,
	op_counts,
	op_preferred_counts,
	ne_ops,
	(fname, i, typ, preferred, supported),
	)

	if i < 80:
	print(f"\n[{i}]")
	print("op:", typ)
	print("name:", getattr(op, "name", None))
	print("preferred:", preferred)
	print("supported:", supported)

	print("\nTOTAL OPS:", total_ops)

	print_counts("PREFERRED DEVICE COUNTS", preferred_counts)
	print_counts("SUPPORTED DEVICE COUNTS", supported_counts)
	print_counts("OP TYPE COUNTS", op_counts)

	print("\n== OP x PREFERRED DEVICE ==")
	for (typ, pref), v in sorted(
	op_preferred_counts.items(),
	key=lambda kv: (-kv[1], str(kv[0])),
	):
	print(v, typ, pref)

	print("\n== NEURAL ENGINE OPS ==")
	if not ne_ops:
	print("NONE")
	else:
	print("count:", len(ne_ops))
	for fname, i, typ, preferred, supported in ne_ops[:100]:
	print(
	f"{fname}[{i}] op={typ} "
	f"preferred={preferred} supported={supported}"
	)

	elif plan.model_structure.neuralnetwork is not None:
	print("\nmodel kind: neuralnetwork")

	nn = plan.model_structure.neuralnetwork
	layers = nn.layers

	preferred_counts = {}
	supported_counts = {}
	layer_type_counts = {}
	layer_preferred_counts = {}
	ne_layers = []

	print("layers:", len(layers))

	for i, layer in enumerate(layers):
	usage = plan.get_compute_device_usage_for_neuralnetwork_layer(layer)
	preferred, supported = usage_summary(usage)

	typ = getattr(layer, "type", None) or type(layer).__name__

	record_usage(
	typ,
	preferred,
	supported,
	preferred_counts,
	supported_counts,
	layer_type_counts,
	layer_preferred_counts,
	ne_layers,
	(i, layer.name, typ, preferred, supported),
	)

	if i < 80:
	print(f"\n[{i}] {layer.name}")
	print("type:", typ)
	print("preferred:", preferred)
	print("supported:", supported)

	print_counts("PREFERRED DEVICE COUNTS", preferred_counts)
	print_counts("SUPPORTED DEVICE COUNTS", supported_counts)
	print_counts("LAYER TYPE COUNTS", layer_type_counts)

	print("\n== LAYER x PREFERRED DEVICE ==")
	for (typ, pref), v in sorted(
	layer_preferred_counts.items(),
	key=lambda kv: (-kv[1], str(kv[0])),
	):
	print(v, typ, pref)

	print("\n== NEURAL ENGINE LAYERS ==")
	if not ne_layers:
	print("NONE")
	else:
	print("count:", len(ne_layers))
	for i, name, typ, preferred, supported in ne_layers[:100]:
	print(
	f"[{i}] {name} type={typ} "
	f"preferred={preferred} supported={supported}"
	)

	else:
	print("\nunknown model structure")
	#!/usr/bin/env python3
	import argparse

	import numpy as np
	import coremltools as ct

	DEFAULT_MODEL = "./models/synthetic/amplified_conv_tie_fp16.mlpackage"
	RUNS = 100

	parser = argparse.ArgumentParser()
	parser.add_argument("model", nargs="?", default=DEFAULT_MODEL)
	parser.add_argument("--runs", type=int, default=RUNS)
	args = parser.parse_args()

	MODEL = args.model
	RUNS = args.runs

	rng = np.random.default_rng(1234)
	x = {
	"x": rng.standard_normal((1, 3, 224, 224)).astype(np.float32)
	}


	def load(cu):
	return ct.models.MLModel(MODEL, compute_units=cu)


	def run(model):
	y = model.predict(x)
	return np.asarray(y["y"])


	def margin_01(arr):
	flat = arr.reshape(-1)
	return float(flat[0] - flat[1])


	def print_logits(prefix, arr):
	flat = arr.reshape(-1)
	print(f"{prefix}_logits:", flat)
	print(f"{prefix}_margin_0_minus_1:", margin_01(arr))
	print(f"{prefix}_argmax:", int(arr.argmax()))


	def probe(name, cu):
	model = load(cu)

	ref = run(model)
	ref_argmax = int(ref.argmax())

	raw_drift = False
	argmax_drift = False
	worst = 0.0
	first = None

	for i in range(1, RUNS + 1):
	cur = run(model)
	max_abs = float(np.max(np.abs(cur - ref)))
	worst = max(worst, max_abs)

	if not np.array_equal(cur, ref):
	raw_drift = True
	if first is None:
	first = i

	if int(cur.argmax()) != ref_argmax:
	argmax_drift = True
	first = i
	break

	print(f"\n== {name} ==")
	print("raw_drift_seen:", raw_drift)
	print("argmax_drift_seen:", argmax_drift)
	print("first_drift_run:", first)
	print("worst_abs_diff:", worst)
	print("shape:", ref.shape)
	print_logits("ref", ref)

	return ref


	results = {}

	for name, cu in [
	("CPU_ONLY", ct.ComputeUnit.CPU_ONLY),
	("CPU_AND_GPU", ct.ComputeUnit.CPU_AND_GPU),
	("CPU_AND_NE", ct.ComputeUnit.CPU_AND_NE),
	("ALL", ct.ComputeUnit.ALL),
	]:
	try:
	results[name] = probe(name, cu)
	except Exception as e:
	print(f"\n== {name} FAILED ==")
	print(type(e).__name__, e)


	base = results.get("CPU_ONLY")

	if base is not None:
	print("\n== CROSS-BACKEND VS CPU_ONLY ==")
	base_argmax = int(base.argmax())

	print_logits("cpu", base)

	for name, arr in results.items():
	if name == "CPU_ONLY":
	continue

	print(f"\n{name}")
	print("array_equal:", np.array_equal(arr, base))
	print("max_abs_diff:", float(np.max(np.abs(arr - base))))
	print("argmax_equal:", int(arr.argmax()) == base_argmax)
	print_logits("cur", arr)
	print("delta_vs_cpu:", arr.reshape(-1) - base.reshape(-1))
	print("margin_delta_vs_cpu:", margin_01(arr) - margin_01(base))
	#!/usr/bin/env python3
	import argparse
	from collections import Counter, defaultdict
	from pathlib import Path

	import coremltools as ct

	DEFAULT_MODEL = "./models/synthetic/amplified_conv_tie_w8a8.mlpackage"
	KEYWORDS = ("quant", "dequant", "constexpr", "affine", "cast", "int8", "lut")
	REAL_OPS = {"conv", "linear", "matmul", "add", "mul", "silu", "relu", "gelu", "reduce_mean", "reshape"}


	def dtype_name(tensor_type):
	value = tensor_type.dataType
	enum = tensor_type.DESCRIPTOR.fields_by_name["dataType"].enum_type
	enum_value = enum.values_by_number.get(value)
	return enum_value.name if enum_value is not None else str(value)


	def op_name(op):
	attr = op.attributes.get("name") if hasattr(op, "attributes") else None
	if attr is not None:
	text = str(attr)
	marker = 'values: "'
	if marker in text:
	return text.split(marker, 1)[1].split('"', 1)[0]
	if op.outputs:
	return op.outputs[0].name
	return ""


	def output_dtypes(op):
	dtypes = []
	for out in op.outputs:
	if out.type.HasField("tensorType"):
	dtypes.append(dtype_name(out.type.tensorType))
	return dtypes


	def input_names(op):
	names = []
	for input_value in op.inputs.values():
	for arg in input_value.arguments:
	if arg.name:
	names.append(arg.name)
	return names


	def iter_mlprogram_ops(spec):
	if spec.WhichOneof("Type") != "mlProgram":
	return
	for fname, fn in spec.mlProgram.functions.items():
	for bname, block in fn.block_specializations.items():
	for index, op in enumerate(block.operations):
	yield fname, bname, index, op


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("model", nargs="?", default=DEFAULT_MODEL)
	parser.add_argument("--hits", type=int, default=120)
	args = parser.parse_args()

	model_path = Path(args.model).resolve()
	print("coremltools:", ct.__version__)
	print("model:", model_path)
	if not model_path.exists():
	raise FileNotFoundError(model_path)

	mlmodel = ct.models.MLModel(str(model_path), compute_units=ct.ComputeUnit.CPU_ONLY)
	spec = mlmodel.get_spec()
	kind = spec.WhichOneof("Type")
	print("model kind:", kind)

	if kind != "mlProgram":
	print("Unsupported for this probe: expected mlProgram")
	return

	op_counts = Counter()
	dtype_counts = Counter()
	hits = []
	quant_outputs = set()
	dequant_outputs = set()
	real_ops_consuming_dequant = []
	real_ops_consuming_quant = []
	producer = {}

	ops = list(iter_mlprogram_ops(spec))
	for fname, bname, index, op in ops:
	typ = op.type
	name = op_name(op)
	dtypes = output_dtypes(op)
	op_counts[typ] += 1
	for dt in dtypes:
	dtype_counts[dt] += 1
	for out in op.outputs:
	producer[out.name] = typ

	haystack = " ".join([typ, name, [out.name for out in op.outputs], dtypes]).lower()
	if any(k in haystack for k in KEYWORDS):
	hits.append((fname, bname, index, typ, name, dtypes, [out.name for out in op.outputs]))

	if typ == "quantize":
	quant_outputs.update(out.name for out in op.outputs)
	if typ in {"dequantize", "constexpr_affine_dequantize", "constexpr_blockwise_shift_scale"}:
	dequant_outputs.update(out.name for out in op.outputs)

	for fname, bname, index, op in ops:
	if op.type not in REAL_OPS:
	continue
	consumed = input_names(op)
	if any(name in dequant_outputs for name in consumed):
	real_ops_consuming_dequant.append((fname, bname, index, op.type, op_name(op), consumed))
	if any(name in quant_outputs for name in consumed):
	real_ops_consuming_quant.append((fname, bname, index, op.type, op_name(op), consumed))

	print("\n== OP TYPE COUNTS ==")
	for typ, count in op_counts.most_common():
	print(count, typ)

	print("\n== OUTPUT DTYPE COUNTS ==")
	for typ, count in dtype_counts.most_common():
	print(count, typ)

	print("\n== QUANTIZATION-RELATED HITS ==")
	if not hits:
	print("NONE")
	else:
	print("count:", len(hits))
	for fname, bname, index, typ, name, dtypes, outputs in hits[: args.hits]:
	print(f"{fname}/{bname}[{index}] op={typ} name={name} dtypes={dtypes} outputs={outputs}")

	weight_evidence = op_counts["constexpr_affine_dequantize"] + op_counts["constexpr_blockwise_shift_scale"]
	activation_quant_evidence = op_counts["quantize"] + op_counts["dequantize"]

	print("\n== CLASSIFICATION ==")
	print("weight_quantization_evidence:", "yes" if weight_evidence else "no")
	print("weight_quantization_ops:", weight_evidence)
	print("activation_quantization_evidence:", "yes" if activation_quant_evidence else "no")
	print("activation_quantize_ops:", op_counts["quantize"])
	print("activation_dequantize_ops:", op_counts["dequantize"])
	print("real_ops_consuming_dequantized_values:", len(real_ops_consuming_dequant))
	print("real_ops_consuming_quantized_values:", len(real_ops_consuming_quant))

	if activation_quant_evidence and real_ops_consuming_dequant:
	print("activation_quantization_interpretation: quantize/dequantize pairs are present before real ops")
	elif activation_quant_evidence:
	print("activation_quantization_interpretation: quantize/dequantize ops are present, but no real-op consumers were detected")
	elif weight_evidence:
	print("activation_quantization_interpretation: weight compression only; no activation quantize/dequantize ops detected")
	else:
	print("activation_quantization_interpretation: no quantization evidence detected")

	if real_ops_consuming_dequant:
	print("\n== REAL OPS CONSUMING DEQUANTIZED VALUES ==")
	by_type = defaultdict(int)
	for _, _, _, typ, _, _ in real_ops_consuming_dequant:
	by_type[typ] += 1
	for typ, count in sorted(by_type.items()):
	print(count, typ)
	for fname, bname, index, typ, name, consumed in real_ops_consuming_dequant[:40]:
	print(f"{fname}/{bname}[{index}] op={typ} name={name} inputs={consumed}")


	if __name__ == "__main__":
	main()