Skip to content

Instantly share code, notes, and snippets.

@robbiemu
Created June 6, 2026 18:12
Show Gist options
  • Select an option

  • Save robbiemu/34132e313ecdff63a06c3d3e0c624f3d to your computer and use it in GitHub Desktop.

Select an option

Save robbiemu/34132e313ecdff63a06c3d3e0c624f3d to your computer and use it in GitHub Desktop.
Core ML ANE backend-dependent argmax flip reproducer
#!/usr/bin/env python3
from pathlib import Path
import torch
import torch.nn as nn
import coremltools as ct
OUT = Path("./models/synthetic")
OUT.mkdir(parents=True, exist_ok=True)
class AmplifiedConvTie(nn.Module):
def __init__(self):
super().__init__()
self.stem = nn.Sequential(
nn.Conv2d(3, 64, 3, padding=1),
nn.SiLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.SiLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.SiLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.SiLU(),
nn.Conv2d(64, 16, 1),
nn.SiLU(),
)
self.pool = nn.AdaptiveAvgPool2d((1, 1))
self.head = nn.Linear(16, 2, bias=True)
# Make the decision boundary intentionally fragile.
#
# Class 0 depends on channel 0.
# Class 1 depends on channel 1.
#
# This makes CPU/GPU/ANE backend drift capable of moving the two logits
# differently, instead of shifting both logits together.
with torch.no_grad():
self.head.weight.zero_()
self.head.bias.zero_()
self.head.weight[0, 0] = 1.0
self.head.weight[1, 1] = 1.0
# Center the decision boundary near the observed CPU/ANE split.
#
# Previous CPU margin was about -15.06 and ANE moved the margin
# by about +1.16, so +14.6 should make CPU slightly prefer class 1
# while ANE has a shot at flipping to class 0.
self.head.bias[0] = 14.6
self.head.bias[1] = 0.0
def forward(self, x):
y = self.stem(x)
# Amplify backend numerical differences before pooling/head.
y = y * 300.0
y = self.pool(y)
y = y.flatten(1)
y = self.head(y)
return y
def main():
torch.manual_seed(1234)
model = AmplifiedConvTie().eval()
example = torch.randn(1, 3, 224, 224)
traced = torch.jit.trace(model, example)
mlmodel = ct.convert(
traced,
inputs=[ct.TensorType(name="x", shape=example.shape)],
outputs=[ct.TensorType(name="y")],
convert_to="mlprogram",
minimum_deployment_target=ct.target.macOS14,
compute_precision=ct.precision.FLOAT16,
)
path = OUT / "amplified_conv_tie_fp16.mlpackage"
mlmodel.save(path)
print("saved:", path)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
from pathlib import Path
import coremltools as ct
import coremltools.optimize.coreml as cto
import numpy as np
import torch
import torch.nn as nn
OUT = Path("./models/synthetic")
OUT.mkdir(parents=True, exist_ok=True)
class AmplifiedConvTie(nn.Module):
def __init__(self):
super().__init__()
self.stem = nn.Sequential(
nn.Conv2d(3, 64, 3, padding=1),
nn.SiLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.SiLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.SiLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.SiLU(),
nn.Conv2d(64, 16, 1),
nn.SiLU(),
)
self.pool = nn.AdaptiveAvgPool2d((1, 1))
self.head = nn.Linear(16, 2, bias=True)
with torch.no_grad():
self.head.weight.zero_()
self.head.bias.zero_()
self.head.weight[0, 0] = 1.0
self.head.weight[1, 1] = 1.0
self.head.bias[0] = 14.6
self.head.bias[1] = 0.0
def forward(self, x):
y = self.stem(x)
y = y * 300.0
y = self.pool(y)
y = y.flatten(1)
y = self.head(y)
return y
def calibration_data(count=8):
samples = []
rng = np.random.default_rng(10_000)
for _ in range(count):
samples.append({"x": rng.standard_normal((1, 3, 224, 224)).astype(np.float32)})
return samples
def main():
torch.manual_seed(1234)
model = AmplifiedConvTie().eval()
example = torch.randn(1, 3, 224, 224)
traced = torch.jit.trace(model, example)
fp16 = ct.convert(
traced,
inputs=[ct.TensorType(name="x", shape=example.shape)],
outputs=[ct.TensorType(name="y")],
convert_to="mlprogram",
minimum_deployment_target=ct.target.macOS14,
compute_precision=ct.precision.FLOAT16,
)
activation_config = cto.OptimizationConfig(
global_config=cto.OpLinearQuantizerConfig(mode="linear_symmetric", dtype=np.int8)
)
a8 = cto.linear_quantize_activations(
fp16,
activation_config,
calibration_data(),
calibration_op_group_size=8,
)
weight_config = cto.OptimizationConfig(
global_config=cto.OpLinearQuantizerConfig(
mode="linear_symmetric",
dtype=np.int8,
granularity="per_channel",
weight_threshold=0,
)
)
w8a8 = cto.linear_quantize_weights(a8, weight_config)
path = OUT / "amplified_conv_tie_w8a8.mlpackage"
w8a8.save(path)
print("saved:", path)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
import argparse
import shutil
from pathlib import Path
import coremltools as ct
DEFAULT_MODEL = "./models/synthetic/amplified_conv_tie_fp16.mlpackage"
parser = argparse.ArgumentParser()
parser.add_argument("model", nargs="?", default=DEFAULT_MODEL)
args = parser.parse_args()
MODEL = Path(args.model).resolve()
OUT = Path("./compiled_models").resolve()
OUT.mkdir(exist_ok=True)
STABLE_NAME = MODEL.name.replace(".mlpackage", ".mlmodelc")
STABLE_COMPILED = OUT / STABLE_NAME
print("coremltools:", ct.__version__)
print("model:", MODEL)
if not MODEL.exists():
raise FileNotFoundError(f"Model not found: {MODEL}")
# Keep this object alive while copying the compiled temporary model.
mlmodel = ct.models.MLModel(str(MODEL), compute_units=ct.ComputeUnit.ALL)
tmp_compiled = Path(mlmodel.get_compiled_model_path())
if STABLE_COMPILED.exists():
shutil.rmtree(STABLE_COMPILED)
shutil.copytree(tmp_compiled, STABLE_COMPILED)
print("tmp compiled:", tmp_compiled)
print("stable compiled:", STABLE_COMPILED)
print("stable exists:", STABLE_COMPILED.exists())
plan = ct.models.compute_plan.MLComputePlan.load_from_path(
path=str(STABLE_COMPILED),
compute_units=ct.ComputeUnit.ALL,
)
print("\ncompute plan loaded")
print("plan type:", type(plan))
def op_type(op):
return (
getattr(op, "operator_name", None)
or getattr(op, "type", None)
or type(op).__name__
)
def device_name(device):
if device is None:
return "None"
return type(device).__name__
def usage_summary(usage):
if usage is None:
return "None", []
preferred = device_name(usage.preferred_compute_device)
supported = [device_name(d) for d in usage.supported_compute_devices]
return preferred, supported
def print_counts(title, counts):
print(f"\n== {title} ==")
for k, v in sorted(counts.items(), key=lambda kv: (-kv[1], str(kv[0]))):
print(v, k)
def record_usage(
typ,
preferred,
supported,
preferred_counts,
supported_counts,
op_counts,
op_preferred_counts,
ne_hits,
hit_payload,
):
preferred_counts[preferred] = preferred_counts.get(preferred, 0) + 1
op_counts[typ] = op_counts.get(typ, 0) + 1
op_preferred_counts[(typ, preferred)] = (
op_preferred_counts.get((typ, preferred), 0) + 1
)
for dev in supported:
supported_counts[dev] = supported_counts.get(dev, 0) + 1
if "NeuralEngine" in preferred or any("NeuralEngine" in d for d in supported):
ne_hits.append(hit_payload)
if plan.model_structure.program is not None:
print("\nmodel kind: mlprogram")
program = plan.model_structure.program
print("functions:", list(program.functions.keys()))
total_ops = 0
preferred_counts = {}
supported_counts = {}
op_counts = {}
op_preferred_counts = {}
ne_ops = []
for fname, fn in program.functions.items():
ops = fn.block.operations
print(f"\nfunction: {fname}")
print("ops:", len(ops))
for i, op in enumerate(ops):
total_ops += 1
typ = op_type(op)
usage = plan.get_compute_device_usage_for_mlprogram_operation(op)
preferred, supported = usage_summary(usage)
record_usage(
typ,
preferred,
supported,
preferred_counts,
supported_counts,
op_counts,
op_preferred_counts,
ne_ops,
(fname, i, typ, preferred, supported),
)
if i < 80:
print(f"\n[{i}]")
print("op:", typ)
print("name:", getattr(op, "name", None))
print("preferred:", preferred)
print("supported:", supported)
print("\nTOTAL OPS:", total_ops)
print_counts("PREFERRED DEVICE COUNTS", preferred_counts)
print_counts("SUPPORTED DEVICE COUNTS", supported_counts)
print_counts("OP TYPE COUNTS", op_counts)
print("\n== OP x PREFERRED DEVICE ==")
for (typ, pref), v in sorted(
op_preferred_counts.items(),
key=lambda kv: (-kv[1], str(kv[0])),
):
print(v, typ, pref)
print("\n== NEURAL ENGINE OPS ==")
if not ne_ops:
print("NONE")
else:
print("count:", len(ne_ops))
for fname, i, typ, preferred, supported in ne_ops[:100]:
print(
f"{fname}[{i}] op={typ} "
f"preferred={preferred} supported={supported}"
)
elif plan.model_structure.neuralnetwork is not None:
print("\nmodel kind: neuralnetwork")
nn = plan.model_structure.neuralnetwork
layers = nn.layers
preferred_counts = {}
supported_counts = {}
layer_type_counts = {}
layer_preferred_counts = {}
ne_layers = []
print("layers:", len(layers))
for i, layer in enumerate(layers):
usage = plan.get_compute_device_usage_for_neuralnetwork_layer(layer)
preferred, supported = usage_summary(usage)
typ = getattr(layer, "type", None) or type(layer).__name__
record_usage(
typ,
preferred,
supported,
preferred_counts,
supported_counts,
layer_type_counts,
layer_preferred_counts,
ne_layers,
(i, layer.name, typ, preferred, supported),
)
if i < 80:
print(f"\n[{i}] {layer.name}")
print("type:", typ)
print("preferred:", preferred)
print("supported:", supported)
print_counts("PREFERRED DEVICE COUNTS", preferred_counts)
print_counts("SUPPORTED DEVICE COUNTS", supported_counts)
print_counts("LAYER TYPE COUNTS", layer_type_counts)
print("\n== LAYER x PREFERRED DEVICE ==")
for (typ, pref), v in sorted(
layer_preferred_counts.items(),
key=lambda kv: (-kv[1], str(kv[0])),
):
print(v, typ, pref)
print("\n== NEURAL ENGINE LAYERS ==")
if not ne_layers:
print("NONE")
else:
print("count:", len(ne_layers))
for i, name, typ, preferred, supported in ne_layers[:100]:
print(
f"[{i}] {name} type={typ} "
f"preferred={preferred} supported={supported}"
)
else:
print("\nunknown model structure")
#!/usr/bin/env python3
import argparse
import numpy as np
import coremltools as ct
DEFAULT_MODEL = "./models/synthetic/amplified_conv_tie_fp16.mlpackage"
RUNS = 100
parser = argparse.ArgumentParser()
parser.add_argument("model", nargs="?", default=DEFAULT_MODEL)
parser.add_argument("--runs", type=int, default=RUNS)
args = parser.parse_args()
MODEL = args.model
RUNS = args.runs
rng = np.random.default_rng(1234)
x = {
"x": rng.standard_normal((1, 3, 224, 224)).astype(np.float32)
}
def load(cu):
return ct.models.MLModel(MODEL, compute_units=cu)
def run(model):
y = model.predict(x)
return np.asarray(y["y"])
def margin_01(arr):
flat = arr.reshape(-1)
return float(flat[0] - flat[1])
def print_logits(prefix, arr):
flat = arr.reshape(-1)
print(f"{prefix}_logits:", flat)
print(f"{prefix}_margin_0_minus_1:", margin_01(arr))
print(f"{prefix}_argmax:", int(arr.argmax()))
def probe(name, cu):
model = load(cu)
ref = run(model)
ref_argmax = int(ref.argmax())
raw_drift = False
argmax_drift = False
worst = 0.0
first = None
for i in range(1, RUNS + 1):
cur = run(model)
max_abs = float(np.max(np.abs(cur - ref)))
worst = max(worst, max_abs)
if not np.array_equal(cur, ref):
raw_drift = True
if first is None:
first = i
if int(cur.argmax()) != ref_argmax:
argmax_drift = True
first = i
break
print(f"\n== {name} ==")
print("raw_drift_seen:", raw_drift)
print("argmax_drift_seen:", argmax_drift)
print("first_drift_run:", first)
print("worst_abs_diff:", worst)
print("shape:", ref.shape)
print_logits("ref", ref)
return ref
results = {}
for name, cu in [
("CPU_ONLY", ct.ComputeUnit.CPU_ONLY),
("CPU_AND_GPU", ct.ComputeUnit.CPU_AND_GPU),
("CPU_AND_NE", ct.ComputeUnit.CPU_AND_NE),
("ALL", ct.ComputeUnit.ALL),
]:
try:
results[name] = probe(name, cu)
except Exception as e:
print(f"\n== {name} FAILED ==")
print(type(e).__name__, e)
base = results.get("CPU_ONLY")
if base is not None:
print("\n== CROSS-BACKEND VS CPU_ONLY ==")
base_argmax = int(base.argmax())
print_logits("cpu", base)
for name, arr in results.items():
if name == "CPU_ONLY":
continue
print(f"\n{name}")
print("array_equal:", np.array_equal(arr, base))
print("max_abs_diff:", float(np.max(np.abs(arr - base))))
print("argmax_equal:", int(arr.argmax()) == base_argmax)
print_logits("cur", arr)
print("delta_vs_cpu:", arr.reshape(-1) - base.reshape(-1))
print("margin_delta_vs_cpu:", margin_01(arr) - margin_01(base))
#!/usr/bin/env python3
import argparse
from collections import Counter, defaultdict
from pathlib import Path
import coremltools as ct
DEFAULT_MODEL = "./models/synthetic/amplified_conv_tie_w8a8.mlpackage"
KEYWORDS = ("quant", "dequant", "constexpr", "affine", "cast", "int8", "lut")
REAL_OPS = {"conv", "linear", "matmul", "add", "mul", "silu", "relu", "gelu", "reduce_mean", "reshape"}
def dtype_name(tensor_type):
value = tensor_type.dataType
enum = tensor_type.DESCRIPTOR.fields_by_name["dataType"].enum_type
enum_value = enum.values_by_number.get(value)
return enum_value.name if enum_value is not None else str(value)
def op_name(op):
attr = op.attributes.get("name") if hasattr(op, "attributes") else None
if attr is not None:
text = str(attr)
marker = 'values: "'
if marker in text:
return text.split(marker, 1)[1].split('"', 1)[0]
if op.outputs:
return op.outputs[0].name
return ""
def output_dtypes(op):
dtypes = []
for out in op.outputs:
if out.type.HasField("tensorType"):
dtypes.append(dtype_name(out.type.tensorType))
return dtypes
def input_names(op):
names = []
for input_value in op.inputs.values():
for arg in input_value.arguments:
if arg.name:
names.append(arg.name)
return names
def iter_mlprogram_ops(spec):
if spec.WhichOneof("Type") != "mlProgram":
return
for fname, fn in spec.mlProgram.functions.items():
for bname, block in fn.block_specializations.items():
for index, op in enumerate(block.operations):
yield fname, bname, index, op
def main():
parser = argparse.ArgumentParser()
parser.add_argument("model", nargs="?", default=DEFAULT_MODEL)
parser.add_argument("--hits", type=int, default=120)
args = parser.parse_args()
model_path = Path(args.model).resolve()
print("coremltools:", ct.__version__)
print("model:", model_path)
if not model_path.exists():
raise FileNotFoundError(model_path)
mlmodel = ct.models.MLModel(str(model_path), compute_units=ct.ComputeUnit.CPU_ONLY)
spec = mlmodel.get_spec()
kind = spec.WhichOneof("Type")
print("model kind:", kind)
if kind != "mlProgram":
print("Unsupported for this probe: expected mlProgram")
return
op_counts = Counter()
dtype_counts = Counter()
hits = []
quant_outputs = set()
dequant_outputs = set()
real_ops_consuming_dequant = []
real_ops_consuming_quant = []
producer = {}
ops = list(iter_mlprogram_ops(spec))
for fname, bname, index, op in ops:
typ = op.type
name = op_name(op)
dtypes = output_dtypes(op)
op_counts[typ] += 1
for dt in dtypes:
dtype_counts[dt] += 1
for out in op.outputs:
producer[out.name] = typ
haystack = " ".join([typ, name, *[out.name for out in op.outputs], *dtypes]).lower()
if any(k in haystack for k in KEYWORDS):
hits.append((fname, bname, index, typ, name, dtypes, [out.name for out in op.outputs]))
if typ == "quantize":
quant_outputs.update(out.name for out in op.outputs)
if typ in {"dequantize", "constexpr_affine_dequantize", "constexpr_blockwise_shift_scale"}:
dequant_outputs.update(out.name for out in op.outputs)
for fname, bname, index, op in ops:
if op.type not in REAL_OPS:
continue
consumed = input_names(op)
if any(name in dequant_outputs for name in consumed):
real_ops_consuming_dequant.append((fname, bname, index, op.type, op_name(op), consumed))
if any(name in quant_outputs for name in consumed):
real_ops_consuming_quant.append((fname, bname, index, op.type, op_name(op), consumed))
print("\n== OP TYPE COUNTS ==")
for typ, count in op_counts.most_common():
print(count, typ)
print("\n== OUTPUT DTYPE COUNTS ==")
for typ, count in dtype_counts.most_common():
print(count, typ)
print("\n== QUANTIZATION-RELATED HITS ==")
if not hits:
print("NONE")
else:
print("count:", len(hits))
for fname, bname, index, typ, name, dtypes, outputs in hits[: args.hits]:
print(f"{fname}/{bname}[{index}] op={typ} name={name} dtypes={dtypes} outputs={outputs}")
weight_evidence = op_counts["constexpr_affine_dequantize"] + op_counts["constexpr_blockwise_shift_scale"]
activation_quant_evidence = op_counts["quantize"] + op_counts["dequantize"]
print("\n== CLASSIFICATION ==")
print("weight_quantization_evidence:", "yes" if weight_evidence else "no")
print("weight_quantization_ops:", weight_evidence)
print("activation_quantization_evidence:", "yes" if activation_quant_evidence else "no")
print("activation_quantize_ops:", op_counts["quantize"])
print("activation_dequantize_ops:", op_counts["dequantize"])
print("real_ops_consuming_dequantized_values:", len(real_ops_consuming_dequant))
print("real_ops_consuming_quantized_values:", len(real_ops_consuming_quant))
if activation_quant_evidence and real_ops_consuming_dequant:
print("activation_quantization_interpretation: quantize/dequantize pairs are present before real ops")
elif activation_quant_evidence:
print("activation_quantization_interpretation: quantize/dequantize ops are present, but no real-op consumers were detected")
elif weight_evidence:
print("activation_quantization_interpretation: weight compression only; no activation quantize/dequantize ops detected")
else:
print("activation_quantization_interpretation: no quantization evidence detected")
if real_ops_consuming_dequant:
print("\n== REAL OPS CONSUMING DEQUANTIZED VALUES ==")
by_type = defaultdict(int)
for _, _, _, typ, _, _ in real_ops_consuming_dequant:
by_type[typ] += 1
for typ, count in sorted(by_type.items()):
print(count, typ)
for fname, bname, index, typ, name, consumed in real_ops_consuming_dequant[:40]:
print(f"{fname}/{bname}[{index}] op={typ} name={name} inputs={consumed}")
if __name__ == "__main__":
main()

Stage 3 Results: W8A8 / Activation Quantization Probe

Date

2026-06-06

Model

./models/synthetic/amplified_conv_tie_w8a8.mlpackage

Built from the synthetic amplified conv/SILU near-tie model. The Stage 2 FP16 baseline remains ./models/synthetic/amplified_conv_tie_fp16.mlpackage.

What Was Attempted

  1. Converted the PyTorch synthetic conv/SILU model to a Core ML MLProgram with FP16 compute precision.
  2. Applied Core ML-side activation quantization with coremltools.optimize.coreml.linear_quantize_activations using deterministic calibration data.
  3. Applied Core ML-side int8 weight quantization with coremltools.optimize.coreml.linear_quantize_weights.
  4. Inspected the saved MLProgram for quantization evidence.
  5. Inspected Core ML compute placement for the generated model.
  6. Compared repeated CPU/GPU/ANE/ALL predictions.

Commands Run

uv run python make_synthetic_coreml_amp_w8a8.py
uv run python probe_4_quant_evidence.py ./models/synthetic/amplified_conv_tie_w8a8.mlpackage --hits 30
gtimeout 60 uv run python probe_2_plan.py ./models/synthetic/amplified_conv_tie_w8a8.mlpackage
uv run python probe_3_drift.py ./models/synthetic/amplified_conv_tie_w8a8.mlpackage
uv run python -m py_compile make_synthetic_coreml_amp_w8a8.py probe_2_plan.py probe_3_drift.py probe_4_quant_evidence.py

All commands completed successfully.

Quantization Evidence

probe_4_quant_evidence.py found:

== OP TYPE COUNTS ==
68 const
11 quantize
11 dequantize
6 constexpr_affine_dequantize
5 conv
5 silu
1 mul
1 reduce_mean
1 reshape
1 linear

== OUTPUT DTYPE COUNTS ==
60 FLOAT16
22 INT32
16 STRING
11 INT8
1 BOOL

== CLASSIFICATION ==
weight_quantization_evidence: yes
weight_quantization_ops: 6
activation_quantization_evidence: yes
activation_quantize_ops: 11
activation_dequantize_ops: 11
real_ops_consuming_dequantized_values: 11
real_ops_consuming_quantized_values: 0
activation_quantization_interpretation: quantize/dequantize pairs are present before real ops

== REAL OPS CONSUMING DEQUANTIZED VALUES ==
5 conv
1 linear
5 silu

Interpretation: this is not merely filename evidence and not merely weight-only compression. The saved MLProgram contains int8 activation quantize ops, matching dequantize ops, and weight constexpr_affine_dequantize ops. The real conv/SILU/linear ops consume dequantized values, so this is activation-quantized graph evidence, not proof that every arithmetic kernel internally executes as integer MACs.

Compute Placement Evidence

probe_2_plan.py found 110 total MLProgram ops. Relevant output:

== PREFERRED DEVICE COUNTS ==
74 None
36 MLNeuralEngineComputeDevice

== OP TYPE COUNTS ==
68 const
11 ios17.dequantize
11 ios17.quantize
6 ios16.constexpr_affine_dequantize
5 ios16.silu
5 ios17.conv
1 ios16.reduce_mean
1 ios17.linear
1 ios17.mul
1 ios17.reshape

== OP x PREFERRED DEVICE ==
11 ios17.dequantize MLNeuralEngineComputeDevice
11 ios17.quantize MLNeuralEngineComputeDevice
5 ios16.silu MLNeuralEngineComputeDevice
5 ios17.conv MLNeuralEngineComputeDevice
1 ios16.reduce_mean MLNeuralEngineComputeDevice
1 ios17.linear MLNeuralEngineComputeDevice
1 ios17.mul MLNeuralEngineComputeDevice
1 ios17.reshape MLNeuralEngineComputeDevice

Interpretation: activation quantize/dequantize ops and the real conv/SILU/mul/reduce/reshape/linear ops are NE-preferred and CPU/GPU/NE-supported in the compute plan.

Runtime Stability And Backend Comparison

probe_3_drift.py found stable same-backend repeated runs for all tested compute units:

== CPU_ONLY ==
raw_drift_seen: False
argmax_drift_seen: False
worst_abs_diff: 0.0
ref_logits: [3.1015625 3.5722656]
ref_margin_0_minus_1: -0.470703125
ref_argmax: 1

== CPU_AND_GPU ==
raw_drift_seen: False
argmax_drift_seen: False
worst_abs_diff: 0.0
ref_logits: [3.1015625 3.5722656]
ref_margin_0_minus_1: -0.470703125
ref_argmax: 1

== CPU_AND_NE ==
raw_drift_seen: False
argmax_drift_seen: False
worst_abs_diff: 0.0
ref_logits: [4.4414062 3.6835938]
ref_margin_0_minus_1: 0.7578125
ref_argmax: 0

== ALL ==
raw_drift_seen: False
argmax_drift_seen: False
worst_abs_diff: 0.0
ref_logits: [4.4414062 3.6835938]
ref_margin_0_minus_1: 0.7578125
ref_argmax: 0

Cross-backend comparison versus CPU:

CPU_AND_GPU
array_equal: True
max_abs_diff: 0.0
argmax_equal: True
margin_delta_vs_cpu: 0.0

CPU_AND_NE
array_equal: False
max_abs_diff: 1.33984375
argmax_equal: False
delta_vs_cpu: [1.3398438  0.11132812]
margin_delta_vs_cpu: 1.228515625

ALL
array_equal: False
max_abs_diff: 1.33984375
argmax_equal: False
delta_vs_cpu: [1.3398438  0.11132812]
margin_delta_vs_cpu: 1.228515625

Conclusion

Stage 3 passes the planned gates:

  • Activation quantization evidence: yes, 11 quantize and 11 dequantize ops with INT8 quantize outputs.
  • Weight quantization evidence: yes, 6 constexpr_affine_dequantize ops.
  • NE placement: yes, quantize/dequantize and real ops are NE-preferred in the compute plan.
  • Same-backend stability: yes, repeated outputs were stable for CPU_ONLY, CPU_AND_GPU, CPU_AND_NE, and ALL.
  • CPU-vs-ANE delta: max absolute logit delta was 1.33984375; margin delta was 1.228515625.
  • CPU-vs-ANE argmax flip: yes. CPU_ONLY/CPU_AND_GPU returned class 1; CPU_AND_NE/ALL returned class 0.

Honest claim: a Core ML MLProgram with activation quantize/dequantize ops, int8 activation tensors, compressed int8 weights, and NE-preferred placement can be stable within each backend while producing CPU-vs-ANE logit differences large enough to flip a near-tie argmax. This does not show run-to-run nondeterminism, and it does not by itself prove the exact low-level arithmetic kernel implementation inside ANE.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment