Anemll · March 8, 2026 16:14 · Anemll · Mar 12, 2026
diff --git a/README.md b/README.md
diff --git a/gen_int8_bench.py b/gen_int8_bench.py
 #!/usr/bin/env python3
 """Generate FP16 and INT8 W8A8 CoreML models for ANE benchmarking.

 Creates compute-bound conv models to measure peak ANE throughput:
 - FP16 baseline: pure fp16 conv chain
 - W8A8: int8 weights (constexpr_affine_dequantize) + int8 activations (quantize/dequantize)

 Usage:
    pip install coremltools numpy
    python gen_int8_bench.py

    # Then profile with anemll-profile:
    OS_ACTIVITY_DT_MODE=YES anemll-profile /tmp/int8_compute_fp16_512.mlpackage
    OS_ACTIVITY_DT_MODE=YES anemll-profile /tmp/int8_compute_w8a8_512.mlpackage
 """

 import coremltools as ct
 from coremltools.converters.mil import Builder as mb
 from coremltools.converters.mil.mil import types
 import numpy as np

 # Config: (name, num_convs, channels, spatial, use_int8)
 # Large spatial (64x64) saturates MAC arrays; 512ch = high compute intensity
 configs = [
    # FP16 baselines
    ("int8_compute_fp16",     256, 256, 64, False),
    ("int8_compute_fp16_512", 128, 512, 64, False),
    # W8A8 — same compute, but int8 weights + int8 activations between layers
    ("int8_compute_w8a8",     256, 256, 64, True),
    ("int8_compute_w8a8_512", 128, 512, 64, True),
 ]

 for name, nconv, ch, sp, int8_w in configs:
    print(f"Generating {name}: {nconv}x conv {ch}ch sp{sp} int8={int8_w}")
    input_shape = (1, ch, sp, sp)

    if int8_w:
        # W8A8: int8 weights via constexpr_affine_dequantize,
        #        int8 activations via quantize/dequantize between layers
        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape, dtype=types.fp16)],
                    opset_version=ct.target.iOS18)
        def prog(x):
            for i in range(nconv):
                w_int8 = np.random.randint(-64, 63, (ch, ch, 1, 1), dtype=np.int8)
                w = mb.constexpr_affine_dequantize(
                    quantized_data=w_int8, scale=np.float16(0.001),
                    zero_point=np.int8(0), axis=0, name=f"w_{i:03d}")
                x = mb.conv(x=x, weight=w, name=f"conv_{i:03d}")
                if i < nconv - 1:
                    # quantize/dequantize: tells ANE compiler to store activations
                    # as int8 in L2 SRAM between layers (halves bandwidth)
                    x = mb.quantize(input=x, scale=np.float16(0.125),
                                   zero_point=np.int8(0), output_dtype="int8",
                                   name=f"q_{i:03d}")
                    x = mb.dequantize(input=x, scale=np.float16(0.125),
                                     zero_point=np.int8(0), name=f"dq_{i:03d}")
            return x
    else:
        # FP16 baseline: pure fp16 conv chain (same opset so I/O stays fp16, no extra casts)
        @mb.program(input_specs=[mb.TensorSpec(shape=input_shape, dtype=types.fp16)],
                    opset_version=ct.target.iOS18)
        def prog(x):
            for i in range(nconv):
                w = np.random.randn(ch, ch, 1, 1).astype(np.float16) * 0.01
                x = mb.conv(x=x, weight=w, name=f"conv_{i:03d}")
            return x

    model = ct.convert(prog, compute_units=ct.ComputeUnit.CPU_AND_NE,
                       minimum_deployment_target=ct.target.iOS18)
    model.save(f"/tmp/{name}.mlpackage")
    print(f"  Saved /tmp/{name}.mlpackage")

 print("\nDone! Profile with:")
 print("  OS_ACTIVITY_DT_MODE=YES anemll-profile /tmp/int8_compute_fp16_512.mlpackage")
 print("  OS_ACTIVITY_DT_MODE=YES anemll-profile /tmp/int8_compute_w8a8_512.mlpackage")
diff --git a/inmem_peak_int8.m b/inmem_peak_int8.m
 // xcrun clang -O2 -Wall -fobjc-arc -framework Foundation -framework IOSurface -ldl -o inmem_peak_int8 inmem_peak_int8.m
 #import <Foundation/Foundation.h>
 #import <objc/runtime.h>
 #import <objc/message.h>
 #import <dlfcn.h>
 #import <mach/mach_time.h>
 #import <IOSurface/IOSurface.h>
 #import <string.h>

 static mach_timebase_info_data_t g_tb;
 static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
 static unsigned int g_qos = 0;  // RealTime
 static int g_relu = 0;          // --relu flag

 NSData *buildWeightBlobInt8(int ch, int depth) {
    NSUInteger wsize = ch * ch * 1;       // int8: 1 byte per element
    NSUInteger chunkSize = 64 + wsize;
    NSUInteger total = 64 + chunkSize * depth;
    uint8_t *buf = calloc(total, 1);
    buf[0] = 0x01; buf[4] = 0x02;
    for (int i = 0; i < depth; i++) {
        uint8_t *chunk = buf + 64 + i * chunkSize;
        chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
        chunk[4]=0x01; chunk[10]=0x08;
        int8_t *data = (int8_t*)(chunk + 64);
        for (NSUInteger j = 0; j < wsize; j++) data[j] = (int8_t)(arc4random() % 256 - 128);
    }
    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
 }

 // Generate W8A8 INT8 MIL matching CoreML int8_compute_w8a8 pattern:
 // fp16(in) → [conv(int8w) → (relu?) → quant(int8) → dequant(fp16)] × (depth-1) → conv(int8w) → fp16(out)
 NSString *genMILInt8(int ch, int sp, int depth, int batch) {
    NSMutableString *m = [NSMutableString string];
    [m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"];
    // FP16 input (matching CoreML model)
    [m appendFormat:@"    func main<ios18>(tensor<fp16, [%d, %d, %d, %d]> x) {\n", batch, ch, sp, sp];
    [m appendString:@"            string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n"
        @"            tensor<int32, [2]> c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
        @"            tensor<int32, [4]> c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
        @"            tensor<int32, [2]> c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n"
        @"            int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n"];
    // quantize/dequantize scales
    [m appendString:@"            fp16 q_scale = const()[name = string(\"q_scale\"), val = fp16(0x1p-3)];\n"    // 0.125
                    @"            string q_dtype = const()[name = string(\"q_dtype\"), val = string(\"int8\")];\n"
                    @"            fp16 dq_scale = const()[name = string(\"dq_scale\"), val = fp16(0x1p-3)];\n"]; // 0.125
    NSUInteger cs = 64 + ch*ch*1;  // int8 weight chunk size
    NSString *prev = @"x";  // fp16 input
    for (int i = 0; i < depth; i++) {
        // constexpr_affine_dequantize: int8 weights → fp16 (compile-time)
        [m appendFormat:
            @"            tensor<fp16, [%d, %d, 1, 1]> W%d = constexpr_affine_dequantize()"
            @"[axis = int32(0), name = string(\"W%d\"), "
            @"quantized_data = tensor<int8, [%d, %d, 1, 1]>"
            @"(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu))), "
            @"scale = fp16(0x1p-3), zero_point = int8(0)];\n",
            ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)];
        // conv (fp16 activations, int8 weights dequantized at compile-time)
        NSString *conv_out = [NSString stringWithFormat:@"c%d", i];
        [m appendFormat:@"            tensor<fp16, [%d, %d, %d, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n",
            batch, ch, sp, sp, conv_out, i, prev, conv_out];

        NSString *act_out = conv_out;
        if (g_relu) {
            // Optional relu (fused by ANE post-processor, free)
            act_out = [NSString stringWithFormat:@"r%d", i];
            [m appendFormat:@"            tensor<fp16, [%d, %d, %d, %d]> %@ = relu(x = %@)[name = string(\"%@\")];\n",
                batch, ch, sp, sp, act_out, conv_out, act_out];
        }

        if (i < depth - 1) {
            // quantize: fp16 → int8
            NSString *q_out = [NSString stringWithFormat:@"q%d", i];
            [m appendFormat:@"            tensor<int8, [%d, %d, %d, %d]> %@ = quantize(input = %@, output_dtype = q_dtype, scale = q_scale)[name = string(\"%@\")];\n",
                batch, ch, sp, sp, q_out, act_out, q_out];
            // dequantize: int8 → fp16
            NSString *dq_out = [NSString stringWithFormat:@"dq%d", i];
            [m appendFormat:@"            tensor<fp16, [%d, %d, %d, %d]> %@ = dequantize(input = %@, scale = dq_scale)[name = string(\"%@\")];\n",
                batch, ch, sp, sp, dq_out, q_out, dq_out];
            prev = dq_out;  // fp16 for next conv
        } else {
            prev = act_out;  // last conv outputs fp16 directly
        }
    }
    // FP16 output (matching CoreML model)
    [m appendFormat:@"        } -> (%@);\n}\n", prev];
    return m;
 }

 double bench(int ch, int sp, int depth, int batch) {
    @autoreleasepool {
        NSError *e = nil;
        NSData *milData = [[genMILInt8(ch,sp,depth,batch) dataUsingEncoding:NSUTF8StringEncoding] copy];
        NSData *wb = buildWeightBlobInt8(ch, depth);
        Class D=NSClassFromString(@"_ANEInMemoryModelDescriptor"), I=NSClassFromString(@"_ANEInMemoryModel");
        Class AR=NSClassFromString(@"_ANERequest"), AIO=NSClassFromString(@"_ANEIOSurfaceObject");
        id desc=((id(*)(Class,SEL,id,id,id))objc_msgSend)(D,@selector(modelWithMILText:weights:optionsPlist:),milData,@{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":wb}},nil);
        if(!desc)return -1;
        id mdl=((id(*)(Class,SEL,id))objc_msgSend)(I,@selector(inMemoryModelWithDescriptor:),desc);
        if(!mdl)return -2;
        id hx=((id(*)(id,SEL))objc_msgSend)(mdl,@selector(hexStringIdentifier));
        NSString *td=[NSTemporaryDirectory() stringByAppendingPathComponent:hx];
        NSFileManager *fm=[NSFileManager defaultManager];
        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
        [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
        [wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
        if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),g_qos,@{},&e)){
            if(e)fprintf(stderr,"compile err: %s\n",[[e description] UTF8String]);
            [fm removeItemAtPath:td error:nil];return -3;}
        // Save compiled model.mil for inspection
        NSString *compiledMil = [td stringByAppendingPathComponent:@"model.mil"];
        if ([fm fileExistsAtPath:compiledMil]) {
            NSString *content = [NSString stringWithContentsOfFile:compiledMil encoding:NSUTF8StringEncoding error:nil];
            if (content) {
                static int printed = 0;
                if (!printed) {
                    printf("=== Compiled model.mil (after ANE compiler) ===\n%s\n\n", [content UTF8String]);
                    printed = 1;
                }
            }
        }
        if(!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),g_qos,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;}
        // FP16 I/O: 2 bytes per element (matching CoreML model)
        NSUInteger bytes=(NSUInteger)batch*ch*sp*sp*2;
        IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
        IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
        id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI);
        id wO=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioO);
        id req=((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR,@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),@[wI],@[@0],@[wO],@[@0],nil,nil,@0);
        for(int i=0;i<10;i++)((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl,@selector(evaluateWithQoS:options:request:error:),g_qos,@{},req,&e);
        int it=50; uint64_t t0=mach_absolute_time();
        for(int i=0;i<it;i++)((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(mdl,@selector(evaluateWithQoS:options:request:error:),g_qos,@{},req,&e);
        double ms=ticksToMs(mach_absolute_time()-t0)/it;
        ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl,@selector(unloadWithQoS:error:),g_qos,&e);
        CFRelease(ioI);CFRelease(ioO);[fm removeItemAtPath:td error:nil];
        return ms;
    }
 }

 int main(int argc, char *argv[]) {
    mach_timebase_info(&g_tb);
    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",RTLD_NOW);
    int batch = 1, sp = 64;

    // Parse args
    int pos = 0;
    for (int i = 1; i < argc; i++) {
        if (strcmp(argv[i], "--relu") == 0) { g_relu = 1; continue; }
        if (strcmp(argv[i], "--qos") == 0 && i+1 < argc) {
            g_qos = (unsigned int)atoi(argv[++i]); continue;
        }
        if (pos == 0) { batch = atoi(argv[i]); pos++; }
        else if (pos == 1) { sp = atoi(argv[i]); pos++; }
    }
    if (batch < 1) batch = 1;
    if (sp < 1) sp = 1;

    // Query ANE hardware info
    Class DI = NSClassFromString(@"_ANEDeviceInfo");
    unsigned int numANEs = 1, numCores = 16;
    const char *ane_type = "unknown";
    if (DI) {
        numANEs = ((unsigned int(*)(Class,SEL))objc_msgSend)(DI, @selector(numANEs));
        numCores = ((unsigned int(*)(Class,SEL))objc_msgSend)(DI, @selector(numANECores));
        id subType = ((id(*)(Class,SEL))objc_msgSend)(DI, @selector(aneSubType));
        if (subType) ane_type = [[subType description] UTF8String];
    }

    // ANE IP clock by generation
    double ane_ip_mhz = 2364.0;
    if      (strncmp(ane_type, "h11", 3) == 0) ane_ip_mhz = 1464.0;
    else if (strncmp(ane_type, "h13", 3) == 0) ane_ip_mhz = 2100.0;
    else if (strncmp(ane_type, "h15", 3) == 0) ane_ip_mhz = 2364.0;
    else if (strncmp(ane_type, "h17", 3) == 0) ane_ip_mhz = 2448.0;
    const double ane_clock_mhz = ane_ip_mhz / 8.0;

    // Peak throughput per ANE cluster
    // _ANEDeviceInfo.numANECores reports clusters (e.g. 2 on M5), not MAC arrays
    // Each cluster has 16 MAC arrays × 2048 MACs × 2 OPs/MAC (multiply + accumulate)
    const unsigned int mac_arrays = 16;  // per cluster, fixed in ANE architecture
    const double fp16_macs_per_cycle = (double)mac_arrays * 2048 * 2;  // 65536 per cluster
    const double int8_ops_per_cycle = fp16_macs_per_cycle * 2;  // 2x if native int8 MACs
    const double fp16_peak = fp16_macs_per_cycle * ane_clock_mhz / 1e6;  // per cluster
    const double int8_peak = int8_ops_per_cycle * ane_clock_mhz / 1e6;   // per cluster

    const char *qos_name = g_qos==0?"RealTime":g_qos==33?"UserInteractive":g_qos==25?"UserInitiated":
                           g_qos==21?"Default":g_qos==17?"Utility":g_qos==9?"Background":"custom";

    printf("=== In-Memory ANE Peak INT8 W8A8 (batch=%d, sp=%dx%d) ===\n", batch, sp, sp);
    printf("ANE hw: type=%s, numANEs=%u, numCores=%u, QoS=%u (%s)\n", ane_type, numANEs, numCores, g_qos, qos_name);
    printf("  IP clock=%.0f MHz, MAC clock=%.1f MHz\n", ane_ip_mhz, ane_clock_mhz);
    printf("  FP16 peak: %.2f TFLOPS/cluster\n", fp16_peak);
    printf("  INT8 peak (if native): %.2f TOPS/cluster\n", int8_peak);
    printf("Data: fp16(in) → [conv(W8) → %squant → dequant] × N → conv(W8) → fp16(out)\n",
           g_relu ? "relu → " : "");
    printf("Tensor shape: [%d, ch, %d, %d]\n", batch, sp, sp);
    printf("Usage: %s [batch] [sp] [--relu] [--qos N]\n\n", argv[0]);

    typedef struct{int c,d;}C;
    C cf[]={
        {512,128},{512,96},{512,64},{512,48},{512,32},
        {256,256},{256,128},{256,64},
        {384,128},{384,64},
    };
    int ncf = sizeof(cf)/sizeof(cf[0]);

    printf("%-28s %7s %7s %9s %7s %6s %6s %8s\n","Config","W(MB)","GOP","ms/eval","TOPS","%%fp16","%%int8","Est.MHz");
    printf("------------------------------------------------------------------------------------\n");
    for(int i=0;i<ncf;i++){
        int c=cf[i].c,d=cf[i].d;
        double w=(double)c*c*1*d/1024/1024;  // int8 weights
        double gop=2.0*c*c*sp*sp*d*batch/1e9;
        char l[64]; snprintf(l,64,"%dx conv %dch %dx%d",d,c,sp,sp);
        double ms=bench(c,sp,d,batch);
        double tops=ms>0?gop/ms:0;
        double pct_fp16 = tops / fp16_peak * 100.0;
        double pct_int8 = tops / int8_peak * 100.0;
        double est_mhz = tops * 1e12 / fp16_macs_per_cycle / 1e6;
        if(ms>0)printf("%-28s %6.1f  %6.2f  %7.3f ms %6.2f  %5.1f%% %5.1f%% %7.0f\n",l,w,gop,ms,tops,pct_fp16,pct_int8,est_mhz);
        else printf("%-28s %6.1f  %6.2f  FAIL(%.0f)\n",l,w,gop,ms);
    }
    return 0;
 }
Method	FP16	INT8 W8A8	Ratio
anemll-profile (CoreML)	17.99 TOPS, 15.3 ms	33.86 TOPS, 8.2 ms	1.88x
Private API (wall-clock)	18.22 TFLOPS, 15.1 ms	34.22 TOPS, 8.0 ms	1.88x
	#!/usr/bin/env python3
	"""Generate FP16 and INT8 W8A8 CoreML models for ANE benchmarking.

	Creates compute-bound conv models to measure peak ANE throughput:
	- FP16 baseline: pure fp16 conv chain
	- W8A8: int8 weights (constexpr_affine_dequantize) + int8 activations (quantize/dequantize)

	Usage:
	pip install coremltools numpy
	python gen_int8_bench.py

	# Then profile with anemll-profile:
	OS_ACTIVITY_DT_MODE=YES anemll-profile /tmp/int8_compute_fp16_512.mlpackage
	OS_ACTIVITY_DT_MODE=YES anemll-profile /tmp/int8_compute_w8a8_512.mlpackage
	"""

	import coremltools as ct
	from coremltools.converters.mil import Builder as mb
	from coremltools.converters.mil.mil import types
	import numpy as np

	# Config: (name, num_convs, channels, spatial, use_int8)
	# Large spatial (64x64) saturates MAC arrays; 512ch = high compute intensity
	configs = [
	# FP16 baselines
	("int8_compute_fp16", 256, 256, 64, False),
	("int8_compute_fp16_512", 128, 512, 64, False),
	# W8A8 — same compute, but int8 weights + int8 activations between layers
	("int8_compute_w8a8", 256, 256, 64, True),
	("int8_compute_w8a8_512", 128, 512, 64, True),
	]

	for name, nconv, ch, sp, int8_w in configs:
	print(f"Generating {name}: {nconv}x conv {ch}ch sp{sp} int8={int8_w}")
	input_shape = (1, ch, sp, sp)

	if int8_w:
	# W8A8: int8 weights via constexpr_affine_dequantize,
	# int8 activations via quantize/dequantize between layers
	@mb.program(input_specs=[mb.TensorSpec(shape=input_shape, dtype=types.fp16)],
	opset_version=ct.target.iOS18)
	def prog(x):
	for i in range(nconv):
	w_int8 = np.random.randint(-64, 63, (ch, ch, 1, 1), dtype=np.int8)
	w = mb.constexpr_affine_dequantize(
	quantized_data=w_int8, scale=np.float16(0.001),
	zero_point=np.int8(0), axis=0, name=f"w_{i:03d}")
	x = mb.conv(x=x, weight=w, name=f"conv_{i:03d}")
	if i < nconv - 1:
	# quantize/dequantize: tells ANE compiler to store activations
	# as int8 in L2 SRAM between layers (halves bandwidth)
	x = mb.quantize(input=x, scale=np.float16(0.125),
	zero_point=np.int8(0), output_dtype="int8",
	name=f"q_{i:03d}")
	x = mb.dequantize(input=x, scale=np.float16(0.125),
	zero_point=np.int8(0), name=f"dq_{i:03d}")
	return x
	else:
	# FP16 baseline: pure fp16 conv chain (same opset so I/O stays fp16, no extra casts)
	@mb.program(input_specs=[mb.TensorSpec(shape=input_shape, dtype=types.fp16)],
	opset_version=ct.target.iOS18)
	def prog(x):
	for i in range(nconv):
	w = np.random.randn(ch, ch, 1, 1).astype(np.float16) * 0.01
	x = mb.conv(x=x, weight=w, name=f"conv_{i:03d}")
	return x

	model = ct.convert(prog, compute_units=ct.ComputeUnit.CPU_AND_NE,
	minimum_deployment_target=ct.target.iOS18)
	model.save(f"/tmp/{name}.mlpackage")
	print(f" Saved /tmp/{name}.mlpackage")

	print("\nDone! Profile with:")
	print(" OS_ACTIVITY_DT_MODE=YES anemll-profile /tmp/int8_compute_fp16_512.mlpackage")
	print(" OS_ACTIVITY_DT_MODE=YES anemll-profile /tmp/int8_compute_w8a8_512.mlpackage")
	// xcrun clang -O2 -Wall -fobjc-arc -framework Foundation -framework IOSurface -ldl -o inmem_peak_int8 inmem_peak_int8.m
	#import <Foundation/Foundation.h>
	#import <objc/runtime.h>
	#import <objc/message.h>
	#import <dlfcn.h>
	#import <mach/mach_time.h>
	#import <IOSurface/IOSurface.h>
	#import <string.h>

	static mach_timebase_info_data_t g_tb;
	static double ticksToMs(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }
	static unsigned int g_qos = 0; // RealTime
	static int g_relu = 0; // --relu flag

	NSData *buildWeightBlobInt8(int ch, int depth) {
	NSUInteger wsize = ch * ch * 1; // int8: 1 byte per element
	NSUInteger chunkSize = 64 + wsize;
	NSUInteger total = 64 + chunkSize * depth;
	uint8_t *buf = calloc(total, 1);
	buf[0] = 0x01; buf[4] = 0x02;
	for (int i = 0; i < depth; i++) {
	uint8_t chunk = buf + 64 + i chunkSize;
	chunk[0]=0xEF; chunk[1]=0xBE; chunk[2]=0xAD; chunk[3]=0xDE;
	chunk[4]=0x01; chunk[10]=0x08;
	int8_t data = (int8_t)(chunk + 64);
	for (NSUInteger j = 0; j < wsize; j++) data[j] = (int8_t)(arc4random() % 256 - 128);
	}
	return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
	}

	// Generate W8A8 INT8 MIL matching CoreML int8_compute_w8a8 pattern:
	// fp16(in) → [conv(int8w) → (relu?) → quant(int8) → dequant(fp16)] × (depth-1) → conv(int8w) → fp16(out)
	NSString *genMILInt8(int ch, int sp, int depth, int batch) {
	NSMutableString *m = [NSMutableString string];
	[m appendString:@"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, {\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, {\"coremltools-version\", \"9.0\"}})]\n{\n"];
	// FP16 input (matching CoreML model)
	[m appendFormat:@" func main<ios18>(tensor<fp16, [%d, %d, %d, %d]> x) {\n", batch, ch, sp, sp];
	[m appendString:@" string c_pad_type_0 = const()[name = string(\"c_pad_type_0\"), val = string(\"valid\")];\n"
	@" tensor<int32, [2]> c_strides_0 = const()[name = string(\"c_strides_0\"), val = tensor<int32, [2]>([1, 1])];\n"
	@" tensor<int32, [4]> c_pad_0 = const()[name = string(\"c_pad_0\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
	@" tensor<int32, [2]> c_dilations_0 = const()[name = string(\"c_dilations_0\"), val = tensor<int32, [2]>([1, 1])];\n"
	@" int32 c_groups_0 = const()[name = string(\"c_groups_0\"), val = int32(1)];\n"];
	// quantize/dequantize scales
	[m appendString:@" fp16 q_scale = const()[name = string(\"q_scale\"), val = fp16(0x1p-3)];\n" // 0.125
	@" string q_dtype = const()[name = string(\"q_dtype\"), val = string(\"int8\")];\n"
	@" fp16 dq_scale = const()[name = string(\"dq_scale\"), val = fp16(0x1p-3)];\n"]; // 0.125
	NSUInteger cs = 64 + chch1; // int8 weight chunk size
	NSString *prev = @"x"; // fp16 input
	for (int i = 0; i < depth; i++) {
	// constexpr_affine_dequantize: int8 weights → fp16 (compile-time)
	[m appendFormat:
	@" tensor<fp16, [%d, %d, 1, 1]> W%d = constexpr_affine_dequantize()"
	@"[axis = int32(0), name = string(\"W%d\"), "
	@"quantized_data = tensor<int8, [%d, %d, 1, 1]>"
	@"(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu))), "
	@"scale = fp16(0x1p-3), zero_point = int8(0)];\n",
	ch, ch, i, i, ch, ch, (unsigned long)(64 + i*cs)];
	// conv (fp16 activations, int8 weights dequantized at compile-time)
	NSString *conv_out = [NSString stringWithFormat:@"c%d", i];
	[m appendFormat:@" tensor<fp16, [%d, %d, %d, %d]> %@ = conv(dilations = c_dilations_0, groups = c_groups_0, pad = c_pad_0, pad_type = c_pad_type_0, strides = c_strides_0, weight = W%d, x = %@)[name = string(\"%@\")];\n",
	batch, ch, sp, sp, conv_out, i, prev, conv_out];

	NSString *act_out = conv_out;
	if (g_relu) {
	// Optional relu (fused by ANE post-processor, free)
	act_out = [NSString stringWithFormat:@"r%d", i];
	[m appendFormat:@" tensor<fp16, [%d, %d, %d, %d]> %@ = relu(x = %@)[name = string(\"%@\")];\n",
	batch, ch, sp, sp, act_out, conv_out, act_out];
	}

	if (i < depth - 1) {
	// quantize: fp16 → int8
	NSString *q_out = [NSString stringWithFormat:@"q%d", i];
	[m appendFormat:@" tensor<int8, [%d, %d, %d, %d]> %@ = quantize(input = %@, output_dtype = q_dtype, scale = q_scale)[name = string(\"%@\")];\n",
	batch, ch, sp, sp, q_out, act_out, q_out];
	// dequantize: int8 → fp16
	NSString *dq_out = [NSString stringWithFormat:@"dq%d", i];
	[m appendFormat:@" tensor<fp16, [%d, %d, %d, %d]> %@ = dequantize(input = %@, scale = dq_scale)[name = string(\"%@\")];\n",
	batch, ch, sp, sp, dq_out, q_out, dq_out];
	prev = dq_out; // fp16 for next conv
	} else {
	prev = act_out; // last conv outputs fp16 directly
	}
	}
	// FP16 output (matching CoreML model)
	[m appendFormat:@" } -> (%@);\n}\n", prev];
	return m;
	}

	double bench(int ch, int sp, int depth, int batch) {
	@autoreleasepool {
	NSError *e = nil;
	NSData *milData = [[genMILInt8(ch,sp,depth,batch) dataUsingEncoding:NSUTF8StringEncoding] copy];
	NSData *wb = buildWeightBlobInt8(ch, depth);
	Class D=NSClassFromString(@"_ANEInMemoryModelDescriptor"), I=NSClassFromString(@"_ANEInMemoryModel");
	Class AR=NSClassFromString(@"_ANERequest"), AIO=NSClassFromString(@"_ANEIOSurfaceObject");
	id desc=((id(*)(Class,SEL,id,id,id))objc_msgSend)(D,@selector(modelWithMILText:weights:optionsPlist:),milData,@{@"@model_path/weights/weight.bin":@{@"offset":@0,@"data":wb}},nil);
	if(!desc)return -1;
	id mdl=((id(*)(Class,SEL,id))objc_msgSend)(I,@selector(inMemoryModelWithDescriptor:),desc);
	if(!mdl)return -2;
	id hx=((id(*)(id,SEL))objc_msgSend)(mdl,@selector(hexStringIdentifier));
	NSString *td=[NSTemporaryDirectory() stringByAppendingPathComponent:hx];
	NSFileManager *fm=[NSFileManager defaultManager];
	[fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
	[milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
	[wb writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
	if(!((BOOL()(id,SEL,unsigned int,id,NSError*))objc_msgSend)(mdl,@selector(compileWithQoS:options:error:),g_qos,@{},&e)){
	if(e)fprintf(stderr,"compile err: %s\n",[[e description] UTF8String]);
	[fm removeItemAtPath:td error:nil];return -3;}
	// Save compiled model.mil for inspection
	NSString *compiledMil = [td stringByAppendingPathComponent:@"model.mil"];
	if ([fm fileExistsAtPath:compiledMil]) {
	NSString *content = [NSString stringWithContentsOfFile:compiledMil encoding:NSUTF8StringEncoding error:nil];
	if (content) {
	static int printed = 0;
	if (!printed) {
	printf("=== Compiled model.mil (after ANE compiler) ===\n%s\n\n", [content UTF8String]);
	printed = 1;
	}
	}
	}
	if(!((BOOL()(id,SEL,unsigned int,id,NSError*))objc_msgSend)(mdl,@selector(loadWithQoS:options:error:),g_qos,@{},&e)){[fm removeItemAtPath:td error:nil];return -4;}
	// FP16 I/O: 2 bytes per element (matching CoreML model)
	NSUInteger bytes=(NSUInteger)batchchspsp2;
	IOSurfaceRef ioI=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
	IOSurfaceRef ioO=IOSurfaceCreate((__bridge CFDictionaryRef)@{(id)kIOSurfaceWidth:@(bytes),(id)kIOSurfaceHeight:@1,(id)kIOSurfaceBytesPerElement:@1,(id)kIOSurfaceBytesPerRow:@(bytes),(id)kIOSurfaceAllocSize:@(bytes),(id)kIOSurfacePixelFormat:@0});
	id wI=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioI);
	id wO=((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(AIO,@selector(objectWithIOSurface:),ioO);
	id req=((id(*)(Class,SEL,id,id,id,id,id,id,id))objc_msgSend)(AR,@selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),@[wI],@[@0],@[wO],@[@0],nil,nil,@0);
	for(int i=0;i<10;i++)((BOOL()(id,SEL,unsigned int,id,id,NSError*))objc_msgSend)(mdl,@selector(evaluateWithQoS:options:request:error:),g_qos,@{},req,&e);
	int it=50; uint64_t t0=mach_absolute_time();
	for(int i=0;i<it;i++)((BOOL()(id,SEL,unsigned int,id,id,NSError*))objc_msgSend)(mdl,@selector(evaluateWithQoS:options:request:error:),g_qos,@{},req,&e);
	double ms=ticksToMs(mach_absolute_time()-t0)/it;
	((BOOL()(id,SEL,unsigned int,NSError*))objc_msgSend)(mdl,@selector(unloadWithQoS:error:),g_qos,&e);
	CFRelease(ioI);CFRelease(ioO);[fm removeItemAtPath:td error:nil];
	return ms;
	}
	}

	int main(int argc, char *argv[]) {
	mach_timebase_info(&g_tb);
	dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine",RTLD_NOW);
	int batch = 1, sp = 64;

	// Parse args
	int pos = 0;
	for (int i = 1; i < argc; i++) {
	if (strcmp(argv[i], "--relu") == 0) { g_relu = 1; continue; }
	if (strcmp(argv[i], "--qos") == 0 && i+1 < argc) {
	g_qos = (unsigned int)atoi(argv[++i]); continue;
	}
	if (pos == 0) { batch = atoi(argv[i]); pos++; }
	else if (pos == 1) { sp = atoi(argv[i]); pos++; }
	}
	if (batch < 1) batch = 1;
	if (sp < 1) sp = 1;

	// Query ANE hardware info
	Class DI = NSClassFromString(@"_ANEDeviceInfo");
	unsigned int numANEs = 1, numCores = 16;
	const char *ane_type = "unknown";
	if (DI) {
	numANEs = ((unsigned int(*)(Class,SEL))objc_msgSend)(DI, @selector(numANEs));
	numCores = ((unsigned int(*)(Class,SEL))objc_msgSend)(DI, @selector(numANECores));
	id subType = ((id(*)(Class,SEL))objc_msgSend)(DI, @selector(aneSubType));
	if (subType) ane_type = [[subType description] UTF8String];
	}

	// ANE IP clock by generation
	double ane_ip_mhz = 2364.0;
	if (strncmp(ane_type, "h11", 3) == 0) ane_ip_mhz = 1464.0;
	else if (strncmp(ane_type, "h13", 3) == 0) ane_ip_mhz = 2100.0;
	else if (strncmp(ane_type, "h15", 3) == 0) ane_ip_mhz = 2364.0;
	else if (strncmp(ane_type, "h17", 3) == 0) ane_ip_mhz = 2448.0;
	const double ane_clock_mhz = ane_ip_mhz / 8.0;

	// Peak throughput per ANE cluster
	// _ANEDeviceInfo.numANECores reports clusters (e.g. 2 on M5), not MAC arrays
	// Each cluster has 16 MAC arrays × 2048 MACs × 2 OPs/MAC (multiply + accumulate)
	const unsigned int mac_arrays = 16; // per cluster, fixed in ANE architecture
	const double fp16_macs_per_cycle = (double)mac_arrays * 2048 * 2; // 65536 per cluster
	const double int8_ops_per_cycle = fp16_macs_per_cycle * 2; // 2x if native int8 MACs
	const double fp16_peak = fp16_macs_per_cycle * ane_clock_mhz / 1e6; // per cluster
	const double int8_peak = int8_ops_per_cycle * ane_clock_mhz / 1e6; // per cluster

	const char *qos_name = g_qos==0?"RealTime":g_qos==33?"UserInteractive":g_qos==25?"UserInitiated":
	g_qos==21?"Default":g_qos==17?"Utility":g_qos==9?"Background":"custom";

	printf("=== In-Memory ANE Peak INT8 W8A8 (batch=%d, sp=%dx%d) ===\n", batch, sp, sp);
	printf("ANE hw: type=%s, numANEs=%u, numCores=%u, QoS=%u (%s)\n", ane_type, numANEs, numCores, g_qos, qos_name);
	printf(" IP clock=%.0f MHz, MAC clock=%.1f MHz\n", ane_ip_mhz, ane_clock_mhz);
	printf(" FP16 peak: %.2f TFLOPS/cluster\n", fp16_peak);
	printf(" INT8 peak (if native): %.2f TOPS/cluster\n", int8_peak);
	printf("Data: fp16(in) → [conv(W8) → %squant → dequant] × N → conv(W8) → fp16(out)\n",
	g_relu ? "relu → " : "");
	printf("Tensor shape: [%d, ch, %d, %d]\n", batch, sp, sp);
	printf("Usage: %s [batch] [sp] [--relu] [--qos N]\n\n", argv[0]);

	typedef struct{int c,d;}C;
	C cf[]={
	{512,128},{512,96},{512,64},{512,48},{512,32},
	{256,256},{256,128},{256,64},
	{384,128},{384,64},
	};
	int ncf = sizeof(cf)/sizeof(cf[0]);

	printf("%-28s %7s %7s %9s %7s %6s %6s %8s\n","Config","W(MB)","GOP","ms/eval","TOPS","%%fp16","%%int8","Est.MHz");
	printf("------------------------------------------------------------------------------------\n");
	for(int i=0;i<ncf;i++){
	int c=cf[i].c,d=cf[i].d;
	double w=(double)cc1*d/1024/1024; // int8 weights
	double gop=2.0ccspspdbatch/1e9;
	char l[64]; snprintf(l,64,"%dx conv %dch %dx%d",d,c,sp,sp);
	double ms=bench(c,sp,d,batch);
	double tops=ms>0?gop/ms:0;
	double pct_fp16 = tops / fp16_peak * 100.0;
	double pct_int8 = tops / int8_peak * 100.0;
	double est_mhz = tops * 1e12 / fp16_macs_per_cycle / 1e6;
	if(ms>0)printf("%-28s %6.1f %6.2f %7.3f ms %6.2f %5.1f%% %5.1f%% %7.0f\n",l,w,gop,ms,tops,pct_fp16,pct_int8,est_mhz);
	else printf("%-28s %6.1f %6.2f FAIL(%.0f)\n",l,w,gop,ms);
	}
	return 0;
	}