makslevental · March 26, 2026 19:01 · makslevental · Mar 26, 2026
diff --git a/fuse-mask-reduce.mlir b/fuse-mask-reduce.mlir
 // Attention mask + max-reduce using named linalg ops.
 // 3 kernels: broadcast mask → add to scores → max-reduce over j.
 // After generalize + fuse: broadcast and add are inlined into the reduction.

 func.func @fuse_mask_into_max_reduce(
    %scores: tensor<4x512x512xf32>,
    %mask: tensor<512x512xf32>,
    %neg_inf_init: tensor<4x512xf32>) -> tensor<4x512xf32> {

  %init3d = tensor.empty() : tensor<4x512x512xf32>

  // Kernel 1: broadcast mask [512,512] -> [4,512,512] (replicate over batch)
  %mask_3d = linalg.broadcast
      ins(%mask : tensor<512x512xf32>)
      outs(%init3d : tensor<4x512x512xf32>) dimensions = [0]

  // Kernel 2: elementwise add scores + mask
  %masked = linalg.add
      ins(%scores, %mask_3d : tensor<4x512x512xf32>, tensor<4x512x512xf32>)
      outs(%init3d : tensor<4x512x512xf32>) -> tensor<4x512x512xf32>

  // Kernel 3: max-reduce over j (dim 2)
  %max = linalg.reduce ins(%masked : tensor<4x512x512xf32>)
      outs(%neg_inf_init : tensor<4x512xf32>) dimensions = [2]
      (%in: f32, %acc: f32) {
        %m = arith.maximumf %in, %acc : f32
        linalg.yield %m : f32
      }

  return %max : tensor<4x512xf32>
 }
diff --git a/fuse-powf-reduce.mlir b/fuse-powf-reduce.mlir
 // RUN: mlir-opt %s --linalg-generalize-named-ops --linalg-fuse-elementwise-ops --split-input-file | FileCheck %s

 func.func @fuse_square_into_reduce(
    %x: tensor<4x1024xf32>) -> tensor<4xf32> {
  %cst = arith.constant 0.0 : f32
  %init = tensor.empty() : tensor<4x1024xf32>
  %zero = tensor.empty() : tensor<4xf32>
  %fill = linalg.fill ins(%cst : f32) outs(%zero : tensor<4xf32>) -> tensor<4xf32>

  // Kernel 1: elementwise x^2
  %sq = linalg.square
      ins(%x : tensor<4x1024xf32>)
      outs(%init : tensor<4x1024xf32>) -> tensor<4x1024xf32>

  // Kernel 2: sum-reduce over dim 1
  %sum = linalg.reduce ins(%sq : tensor<4x1024xf32>)
      outs(%fill : tensor<4xf32>) dimensions = [1]
      (%in: f32, %acc: f32) {
        %add = arith.addf %in, %acc : f32
        linalg.yield %add : f32
      }

  return %sum : tensor<4xf32>
 }

 // CHECK: #[[$ATTR_0:.+]] = affine_map<(d0) -> (d0)>
 // CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1) -> (d0)>
 // CHECK-LABEL:   func.func @fuse_square_into_reduce(
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<4x1024xf32>) -> tensor<4xf32> {
 // CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[EMPTY_0:.*]] = tensor.empty() : tensor<4xf32>
 // CHECK:           %[[GENERIC_0:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_0]]], iterator_types = ["parallel"]} outs(%[[EMPTY_0]] : tensor<4xf32>) {
 // CHECK:           ^bb0(%[[VAL_0:.*]]: f32):
 // CHECK:             linalg.yield %[[CONSTANT_0]] : f32
 // CHECK:           } -> tensor<4xf32>
 // CHECK:           %[[GENERIC_1:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "reduction"]} ins(%[[ARG0]] : tensor<4x1024xf32>) outs(%[[GENERIC_0]] : tensor<4xf32>) {
 // CHECK:           ^bb0(%[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: f32):
 // CHECK:             %[[MULF_0:.*]] = arith.mulf %[[VAL_1]], %[[VAL_1]] : f32
 // CHECK:             %[[ADDF_0:.*]] = arith.addf %[[MULF_0]], %[[VAL_2]] : f32
 // CHECK:             linalg.yield %[[ADDF_0]] : f32
 // CHECK:           } -> tensor<4xf32>
 // CHECK:           return %[[GENERIC_1]] : tensor<4xf32>
 // CHECK:         }

 // -----

 // NLL loss: loss = -Σ_c q[c] * log(p[c])
 // Three kernels: log (elementwise) -> mul (elementwise) -> reduce (sum)
 // After generalize + fuse: log and mul are inlined into the reduction body.

 func.func @fuse_nll_into_reduce(
    %p: tensor<1024xf32>,
    %q: tensor<1024xf32>) -> tensor<f32> {
  %cst = arith.constant 0.0 : f32
  %init = tensor.empty() : tensor<1024xf32>
  %zero = tensor.empty() : tensor<f32>
  %fill = linalg.fill ins(%cst : f32) outs(%zero : tensor<f32>) -> tensor<f32>

  // Kernel 1: elementwise log(p)
  %log_p = linalg.log
      ins(%p : tensor<1024xf32>)
      outs(%init : tensor<1024xf32>) -> tensor<1024xf32>

  // Kernel 2: elementwise q * log(p)
  %qlogp = linalg.mul
      ins(%q, %log_p : tensor<1024xf32>, tensor<1024xf32>)
      outs(%init : tensor<1024xf32>) -> tensor<1024xf32>

  // Kernel 3: sum-reduce over classes
  %sum = linalg.reduce ins(%qlogp : tensor<1024xf32>)
      outs(%fill : tensor<f32>) dimensions = [0]
      (%in: f32, %acc: f32) {
        %add = arith.addf %in, %acc : f32
        linalg.yield %add : f32
      }

  return %sum : tensor<f32>
 }

 // CHECK-LABEL:   func.func @fuse_nll_into_reduce(
 // CHECK-SAME:      %[[ARG0:.*]]: tensor<1024xf32>,
 // CHECK-SAME:      %[[ARG1:.*]]: tensor<1024xf32>) -> tensor<f32> {
 // CHECK:           %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:           %[[EMPTY_0:.*]] = tensor.empty() : tensor<f32>
 // CHECK:           %[[GENERIC_0:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_3]]], iterator_types = []} outs(%[[EMPTY_0]] : tensor<f32>) {
 // CHECK:           ^bb0(%[[VAL_0:.*]]: f32):
 // CHECK:             linalg.yield %[[CONSTANT_0]] : f32
 // CHECK:           } -> tensor<f32>
 // CHECK:           %[[GENERIC_1:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_4]]], iterator_types = ["reduction"]} ins(%[[ARG1]], %[[ARG0]] : tensor<1024xf32>, tensor<1024xf32>) outs(%[[GENERIC_0]] : tensor<f32>) {
 // CHECK:           ^bb0(%[[VAL_1:.*]]: f32, %[[VAL_2:.*]]: f32, %[[VAL_3:.*]]: f32):
 // CHECK:             %[[LOG_0:.*]] = math.log %[[VAL_2]] : f32
 // CHECK:             %[[MULF_0:.*]] = arith.mulf %[[VAL_1]], %[[LOG_0]] : f32
 // CHECK:             %[[ADDF_0:.*]] = arith.addf %[[MULF_0]], %[[VAL_3]] : f32
 // CHECK:             linalg.yield %[[ADDF_0]] : f32
 // CHECK:           } -> tensor<f32>
 // CHECK:           return %[[GENERIC_1]] : tensor<f32>
 // CHECK:         }
	// Attention mask + max-reduce using named linalg ops.
	// 3 kernels: broadcast mask → add to scores → max-reduce over j.
	// After generalize + fuse: broadcast and add are inlined into the reduction.

	func.func @fuse_mask_into_max_reduce(
	%scores: tensor<4x512x512xf32>,
	%mask: tensor<512x512xf32>,
	%neg_inf_init: tensor<4x512xf32>) -> tensor<4x512xf32> {

	%init3d = tensor.empty() : tensor<4x512x512xf32>

	// Kernel 1: broadcast mask [512,512] -> [4,512,512] (replicate over batch)
	%mask_3d = linalg.broadcast
	ins(%mask : tensor<512x512xf32>)
	outs(%init3d : tensor<4x512x512xf32>) dimensions = [0]

	// Kernel 2: elementwise add scores + mask
	%masked = linalg.add
	ins(%scores, %mask_3d : tensor<4x512x512xf32>, tensor<4x512x512xf32>)
	outs(%init3d : tensor<4x512x512xf32>) -> tensor<4x512x512xf32>

	// Kernel 3: max-reduce over j (dim 2)
	%max = linalg.reduce ins(%masked : tensor<4x512x512xf32>)
	outs(%neg_inf_init : tensor<4x512xf32>) dimensions = [2]
	(%in: f32, %acc: f32) {
	%m = arith.maximumf %in, %acc : f32
	linalg.yield %m : f32
	}

	return %max : tensor<4x512xf32>
	}
	// RUN: mlir-opt %s --linalg-generalize-named-ops --linalg-fuse-elementwise-ops --split-input-file \| FileCheck %s

	func.func @fuse_square_into_reduce(
	%x: tensor<4x1024xf32>) -> tensor<4xf32> {
	%cst = arith.constant 0.0 : f32
	%init = tensor.empty() : tensor<4x1024xf32>
	%zero = tensor.empty() : tensor<4xf32>
	%fill = linalg.fill ins(%cst : f32) outs(%zero : tensor<4xf32>) -> tensor<4xf32>

	// Kernel 1: elementwise x^2
	%sq = linalg.square
	ins(%x : tensor<4x1024xf32>)
	outs(%init : tensor<4x1024xf32>) -> tensor<4x1024xf32>

	// Kernel 2: sum-reduce over dim 1
	%sum = linalg.reduce ins(%sq : tensor<4x1024xf32>)
	outs(%fill : tensor<4xf32>) dimensions = [1]
	(%in: f32, %acc: f32) {
	%add = arith.addf %in, %acc : f32
	linalg.yield %add : f32
	}

	return %sum : tensor<4xf32>
	}

	// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0) -> (d0)>
	// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1) -> (d0, d1)>
	// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1) -> (d0)>
	// CHECK-LABEL: func.func @fuse_square_into_reduce(
	// CHECK-SAME: %[[ARG0:.*]]: tensor<4x1024xf32>) -> tensor<4xf32> {
	// CHECK: %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
	// CHECK: %[[EMPTY_0:.*]] = tensor.empty() : tensor<4xf32>
	// CHECK: %[[GENERIC_0:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_0]]], iterator_types = ["parallel"]} outs(%[[EMPTY_0]] : tensor<4xf32>) {
	// CHECK: ^bb0(%[[VAL_0:.*]]: f32):
	// CHECK: linalg.yield %[[CONSTANT_0]] : f32
	// CHECK: } -> tensor<4xf32>
	// CHECK: %[[GENERIC_1:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "reduction"]} ins(%[[ARG0]] : tensor<4x1024xf32>) outs(%[[GENERIC_0]] : tensor<4xf32>) {
	// CHECK: ^bb0(%[[VAL_1:.]]: f32, %[[VAL_2:.]]: f32):
	// CHECK: %[[MULF_0:.*]] = arith.mulf %[[VAL_1]], %[[VAL_1]] : f32
	// CHECK: %[[ADDF_0:.*]] = arith.addf %[[MULF_0]], %[[VAL_2]] : f32
	// CHECK: linalg.yield %[[ADDF_0]] : f32
	// CHECK: } -> tensor<4xf32>
	// CHECK: return %[[GENERIC_1]] : tensor<4xf32>
	// CHECK: }

	// -----

	// NLL loss: loss = -Σ_c q[c] * log(p[c])
	// Three kernels: log (elementwise) -> mul (elementwise) -> reduce (sum)
	// After generalize + fuse: log and mul are inlined into the reduction body.

	func.func @fuse_nll_into_reduce(
	%p: tensor<1024xf32>,
	%q: tensor<1024xf32>) -> tensor<f32> {
	%cst = arith.constant 0.0 : f32
	%init = tensor.empty() : tensor<1024xf32>
	%zero = tensor.empty() : tensor<f32>
	%fill = linalg.fill ins(%cst : f32) outs(%zero : tensor<f32>) -> tensor<f32>

	// Kernel 1: elementwise log(p)
	%log_p = linalg.log
	ins(%p : tensor<1024xf32>)
	outs(%init : tensor<1024xf32>) -> tensor<1024xf32>

	// Kernel 2: elementwise q * log(p)
	%qlogp = linalg.mul
	ins(%q, %log_p : tensor<1024xf32>, tensor<1024xf32>)
	outs(%init : tensor<1024xf32>) -> tensor<1024xf32>

	// Kernel 3: sum-reduce over classes
	%sum = linalg.reduce ins(%qlogp : tensor<1024xf32>)
	outs(%fill : tensor<f32>) dimensions = [0]
	(%in: f32, %acc: f32) {
	%add = arith.addf %in, %acc : f32
	linalg.yield %add : f32
	}

	return %sum : tensor<f32>
	}

	// CHECK-LABEL: func.func @fuse_nll_into_reduce(
	// CHECK-SAME: %[[ARG0:.*]]: tensor<1024xf32>,
	// CHECK-SAME: %[[ARG1:.*]]: tensor<1024xf32>) -> tensor<f32> {
	// CHECK: %[[CONSTANT_0:.*]] = arith.constant 0.000000e+00 : f32
	// CHECK: %[[EMPTY_0:.*]] = tensor.empty() : tensor<f32>
	// CHECK: %[[GENERIC_0:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_3]]], iterator_types = []} outs(%[[EMPTY_0]] : tensor<f32>) {
	// CHECK: ^bb0(%[[VAL_0:.*]]: f32):
	// CHECK: linalg.yield %[[CONSTANT_0]] : f32
	// CHECK: } -> tensor<f32>
	// CHECK: %[[GENERIC_1:.*]] = linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_4]]], iterator_types = ["reduction"]} ins(%[[ARG1]], %[[ARG0]] : tensor<1024xf32>, tensor<1024xf32>) outs(%[[GENERIC_0]] : tensor<f32>) {
	// CHECK: ^bb0(%[[VAL_1:.]]: f32, %[[VAL_2:.]]: f32, %[[VAL_3:.*]]: f32):
	// CHECK: %[[LOG_0:.*]] = math.log %[[VAL_2]] : f32
	// CHECK: %[[MULF_0:.*]] = arith.mulf %[[VAL_1]], %[[LOG_0]] : f32
	// CHECK: %[[ADDF_0:.*]] = arith.addf %[[MULF_0]], %[[VAL_3]] : f32
	// CHECK: linalg.yield %[[ADDF_0]] : f32
	// CHECK: } -> tensor<f32>
	// CHECK: return %[[GENERIC_1]] : tensor<f32>
	// CHECK: }