# import random from memory import UnsafePointer from compile.reflection import get_type_name from sys.intrinsics import _type_is_eq from sys.info import _current_target, is_triple from gpu import * from gpu.warp import * from gpu.host import DeviceContext from sys import has_accelerator, is_nvidia_gpu, llvm_intrinsic, is_amd_gpu from layout import Layout, LayoutTensor from layout.tensor_builder import LayoutTensorBuild as tb fn _amdgcn_dpp[ dtype: DType, width: Int, //, dpp_ctrl: UInt32, row_mask: UInt32 = 0b1111, bank_mask: UInt32 = 0b1111, ](old: SIMD[dtype, width], src: SIMD[dtype, width]) -> SIMD[dtype, width]: # constrained[is_amd_gpu()]() constrained[ dtype.bitwidth() in (32, 64), "Can only use DPP with 32/64-bit dtypes" ]() bound_ctrl = False return llvm_intrinsic["llvm.amdgcn.update.dpp", SIMD[dtype, width]]( old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl ) fn amdgcn_row_mirror[ dtype: DType, width: Int, // ](old: SIMD[dtype, width], src: SIMD[dtype, width]) -> SIMD[dtype, width]: return _amdgcn_dpp[dpp_ctrl=0x140](old, src) fn amdgcn_row_shift_left[ dtype: DType, width: Int, //, offset: Int ](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]: constrained[ offset > 0 and offset < 16, "Can only shift row by up to 15 positions" ]() return _amdgcn_dpp[dpp_ctrl = 0x100 + offset](old, src) fn amdgcn_row_rotate_left[ dtype: DType, width: Int, //, offset: Int ](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]: constrained[ offset > 0 and offset < 16, "Can only rotate row by up to 15 positions" ]() return _amdgcn_dpp[dpp_ctrl = 0x110 + offset](old, src) fn amdgcn_shift_left[ dtype: DType, width: Int, // ](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]: return _amdgcn_dpp[dpp_ctrl=0x130](old, src) fn amdgcn_rotate_left[ dtype: DType, width: Int, // ](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]: return _amdgcn_dpp[dpp_ctrl=0x134](old, src) fn amdgcn_row_read_lane[ dtype: DType, width: Int, //, offset: Int ](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]: constrained[ offset >= 0 and offset < 16, "Can only broadcast within each row (0-15)" ]() return _amdgcn_dpp[dpp_ctrl = 0x150 + offset](old, src) fn amdgcn_quad_perm[ dtype: DType, width: Int, //, perm: Int ](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]: constrained[ perm >= 0 and perm <= 0xFF, "DPP_QUAD_PERM must be between 0 and 0xFF" ]() return _amdgcn_dpp[dpp_ctrl = 0x0 + perm](old, src) fn amdgcn_quad_shuffle_xor[ dtype: DType, width: Int, //, mask: Int ](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]: constrained[ mask >= 0 and mask <= 3, "Quad shuffle mask must be between 0 and 3" ]() fn calculate_bitmask(xor: Int) -> Int: # calculate lane indices for a quad mask = 0 for i in range(4): mask |= (i ^ xor) << 2 * i return mask alias bitmask = calculate_bitmask(mask) return amdgcn_quad_perm[bitmask](src, old)