# import random
from memory import UnsafePointer
from compile.reflection import get_type_name
from sys.intrinsics import _type_is_eq
from sys.info import _current_target, is_triple
from gpu import *
from gpu.warp import *
from gpu.host import DeviceContext
from sys import has_accelerator, is_nvidia_gpu, llvm_intrinsic, is_amd_gpu
from layout import Layout, LayoutTensor
from layout.tensor_builder import LayoutTensorBuild as tb

fn _amdgcn_dpp[
    dtype: DType,
    width: Int, //,
    dpp_ctrl: UInt32,
    row_mask: UInt32 = 0b1111,
    bank_mask: UInt32 = 0b1111,
](old: SIMD[dtype, width], src: SIMD[dtype, width]) -> SIMD[dtype, width]:
    # constrained[is_amd_gpu()]()
    constrained[
        dtype.bitwidth() in (32, 64), "Can only use DPP with 32/64-bit dtypes"
    ]()

    bound_ctrl = False
    return llvm_intrinsic["llvm.amdgcn.update.dpp", SIMD[dtype, width]](
        old, src, dpp_ctrl, row_mask, bank_mask, bound_ctrl
    )


fn amdgcn_row_mirror[
    dtype: DType, width: Int, //
](old: SIMD[dtype, width], src: SIMD[dtype, width]) -> SIMD[dtype, width]:
    return _amdgcn_dpp[dpp_ctrl=0x140](old, src)


fn amdgcn_row_shift_left[
    dtype: DType, width: Int, //, offset: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
    constrained[
        offset > 0 and offset < 16, "Can only shift row by up to 15 positions"
    ]()
    return _amdgcn_dpp[dpp_ctrl = 0x100 + offset](old, src)


fn amdgcn_row_rotate_left[
    dtype: DType, width: Int, //, offset: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
    constrained[
        offset > 0 and offset < 16, "Can only rotate row by up to 15 positions"
    ]()
    return _amdgcn_dpp[dpp_ctrl = 0x110 + offset](old, src)


fn amdgcn_shift_left[
    dtype: DType, width: Int, //
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
    return _amdgcn_dpp[dpp_ctrl=0x130](old, src)


fn amdgcn_rotate_left[
    dtype: DType, width: Int, //
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
    return _amdgcn_dpp[dpp_ctrl=0x134](old, src)


fn amdgcn_row_read_lane[
    dtype: DType, width: Int, //, offset: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
    constrained[
        offset >= 0 and offset < 16, "Can only broadcast within each row (0-15)"
    ]()
    return _amdgcn_dpp[dpp_ctrl = 0x150 + offset](old, src)


fn amdgcn_quad_perm[
    dtype: DType, width: Int, //, perm: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
    constrained[
        perm >= 0 and perm <= 0xFF, "DPP_QUAD_PERM must be between 0 and 0xFF"
    ]()
    return _amdgcn_dpp[dpp_ctrl = 0x0 + perm](old, src)


fn amdgcn_quad_shuffle_xor[
    dtype: DType, width: Int, //, mask: Int
](src: SIMD[dtype, width], old: SIMD[dtype, width] = 0) -> SIMD[dtype, width]:
    constrained[
        mask >= 0 and mask <= 3, "Quad shuffle mask must be between 0 and 3"
    ]()

    fn calculate_bitmask(xor: Int) -> Int:
        # calculate lane indices for a quad
        mask = 0
        for i in range(4):
            mask |= (i ^ xor) << 2 * i
        return mask

    alias bitmask = calculate_bitmask(mask)
    return amdgcn_quad_perm[bitmask](src, old)