kostakoff · March 20, 2026 04:31
diff --git a/Alibaba MNN build cuda 13.0.md b/Alibaba MNN build cuda 13.0.md
diff --git a/my_changes.patch b/my_changes.patch
 diff --git a/source/backend/cuda/CMakeLists.txt b/source/backend/cuda/CMakeLists.txt
 index 2295f2ad..d1b2f508 100644
 --- a/source/backend/cuda/CMakeLists.txt
 +++ b/source/backend/cuda/CMakeLists.txt
 @@ -29,20 +29,20 @@ if(CUDA_FOUND)
      set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -allow-unsupported-compiler")
     ENDIF()
     IF (${arch_count} EQUAL 1)
 -        set(support_archs 60 61 62 70 72 75 80 86 87 89)
 +        set(support_archs 60 61 62 70 72 75 80 86 87 89 90 120 121)
         list(FIND support_archs ${CUDA_ARCH_FLAGS_readable_code} list_index)
         IF (${list_index} EQUAL -1)
             message(FATAL_ERROR "Please add your own sm arch ${CUDA_ARCH_FLAGS_readable_code} to CmakeLists.txt!")
         ENDIF()
     ENDIF()
 
 -    IF ((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0"))
 +    IF (((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0")) AND (CUDA_VERSION VERSION_LESS "13.0"))
         set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
         set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
         set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
     ENDIF()
     
 -    IF ((CUDA_VERSION VERSION_GREATER "10.1") OR (CUDA_VERSION VERSION_EQUAL "10.1"))
 +    IF (((CUDA_VERSION VERSION_GREATER "10.1") OR (CUDA_VERSION VERSION_EQUAL "10.1")) AND (CUDA_VERSION VERSION_LESS "13.0"))
         set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
         set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_72,code=sm_72")
     ENDIF()
 @@ -68,6 +68,12 @@ if(CUDA_FOUND)
         add_definitions(-DMNN_CUDA_ENABLE_SM89 -DMNN_CUDA_ENABLE_SM89)
     ENDIF()
 
 +    IF ((CUDA_VERSION VERSION_GREATER "13.0") OR (CUDA_VERSION VERSION_EQUAL "13.0"))
 +        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_120,code=sm_120")
 +        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_121,code=sm_121")
 +        add_definitions(-DMNN_CUDA_ENABLE_SM120)
 +    ENDIF()
 +
     # Limit minimum cuda version for each archs
 
     IF (${arch_count} EQUAL 1)
 @@ -172,6 +178,7 @@ if(WIN32)
 else()
 
     cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC})
 +    set_property(TARGET MNN_Cuda_Main PROPERTY CUDA_SEPARABLE_COMPILATION ON)
     if(MNN_CUDA_PROFILE)
       target_compile_options(MNN_Cuda_Main PRIVATE -DMNN_CUDA_PROFILE)
       target_link_libraries(MNN_Cuda_Main  ${CUDA_INCLUDE_DIRS}/../lib/libnvToolsExt.so)
 diff --git a/source/backend/cuda/SelectCudaComputeArch.cmake b/source/backend/cuda/SelectCudaComputeArch.cmake
 index 889e7af8..6d48f046 100755
 --- a/source/backend/cuda/SelectCudaComputeArch.cmake
 +++ b/source/backend/cuda/SelectCudaComputeArch.cmake
 @@ -133,6 +133,13 @@ if(NOT CUDA_VERSION VERSION_LESS "11.8")
   endif()
 endif()
 
 +if(NOT CUDA_VERSION VERSION_LESS "13.0")
 +  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Blackwell")
 +  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "12.0" "12.1")
 +  list(APPEND CUDA_ALL_GPU_ARCHITECTURES "12.0" "12.1")
 +  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "12.0+PTX")
 +endif()
 +
 ################################################################################################
 # A function for automatic detection of GPUs installed  (if autodetection is enabled)
 # Usage:
 @@ -240,7 +247,7 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
       set(add_ptx TRUE)
       set(arch_name ${CMAKE_MATCH_1})
     endif()
 -    if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
 +    if(arch_name MATCHES "^([0-9]+\\.[0-9]+(\\([0-9]+\\.[0-9]+\\))?)$")
       set(arch_bin ${CMAKE_MATCH_1})
       set(arch_ptx ${arch_bin})
     else()
 @@ -277,6 +284,9 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
       elseif(${arch_name} STREQUAL "Hopper")
         set(arch_bin 9.0)
         set(arch_ptx 9.0)
 +      elseif(${arch_name} STREQUAL "Blackwell")
 +        set(arch_bin 12.0 12.1)
 +        set(arch_ptx 12.0)
       else()
         message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
       endif()
 diff --git a/source/backend/cuda/execution/AttentionExecution.cu b/source/backend/cuda/execution/AttentionExecution.cu
 index 593afc56..80cad8c2 100644
 --- a/source/backend/cuda/execution/AttentionExecution.cu
 +++ b/source/backend/cuda/execution/AttentionExecution.cu
 @@ -1,3 +1,4 @@
 +#include "SoftmaxKernels.cuh"
 #include "AttentionExecution.hpp"
 #include "core/TensorUtils.hpp"
 #include "SoftmaxExecution.hpp"
 diff --git a/source/backend/cuda/execution/SoftmaxExecution.cu b/source/backend/cuda/execution/SoftmaxExecution.cu
 index 02077763..0e2f8093 100644
 --- a/source/backend/cuda/execution/SoftmaxExecution.cu
 +++ b/source/backend/cuda/execution/SoftmaxExecution.cu
 @@ -1,3 +1,4 @@
 +#include "SoftmaxKernels.cuh"
 #include "SoftmaxExecution.hpp"
 #include "core/TensorUtils.hpp"
 namespace MNN {
 @@ -35,7 +36,7 @@ __global__ void SOFTMAX(const T *input, T *output,
         }
     }
 }
 -
 +/**
 template <typename T>
 __global__ void SOFTMAX_WARP_32(const T *input, T *output,
     const int inside,
 @@ -136,7 +137,7 @@ __global__ void SOFTMAX_AXIS_REDUCE(const T *input, T *output,
             dst[(tid + i * per_block_size) * inside] = (T)(tmp_exp * divSumValue);
         }
     }
 -}
 +}*/
 
 SoftmaxExecution::SoftmaxExecution(int axis, Backend *backend) : Execution(backend) {
     mAxis = axis;
 diff --git a/source/backend/cuda/execution/SoftmaxKernels.cuh b/source/backend/cuda/execution/SoftmaxKernels.cuh
 new file mode 100644
 index 00000000..57773aef
 --- /dev/null
 +++ b/source/backend/cuda/execution/SoftmaxKernels.cuh
 @@ -0,0 +1,102 @@
 +// SoftmaxKernels.cuh
 +#pragma once
 +#include <cuda_fp16.h>
 +#include <float.h>
 +
 +namespace MNN {
 +namespace CUDA {
 +
 +template <typename T>
 +__global__ void SOFTMAX_WARP_32(const T *input, T *output,
 +    const int inside, const int axis, const int outside, const int count)
 +{
 +    int idx_outside = blockIdx.x / inside;
 +    int idx_inside  = blockIdx.x - idx_outside * inside;
 +    auto src = input + idx_outside * axis * inside + idx_inside;
 +
 +    float local_src = -FLT_MAX;
 +    __shared__ float maxValue;
 +    __shared__ float sumValue;
 +    int tid = threadIdx.x;
 +    if (tid < axis) local_src = (float)(src[tid * inside]);
 +
 +    // warp reduce max
 +    for (int offset = 16; offset > 0; offset >>= 1)
 +        local_src = max(local_src, __shfl_down_sync(0xffffffff, local_src, offset));
 +    if (tid == 0) maxValue = local_src;
 +    __syncthreads();
 +
 +    float local_exp = 0.0f;
 +    if (tid < axis) {
 +        float tmp = local_src - maxValue;
 +        tmp = (tmp < -87.0f) ? -87.0f : tmp;
 +        local_exp = expf(tmp);
 +    }
 +
 +    // warp reduce sum
 +    float s = local_exp;
 +    for (int offset = 16; offset > 0; offset >>= 1)
 +        s += __shfl_down_sync(0xffffffff, s, offset);
 +    if (tid == 0) sumValue = s;
 +    __syncthreads();
 +
 +    if (tid < axis)
 +        output[(idx_outside * axis + tid) * inside + idx_inside] = (T)(local_exp / sumValue);
 +}
 +
 +template <typename T>
 +__global__ void SOFTMAX_AXIS_REDUCE(const T *input, T *output,
 +    const int inside, const int axis,
 +    const int per_block_size, const int calc_multi_num,
 +    const int outside, const int count)
 +{
 +    int idx_outside = blockIdx.x / inside;
 +    int idx_inside  = blockIdx.x - idx_outside * inside;
 +    auto src = input + idx_outside * axis * inside + idx_inside;
 +    auto dst = output + idx_outside * axis * inside + idx_inside;
 +
 +    int tid = threadIdx.x;
 +    float local_max = -FLT_MAX;
 +    for (int i = 0; i < calc_multi_num; i++) {
 +        int pos = tid + i * per_block_size;
 +        if (pos < axis) local_max = max(local_max, (float)(src[pos * inside]));
 +    }
 +    // block reduce max via shared mem
 +    __shared__ float smem[256];
 +    smem[tid] = local_max;
 +    __syncthreads();
 +    for (int s = blockDim.x/2; s > 0; s >>= 1) {
 +        if (tid < s) smem[tid] = max(smem[tid], smem[tid+s]);
 +        __syncthreads();
 +    }
 +    float maxValue = smem[0];
 +
 +    float local_sum = 0.0f;
 +    for (int i = 0; i < calc_multi_num; i++) {
 +        int pos = tid + i * per_block_size;
 +        if (pos < axis) {
 +            float tmp = (float)(src[pos * inside]) - maxValue;
 +            tmp = (tmp < -87.0f) ? -87.0f : tmp;
 +            local_sum += expf(tmp);
 +        }
 +    }
 +    smem[tid] = local_sum;
 +    __syncthreads();
 +    for (int s = blockDim.x/2; s > 0; s >>= 1) {
 +        if (tid < s) smem[tid] += smem[tid+s];
 +        __syncthreads();
 +    }
 +    float divSum = 1.0f / smem[0];
 +
 +    for (int i = 0; i < calc_multi_num; i++) {
 +        int pos = tid + i * per_block_size;
 +        if (pos < axis) {
 +            float tmp = (float)(src[pos * inside]) - maxValue;
 +            tmp = (tmp < -87.0f) ? -87.0f : tmp;
 +            dst[pos * inside] = (T)(expf(tmp) * divSum);
 +        }
 +    }
 +}
 +
 +} // namespace CUDA
 +} // namespace MNN
 \ No newline at end of file
 diff --git a/transformers/llm/engine/src/tokenizer/jinja.hpp b/transformers/llm/engine/src/tokenizer/jinja.hpp
 index f1ca3f88..55d20bb4 100644
 --- a/transformers/llm/engine/src/tokenizer/jinja.hpp
 +++ b/transformers/llm/engine/src/tokenizer/jinja.hpp
 @@ -123,12 +123,13 @@ private:
 
 namespace jinja {
 
 -
 +/*
 // C++14 make_unique polyfill for C++11
 template<typename T, typename... Args>
 std::unique_ptr<T> make_unique(Args&&... args) {
     return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
 -}
 +}*/
 +using std::make_unique;
 
 inline std::string to_python_string(const json& val);
 
 diff --git a/transformers/llm/engine/tools/llm_bench.cpp b/transformers/llm/engine/tools/llm_bench.cpp
 index af5da9ea..93d9806b 100644
 --- a/transformers/llm/engine/tools/llm_bench.cpp
 +++ b/transformers/llm/engine/tools/llm_bench.cpp
 @@ -381,6 +381,7 @@ struct markdownPrinter : public Printer {
             }  else if (field == "backend") {
                 if (t.backend == 1) value = "METAL";
                 else if (t.backend == 3) value = "OPENCL";
 +                else if (t.backend == 4) value = "CUDA";
                 else value = "CPU";
             } else if (field == "test") {
                 if (t.nPrompt > 0 && t.nGenerate == 0) {
 @@ -486,6 +487,7 @@ struct jsonAggregator : public Printer {
         writer.Key("backend");
         if (t.backend == 1) writer.String("METAL");
         else if (t.backend == 3) writer.String("OPENCL");
 +        else if (t.backend == 4) writer.String("CUDA");
         else writer.String("CPU");
 
         writer.Key("threads");
 @@ -801,7 +803,7 @@ static void printUsage(int /* argc */, char ** argv) {
     printf("options:\n");
     printf("  -h, --help\n");
     printf("  -m, --model <filename>                    (default: ./Qwen2.5-1.5B-Instruct/config.json)\n");
 -    printf("  -a, --backends <cpu,opencl,metal>         (default: %s)\n", "cpu");
 +    printf("  -a, --backends <cpu,opencl,metal,cuda>    (default: %s)\n", "cpu");
     printf("  -c, --precision <n>                       (default: %s) | Note: (0:Normal(for cpu bakend, 'Normal' is 'High'),1:High,2:Low)\n", join(runtimeParamsDefaults.precision, ",").c_str());
     printf("  -t, --threads <n>                         (default: %s)\n", join(runtimeParamsDefaults.threads, ",").c_str());
     printf("  -p, --n-prompt <n>                        (default: %s)\n", join(testParamsDefaults.nPrompt, ",").c_str());
 @@ -884,6 +886,8 @@ static bool parseCmdParams(int argc, char ** argv, RuntimeParameters & runtimePa
                     p.emplace_back(1);
                 } else if (type == "opencl") {
                     p.emplace_back(3);
 +                } else if (type == "cuda") {
 +                    p.emplace_back(4);
                 } else {
                     p.emplace_back(0);
                 }
 @@ -1061,7 +1065,7 @@ static Llm* buildLLM(const std::string& config_path, int backend, int memory, in
     // Otherwise, mContext->history_tokens retains data after the first run, skewing true prefill performance metrics."
     llmPtr->set_config(R"({"reuse_kv":false})");
     std::map<int, std::string> lever = {{0,"normal"}, {1, "high"}, {2, "low"}};
 -    std::map<int, std::string> backend_type = {{0, "cpu"}, {1, "metal"}, {3, "opencl"}};
 +    std::map<int, std::string> backend_type = {{0, "cpu"}, {1, "metal"}, {3, "opencl"}, {4, "cuda"}};
     std::map<bool, std::string> mmap = {{true,"true"}, {false, "false"}};
 
     bool setSuccess = true;
	diff --git a/source/backend/cuda/CMakeLists.txt b/source/backend/cuda/CMakeLists.txt
	index 2295f2ad..d1b2f508 100644
	--- a/source/backend/cuda/CMakeLists.txt
	+++ b/source/backend/cuda/CMakeLists.txt
	@@ -29,20 +29,20 @@ if(CUDA_FOUND)
	set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -allow-unsupported-compiler")
	ENDIF()
	IF (${arch_count} EQUAL 1)
	- set(support_archs 60 61 62 70 72 75 80 86 87 89)
	+ set(support_archs 60 61 62 70 72 75 80 86 87 89 90 120 121)
	list(FIND support_archs ${CUDA_ARCH_FLAGS_readable_code} list_index)
	IF (${list_index} EQUAL -1)
	message(FATAL_ERROR "Please add your own sm arch ${CUDA_ARCH_FLAGS_readable_code} to CmakeLists.txt!")
	ENDIF()
	ENDIF()

	- IF ((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0"))
	+ IF (((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0")) AND (CUDA_VERSION VERSION_LESS "13.0"))
	set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
	set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
	set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
	ENDIF()

	- IF ((CUDA_VERSION VERSION_GREATER "10.1") OR (CUDA_VERSION VERSION_EQUAL "10.1"))
	+ IF (((CUDA_VERSION VERSION_GREATER "10.1") OR (CUDA_VERSION VERSION_EQUAL "10.1")) AND (CUDA_VERSION VERSION_LESS "13.0"))
	set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
	set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_72,code=sm_72")
	ENDIF()
	@@ -68,6 +68,12 @@ if(CUDA_FOUND)
	add_definitions(-DMNN_CUDA_ENABLE_SM89 -DMNN_CUDA_ENABLE_SM89)
	ENDIF()

	+ IF ((CUDA_VERSION VERSION_GREATER "13.0") OR (CUDA_VERSION VERSION_EQUAL "13.0"))
	+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_120,code=sm_120")
	+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_121,code=sm_121")
	+ add_definitions(-DMNN_CUDA_ENABLE_SM120)
	+ ENDIF()
	+
	# Limit minimum cuda version for each archs

	IF (${arch_count} EQUAL 1)
	@@ -172,6 +178,7 @@ if(WIN32)
	else()

	cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC})
	+ set_property(TARGET MNN_Cuda_Main PROPERTY CUDA_SEPARABLE_COMPILATION ON)
	if(MNN_CUDA_PROFILE)
	target_compile_options(MNN_Cuda_Main PRIVATE -DMNN_CUDA_PROFILE)
	target_link_libraries(MNN_Cuda_Main ${CUDA_INCLUDE_DIRS}/../lib/libnvToolsExt.so)
	diff --git a/source/backend/cuda/SelectCudaComputeArch.cmake b/source/backend/cuda/SelectCudaComputeArch.cmake
	index 889e7af8..6d48f046 100755
	--- a/source/backend/cuda/SelectCudaComputeArch.cmake
	+++ b/source/backend/cuda/SelectCudaComputeArch.cmake
	@@ -133,6 +133,13 @@ if(NOT CUDA_VERSION VERSION_LESS "11.8")
	endif()
	endif()

	+if(NOT CUDA_VERSION VERSION_LESS "13.0")
	+ list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Blackwell")
	+ list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "12.0" "12.1")
	+ list(APPEND CUDA_ALL_GPU_ARCHITECTURES "12.0" "12.1")
	+ list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "12.0+PTX")
	+endif()
	+
	################################################################################################
	# A function for automatic detection of GPUs installed (if autodetection is enabled)
	# Usage:
	@@ -240,7 +247,7 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
	set(add_ptx TRUE)
	set(arch_name ${CMAKE_MATCH_1})
	endif()
	- if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
	+ if(arch_name MATCHES "^([0-9]+\\.[0-9]+(\\([0-9]+\\.[0-9]+\\))?)$")
	set(arch_bin ${CMAKE_MATCH_1})
	set(arch_ptx ${arch_bin})
	else()
	@@ -277,6 +284,9 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
	elseif(${arch_name} STREQUAL "Hopper")
	set(arch_bin 9.0)
	set(arch_ptx 9.0)
	+ elseif(${arch_name} STREQUAL "Blackwell")
	+ set(arch_bin 12.0 12.1)
	+ set(arch_ptx 12.0)
	else()
	message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
	endif()
	diff --git a/source/backend/cuda/execution/AttentionExecution.cu b/source/backend/cuda/execution/AttentionExecution.cu
	index 593afc56..80cad8c2 100644
	--- a/source/backend/cuda/execution/AttentionExecution.cu
	+++ b/source/backend/cuda/execution/AttentionExecution.cu
	@@ -1,3 +1,4 @@
	+#include "SoftmaxKernels.cuh"
	#include "AttentionExecution.hpp"
	#include "core/TensorUtils.hpp"
	#include "SoftmaxExecution.hpp"
	diff --git a/source/backend/cuda/execution/SoftmaxExecution.cu b/source/backend/cuda/execution/SoftmaxExecution.cu
	index 02077763..0e2f8093 100644
	--- a/source/backend/cuda/execution/SoftmaxExecution.cu
	+++ b/source/backend/cuda/execution/SoftmaxExecution.cu
	@@ -1,3 +1,4 @@
	+#include "SoftmaxKernels.cuh"
	#include "SoftmaxExecution.hpp"
	#include "core/TensorUtils.hpp"
	namespace MNN {
	@@ -35,7 +36,7 @@ __global__ void SOFTMAX(const T input, T output,
	}
	}
	}
	-
	+/**
	template <typename T>
	__global__ void SOFTMAX_WARP_32(const T input, T output,
	const int inside,
	@@ -136,7 +137,7 @@ __global__ void SOFTMAX_AXIS_REDUCE(const T input, T output,
	dst[(tid + i * per_block_size) * inside] = (T)(tmp_exp * divSumValue);
	}
	}
	-}
	+}*/

	SoftmaxExecution::SoftmaxExecution(int axis, Backend *backend) : Execution(backend) {
	mAxis = axis;
	diff --git a/source/backend/cuda/execution/SoftmaxKernels.cuh b/source/backend/cuda/execution/SoftmaxKernels.cuh
	new file mode 100644
	index 00000000..57773aef
	--- /dev/null
	+++ b/source/backend/cuda/execution/SoftmaxKernels.cuh
	@@ -0,0 +1,102 @@
	+// SoftmaxKernels.cuh
	+#pragma once
	+#include <cuda_fp16.h>
	+#include <float.h>
	+
	+namespace MNN {
	+namespace CUDA {
	+
	+template <typename T>
	+__global__ void SOFTMAX_WARP_32(const T input, T output,
	+ const int inside, const int axis, const int outside, const int count)
	+{
	+ int idx_outside = blockIdx.x / inside;
	+ int idx_inside = blockIdx.x - idx_outside * inside;
	+ auto src = input + idx_outside * axis * inside + idx_inside;
	+
	+ float local_src = -FLT_MAX;
	+ __shared__ float maxValue;
	+ __shared__ float sumValue;
	+ int tid = threadIdx.x;
	+ if (tid < axis) local_src = (float)(src[tid * inside]);
	+
	+ // warp reduce max
	+ for (int offset = 16; offset > 0; offset >>= 1)
	+ local_src = max(local_src, __shfl_down_sync(0xffffffff, local_src, offset));
	+ if (tid == 0) maxValue = local_src;
	+ __syncthreads();
	+
	+ float local_exp = 0.0f;
	+ if (tid < axis) {
	+ float tmp = local_src - maxValue;
	+ tmp = (tmp < -87.0f) ? -87.0f : tmp;
	+ local_exp = expf(tmp);
	+ }
	+
	+ // warp reduce sum
	+ float s = local_exp;
	+ for (int offset = 16; offset > 0; offset >>= 1)
	+ s += __shfl_down_sync(0xffffffff, s, offset);
	+ if (tid == 0) sumValue = s;
	+ __syncthreads();
	+
	+ if (tid < axis)
	+ output[(idx_outside * axis + tid) * inside + idx_inside] = (T)(local_exp / sumValue);
	+}
	+
	+template <typename T>
	+__global__ void SOFTMAX_AXIS_REDUCE(const T input, T output,
	+ const int inside, const int axis,
	+ const int per_block_size, const int calc_multi_num,
	+ const int outside, const int count)
	+{
	+ int idx_outside = blockIdx.x / inside;
	+ int idx_inside = blockIdx.x - idx_outside * inside;
	+ auto src = input + idx_outside * axis * inside + idx_inside;
	+ auto dst = output + idx_outside * axis * inside + idx_inside;
	+
	+ int tid = threadIdx.x;
	+ float local_max = -FLT_MAX;
	+ for (int i = 0; i < calc_multi_num; i++) {
	+ int pos = tid + i * per_block_size;
	+ if (pos < axis) local_max = max(local_max, (float)(src[pos * inside]));
	+ }
	+ // block reduce max via shared mem
	+ __shared__ float smem[256];
	+ smem[tid] = local_max;
	+ __syncthreads();
	+ for (int s = blockDim.x/2; s > 0; s >>= 1) {
	+ if (tid < s) smem[tid] = max(smem[tid], smem[tid+s]);
	+ __syncthreads();
	+ }
	+ float maxValue = smem[0];
	+
	+ float local_sum = 0.0f;
	+ for (int i = 0; i < calc_multi_num; i++) {
	+ int pos = tid + i * per_block_size;
	+ if (pos < axis) {
	+ float tmp = (float)(src[pos * inside]) - maxValue;
	+ tmp = (tmp < -87.0f) ? -87.0f : tmp;
	+ local_sum += expf(tmp);
	+ }
	+ }
	+ smem[tid] = local_sum;
	+ __syncthreads();
	+ for (int s = blockDim.x/2; s > 0; s >>= 1) {
	+ if (tid < s) smem[tid] += smem[tid+s];
	+ __syncthreads();
	+ }
	+ float divSum = 1.0f / smem[0];
	+
	+ for (int i = 0; i < calc_multi_num; i++) {
	+ int pos = tid + i * per_block_size;
	+ if (pos < axis) {
	+ float tmp = (float)(src[pos * inside]) - maxValue;
	+ tmp = (tmp < -87.0f) ? -87.0f : tmp;
	+ dst[pos * inside] = (T)(expf(tmp) * divSum);
	+ }
	+ }
	+}
	+
	+} // namespace CUDA
	+} // namespace MNN
	\ No newline at end of file
	diff --git a/transformers/llm/engine/src/tokenizer/jinja.hpp b/transformers/llm/engine/src/tokenizer/jinja.hpp
	index f1ca3f88..55d20bb4 100644
	--- a/transformers/llm/engine/src/tokenizer/jinja.hpp
	+++ b/transformers/llm/engine/src/tokenizer/jinja.hpp
	@@ -123,12 +123,13 @@ private:

	namespace jinja {

	-
	+/*
	// C++14 make_unique polyfill for C++11
	template<typename T, typename... Args>
	std::unique_ptr<T> make_unique(Args&&... args) {
	return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
	-}
	+}*/
	+using std::make_unique;

	inline std::string to_python_string(const json& val);

	diff --git a/transformers/llm/engine/tools/llm_bench.cpp b/transformers/llm/engine/tools/llm_bench.cpp
	index af5da9ea..93d9806b 100644
	--- a/transformers/llm/engine/tools/llm_bench.cpp
	+++ b/transformers/llm/engine/tools/llm_bench.cpp
	@@ -381,6 +381,7 @@ struct markdownPrinter : public Printer {
	} else if (field == "backend") {
	if (t.backend == 1) value = "METAL";
	else if (t.backend == 3) value = "OPENCL";
	+ else if (t.backend == 4) value = "CUDA";
	else value = "CPU";
	} else if (field == "test") {
	if (t.nPrompt > 0 && t.nGenerate == 0) {
	@@ -486,6 +487,7 @@ struct jsonAggregator : public Printer {
	writer.Key("backend");
	if (t.backend == 1) writer.String("METAL");
	else if (t.backend == 3) writer.String("OPENCL");
	+ else if (t.backend == 4) writer.String("CUDA");
	else writer.String("CPU");

	writer.Key("threads");
	@@ -801,7 +803,7 @@ static void printUsage(int /* argc /, char * argv) {
	printf("options:\n");
	printf(" -h, --help\n");
	printf(" -m, --model <filename> (default: ./Qwen2.5-1.5B-Instruct/config.json)\n");
	- printf(" -a, --backends <cpu,opencl,metal> (default: %s)\n", "cpu");
	+ printf(" -a, --backends <cpu,opencl,metal,cuda> (default: %s)\n", "cpu");
	printf(" -c, --precision <n> (default: %s) \| Note: (0:Normal(for cpu bakend, 'Normal' is 'High'),1:High,2:Low)\n", join(runtimeParamsDefaults.precision, ",").c_str());
	printf(" -t, --threads <n> (default: %s)\n", join(runtimeParamsDefaults.threads, ",").c_str());
	printf(" -p, --n-prompt <n> (default: %s)\n", join(testParamsDefaults.nPrompt, ",").c_str());
	@@ -884,6 +886,8 @@ static bool parseCmdParams(int argc, char ** argv, RuntimeParameters & runtimePa
	p.emplace_back(1);
	} else if (type == "opencl") {
	p.emplace_back(3);
	+ } else if (type == "cuda") {
	+ p.emplace_back(4);
	} else {
	p.emplace_back(0);
	}
	@@ -1061,7 +1065,7 @@ static Llm* buildLLM(const std::string& config_path, int backend, int memory, in
	// Otherwise, mContext->history_tokens retains data after the first run, skewing true prefill performance metrics."
	llmPtr->set_config(R"({"reuse_kv":false})");
	std::map<int, std::string> lever = {{0,"normal"}, {1, "high"}, {2, "low"}};
	- std::map<int, std::string> backend_type = {{0, "cpu"}, {1, "metal"}, {3, "opencl"}};
	+ std::map<int, std::string> backend_type = {{0, "cpu"}, {1, "metal"}, {3, "opencl"}, {4, "cuda"}};
	std::map<bool, std::string> mmap = {{true,"true"}, {false, "false"}};

	bool setSuccess = true;
No results found