Skip to content

Instantly share code, notes, and snippets.

@kostakoff
Last active March 20, 2026 04:31
Show Gist options
  • Select an option

  • Save kostakoff/b5198f723fabbc759997434151cb6963 to your computer and use it in GitHub Desktop.

Select an option

Save kostakoff/b5198f723fabbc759997434151cb6963 to your computer and use it in GitHub Desktop.
Alibaba MNN benchmark on a DGX Spark

Here is the step-by-step guide on how to build MNN:

mkdir mnn && cd mnn
# Get the code
git clone https://github.com/alibaba/MNN.git
cd MNN

# Reset repo to a specific commit
git reset --hard b1d06d68b3366183d157f0703d7b8a8b61ae55b3

# Apply patch for CUDA 13.0
git apply ../my_changes.patch

mkdir build && cd build
# Configure the project
cmake .. \
  -DMNN_CUDA=ON \
  -DMNN_BUILD_LLM=ON \
  -DMNN_SUPPORT_TRANSFORMER_FUSE=ON \
  -DCMAKE_BUILD_TYPE=Release

# Build libraries and executable binaries
cmake --build . --config Release -j$(nproc)
make -j$(nproc)

How to run the test:

./MNN/build/llm_bench -m /path/to/qwen/config.json -a cuda -c 2 -p 512 -n 128 -kv true -rep 3
diff --git a/source/backend/cuda/CMakeLists.txt b/source/backend/cuda/CMakeLists.txt
index 2295f2ad..d1b2f508 100644
--- a/source/backend/cuda/CMakeLists.txt
+++ b/source/backend/cuda/CMakeLists.txt
@@ -29,20 +29,20 @@ if(CUDA_FOUND)
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -allow-unsupported-compiler")
ENDIF()
IF (${arch_count} EQUAL 1)
- set(support_archs 60 61 62 70 72 75 80 86 87 89)
+ set(support_archs 60 61 62 70 72 75 80 86 87 89 90 120 121)
list(FIND support_archs ${CUDA_ARCH_FLAGS_readable_code} list_index)
IF (${list_index} EQUAL -1)
message(FATAL_ERROR "Please add your own sm arch ${CUDA_ARCH_FLAGS_readable_code} to CmakeLists.txt!")
ENDIF()
ENDIF()
- IF ((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0"))
+ IF (((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0")) AND (CUDA_VERSION VERSION_LESS "13.0"))
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
ENDIF()
- IF ((CUDA_VERSION VERSION_GREATER "10.1") OR (CUDA_VERSION VERSION_EQUAL "10.1"))
+ IF (((CUDA_VERSION VERSION_GREATER "10.1") OR (CUDA_VERSION VERSION_EQUAL "10.1")) AND (CUDA_VERSION VERSION_LESS "13.0"))
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70")
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_72,code=sm_72")
ENDIF()
@@ -68,6 +68,12 @@ if(CUDA_FOUND)
add_definitions(-DMNN_CUDA_ENABLE_SM89 -DMNN_CUDA_ENABLE_SM89)
ENDIF()
+ IF ((CUDA_VERSION VERSION_GREATER "13.0") OR (CUDA_VERSION VERSION_EQUAL "13.0"))
+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_120,code=sm_120")
+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_121,code=sm_121")
+ add_definitions(-DMNN_CUDA_ENABLE_SM120)
+ ENDIF()
+
# Limit minimum cuda version for each archs
IF (${arch_count} EQUAL 1)
@@ -172,6 +178,7 @@ if(WIN32)
else()
cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC})
+ set_property(TARGET MNN_Cuda_Main PROPERTY CUDA_SEPARABLE_COMPILATION ON)
if(MNN_CUDA_PROFILE)
target_compile_options(MNN_Cuda_Main PRIVATE -DMNN_CUDA_PROFILE)
target_link_libraries(MNN_Cuda_Main ${CUDA_INCLUDE_DIRS}/../lib/libnvToolsExt.so)
diff --git a/source/backend/cuda/SelectCudaComputeArch.cmake b/source/backend/cuda/SelectCudaComputeArch.cmake
index 889e7af8..6d48f046 100755
--- a/source/backend/cuda/SelectCudaComputeArch.cmake
+++ b/source/backend/cuda/SelectCudaComputeArch.cmake
@@ -133,6 +133,13 @@ if(NOT CUDA_VERSION VERSION_LESS "11.8")
endif()
endif()
+if(NOT CUDA_VERSION VERSION_LESS "13.0")
+ list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Blackwell")
+ list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "12.0" "12.1")
+ list(APPEND CUDA_ALL_GPU_ARCHITECTURES "12.0" "12.1")
+ list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "12.0+PTX")
+endif()
+
################################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
# Usage:
@@ -240,7 +247,7 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
set(add_ptx TRUE)
set(arch_name ${CMAKE_MATCH_1})
endif()
- if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
+ if(arch_name MATCHES "^([0-9]+\\.[0-9]+(\\([0-9]+\\.[0-9]+\\))?)$")
set(arch_bin ${CMAKE_MATCH_1})
set(arch_ptx ${arch_bin})
else()
@@ -277,6 +284,9 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable)
elseif(${arch_name} STREQUAL "Hopper")
set(arch_bin 9.0)
set(arch_ptx 9.0)
+ elseif(${arch_name} STREQUAL "Blackwell")
+ set(arch_bin 12.0 12.1)
+ set(arch_ptx 12.0)
else()
message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
endif()
diff --git a/source/backend/cuda/execution/AttentionExecution.cu b/source/backend/cuda/execution/AttentionExecution.cu
index 593afc56..80cad8c2 100644
--- a/source/backend/cuda/execution/AttentionExecution.cu
+++ b/source/backend/cuda/execution/AttentionExecution.cu
@@ -1,3 +1,4 @@
+#include "SoftmaxKernels.cuh"
#include "AttentionExecution.hpp"
#include "core/TensorUtils.hpp"
#include "SoftmaxExecution.hpp"
diff --git a/source/backend/cuda/execution/SoftmaxExecution.cu b/source/backend/cuda/execution/SoftmaxExecution.cu
index 02077763..0e2f8093 100644
--- a/source/backend/cuda/execution/SoftmaxExecution.cu
+++ b/source/backend/cuda/execution/SoftmaxExecution.cu
@@ -1,3 +1,4 @@
+#include "SoftmaxKernels.cuh"
#include "SoftmaxExecution.hpp"
#include "core/TensorUtils.hpp"
namespace MNN {
@@ -35,7 +36,7 @@ __global__ void SOFTMAX(const T *input, T *output,
}
}
}
-
+/**
template <typename T>
__global__ void SOFTMAX_WARP_32(const T *input, T *output,
const int inside,
@@ -136,7 +137,7 @@ __global__ void SOFTMAX_AXIS_REDUCE(const T *input, T *output,
dst[(tid + i * per_block_size) * inside] = (T)(tmp_exp * divSumValue);
}
}
-}
+}*/
SoftmaxExecution::SoftmaxExecution(int axis, Backend *backend) : Execution(backend) {
mAxis = axis;
diff --git a/source/backend/cuda/execution/SoftmaxKernels.cuh b/source/backend/cuda/execution/SoftmaxKernels.cuh
new file mode 100644
index 00000000..57773aef
--- /dev/null
+++ b/source/backend/cuda/execution/SoftmaxKernels.cuh
@@ -0,0 +1,102 @@
+// SoftmaxKernels.cuh
+#pragma once
+#include <cuda_fp16.h>
+#include <float.h>
+
+namespace MNN {
+namespace CUDA {
+
+template <typename T>
+__global__ void SOFTMAX_WARP_32(const T *input, T *output,
+ const int inside, const int axis, const int outside, const int count)
+{
+ int idx_outside = blockIdx.x / inside;
+ int idx_inside = blockIdx.x - idx_outside * inside;
+ auto src = input + idx_outside * axis * inside + idx_inside;
+
+ float local_src = -FLT_MAX;
+ __shared__ float maxValue;
+ __shared__ float sumValue;
+ int tid = threadIdx.x;
+ if (tid < axis) local_src = (float)(src[tid * inside]);
+
+ // warp reduce max
+ for (int offset = 16; offset > 0; offset >>= 1)
+ local_src = max(local_src, __shfl_down_sync(0xffffffff, local_src, offset));
+ if (tid == 0) maxValue = local_src;
+ __syncthreads();
+
+ float local_exp = 0.0f;
+ if (tid < axis) {
+ float tmp = local_src - maxValue;
+ tmp = (tmp < -87.0f) ? -87.0f : tmp;
+ local_exp = expf(tmp);
+ }
+
+ // warp reduce sum
+ float s = local_exp;
+ for (int offset = 16; offset > 0; offset >>= 1)
+ s += __shfl_down_sync(0xffffffff, s, offset);
+ if (tid == 0) sumValue = s;
+ __syncthreads();
+
+ if (tid < axis)
+ output[(idx_outside * axis + tid) * inside + idx_inside] = (T)(local_exp / sumValue);
+}
+
+template <typename T>
+__global__ void SOFTMAX_AXIS_REDUCE(const T *input, T *output,
+ const int inside, const int axis,
+ const int per_block_size, const int calc_multi_num,
+ const int outside, const int count)
+{
+ int idx_outside = blockIdx.x / inside;
+ int idx_inside = blockIdx.x - idx_outside * inside;
+ auto src = input + idx_outside * axis * inside + idx_inside;
+ auto dst = output + idx_outside * axis * inside + idx_inside;
+
+ int tid = threadIdx.x;
+ float local_max = -FLT_MAX;
+ for (int i = 0; i < calc_multi_num; i++) {
+ int pos = tid + i * per_block_size;
+ if (pos < axis) local_max = max(local_max, (float)(src[pos * inside]));
+ }
+ // block reduce max via shared mem
+ __shared__ float smem[256];
+ smem[tid] = local_max;
+ __syncthreads();
+ for (int s = blockDim.x/2; s > 0; s >>= 1) {
+ if (tid < s) smem[tid] = max(smem[tid], smem[tid+s]);
+ __syncthreads();
+ }
+ float maxValue = smem[0];
+
+ float local_sum = 0.0f;
+ for (int i = 0; i < calc_multi_num; i++) {
+ int pos = tid + i * per_block_size;
+ if (pos < axis) {
+ float tmp = (float)(src[pos * inside]) - maxValue;
+ tmp = (tmp < -87.0f) ? -87.0f : tmp;
+ local_sum += expf(tmp);
+ }
+ }
+ smem[tid] = local_sum;
+ __syncthreads();
+ for (int s = blockDim.x/2; s > 0; s >>= 1) {
+ if (tid < s) smem[tid] += smem[tid+s];
+ __syncthreads();
+ }
+ float divSum = 1.0f / smem[0];
+
+ for (int i = 0; i < calc_multi_num; i++) {
+ int pos = tid + i * per_block_size;
+ if (pos < axis) {
+ float tmp = (float)(src[pos * inside]) - maxValue;
+ tmp = (tmp < -87.0f) ? -87.0f : tmp;
+ dst[pos * inside] = (T)(expf(tmp) * divSum);
+ }
+ }
+}
+
+} // namespace CUDA
+} // namespace MNN
\ No newline at end of file
diff --git a/transformers/llm/engine/src/tokenizer/jinja.hpp b/transformers/llm/engine/src/tokenizer/jinja.hpp
index f1ca3f88..55d20bb4 100644
--- a/transformers/llm/engine/src/tokenizer/jinja.hpp
+++ b/transformers/llm/engine/src/tokenizer/jinja.hpp
@@ -123,12 +123,13 @@ private:
namespace jinja {
-
+/*
// C++14 make_unique polyfill for C++11
template<typename T, typename... Args>
std::unique_ptr<T> make_unique(Args&&... args) {
return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
+}*/
+using std::make_unique;
inline std::string to_python_string(const json& val);
diff --git a/transformers/llm/engine/tools/llm_bench.cpp b/transformers/llm/engine/tools/llm_bench.cpp
index af5da9ea..93d9806b 100644
--- a/transformers/llm/engine/tools/llm_bench.cpp
+++ b/transformers/llm/engine/tools/llm_bench.cpp
@@ -381,6 +381,7 @@ struct markdownPrinter : public Printer {
} else if (field == "backend") {
if (t.backend == 1) value = "METAL";
else if (t.backend == 3) value = "OPENCL";
+ else if (t.backend == 4) value = "CUDA";
else value = "CPU";
} else if (field == "test") {
if (t.nPrompt > 0 && t.nGenerate == 0) {
@@ -486,6 +487,7 @@ struct jsonAggregator : public Printer {
writer.Key("backend");
if (t.backend == 1) writer.String("METAL");
else if (t.backend == 3) writer.String("OPENCL");
+ else if (t.backend == 4) writer.String("CUDA");
else writer.String("CPU");
writer.Key("threads");
@@ -801,7 +803,7 @@ static void printUsage(int /* argc */, char ** argv) {
printf("options:\n");
printf(" -h, --help\n");
printf(" -m, --model <filename> (default: ./Qwen2.5-1.5B-Instruct/config.json)\n");
- printf(" -a, --backends <cpu,opencl,metal> (default: %s)\n", "cpu");
+ printf(" -a, --backends <cpu,opencl,metal,cuda> (default: %s)\n", "cpu");
printf(" -c, --precision <n> (default: %s) | Note: (0:Normal(for cpu bakend, 'Normal' is 'High'),1:High,2:Low)\n", join(runtimeParamsDefaults.precision, ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(runtimeParamsDefaults.threads, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", join(testParamsDefaults.nPrompt, ",").c_str());
@@ -884,6 +886,8 @@ static bool parseCmdParams(int argc, char ** argv, RuntimeParameters & runtimePa
p.emplace_back(1);
} else if (type == "opencl") {
p.emplace_back(3);
+ } else if (type == "cuda") {
+ p.emplace_back(4);
} else {
p.emplace_back(0);
}
@@ -1061,7 +1065,7 @@ static Llm* buildLLM(const std::string& config_path, int backend, int memory, in
// Otherwise, mContext->history_tokens retains data after the first run, skewing true prefill performance metrics."
llmPtr->set_config(R"({"reuse_kv":false})");
std::map<int, std::string> lever = {{0,"normal"}, {1, "high"}, {2, "low"}};
- std::map<int, std::string> backend_type = {{0, "cpu"}, {1, "metal"}, {3, "opencl"}};
+ std::map<int, std::string> backend_type = {{0, "cpu"}, {1, "metal"}, {3, "opencl"}, {4, "cuda"}};
std::map<bool, std::string> mmap = {{true,"true"}, {false, "false"}};
bool setSuccess = true;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment