|
diff --git a/source/backend/cuda/CMakeLists.txt b/source/backend/cuda/CMakeLists.txt |
|
index 2295f2ad..d1b2f508 100644 |
|
--- a/source/backend/cuda/CMakeLists.txt |
|
+++ b/source/backend/cuda/CMakeLists.txt |
|
@@ -29,20 +29,20 @@ if(CUDA_FOUND) |
|
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -allow-unsupported-compiler") |
|
ENDIF() |
|
IF (${arch_count} EQUAL 1) |
|
- set(support_archs 60 61 62 70 72 75 80 86 87 89) |
|
+ set(support_archs 60 61 62 70 72 75 80 86 87 89 90 120 121) |
|
list(FIND support_archs ${CUDA_ARCH_FLAGS_readable_code} list_index) |
|
IF (${list_index} EQUAL -1) |
|
message(FATAL_ERROR "Please add your own sm arch ${CUDA_ARCH_FLAGS_readable_code} to CmakeLists.txt!") |
|
ENDIF() |
|
ENDIF() |
|
|
|
- IF ((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0")) |
|
+ IF (((CUDA_VERSION VERSION_GREATER "8.0") OR (CUDA_VERSION VERSION_EQUAL "8.0")) AND (CUDA_VERSION VERSION_LESS "13.0")) |
|
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") |
|
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") |
|
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") |
|
ENDIF() |
|
|
|
- IF ((CUDA_VERSION VERSION_GREATER "10.1") OR (CUDA_VERSION VERSION_EQUAL "10.1")) |
|
+ IF (((CUDA_VERSION VERSION_GREATER "10.1") OR (CUDA_VERSION VERSION_EQUAL "10.1")) AND (CUDA_VERSION VERSION_LESS "13.0")) |
|
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_70,code=sm_70") |
|
set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_72,code=sm_72") |
|
ENDIF() |
|
@@ -68,6 +68,12 @@ if(CUDA_FOUND) |
|
add_definitions(-DMNN_CUDA_ENABLE_SM89 -DMNN_CUDA_ENABLE_SM89) |
|
ENDIF() |
|
|
|
+ IF ((CUDA_VERSION VERSION_GREATER "13.0") OR (CUDA_VERSION VERSION_EQUAL "13.0")) |
|
+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_120,code=sm_120") |
|
+ set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_121,code=sm_121") |
|
+ add_definitions(-DMNN_CUDA_ENABLE_SM120) |
|
+ ENDIF() |
|
+ |
|
# Limit minimum cuda version for each archs |
|
|
|
IF (${arch_count} EQUAL 1) |
|
@@ -172,6 +178,7 @@ if(WIN32) |
|
else() |
|
|
|
cuda_add_library(MNN_Cuda_Main SHARED ${MNN_CUDA_SRC}) |
|
+ set_property(TARGET MNN_Cuda_Main PROPERTY CUDA_SEPARABLE_COMPILATION ON) |
|
if(MNN_CUDA_PROFILE) |
|
target_compile_options(MNN_Cuda_Main PRIVATE -DMNN_CUDA_PROFILE) |
|
target_link_libraries(MNN_Cuda_Main ${CUDA_INCLUDE_DIRS}/../lib/libnvToolsExt.so) |
|
diff --git a/source/backend/cuda/SelectCudaComputeArch.cmake b/source/backend/cuda/SelectCudaComputeArch.cmake |
|
index 889e7af8..6d48f046 100755 |
|
--- a/source/backend/cuda/SelectCudaComputeArch.cmake |
|
+++ b/source/backend/cuda/SelectCudaComputeArch.cmake |
|
@@ -133,6 +133,13 @@ if(NOT CUDA_VERSION VERSION_LESS "11.8") |
|
endif() |
|
endif() |
|
|
|
+if(NOT CUDA_VERSION VERSION_LESS "13.0") |
|
+ list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Blackwell") |
|
+ list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "12.0" "12.1") |
|
+ list(APPEND CUDA_ALL_GPU_ARCHITECTURES "12.0" "12.1") |
|
+ list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "12.0+PTX") |
|
+endif() |
|
+ |
|
################################################################################################ |
|
# A function for automatic detection of GPUs installed (if autodetection is enabled) |
|
# Usage: |
|
@@ -240,7 +247,7 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable) |
|
set(add_ptx TRUE) |
|
set(arch_name ${CMAKE_MATCH_1}) |
|
endif() |
|
- if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$") |
|
+ if(arch_name MATCHES "^([0-9]+\\.[0-9]+(\\([0-9]+\\.[0-9]+\\))?)$") |
|
set(arch_bin ${CMAKE_MATCH_1}) |
|
set(arch_ptx ${arch_bin}) |
|
else() |
|
@@ -277,6 +284,9 @@ function(CUDA_SELECT_NVCC_ARCH_FLAGS out_variable) |
|
elseif(${arch_name} STREQUAL "Hopper") |
|
set(arch_bin 9.0) |
|
set(arch_ptx 9.0) |
|
+ elseif(${arch_name} STREQUAL "Blackwell") |
|
+ set(arch_bin 12.0 12.1) |
|
+ set(arch_ptx 12.0) |
|
else() |
|
message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS") |
|
endif() |
|
diff --git a/source/backend/cuda/execution/AttentionExecution.cu b/source/backend/cuda/execution/AttentionExecution.cu |
|
index 593afc56..80cad8c2 100644 |
|
--- a/source/backend/cuda/execution/AttentionExecution.cu |
|
+++ b/source/backend/cuda/execution/AttentionExecution.cu |
|
@@ -1,3 +1,4 @@ |
|
+#include "SoftmaxKernels.cuh" |
|
#include "AttentionExecution.hpp" |
|
#include "core/TensorUtils.hpp" |
|
#include "SoftmaxExecution.hpp" |
|
diff --git a/source/backend/cuda/execution/SoftmaxExecution.cu b/source/backend/cuda/execution/SoftmaxExecution.cu |
|
index 02077763..0e2f8093 100644 |
|
--- a/source/backend/cuda/execution/SoftmaxExecution.cu |
|
+++ b/source/backend/cuda/execution/SoftmaxExecution.cu |
|
@@ -1,3 +1,4 @@ |
|
+#include "SoftmaxKernels.cuh" |
|
#include "SoftmaxExecution.hpp" |
|
#include "core/TensorUtils.hpp" |
|
namespace MNN { |
|
@@ -35,7 +36,7 @@ __global__ void SOFTMAX(const T *input, T *output, |
|
} |
|
} |
|
} |
|
- |
|
+/** |
|
template <typename T> |
|
__global__ void SOFTMAX_WARP_32(const T *input, T *output, |
|
const int inside, |
|
@@ -136,7 +137,7 @@ __global__ void SOFTMAX_AXIS_REDUCE(const T *input, T *output, |
|
dst[(tid + i * per_block_size) * inside] = (T)(tmp_exp * divSumValue); |
|
} |
|
} |
|
-} |
|
+}*/ |
|
|
|
SoftmaxExecution::SoftmaxExecution(int axis, Backend *backend) : Execution(backend) { |
|
mAxis = axis; |
|
diff --git a/source/backend/cuda/execution/SoftmaxKernels.cuh b/source/backend/cuda/execution/SoftmaxKernels.cuh |
|
new file mode 100644 |
|
index 00000000..57773aef |
|
--- /dev/null |
|
+++ b/source/backend/cuda/execution/SoftmaxKernels.cuh |
|
@@ -0,0 +1,102 @@ |
|
+// SoftmaxKernels.cuh |
|
+#pragma once |
|
+#include <cuda_fp16.h> |
|
+#include <float.h> |
|
+ |
|
+namespace MNN { |
|
+namespace CUDA { |
|
+ |
|
+template <typename T> |
|
+__global__ void SOFTMAX_WARP_32(const T *input, T *output, |
|
+ const int inside, const int axis, const int outside, const int count) |
|
+{ |
|
+ int idx_outside = blockIdx.x / inside; |
|
+ int idx_inside = blockIdx.x - idx_outside * inside; |
|
+ auto src = input + idx_outside * axis * inside + idx_inside; |
|
+ |
|
+ float local_src = -FLT_MAX; |
|
+ __shared__ float maxValue; |
|
+ __shared__ float sumValue; |
|
+ int tid = threadIdx.x; |
|
+ if (tid < axis) local_src = (float)(src[tid * inside]); |
|
+ |
|
+ // warp reduce max |
|
+ for (int offset = 16; offset > 0; offset >>= 1) |
|
+ local_src = max(local_src, __shfl_down_sync(0xffffffff, local_src, offset)); |
|
+ if (tid == 0) maxValue = local_src; |
|
+ __syncthreads(); |
|
+ |
|
+ float local_exp = 0.0f; |
|
+ if (tid < axis) { |
|
+ float tmp = local_src - maxValue; |
|
+ tmp = (tmp < -87.0f) ? -87.0f : tmp; |
|
+ local_exp = expf(tmp); |
|
+ } |
|
+ |
|
+ // warp reduce sum |
|
+ float s = local_exp; |
|
+ for (int offset = 16; offset > 0; offset >>= 1) |
|
+ s += __shfl_down_sync(0xffffffff, s, offset); |
|
+ if (tid == 0) sumValue = s; |
|
+ __syncthreads(); |
|
+ |
|
+ if (tid < axis) |
|
+ output[(idx_outside * axis + tid) * inside + idx_inside] = (T)(local_exp / sumValue); |
|
+} |
|
+ |
|
+template <typename T> |
|
+__global__ void SOFTMAX_AXIS_REDUCE(const T *input, T *output, |
|
+ const int inside, const int axis, |
|
+ const int per_block_size, const int calc_multi_num, |
|
+ const int outside, const int count) |
|
+{ |
|
+ int idx_outside = blockIdx.x / inside; |
|
+ int idx_inside = blockIdx.x - idx_outside * inside; |
|
+ auto src = input + idx_outside * axis * inside + idx_inside; |
|
+ auto dst = output + idx_outside * axis * inside + idx_inside; |
|
+ |
|
+ int tid = threadIdx.x; |
|
+ float local_max = -FLT_MAX; |
|
+ for (int i = 0; i < calc_multi_num; i++) { |
|
+ int pos = tid + i * per_block_size; |
|
+ if (pos < axis) local_max = max(local_max, (float)(src[pos * inside])); |
|
+ } |
|
+ // block reduce max via shared mem |
|
+ __shared__ float smem[256]; |
|
+ smem[tid] = local_max; |
|
+ __syncthreads(); |
|
+ for (int s = blockDim.x/2; s > 0; s >>= 1) { |
|
+ if (tid < s) smem[tid] = max(smem[tid], smem[tid+s]); |
|
+ __syncthreads(); |
|
+ } |
|
+ float maxValue = smem[0]; |
|
+ |
|
+ float local_sum = 0.0f; |
|
+ for (int i = 0; i < calc_multi_num; i++) { |
|
+ int pos = tid + i * per_block_size; |
|
+ if (pos < axis) { |
|
+ float tmp = (float)(src[pos * inside]) - maxValue; |
|
+ tmp = (tmp < -87.0f) ? -87.0f : tmp; |
|
+ local_sum += expf(tmp); |
|
+ } |
|
+ } |
|
+ smem[tid] = local_sum; |
|
+ __syncthreads(); |
|
+ for (int s = blockDim.x/2; s > 0; s >>= 1) { |
|
+ if (tid < s) smem[tid] += smem[tid+s]; |
|
+ __syncthreads(); |
|
+ } |
|
+ float divSum = 1.0f / smem[0]; |
|
+ |
|
+ for (int i = 0; i < calc_multi_num; i++) { |
|
+ int pos = tid + i * per_block_size; |
|
+ if (pos < axis) { |
|
+ float tmp = (float)(src[pos * inside]) - maxValue; |
|
+ tmp = (tmp < -87.0f) ? -87.0f : tmp; |
|
+ dst[pos * inside] = (T)(expf(tmp) * divSum); |
|
+ } |
|
+ } |
|
+} |
|
+ |
|
+} // namespace CUDA |
|
+} // namespace MNN |
|
\ No newline at end of file |
|
diff --git a/transformers/llm/engine/src/tokenizer/jinja.hpp b/transformers/llm/engine/src/tokenizer/jinja.hpp |
|
index f1ca3f88..55d20bb4 100644 |
|
--- a/transformers/llm/engine/src/tokenizer/jinja.hpp |
|
+++ b/transformers/llm/engine/src/tokenizer/jinja.hpp |
|
@@ -123,12 +123,13 @@ private: |
|
|
|
namespace jinja { |
|
|
|
- |
|
+/* |
|
// C++14 make_unique polyfill for C++11 |
|
template<typename T, typename... Args> |
|
std::unique_ptr<T> make_unique(Args&&... args) { |
|
return std::unique_ptr<T>(new T(std::forward<Args>(args)...)); |
|
-} |
|
+}*/ |
|
+using std::make_unique; |
|
|
|
inline std::string to_python_string(const json& val); |
|
|
|
diff --git a/transformers/llm/engine/tools/llm_bench.cpp b/transformers/llm/engine/tools/llm_bench.cpp |
|
index af5da9ea..93d9806b 100644 |
|
--- a/transformers/llm/engine/tools/llm_bench.cpp |
|
+++ b/transformers/llm/engine/tools/llm_bench.cpp |
|
@@ -381,6 +381,7 @@ struct markdownPrinter : public Printer { |
|
} else if (field == "backend") { |
|
if (t.backend == 1) value = "METAL"; |
|
else if (t.backend == 3) value = "OPENCL"; |
|
+ else if (t.backend == 4) value = "CUDA"; |
|
else value = "CPU"; |
|
} else if (field == "test") { |
|
if (t.nPrompt > 0 && t.nGenerate == 0) { |
|
@@ -486,6 +487,7 @@ struct jsonAggregator : public Printer { |
|
writer.Key("backend"); |
|
if (t.backend == 1) writer.String("METAL"); |
|
else if (t.backend == 3) writer.String("OPENCL"); |
|
+ else if (t.backend == 4) writer.String("CUDA"); |
|
else writer.String("CPU"); |
|
|
|
writer.Key("threads"); |
|
@@ -801,7 +803,7 @@ static void printUsage(int /* argc */, char ** argv) { |
|
printf("options:\n"); |
|
printf(" -h, --help\n"); |
|
printf(" -m, --model <filename> (default: ./Qwen2.5-1.5B-Instruct/config.json)\n"); |
|
- printf(" -a, --backends <cpu,opencl,metal> (default: %s)\n", "cpu"); |
|
+ printf(" -a, --backends <cpu,opencl,metal,cuda> (default: %s)\n", "cpu"); |
|
printf(" -c, --precision <n> (default: %s) | Note: (0:Normal(for cpu bakend, 'Normal' is 'High'),1:High,2:Low)\n", join(runtimeParamsDefaults.precision, ",").c_str()); |
|
printf(" -t, --threads <n> (default: %s)\n", join(runtimeParamsDefaults.threads, ",").c_str()); |
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(testParamsDefaults.nPrompt, ",").c_str()); |
|
@@ -884,6 +886,8 @@ static bool parseCmdParams(int argc, char ** argv, RuntimeParameters & runtimePa |
|
p.emplace_back(1); |
|
} else if (type == "opencl") { |
|
p.emplace_back(3); |
|
+ } else if (type == "cuda") { |
|
+ p.emplace_back(4); |
|
} else { |
|
p.emplace_back(0); |
|
} |
|
@@ -1061,7 +1065,7 @@ static Llm* buildLLM(const std::string& config_path, int backend, int memory, in |
|
// Otherwise, mContext->history_tokens retains data after the first run, skewing true prefill performance metrics." |
|
llmPtr->set_config(R"({"reuse_kv":false})"); |
|
std::map<int, std::string> lever = {{0,"normal"}, {1, "high"}, {2, "low"}}; |
|
- std::map<int, std::string> backend_type = {{0, "cpu"}, {1, "metal"}, {3, "opencl"}}; |
|
+ std::map<int, std::string> backend_type = {{0, "cpu"}, {1, "metal"}, {3, "opencl"}, {4, "cuda"}}; |
|
std::map<bool, std::string> mmap = {{true,"true"}, {false, "false"}}; |
|
|
|
bool setSuccess = true; |