#include "acl/acl.h" #include "acl/ops/acl_cblas.h" #include "acl/acl_op_compiler.h" #include "opencv2/imgproc.hpp" #include #include #include #include #include // for transform static std::string getType(const std::string& header) { std::string field = "'descr':"; int idx = header.find(field); //CV_Assert(idx != -1); int from = header.find('\'', idx + field.size()) + 1; int to = header.find('\'', from); return header.substr(from, to - from); } static std::string getFortranOrder(const std::string& header) { std::string field = "'fortran_order':"; int idx = header.find(field); //CV_Assert(idx != -1); int from = header.find_last_of(' ', idx + field.size()) + 1; int to = header.find(',', from); return header.substr(from, to - from); } static std::vector getShape(const std::string& header) { std::string field = "'shape':"; int idx = header.find(field); //CV_Assert(idx != -1); int from = header.find('(', idx + field.size()) + 1; int to = header.find(')', from); std::string shapeStr = header.substr(from, to - from); if (shapeStr.empty()) return std::vector(1, 1); // Remove all commas. shapeStr.erase(std::remove(shapeStr.begin(), shapeStr.end(), ','), shapeStr.end()); std::istringstream ss(shapeStr); int value; std::vector shape; while (ss >> value) { shape.push_back(value); } return shape; } cv::Mat blobFromNPY(const std::string& path) { std::ifstream ifs(path.c_str(), std::ios::binary); //CV_Assert(ifs.is_open()); std::string magic(6, '*'); ifs.read(&magic[0], magic.size()); //CV_Assert(magic == "\x93NUMPY"); ifs.ignore(1); // Skip major version byte. ifs.ignore(1); // Skip minor version byte. unsigned short headerSize; ifs.read((char*)&headerSize, sizeof(headerSize)); std::string header(headerSize, '*'); ifs.read(&header[0], header.size()); // Extract data type. //CV_Assert(getType(header) == " shape = getShape(header); cv::Mat blob(shape, CV_32F); ifs.read((char*)blob.data, blob.total() * blob.elemSize()); //CV_Assert((size_t)ifs.gcount() == blob.total() * blob.elemSize()); return blob; } void printBlob(const cv::Mat& m, int end) { const float* mptr = (const float*)m.data; for (int i = 0; i < end; i++) std::cout << mptr[i] << " "; std::cout << std::endl; } // Conv2D // * input shape [1, 3, 10, 10] // * kernel shape [5, 3, 5, 5] // * output shape [1, 5, 4, 4] int main() { // Ascend resource initialization // * init ascend aclInit(NULL); // * set device int deviceID = 0; aclrtSetDevice(deviceID); // * create context aclrtContext context = nullptr; aclrtCreateContext(&context, deviceID); // * create stream aclrtStream stream = nullptr; aclrtCreateStream(&stream); int ret; // Inputs // * get input pointer std::vector shape = {1, 3, 10, 10}; size_t inputSizeInByte = sizeof(float) * 1 * 3 * 10 * 10; cv::Mat inputMat = blobFromNPY("./input_convolution.npy"); //printBlob(inputMat, 300); const void* inputOnHost = (const void*)inputMat.data; // * alloc buffer for input on device void* inputOnDevice = nullptr; ret = aclrtMalloc(&inputOnDevice, inputSizeInByte, ACL_MEM_MALLOC_NORMAL_ONLY); std::cout << "inputOnDevice malloc status: " << ret << std::endl; // * send the input data from host to device ret = aclrtMemcpy(inputOnDevice, inputSizeInByte, inputOnHost, inputSizeInByte, ACL_MEMCPY_HOST_TO_DEVICE); std::cout << "inputOnDevice memcpy status: " << ret << std::endl; // Model parameters // * w std::vector w_shape = {5, 3, 5, 5}; size_t wSizeInByte = sizeof(float) * 5 * 3 * 5 * 5; cv::Mat wMat = blobFromNPY("./convolution_w.npy"); const void* wOnHost = (const void*)wMat.data; //printBlob(wMat, 25); void* wOnDevice = nullptr; ret = aclrtMalloc(&wOnDevice, wSizeInByte, ACL_MEM_MALLOC_NORMAL_ONLY); std::cout << "wOnDevice malloc status: " << ret << std::endl; ret = aclrtMemcpy(wOnDevice, wSizeInByte, wOnHost, wSizeInByte, ACL_MEMCPY_HOST_TO_DEVICE); std::cout << "wOnDevice memcpy status: " << ret << std::endl; // * b std::vector b_shape = {5}; size_t bSizeInByte = sizeof(float) * 5; cv::Mat bMat = blobFromNPY("./convolution_b.npy"); //printBlob(bMat, 5); const void* bOnHost = (const void*)bMat.data; void* bOnDeivce = nullptr; ret = aclrtMalloc(&bOnDeivce, bSizeInByte, ACL_MEM_MALLOC_NORMAL_ONLY); std::cout << "bOnDevice malloc status: " << ret << std::endl; ret = aclrtMemcpy(bOnDeivce, bSizeInByte, bOnHost, bSizeInByte, ACL_MEMCPY_HOST_TO_DEVICE); std::cout << "bOnDevice memcpy status: " << ret << std::endl; // Model output // * get output shape std::vector output_shape = {1, 5, 4, 4}; size_t outputSizeInByte = sizeof(float) * 1 * 5 * 4 * 4; // * alloc buffer for output on device void* outputOnDevice = nullptr; ret = aclrtMalloc(&outputOnDevice, outputSizeInByte, ACL_MEM_MALLOC_HUGE_FIRST); std::cout << "outputonDevice malloc status: " << ret << std::endl; // Create model std::string opName("Conv2D"); // * set attr, stides, pads, dilations aclopAttr* opAttr = aclopCreateAttr(); std::vector stridesValue = {1, 1, 2, 2}; // strides ret = aclopSetAttrListInt(opAttr, "strides", stridesValue.size(), stridesValue.data()); std::cout << "attr set strides: " << ret << std::endl; std::vector padsValue = {1, 1, 1, 1}; // pads ret = aclopSetAttrListInt(opAttr, "pads", padsValue.size(), padsValue.data()); std::cout << "attr set pads: " << ret << std::endl; std::vector dilationsValue = {1, 1, 1, 1}; // dilations ret = aclopSetAttrListInt(opAttr, "dilations", dilationsValue.size(), dilationsValue.data()); std::cout << "attr set dilations: " << ret << std::endl; int groups = 1; ret = aclopSetAttrInt(opAttr, "groups", groups); std::cout << "attr set groups: " << ret << std::endl; int offset_x = 0; ret = aclopSetAttrInt(opAttr, "offset_x", offset_x); std::cout << "attr set offset_x: " << ret << std::endl; // * set inputTensor (description) std::vector inputTensorDesc; inputTensorDesc.push_back(aclCreateTensorDesc(ACL_FLOAT, // ACL data type shape.size(), // num of dim shape.data(), // dims ACL_FORMAT_NCHW)); // ACL tensor format inputTensorDesc.push_back(aclCreateTensorDesc(ACL_FLOAT, w_shape.size(), w_shape.data(), ACL_FORMAT_NCHW)); inputTensorDesc.push_back(aclCreateTensorDesc(ACL_FLOAT, b_shape.size(), b_shape.data(), ACL_FORMAT_ND)); // * set outputTensor (description), similar above std::vector outputTensorDesc; outputTensorDesc.push_back(aclCreateTensorDesc(ACL_FLOAT, output_shape.size(), output_shape.data(), ACL_FORMAT_NCHW)); // Inference // * create data buffer for input std::vector inputBuffers; inputBuffers.push_back(aclCreateDataBuffer(inputOnDevice, inputSizeInByte)); inputBuffers.push_back(aclCreateDataBuffer(wOnDevice, wSizeInByte)); inputBuffers.push_back(aclCreateDataBuffer(bOnDeivce, bSizeInByte)); // * create data buffer for output std::vector outputBuffers; outputBuffers.push_back(aclCreateDataBuffer(outputOnDevice, outputSizeInByte)); // * forward: call aclopExecute() ret = aclopCompileAndExecute(opName.c_str(), inputTensorDesc.size(), inputTensorDesc.data(), inputBuffers.data(), outputTensorDesc.size(), outputTensorDesc.data(), outputBuffers.data(), opAttr, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, stream); std::cout << "op execute: " << ret << std::endl; // * synchronize stream aclrtSynchronizeStream(stream); // Get output - move from device to host // * send the output data from device to host void* outputOnHost = nullptr; aclrtMallocHost(&outputOnHost, outputSizeInByte); aclrtMemcpy(outputOnHost, outputSizeInByte, outputOnDevice, outputSizeInByte, ACL_MEMCPY_DEVICE_TO_HOST); // * construct outputMat std::vector output_shape_int = {1, 5, 4, 4}; cv::Mat tmp(output_shape_int, CV_32FC1, outputOnHost); cv::Mat outputMat; tmp.copyTo(outputMat); // * print std::cout << outputMat.size << std::endl; printBlob(outputMat, 5*4*4); // * write to file //ofstream outstr("res.out", ios::out | ios::binary); //outstr.write((char*) // Release Ascend resource // * release stream aclrtDestroyStream(stream); stream = nullptr; // * release context aclrtDestroyContext(context); context = nullptr; // * reset device aclrtResetDevice(deviceID); // * de-init ascend aclFinalize(); }