欢迎您访问 最编程 本站为您分享编程语言代码,编程技术文章!
您现在的位置是: 首页

用 python 调用 pybind11 封装的 cuda C++ 动态链接库

最编程 2024-07-15 08:45:21
...

使用python 调用 pybind11封装的 cuda C++ 动态链接库

pybind11是可以使C++和python程序间互相调用的轻量头文件库,它可以将C++代码编译成python可调用的动态链接库,

pybind11可以自动实现C++中vector、list等与python中list的自动转换,也可以C++中多维数组自动转换为 numpy.ndarray的格式。

pybind11中numpy.ndarray在C++中的表现形式:

py::array_t<float> np_multiply_Cublas(py::array_t<float> &inLeft, py::array_t<float> &inRight)
{
    //可通过此函数传入python中的numpy.ndarray数据,在C++中表现为py::array_t<T>格式。
    py::buffer_info bufLeft = inLeft.request(), bufRight = inRight.request();
    //request方法活得对py::array_t<T>的绑定,包括维度、数据指针、size、shape等参数
    assert(bufLeft.ndim == bufRight.ndim);
    assert(bufLeft.ndim == 2);
    assert(bufLeft.shape[1] == bufRight.shape[0]);
    const int M = bufLeft.shape[0], K = bufLeft.shape[1], N = bufRight.shape[1];
    // 实现矩阵乘法,C=A*B
    //M、K、N分别是A的行数、列数、B的列数,C的shape为{M,N}
    auto result = py::array_t<float>(M * N);
    result.resize({M, N});

    py::buffer_info bufResult = result.request();
    float *ptrLeft = (float *)bufLeft.ptr,
          *ptrRight = (float *)bufRight.ptr,
          *ptrResult = (float *)bufResult.ptr;  //获得数据指针
    Gpu_mul(ptrLeft, ptrRight, ptrResult, M, K, N);//将响应参数传输cuda C++编写的矩阵乘法函数中结果保存在ptrResult中
    return result;
    //返回result,result也是py::array_t<T>格式,也就是python中 的numpy.ndarray
}

以下是简单的python调用接口封装

PYBIND11_MODULE(mylib, m)
{
   // m.doc("use cuda and demo");
    //mylib代表封装的python模块名
    m.def("np_sum", &np_sum, "Add two Numpy arrays use cuda");
    m.def("Gpu_mul", &np_multiply, "Multuply tow arrays use cuda flag==1 use shared memory,flag==2 use global memory");
    
    m.def("Gpu_Cublas", &np_multiply_Cublas, "Multuply tow arrays use cublas");
    //"Gpu_Cublas"代表python中对应的函数,&np_multiply_Cublas是对应的C++函数指针,之后的字符串是python中的函数doc
}

以下是python中的调用示例

import os
os.sys.path.append("../build/bin")
os.sys.path.append("build/bin")
os.sys.path.append("build/bin/Release")#这三个路径是 mylib.cp37-win_amd64.pyd 动态库可能存在的路径
import mylib as my # 载入mylib模块
import time
import numpy as np

Ga = np.random.rand(50* 100, 31 * 100).astype(np.float32)
Gb = np.random.rand(31 * 100, 50 * 100).astype(np.float32)
# 生成随机numpy.ndarray,为float32格式
Gstart = time.time()
Ghc = Ga @ Gb  # 调用numpy自带的矩阵乘法函数
Gpaush = time.time()
Gc = my.Gpu_Cublas(Ga, Gb) # 调研C++端的liys cublas矩阵乘法函数
Gend = time.time()


print("the numpy cost", (Gpaush - Gstart)) # 比较两者耗时
print("the cublas cost time ", (Gend - Gpaush))
print("the two result are same?", (Ghc == Gc).any()) # 比较两个答案是否相等

以下是C++中利用cublas实现矩阵乘法的代码实现


void useCublas(float *_A, float *_B, float *_C, int M, int K, int N)
{
    cublasHandle_t handle;
    cublasCreate(&handle);
    float alpha = 1, beta = 0;
    cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, _B, N, _A, K, &beta, _C, N);//C=alpha*A*B+beta,
    //cublas中矩阵是列优先的格式,而C++是行优先的格式,所以调用的时候是_B在前,_A在后 C^T = B^T*A^T 
}
void Gpu_mul(float *ptrLeft, float *ptrRight, float *ptrResult, int M, int K, int N,int flage)
{
    float *d_a, *d_b, *d_c;
    cudaMalloc((void **)&d_a, M * K * sizeof(float));
    cudaMalloc((void **)&d_b, K * N * sizeof(float));
    cudaMalloc((void **)&d_c, M * N * sizeof(float));
    CHECK(cudaMemcpy(d_a, ptrLeft, M * K * sizeof(float), cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(d_b, ptrRight, K * N * sizeof(float), cudaMemcpyHostToDevice));
    constexpr const int TP = 16;
    dim3 threadsPer(TP, TP);
    dim3 blocksPer((M + TP - 1) / TP, (N + TP - 1) / TP);
    if (flage == 1)
    {
        matrix_MulG<TP><<<blocksPer, threadsPer>>>(d_a, d_b, d_c, M, K, N);
    }
    else if (flage == 0)
    {
        useCublas(d_a, d_b, d_c, M, K, N); //如果flag==0,调用cublas的矩阵乘法,否则调用 手写的cuda核函数进行矩阵相乘。
    }
    else if (flage == 2)
    {
        matrix_glbal_mul<<<blocksPer, threadsPer>>>(d_a, d_b, d_c, M, K, N);
    }
    
    CHECK(cudaMemcpy(ptrResult, d_c, M * N * sizeof(float), cudaMemcpyDeviceToHost));
    
    CHECK(cudaFree(d_a));
    CHECK(cudaFree(d_b));
    CHECK(cudaFree(d_c));
    
}

矩阵乘法cuda核函数版,最直接的版本,直接使用全局内存

__global__ void matrix_glbal_mul(float*_A ,float* _B,float* _C, int M,int K,int N)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x; //对应于_C的列
    int y = threadIdx.y + blockIdx.y * blockDim.y; //对应于_C的行索引
    if(y>=M || x>=N)
        return;//防止越界
    float sum = 0.;
    for (int i = 0;i<K; ++i)
    {
        sum += _A[y * K + i] * _B[i * K + x];//A[y][i]*B[i][x]的乘累加
    }
    _C[y * N + x] = sum;
}
//这个版本,A和B中的每个元素都需要读取多次,由于GPU中global memory读写速度远小于每个block的shared memory

利用shared memory的储存A和B的数据,global memory中的每个数据只需读取一次


template <int TILE_WIDTH>
__global__ void matrix_MulG(float *_A, float *_B, float *_C, int M, int K, int N)
{
    __shared__ float subM[TILE_WIDTH][TILE_WIDTH];
    __shared__ float subN[TILE_WIDTH][TILE_WIDTH];//每个block中的所有threads共享同一的shared memory
    int x = threadIdx.x + blockIdx.x * blockDim.x; //
    int y = threadIdx.y + blockIdx.y * blockDim.y; //y为行,x为列 对应与结果C的行和列
    int tx = threadIdx.x;
    int ty = threadIdx.y;//该thread 在所属block中的索引
    
    float tmp = 0.;
    for (int i = 0; i < ((K + TILE_WIDTH - 1) / TILE_WIDTH); ++i)
    {
        if ((tx + i * TILE_WIDTH) < K && (y < M))
            subM[ty][tx] = _A[y * K + tx + i * TILE_WIDTH]; //共享内存参数更新
        else
        {
            subM[ty][tx] = 0.;
        }
        if ((ty + i * TILE_WIDTH) < K && (x < N))
            subN[ty][tx] = _B[(ty + i * TILE_WIDTH) * N + x];
        else
        {
            subN[ty][tx] = 0.;
        }
        __syncthreads();// block内thread同步
        for (int j = 0; j < TILE_WIDTH; ++j)
        {
            tmp += subM[ty][j] * subN[j][tx];
        }
        __syncthreads();
    }
    if (y < M && x < N) //越界检查
        _C[y * N + x] = tmp;
}

最终调用的python测试,运行结果可能在不同机器上不同

the numpy cost 4.054656028747559
the cublas cost time 2.555680513381958
the two result are same? True
the Gpu with shared memory cost time 4.681894779205322
the two result are same? True
the Gpu with global memory cost time 4.836901903152466
the two result are same? True

实际上如果需要在Python代码中调用cuda核函数,可以用numba加速库的numba.cuda模块。