Python 找不到Ctypes函数_Python_Numpy_Cuda_Ctypes

Python 找不到Ctypes函数

python numpy cuda

Python 找不到Ctypes函数,python,numpy,cuda,ctypes,Python,Numpy,Cuda,Ctypes,我尝试使用ctypes在python中运行一些cuda代码。编译并加载.so文件后，我遇到一个错误，告诉我cuda函数不存在。我以前试过在plainc中使用一个例子，结果成功了。我的编译有什么问题吗 Cuda代码 #include <stdio.h> #include <stdlib.h> #define BLOCK_SIZE 16 struct Matrix { int width; int height; float *elements; }

我尝试使用

ctypes

在python中运行一些cuda代码。编译并加载

.so

文件后，我遇到一个错误，告诉我

cuda

函数不存在。我以前试过在plain

中使用一个例子，结果成功了。我的编译有什么问题吗

Cuda代码

#include <stdio.h>
#include <stdlib.h>
#define BLOCK_SIZE 16

struct Matrix {
    int width;
    int height;
    float *elements;
};

__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C){

    // runs for each col - row pair
    float tmpVal = 0;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    for (int i = 0; i < A.width; ++i)
        tmpVal += A.elements[row * A.width + i] *
                  B.elements[i * B.width + col];
    C.elements[ row * C.width + col ] = tmpVal;
}

void mMul( Matrix *A, Matrix *B, Matrix *C ){

    Matrix d_A, d_B, d_C;

    // Matrix d_A
    d_A.width    =   A->width;
    d_A.height   =   A->height;
    size_t sizeA =   A->width * A->height * sizeof(float);
    // dynamically allocate cudaMemory for elemenst array
    cudaMalloc(&d_A.elements, sizeA);
    cudaMemcpy(d_A.elements, A->elements, sizeA, cudaMemcpyHostToDevice);

    // Matrix d_B
    d_B.width    =   B->width;
    d_B.height   =   B->height;
    size_t sizeB =   B->width * B->height * sizeof(float);
    // dynamically allocate cudaMemory for elemenst array
    cudaMalloc(&d_B.elements, sizeB);
    cudaMemcpy(d_B.elements, B->elements, sizeB, cudaMemcpyHostToDevice);

    // Matrix d_C
    d_C.width    =   C->width;
    d_C.height   =   C->height;
    size_t sizeC =   C->width * C->height * sizeof(float);

    // dynamically allocate cudaMemory for elemenst array
    cudaMalloc(&d_C.elements, sizeC);

    // 16 * 16 = 256 threads per block
    dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);

    // Blocks per grid
    dim3 dimGrid(B->width / dimBlock.x, A->height / dimBlock.y);

    // calling the Kernel
    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

    // copy results from result matrix C to the host again
    cudaMemcpy(C->elements, d_C.elements, sizeC, cudaMemcpyDeviceToHost);

    // free the cuda memory
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}

python-ctypes代码：

import numpy as np
from numpy.ctypeslib import ndpointer
from ctypes import *

class Matrix(Structure):
    _fields_ = [("width", c_int),
                ("height", c_int),
                ("elements", POINTER(c_float))]

libc = CDLL("./Sequential_Cuda_Python.so")

libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]

错误，似乎函数尚未找到

Traceback (most recent call last):
  File "cuda_arr.py", line 17, in <module>
    libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
  File "/usr/lib/python3.8/ctypes/__init__.py", line 386, in __getattr__
    func = self.__getitem__(name)
  File "/usr/lib/python3.8/ctypes/__init__.py", line 391, in __getitem__
    func = self._FuncPtr((name_or_ordinal, self))
AttributeError: ... /Sequential_Cuda_Python.so: undefined symbol: mMul

回溯（最近一次呼叫最后一次）：
文件“cuda_arr.py”，第17行，在
libc.mMul.argtypes=[指针（矩阵）、指针（矩阵）、指针（矩阵）]
文件“/usr/lib/python3.8/ctypes/_init__.py”，第386行，在_getattr中__
func=self.\uuuu getitem\uuuuu（名称）
文件“/usr/lib/python3.8/ctypes/_init__.py”，第391行，在_getitem中__
func=self.\u FuncPtr（（名称或顺序，self））
AttributeError:/Sequential_Cuda_Python.so:未定义符号：mMul

根据评论，您需要

外部“C”

C++（扩展为cuda）做了一件叫做名称混乱的事情

无论是否使用

extern“C”

readelf--symbols Sequential_Cuda_Python.so | grep mMul

#包括
#包括
#定义块大小16
结构矩阵{
整数宽度；
内部高度；
浮动*元素；
};
__全局无效MatMulKernel（矩阵A、矩阵B、矩阵C）{
//为每个列-行对运行
浮动tmpVal=0；
int col=blockIdx.x*blockDim.x+threadIdx.x；
int row=blockIdx.y*blockDim.y+threadIdx.y；
对于（int i=0；iwidth；
d_A.高度=A->高度；
尺寸A=A->宽度*A->高度*sizeof（浮动）；
//为elemenst数组动态分配cudaMemory
Cudamaloc（和d_A.elements，sizeA）；
cudaMemcpy（d_A.elements，A->elements，sizeA，cudamemcpyhostodevice）；
//矩阵d_B
d_B.width=B->width；
d_B.height=B->height；
尺寸B=B->宽度B->高度x尺寸f（浮动）；
//为elemenst数组动态分配cudaMemory
Cudamaloc（&d_B.elements，sizeB）；
cudaMemcpy（d_B.elements，B->elements，sizeB，cudamemcpyhostodevice）；
//矩阵d_C
d_C.width=C->width；
d_C.高度=C->高度；
尺寸=C->宽度*C->高度*sizeof（浮动）；
//为elemenst数组动态分配cudaMemory
Cudamaloc（和d_C.elements，sizeC）；
//16*16=每个块256个线程
dim3 dimBlock（块大小，块大小）；
//每格块数
dim3 dimGrid（B->宽度/dimBlock.x，A->高度/dimBlock.y）；
//调用内核
MatMulKernel（d_A，d_B，d_C）；
//再次将结果从结果矩阵C复制到主机
cudaMemcpy（C->elements，du C.elements，sizeC，cudaMemcpyDeviceToHost）；
//释放cuda内存
cudaFree（d_A.元素）；
cudaFree（d_B.元素）；
cudaFree（d_C.元素）；
}
}

根据评论，您需要

外部“C”

C++（扩展为cuda）做了一件叫做名称混乱的事情

无论是否使用

extern“C”

readelf--symbols Sequential_Cuda_Python.so | grep mMul

#包括
#包括
#定义块大小16
结构矩阵{
整数宽度；
内部高度；
浮动*元素；
};
__全局无效MatMulKernel（矩阵A、矩阵B、矩阵C）{
//为每个列-行对运行
浮动tmpVal=0；
int col=blockIdx.x*blockDim.x+threadIdx.x；
int row=blockIdx.y*blockDim.y+threadIdx.y；
对于（int i=0；iwidth；
d_A.高度=A->高度；
尺寸A=A->宽度*A->高度*sizeof（浮动）；
//为elemenst数组动态分配cudaMemory
Cudamaloc（和d_A.elements，sizeA）；
cudaMemcpy（d_A.elements，A->elements，sizeA，cudamemcpyhostodevice）；
//矩阵d_B
d_B.width=B->width；
d_B.height=B->height；
尺寸B=B->宽度B->高度x尺寸f（浮动）；
//为elemenst数组动态分配cudaMemory
Cudamaloc（&d_B.elements，sizeB）；
cudaMemcpy（d_B.elements，B->elements，sizeB，cudamemcpyhostodevice）；
//矩阵d_C
d_C.width=C->width；
d_C.高度=C->高度；
尺寸=C->宽度*C->高度*sizeof（浮动）；
//为elemenst数组动态分配cudaMemory
Cudamaloc（和d_C.elements，sizeC）；
//16*16=每个块256个线程
dim3 dimBlock（块大小，块大小）；
//每格块数
dim3 dimGrid（B->宽度/dimBlock.x，A->高度/dimBlock.y）；
//调用内核
MatMulKernel（d_A，d_B，d_C）；
//再次将结果从结果矩阵C复制到主机
cudaMemcpy（C->elements，du C.elements，sizeC，cudaMemcpyDeviceToHost）；
//释放cuda内存
cudaFree（d_A.元素）；
cudaFree（d_B.元素）；
cudaFree（d_C.元素）；
}
}
你可能缺少的是CUDA是用C++编译器和C++链接编译的。事实上，你可以让C示例工作可能是无关的。你可能缺少的是CUDA是用C++编译器和C++链接编译的。事实上，你可以让一个C示例工作是不相关的。谢谢你的工作！谢谢你的工作！
Traceback (most recent call last):
  File "cuda_arr.py", line 17, in <module>
    libc.mMul.argtypes = [ POINTER(Matrix), POINTER(Matrix), POINTER(Matrix) ]
  File "/usr/lib/python3.8/ctypes/__init__.py", line 386, in __getattr__
    func = self.__getitem__(name)
  File "/usr/lib/python3.8/ctypes/__init__.py", line 391, in __getitem__
    func = self._FuncPtr((name_or_ordinal, self))
AttributeError: ... /Sequential_Cuda_Python.so: undefined symbol: mMul