C++ 如何在CUDA中调整YUV图像的大小_C++_Cuda

C++ 如何在CUDA中调整YUV图像的大小

c++ cuda

C++ 如何在CUDA中调整YUV图像的大小,c++,cuda,C++,Cuda,如何在CUDA中调整YUV图像的大小？我尝试将Libyov的缩放代码转换为CUDA，但性能非常差 void ScalePlaneSimple(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const Npp8u* src_ptr, Npp8u* dst_ptr) { int i; // Initial source x/y coordina

如何在CUDA中调整YUV图像的大小？我尝试将Libyov的缩放代码转换为CUDA，但性能非常差

void ScalePlaneSimple(int src_width, int src_height,
int dst_width, int dst_height,
int src_stride, int dst_stride,
const Npp8u* src_ptr, Npp8u* dst_ptr) {
    int i;
    // Initial source x/y coordinate and step values as 16.16 fixed point.
    int x = 0;
    int y = 0;
    int dx = 0;
    int dy = 0;

ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
    &x, &y, &dx, &dy);
src_width = Abs(src_width);
    if (src_width * 2 == dst_width && x < 0x8000) {
        for (i = 0; i < dst_height; ++i) {
            ScaleColsUp2_C(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
            dst_ptr += dst_stride;
            y += dy;
        }
    }
    else
    {
        for (i = 0; i < dst_height; ++i) {
        ScaleCols_C<<<1,1>>>(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
            dst_ptr += dst_stride;
            y += dy;
        }
    }
}
__global__ void ScaleCols_C(Npp8u* dst_ptr, const Npp8u* src_ptr,
    int dst_width, int x, int dx) {
    int j;
    for (j = 0; j < dst_width - 1; j += 2) {
        dst_ptr[0] = src_ptr[x >> 16];
        x += dx;
        dst_ptr[1] = src_ptr[x >> 16];
        x += dx;
        dst_ptr += 2;
}
    if (dst_width & 1) {
        dst_ptr[0] = src_ptr[x >> 16];
    }
}

void ScalePlaneSimple（int src\u width，int src\u height，
int dst_宽度，int dst_高度，
int src_跨步，int dst_跨步，
常数Npp8u*src\ptr，Npp8u*dst\ptr）{
int i；
//初始源x/y坐标和步长值为16.16定点。
int x=0；
int y=0；
int dx=0；
int-dy=0；
扇形坡（src_宽度、src_高度、dst_宽度、dst_高度、Kfilterne、，
&x、 &y，&dx，&dy）；
src_宽度=Abs（src_宽度）；
如果（src_宽度*2==dst_宽度&&x<0x8000）{
对于（i=0；i>16）*src_步幅，dst_宽度，x，dx）；
dst_ptr+=dst_步幅；
y+=dy；
}
}
其他的
{
对于（i=0；i>16）*src_跨距，dst_宽度，x，dx）；
dst_ptr+=dst_步幅；
y+=dy；
}
}
}
__全局无效刻度线C（Npp8u*dst\U ptr，常数Npp8u*src\U ptr，
int dst_宽度，int x，int dx）{
int j；
对于（j=0；j>16]；
x+=dx；
dst_ptr[1]=src_ptr[x>>16]；
x+=dx；
dst_ptr+=2；
}
如果（dst_宽度和1）{
dst_ptr[0]=src_ptr[x>>16]；
}
}

也许我应该使用并行计算？任何建议都是受欢迎的。

< P>如果你想使用CUDA，请看英伟达的性能基元。有图像大小调整功能。（如果您不想使用gpu，它是英特尔性能原件接口的副本）

我在这里看到了很多循环，这与典型的cuda实现不同。既然您使用的是NPP数据结构，为什么不使用NPP函数呢？虽然npp不提供yuv resize，但必须转换为rgb来调整大小，nppiResizeSqrPixel函数在收缩时会导致图像不佳。您正在运行1个线程。当你故意让你的GPU的99.99%的计算能力闲置时，你抱怨性能有什么意义？这是一个组装问题吗？谢谢，我在nvidia_video_sdk（CNvEncoderLowLatency:：ScaleNV12Image）中找到了示例代码，我希望我也能找到混合NV12Image