如何在android上更快地将RGB565转换为YUV420SP?

如何在android上更快地将RGB565转换为YUV420SP?,android,arm,jpeg,rgb,yuv,Android,Arm,Jpeg,Rgb,Yuv,我需要显示一个jpeg图片,并将其转换为YUV420SP。首先我使用SkBitmap解析jpeg并显示它,然后我使用下面的代码在android上将RGB565转换为YUV420SP,但转换640*480 RGB565图片需要75毫秒,所以有人知道在android上将RGB565转换为YUV420SP的更快方法吗?还是在android上更快地将jpeg文件转换为YUV420SP // Convert from RGB to YUV420 int RGB2YUV_YR[256], RGB2YUV_Y

我需要显示一个jpeg图片,并将其转换为YUV420SP。首先我使用SkBitmap解析jpeg并显示它,然后我使用下面的代码在android上将RGB565转换为YUV420SP,但转换640*480 RGB565图片需要75毫秒,所以有人知道在android上将RGB565转换为YUV420SP的更快方法吗?还是在android上更快地将jpeg文件转换为YUV420SP

// Convert from RGB to YUV420
int RGB2YUV_YR[256], RGB2YUV_YG[256], RGB2YUV_YB[256];
int RGB2YUV_UR[256], RGB2YUV_UG[256], RGB2YUV_UBVR[256];
int RGB2YUV_VG[256], RGB2YUV_VB[256];

//
// Table used for RGB to YUV420 conversion
//
void InitLookupTable()
{
    static bool hasInited = false;
    if(hasInited)
        return ;
    hasInited = true;
    int i;
    for (i = 0; i < 256; i++)
        RGB2YUV_YR[i] = (float) 65.481 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_YG[i] = (float) 128.553 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_YB[i] = (float) 24.966 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_UR[i] = (float) 37.797 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_UG[i] = (float) 74.203 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_VG[i] = (float) 93.786 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_VB[i] = (float) 18.214 * (i << 8);
    for (i = 0; i < 256; i++)
        RGB2YUV_UBVR[i] = (float) 112 * (i << 8);
}

int ConvertRGB5652YUV420SP(int w, int h, unsigned char *bmp, unsigned char *yuv)
{
    unsigned char *u, *v, *y, *uu, *vv;
    unsigned char *pu1, *pu2, *pu3, *pu4;
    unsigned char *pv1, *pv2, *pv3, *pv4;
    unsigned char rValue = 0, gValue = 0, bValue = 0;
    uint16_t* bmpPtr;
    int i, j;
    printf("ConvertRGB5652YUV420SP begin,w=%d,h=%d,bmp=%p,yuv=%p\n", w, h, bmp, yuv);

    struct timeval tpstart,tpend;
    gettimeofday(&tpstart,NULL);

    InitLookupTable();

    gettimeofday(&tpend,NULL);
    float timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
    timeuse/=1000;
    printf("InitLookupTable used time=%f\n", timeuse);
    gettimeofday(&tpstart,NULL);

    uu = new unsigned char[w * h];
    vv = new unsigned char[w * h];
    if (uu == NULL || vv == NULL || yuv == NULL)
        return 0;
    y = yuv;
    u = uu;
    v = vv;
    // Get r,g,b pointers from bmp image data....
    bmpPtr = (uint16_t*)bmp;

    //Get YUV values for rgb values...
    for (i = 0; i < h; i++) {
        for (j = 0; j < w; j++) {
            uint16_t color = *bmpPtr;
            unsigned int r = (color>>11) & 0x1f;
            unsigned int g = (color>> 5) & 0x3f;
            unsigned int b = (color    ) & 0x1f;
            rValue = (r<<3) | (r>>2);      
            gValue = (g<<2) | (g>>4);   
            bValue = (b<<3) | (b>>2);

            *y++ = (RGB2YUV_YR[rValue] + RGB2YUV_YG[gValue] + RGB2YUV_YB[bValue] +
                1048576) >> 16;
            *u++ = (-RGB2YUV_UR[rValue] - RGB2YUV_UG[gValue] + RGB2YUV_UBVR[bValue] +
                8388608) >> 16;
            *v++ = (RGB2YUV_UBVR[rValue] - RGB2YUV_VG[gValue] - RGB2YUV_VB[bValue] +
                8388608) >> 16;
            bmpPtr++;
        }
    }

    gettimeofday(&tpend,NULL);
    timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
    timeuse/=1000;
    printf("Get YUV values used  time=%f\n", timeuse);
    gettimeofday(&tpstart,NULL);

    // Now sample the U & V to obtain YUV 4:2:0 format
    // Get the right pointers...
    u = yuv + w * h;
    v = u + 1;
    // For U
    pu1 = uu;
    pu2 = pu1 + 1;
    pu3 = pu1 + w;
    pu4 = pu3 + 1;
    // For V
    pv1 = vv;
    pv2 = pv1 + 1;
    pv3 = pv1 + w;
    pv4 = pv3 + 1;
    // Do sampling....
    for (i = 0; i < h; i += 2) {
        for (j = 0; j < w; j += 2) {
            *u = (*pu1 + *pu2 + *pu3 + *pu4) >> 2;
            u += 2;
            *v = (*pv1 + *pv2 + *pv3 + *pv4) >> 2;
            v += 2;
            pu1 += 2;
            pu2 += 2;
            pu3 += 2;
            pu4 += 2;
            pv1 += 2;
            pv2 += 2;
            pv3 += 2;
            pv4 += 2;
        }

        pu1 += w;
        pu2 += w;
        pu3 += w;
        pu4 += w;
        pv1 += w;
        pv2 += w;
        pv3 += w;
        pv4 += w;
    }

    gettimeofday(&tpend,NULL);
    timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
    timeuse/=1000;
    printf("Do sampling used time=%f\n", timeuse);
    gettimeofday(&tpstart,NULL);

    delete uu;
    delete vv;
    return 1;
}

int main(int argc, char **argv) {
    unsigned char bmp[640*480*2] = {0};
    unsigned char yuv[(640*480*3)/2] = {0};
    struct timeval tpstart,tpend;
    gettimeofday(&tpstart,NULL);

    ConvertRGB5652YUV420SP(640, 480, bmp, yuv);

    gettimeofday(&tpend,NULL);
    float timeuse=1000000*(tpend.tv_sec-tpstart.tv_sec)+tpend.tv_usec-tpstart.tv_usec;
    timeuse/=1000;
    printf("ConvertARGB2YUV420SP used time=%f\n", timeuse);
    return 0;
}
cpu信息:

$ cat /proc/cpuinfo
cat /proc/cpuinfo
Processor       : ARMv6-compatible processor rev 5 (v6l)
BogoMIPS        : 791.34
Features        : swp half thumb fastmult vfp edsp java
CPU implementer : 0x41
CPU architecture: 6TEJ
CPU variant     : 0x1
CPU part        : 0xb36
CPU revision    : 5

Hardware        : IMAPX200
Revision        : 0000
Serial          : 0000000000000000

在ARMv7上,使用霓虹灯。它将在不到1毫秒的时间内完成这项工作。(VGA)

如果您无法使用ARMv6,请在ARM assembly中对其进行优化。(VGA上约8毫秒)

使用定点算法而不是查找表。摆脱他们

制作两个面具:

  • 0x001f001f:mask1
  • 0x003f003f:mask2
然后一次将两个像素加载到32位寄存器中(这比16位读取快得多)

现在有三个寄存器,每个寄存器包含两个值——一个在低位,另一个在高位16位

从现在起,smulxy指令将创造奇迹。(16位乘法)

祝你好运

PS:你的查找表也不是很好。为什么它们的长度都是256? 您可以将它们减少到32(与r和b相关)和64(与g相关),这将提高缓存命中率。 这可能只适用于目标40毫秒,而无需借助组装。
是的,缓存未命中非常痛苦。

我在skia中找到了一种更快的方法,它大约运行40毫秒

#include "SkColorPriv.h"
#include "SkBitmap.h"
#include "SkCanvas.h"
#include "SkStream.h"

using namespace android;

// taken from jcolor.c in libjpeg
#if 0   // 16bit - precise but slow
    #define CYR     19595   // 0.299
    #define CYG     38470   // 0.587
    #define CYB      7471   // 0.114

    #define CUR    -11059   // -0.16874
    #define CUG    -21709   // -0.33126
    #define CUB     32768   // 0.5

    #define CVR     32768   // 0.5
    #define CVG    -27439   // -0.41869
    #define CVB     -5329   // -0.08131

    #define CSHIFT  16
#else      // 8bit - fast, slightly less precise
    #define CYR     77    // 0.299
    #define CYG     150    // 0.587
    #define CYB      29    // 0.114

    #define CUR     -43    // -0.16874
    #define CUG    -85    // -0.33126
    #define CUB     128    // 0.5

    #define CVR      128   // 0.5
    #define CVG     -107   // -0.41869
    #define CVB      -21   // -0.08131

    #define CSHIFT  8
#endif

static void rgb2yuv_32(uint8_t dst[], SkPMColor c) {
    int r = SkGetPackedR32(c);
    int g = SkGetPackedG32(c);
    int b = SkGetPackedB32(c);

    int  y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT;
    int  u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT;
    int  v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT;

    dst[0] = SkToU8(y);
    dst[1] = SkToU8(u + 128);
    dst[2] = SkToU8(v + 128);
}

static void rgb2yuv_32_x(uint8_t *py, uint8_t *pu, uint8_t *pv, SkPMColor c) {
    int r = SkGetPackedR32(c);
    int g = SkGetPackedG32(c);
    int b = SkGetPackedB32(c);

    if(py != NULL){
         int y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT;
     *py = SkToU8(y);
    }
    if(pu != NULL){
        int  u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT;
    *pu = SkToU8(u + 128);
    }
    if(pv != NULL){
        int  v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT;
    *pv = SkToU8(v + 128);
    }
}

static void rgb2yuv_4444(uint8_t dst[], U16CPU c) {
    int r = SkGetPackedR4444(c);
    int g = SkGetPackedG4444(c);
    int b = SkGetPackedB4444(c);

    int  y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4);
    int  u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4);
    int  v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4);

    dst[0] = SkToU8(y);
    dst[1] = SkToU8(u + 128);
    dst[2] = SkToU8(v + 128);
}

static void rgb2yuv_4444_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) {
    int r = SkGetPackedR4444(c);
    int g = SkGetPackedG4444(c);
    int b = SkGetPackedB4444(c);

    if(py != NULL){
        int  y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4);
    *py = SkToU8(y);
    }
    if(pu != NULL){
        int  u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4);
    *pu = SkToU8(u + 128);
    }
    if(pv != NULL){
        int  v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4);
    *pv = SkToU8(v + 128);
    }
}

static void rgb2yuv_16(uint8_t dst[], U16CPU c) {
    int r = SkGetPackedR16(c);
    int g = SkGetPackedG16(c);
    int b = SkGetPackedB16(c);

    int  y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2);
    int  u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2);
    int  v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2);

    dst[0] = SkToU8(y);
    dst[1] = SkToU8(u + 128);
    dst[2] = SkToU8(v + 128);
}

static void rgb2yuv_16_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) {
    int r = SkGetPackedR16(c);
    int g = SkGetPackedG16(c);
    int b = SkGetPackedB16(c);

    if(py != NULL){
        int  y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2);
        *py = SkToU8(y);
    }
    if(pu != NULL){
        int  u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2);
        *pu = SkToU8(u + 128);
    }
    if(pv != NULL){
        int  v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2);
        *pv = SkToU8(v + 128);
    }
}


int ConvertRGB5652YUV420SPBySkia(SkBitmap* bmp, unsigned char* dst) {
    if(!bmp || !dst || bmp->getConfig() != SkBitmap::kRGB_565_Config)
         return -1;
    int width = bmp->width();
    int height = bmp->height();
    void *src = bmp->getPixels();
    int src_rowbytes = bmp->rowBytes();
    int stride = width;
    int dstheight = height;
    int i, j;
    uint8_t *y_base = (uint8_t *)dst;
    uint8_t *cb_base = (uint8_t *)((unsigned int)y_base + stride * dstheight);
    uint8_t *cr_base = cb_base + 1; 
    uint8_t yuv[3];
    uint8_t *y = NULL, *cb = NULL, *cr = NULL;
    uint16_t *rgb = (uint16_t *)src;
    for(i=0; i<height; i++){
        rgb = (uint16_t *)((unsigned int)src + i * src_rowbytes);
        y = (uint8_t *)((unsigned int)y_base + i * stride);
        if((i & 0x1) == 0){
            cb = (uint8_t *)((unsigned int)cb_base + ((i>>1) * stride));
            cr = cb +  1;
        }
        for(j=0; j<width; j++){
            if(i & 0x1){// valid y and cr
                if(j & 0x01){   // only y
                        rgb2yuv_16_x(y++, NULL, NULL, *rgb++);
                }else{  // both y and cr
                        rgb2yuv_16_x(y++, NULL, cr++, *rgb++);
                        cr++;
                }
            }else{// valid y and cb
                if(j & 0x01){   // only y
                        rgb2yuv_16_x(y++, NULL, NULL, *rgb++);
                }else{  // both y and cb
                        rgb2yuv_16_x(y++, cb++, NULL, *rgb++);
                        cb++;
                }
            }

        }
    }
    return 0;
}
#包括“SkColorPriv.h”
#包括“SkBitmap.h”
#包括“SkCanvas.h”
#包括“SkStream.h”
使用android名称空间;
//取自libjpeg中的jcolor.c
#如果0//16位-精确但速度慢
#定义CYR 19595//0.299
#定义CYG 38470//0.587
#定义CYB 7471//0.114
#定义CUR-11059/-0.16874
#定义CUG-21709//-0.33126
#定义CUB 32768//0.5
#定义CVR 32768//0.5
#定义CVG-27439/-0.41869
#定义CVB-5329//-0.08131
#定义CSShift 16
#else//8位-速度快,精确度稍低
#定义CYR 77//0.299
#定义CYG 150//0.587
#定义CYB 29//0.114
#定义CUR-43//-0.16874
#定义CUG-85//-0.33126
#定义CUB 128//0.5
#定义CVR 128//0.5
#定义CVG-107//-0.41869
#定义CVB-21//-0.08131
#定义CSShift 8
#恩迪夫
静态无效rgb2yuv_32(uint8_t dst[],SKPMC颜色){
int r=SkGetPackedR32(c);
int g=SkGetPackedG32(c);
intb=SkGetPackedB32(c);
int y=(CYR*r+CYG*g+CYB*b)>>CSHIFT;
INTU=(电流*r+CUG*g+CUB*b)>>CSHIFT;
INTV=(CVR*r+CVG*g+CVB*b)>>C换档;
dst[0]=SkToU8(y);
dst[1]=SkToU8(u+128);
dst[2]=SkToU8(v+128);
}
静态空隙rgb2yuv_32_x(uint8_t*py、uint8_t*pu、uint8_t*pv、SKPMC颜色){
int r=SkGetPackedR32(c);
int g=SkGetPackedG32(c);
intb=SkGetPackedB32(c);
如果(py!=NULL){
int y=(CYR*r+CYG*g+CYB*b)>>CSHIFT;
*py=SkToU8(y);
}
如果(pu!=NULL){
INTU=(电流*r+CUG*g+CUB*b)>>CSHIFT;
*pu=SkToU8(u+128);
}
如果(pv!=NULL){
INTV=(CVR*r+CVG*g+CVB*b)>>C换档;
*pv=SkToU8(v+128);
}
}
静态无效rgb2yuv_4444(uint8_t dst[],U16CPU c){
int r=SkGetPackedR4444(c);
int g=SkGetPackedG4444(c);
intb=SkGetPackedB4444(c);
int y=(CYR*r+CYG*g+CYB*b)>>(CSHIFT-4);
INTU=(电流*r+CUG*g+CUB*b)>>(CSHIFT-4);
INTV=(CVR*r+CVG*g+CVB*b)>>(CSHIFT-4);
dst[0]=SkToU8(y);
dst[1]=SkToU8(u+128);
dst[2]=SkToU8(v+128);
}
静态空隙rgb2yuv_4444_x(uint8_t*py、uint8_t*pu、uint8_t*pv、u16c){
int r=SkGetPackedR4444(c);
int g=SkGetPackedG4444(c);
intb=SkGetPackedB4444(c);
如果(py!=NULL){
int y=(CYR*r+CYG*g+CYB*b)>>(CSHIFT-4);
*py=SkToU8(y);
}
如果(pu!=NULL){
INTU=(电流*r+CUG*g+CUB*b)>>(CSHIFT-4);
*pu=SkToU8(u+128);
}
如果(pv!=NULL){
INTV=(CVR*r+CVG*g+CVB*b)>>(CSHIFT-4);
*pv=SkToU8(v+128);
}
}
静态无效rgb2yuv_16(uint8_t dst[],U16CPU c){
int r=SkGetPackedR16(c);
int g=SkGetPackedG16(c);
intb=SkGetPackedB16(c);
int y=(2*CYR*r+CYG*g+2*CYB*b)>>(CSHIFT-2);
intu=(2*CUR*r+CUG*g+2*CUB*b)>>(CSHIFT-2);
INTV=(2*CVR*r+CVG*g+2*CVB*b)>>(CSHIFT-2);
dst[0]=SkToU8(y);
dst[1]=SkToU8(u+128);
dst[2]=SkToU8(v+128);
}
静态空隙rgb2yuv_16_x(uint8_t*py、uint8_t*pu、uint8_t*pv、u16c){
int r=SkGetPackedR16(c);
int g=SkGetPackedG16(c);
intb=SkGetPackedB16(c);
如果(py!=NULL){
int y=(2*CYR*r+CYG*g+2*CYB*b)>>(CSHIFT-2);
*py=SkToU8(y);
}
如果(pu!=NULL){
intu=(2*CUR*r+CUG*g+2*CUB*b)>>(CSHIFT-2);
*pu=SkToU8(u+128);
}
如果(pv!=NULL){
INTV=(2*CVR*r+CVG*g+2*CVB*b)>>(CSHIFT-2);
*pv=SkToU8(v+128);
}
}
int-convertRGB5652YUV420SPYSKIA(SkBitmap*bmp,无符号字符*dst){
如果(!bmp | |!dst | | bmp->getConfig()!=SkBitmap::kRGB_565_Config)
返回-1;
int width=bmp->width();
int height=bmp->height();
void*src=bmp->getPixels();
int src_rowbytes=bmp->rowbytes();
int步长=宽度;
高度=高度;
int i,j;
uint8_t*y_base=(uint8_t*)dst;
uint8_t*cb_base=(uint8_t*)((无符号整数)y_base+stride*dstheight);
uint8_t*cr_base=cb_base+1;
uint8_t yuv[3];
uint8_t*y=NULL,*cb=NULL,*cr=NULL;
uint16_t*rgb=(uint16_t*)src;
对于(i=0;i>1)*步幅);
cr=cb+1;
}

对于(j=0;jbe),因为我需要将图像处理为>24fps,所以我需要在
and red, mask1, pixel, lsr #11
and grn, mask2, pixel, lsr #5
and blu, mask1, pixel
#include "SkColorPriv.h"
#include "SkBitmap.h"
#include "SkCanvas.h"
#include "SkStream.h"

using namespace android;

// taken from jcolor.c in libjpeg
#if 0   // 16bit - precise but slow
    #define CYR     19595   // 0.299
    #define CYG     38470   // 0.587
    #define CYB      7471   // 0.114

    #define CUR    -11059   // -0.16874
    #define CUG    -21709   // -0.33126
    #define CUB     32768   // 0.5

    #define CVR     32768   // 0.5
    #define CVG    -27439   // -0.41869
    #define CVB     -5329   // -0.08131

    #define CSHIFT  16
#else      // 8bit - fast, slightly less precise
    #define CYR     77    // 0.299
    #define CYG     150    // 0.587
    #define CYB      29    // 0.114

    #define CUR     -43    // -0.16874
    #define CUG    -85    // -0.33126
    #define CUB     128    // 0.5

    #define CVR      128   // 0.5
    #define CVG     -107   // -0.41869
    #define CVB      -21   // -0.08131

    #define CSHIFT  8
#endif

static void rgb2yuv_32(uint8_t dst[], SkPMColor c) {
    int r = SkGetPackedR32(c);
    int g = SkGetPackedG32(c);
    int b = SkGetPackedB32(c);

    int  y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT;
    int  u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT;
    int  v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT;

    dst[0] = SkToU8(y);
    dst[1] = SkToU8(u + 128);
    dst[2] = SkToU8(v + 128);
}

static void rgb2yuv_32_x(uint8_t *py, uint8_t *pu, uint8_t *pv, SkPMColor c) {
    int r = SkGetPackedR32(c);
    int g = SkGetPackedG32(c);
    int b = SkGetPackedB32(c);

    if(py != NULL){
         int y = ( CYR*r + CYG*g + CYB*b ) >> CSHIFT;
     *py = SkToU8(y);
    }
    if(pu != NULL){
        int  u = ( CUR*r + CUG*g + CUB*b ) >> CSHIFT;
    *pu = SkToU8(u + 128);
    }
    if(pv != NULL){
        int  v = ( CVR*r + CVG*g + CVB*b ) >> CSHIFT;
    *pv = SkToU8(v + 128);
    }
}

static void rgb2yuv_4444(uint8_t dst[], U16CPU c) {
    int r = SkGetPackedR4444(c);
    int g = SkGetPackedG4444(c);
    int b = SkGetPackedB4444(c);

    int  y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4);
    int  u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4);
    int  v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4);

    dst[0] = SkToU8(y);
    dst[1] = SkToU8(u + 128);
    dst[2] = SkToU8(v + 128);
}

static void rgb2yuv_4444_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) {
    int r = SkGetPackedR4444(c);
    int g = SkGetPackedG4444(c);
    int b = SkGetPackedB4444(c);

    if(py != NULL){
        int  y = ( CYR*r + CYG*g + CYB*b ) >> (CSHIFT - 4);
    *py = SkToU8(y);
    }
    if(pu != NULL){
        int  u = ( CUR*r + CUG*g + CUB*b ) >> (CSHIFT - 4);
    *pu = SkToU8(u + 128);
    }
    if(pv != NULL){
        int  v = ( CVR*r + CVG*g + CVB*b ) >> (CSHIFT - 4);
    *pv = SkToU8(v + 128);
    }
}

static void rgb2yuv_16(uint8_t dst[], U16CPU c) {
    int r = SkGetPackedR16(c);
    int g = SkGetPackedG16(c);
    int b = SkGetPackedB16(c);

    int  y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2);
    int  u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2);
    int  v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2);

    dst[0] = SkToU8(y);
    dst[1] = SkToU8(u + 128);
    dst[2] = SkToU8(v + 128);
}

static void rgb2yuv_16_x(uint8_t *py, uint8_t *pu, uint8_t *pv, U16CPU c) {
    int r = SkGetPackedR16(c);
    int g = SkGetPackedG16(c);
    int b = SkGetPackedB16(c);

    if(py != NULL){
        int  y = ( 2*CYR*r + CYG*g + 2*CYB*b ) >> (CSHIFT - 2);
        *py = SkToU8(y);
    }
    if(pu != NULL){
        int  u = ( 2*CUR*r + CUG*g + 2*CUB*b ) >> (CSHIFT - 2);
        *pu = SkToU8(u + 128);
    }
    if(pv != NULL){
        int  v = ( 2*CVR*r + CVG*g + 2*CVB*b ) >> (CSHIFT - 2);
        *pv = SkToU8(v + 128);
    }
}


int ConvertRGB5652YUV420SPBySkia(SkBitmap* bmp, unsigned char* dst) {
    if(!bmp || !dst || bmp->getConfig() != SkBitmap::kRGB_565_Config)
         return -1;
    int width = bmp->width();
    int height = bmp->height();
    void *src = bmp->getPixels();
    int src_rowbytes = bmp->rowBytes();
    int stride = width;
    int dstheight = height;
    int i, j;
    uint8_t *y_base = (uint8_t *)dst;
    uint8_t *cb_base = (uint8_t *)((unsigned int)y_base + stride * dstheight);
    uint8_t *cr_base = cb_base + 1; 
    uint8_t yuv[3];
    uint8_t *y = NULL, *cb = NULL, *cr = NULL;
    uint16_t *rgb = (uint16_t *)src;
    for(i=0; i<height; i++){
        rgb = (uint16_t *)((unsigned int)src + i * src_rowbytes);
        y = (uint8_t *)((unsigned int)y_base + i * stride);
        if((i & 0x1) == 0){
            cb = (uint8_t *)((unsigned int)cb_base + ((i>>1) * stride));
            cr = cb +  1;
        }
        for(j=0; j<width; j++){
            if(i & 0x1){// valid y and cr
                if(j & 0x01){   // only y
                        rgb2yuv_16_x(y++, NULL, NULL, *rgb++);
                }else{  // both y and cr
                        rgb2yuv_16_x(y++, NULL, cr++, *rgb++);
                        cr++;
                }
            }else{// valid y and cb
                if(j & 0x01){   // only y
                        rgb2yuv_16_x(y++, NULL, NULL, *rgb++);
                }else{  // both y and cb
                        rgb2yuv_16_x(y++, cb++, NULL, *rgb++);
                        cb++;
                }
            }

        }
    }
    return 0;
}