Я только что попытался оптимизировать конвертер RGB в YUV420.Использование таблицы поиска привело к увеличению скорости, как и арифметика с фиксированной точкой.Однако я ожидал реальной выгоды, используя инструкции SSE.Моя первая попытка привела к тому, что код стал медленнее, и после объединения всех операций он примерно равен скорости исходного кода.Что-то не так в моей реализации или инструкции SSE просто не подходят для поставленной задачи?
Ниже приведен раздел исходного кода:
#define RRGB24YUVCI2_00 0.299
#define RRGB24YUVCI2_01 0.587
#define RRGB24YUVCI2_02 0.114
#define RRGB24YUVCI2_10 -0.147
#define RRGB24YUVCI2_11 -0.289
#define RRGB24YUVCI2_12 0.436
#define RRGB24YUVCI2_20 0.615
#define RRGB24YUVCI2_21 -0.515
#define RRGB24YUVCI2_22 -0.100
void RealRGB24toYUV420Converter::Convert(void* pRgb, void* pY, void* pU, void* pV)
{
yuvType* py = (yuvType *)pY;
yuvType* pu = (yuvType *)pU;
yuvType* pv = (yuvType *)pV;
unsigned char* src = (unsigned char *)pRgb;
/// Y have range 0..255, U & V have range -128..127.
double u,v;
double r,g,b;
/// Step in 2x2 pel blocks. (4 pels per block).
int xBlks = _width >> 1;
int yBlks = _height >> 1;
for(int yb = 0; yb < yBlks; yb++)
for(int xb = 0; xb < xBlks; xb++)
{
int chrOff = yb*xBlks + xb;
int lumOff = (yb*_width + xb) << 1;
unsigned char* t = src + lumOff*3;
/// Top left pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u = RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v = RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
/// Top right pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
lumOff += _width;
t = t + _width*3 - 6;
/// Bottom left pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
/// Bottom right pel.
b = (double)(*t++);
g = (double)(*t++);
r = (double)(*t++);
py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((int)(0.5 + RRGB24YUVCI2_00*r + RRGB24YUVCI2_01*g + RRGB24YUVCI2_02*b));
u += RRGB24YUVCI2_10*r + RRGB24YUVCI2_11*g + RRGB24YUVCI2_12*b;
v += RRGB24YUVCI2_20*r + RRGB24YUVCI2_21*g + RRGB24YUVCI2_22*b;
/// Average the 4 chr values.
int iu = (int)u;
int iv = (int)v;
if(iu < 0) ///< Rounding.
iu -= 2;
else
iu += 2;
if(iv < 0) ///< Rounding.
iv -= 2;
else
iv += 2;
pu[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iu/4) );
pv[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iv/4) );
}//end for xb & yb...
}//end Convert.
А вот версия с использованием SSE
const float fRRGB24YUVCI2_00 = 0.299;
const float fRRGB24YUVCI2_01 = 0.587;
const float fRRGB24YUVCI2_02 = 0.114;
const float fRRGB24YUVCI2_10 = -0.147;
const float fRRGB24YUVCI2_11 = -0.289;
const float fRRGB24YUVCI2_12 = 0.436;
const float fRRGB24YUVCI2_20 = 0.615;
const float fRRGB24YUVCI2_21 = -0.515;
const float fRRGB24YUVCI2_22 = -0.100;
void RealRGB24toYUV420Converter::Convert(void* pRgb, void* pY, void* pU, void* pV)
{
__m128 xmm_y = _mm_loadu_ps(fCOEFF_0);
__m128 xmm_u = _mm_loadu_ps(fCOEFF_1);
__m128 xmm_v = _mm_loadu_ps(fCOEFF_2);
yuvType* py = (yuvType *)pY;
yuvType* pu = (yuvType *)pU;
yuvType* pv = (yuvType *)pV;
unsigned char* src = (unsigned char *)pRgb;
/// Y have range 0..255, U & V have range -128..127.
float bgr1[4];
bgr1[3] = 0.0;
float bgr2[4];
bgr2[3] = 0.0;
float bgr3[4];
bgr3[3] = 0.0;
float bgr4[4];
bgr4[3] = 0.0;
/// Step in 2x2 pel blocks. (4 pels per block).
int xBlks = _width >> 1;
int yBlks = _height >> 1;
for(int yb = 0; yb < yBlks; yb++)
for(int xb = 0; xb < xBlks; xb++)
{
int chrOff = yb*xBlks + xb;
int lumOff = (yb*_width + xb) << 1;
unsigned char* t = src + lumOff*3;
bgr1[2] = (float)*t++;
bgr1[1] = (float)*t++;
bgr1[0] = (float)*t++;
bgr2[2] = (float)*t++;
bgr2[1] = (float)*t++;
bgr2[0] = (float)*t++;
t = t + _width*3 - 6;
bgr3[2] = (float)*t++;
bgr3[1] = (float)*t++;
bgr3[0] = (float)*t++;
bgr4[2] = (float)*t++;
bgr4[1] = (float)*t++;
bgr4[0] = (float)*t++;
__m128 xmm1 = _mm_loadu_ps(bgr1);
__m128 xmm2 = _mm_loadu_ps(bgr2);
__m128 xmm3 = _mm_loadu_ps(bgr3);
__m128 xmm4 = _mm_loadu_ps(bgr4);
// Y
__m128 xmm_res_y = _mm_mul_ps(xmm1, xmm_y);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
// Y
xmm_res_y = _mm_mul_ps(xmm2, xmm_y);
py[lumOff + 1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
lumOff += _width;
// Y
xmm_res_y = _mm_mul_ps(xmm3, xmm_y);
py[lumOff] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
// Y
xmm_res_y = _mm_mul_ps(xmm4, xmm_y);
py[lumOff+1] = (yuvType)RRGB24YUVCI2_RANGECHECK_0TO255((xmm_res_y.m128_f32[0] + xmm_res_y.m128_f32[1] + xmm_res_y.m128_f32[2] ));
// U
__m128 xmm_res = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(xmm1, xmm_u), _mm_mul_ps(xmm2, xmm_u)),
_mm_add_ps(_mm_mul_ps(xmm3, xmm_u), _mm_mul_ps(xmm4, xmm_u))
);
float fU = xmm_res.m128_f32[0] + xmm_res.m128_f32[1] + xmm_res.m128_f32[2];
// V
xmm_res = _mm_add_ps(
_mm_add_ps(_mm_mul_ps(xmm1, xmm_v), _mm_mul_ps(xmm2, xmm_v)),
_mm_add_ps(_mm_mul_ps(xmm3, xmm_v), _mm_mul_ps(xmm4, xmm_v))
);
float fV = xmm_res.m128_f32[0] + xmm_res.m128_f32[1] + xmm_res.m128_f32[2];
/// Average the 4 chr values.
int iu = (int)fU;
int iv = (int)fV;
if(iu < 0) ///< Rounding.
iu -= 2;
else
iu += 2;
if(iv < 0) ///< Rounding.
iv -= 2;
else
iv += 2;
pu[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iu >> 2) );
pv[chrOff] = (yuvType)( _chrOff + RRGB24YUVCI2_RANGECHECK_N128TO127(iv >> 2) );
}//end for xb & yb...
}
Это одна из моих первых попыток SSE2, так что, возможно, я что-то упустил?К вашему сведению, я работаю на платформе Windows, используя Visual Studio 2008.