Я исправлю это! Это ядро не оптимально с точки зрения производительности, но функционально корректно.
Пожалуйста, используйте такие параметры для enqueueNDRange:
kernelBicubic.getKernel().setArgs(scaleFactor, inImage, imageOut);
lastEvent=kernelBicubic.getKernel().enqueueNDRange(queue,
new int[]{(int) inImage.getWidth()+1,(int) inImage.getHeight()+1},lastEvent);
Код ядра:
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_LINEAR | CLK_ADDRESS_CLAMP_TO_EDGE;
const float CATMULL_ROM[16]={-0.5F, 1.5F,-1.5F, 0.5F, 1.0F,-2.5F, 2.0F,-0.5F,-0.5F, 0.0F, 0.5F, 0.0F, 0.0F, 1.0F, 0.0F, 0.0F};
inlie float calcT(image2d_t signal,int x,int y,int i,int j){
return read_imagef(signal, sampler, (int2)(x ,y+i)).x * CATMULL_ROM[4*j]
+read_imagef(signal, sampler, (int2)(x+1,y+i)).x * CATMULL_ROM[4*j+1]
+read_imagef(signal, sampler, (int2)(x+2,y+i)).x * CATMULL_ROM[4*j+2]
+read_imagef(signal, sampler, (int2)(x+3,y+i)).x * CATMULL_ROM[4*j+3];
}
inline float C(image2d_t signal,int x,int y,int i,int j){
return CATMULL_ROM[4*i ] * calcT(signal,x,y,0,j)
+CATMULL_ROM[4*i+1] * calcT(signal,x,y,1,j)
+CATMULL_ROM[4*i+2] * calcT(signal,x,y,2,j)
+CATMULL_ROM[4*i+3] * calcT(signal,x,y,3,j);
}
__kernel void bicubicUpscale(int scale,read_only image2d_t signal, write_only image2d_t upscale) {
int x = get_global_id(0)-2, y = get_global_id(1)-2;
float C0 =C(signal,x,y,0,0);
float C1 =C(signal,x,y,0,1);
float C2 =C(signal,x,y,0,2);
float C3 =C(signal,x,y,0,3);
float C4 =C(signal,x,y,1,0);
float C5 =C(signal,x,y,1,1);
float C6 =C(signal,x,y,1,2);
float C7 =C(signal,x,y,1,3);
float C8 =C(signal,x,y,2,0);
float C9 =C(signal,x,y,2,1);
float C10=C(signal,x,y,2,2);
float C11=C(signal,x,y,2,3);
float C12=C(signal,x,y,3,0);
float C13=C(signal,x,y,3,1);
float C14=C(signal,x,y,3,2);
float C15=C(signal,x,y,3,3);
float xOff=scale*1.5F + x*scale;
float yOff=scale*1.5F + y*scale;
for (int i = 0; i < scale; i++)
{
for (int j = 0; j < scale; j++)
{
float iY=(float)j/(float) scale;
float iX=(float)i/(float) scale;
float val= iX * (iX * (iX * (iY * (iY * (iY * C0 + C1) + C2) + C3)
+ (iY * (iY * (iY * C4 + C5) + C6) + C7))
+ (iY * (iY * (iY * C8 + C9) + C10) + C11))
+ (iY * (iY * (iY * C12 + C13) + C14) + C15);
write_imagef(upscale, (int2)(xOff+j, yOff+i), val);
}
}
}