CSDN首页>

基于OpenCL的mean filter性能

发表于2012-09-06 13:39| 来源未知| 作者csdn

摘要:对于一个标准的3*3 均值滤波,kernel代码如下:使用buffer/image缓冲对象

1.对于一个标准的3*3 均值滤波,kernel代码如下:

使用buffer/image缓冲对象

__kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage, uint N)

{

int x = get_global_id(0);

int y = get_global_id(1);

int width = get_global_size(0);

int height = get_global_size(1);

int k = (N-1)/2;

int n = N*N; //n*n

if(x < k || y < k || x > width - k - 1 || y > height - k - 1)

{

outputImage[x + y * width] = inputImage[x + y * width];

return;

}

uint4 finalcolor = (uint4)(0);

int i,j;

for(j = y - k; j <= y + k; j++)

{

for(i = x - k; i <= x + k; i++)

{

finalcolor = finalcolor + convert_uint4(inputImage[i + j * width]);

}

}

outputImage[x + y * width] = convert_uchar4(finalcolor/n);

}

__kernel void filterImg( image2d_t inputImage, __write_only image2d_t outputImage, uint N)

{

int x = get_global_id(0);

int y = get_global_id(1);

int width = get_global_size(0);

int height = get_global_size(1);

uint4 temp = read_imageui(inputImage, imageSampler, (int2)(x,y));

int k = (N-1)/2;

int n = N*N; //n*n

if(x < k || y < k || x > width - k - 1 || y > height - k - 1)

{

write_imageui(outputImage, (int2)(x,y), temp);

return;

}

/* k*k area */

uint4 finalcolor = (uint4)(0);

int i,j;

for(j = y - k; j <= y + k; j++)

{

for(i = x - k; i <= x + k; i++)

{

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(i,j));

}

}

finalcolor = finalcolor/n;

write_imageui(outputImage, (int2)(x,y), finalcolor);

}

对一个2048*2048的图像执行filter操作

image

image

image

image

global work size = {2048, 2048, 1}, group work size = {16, 16}, 一般group work size应该为64的倍数,因为对于AMD显卡,wave是基本的硬件线程调度单位。

使用了6个GPRs,没有使用ScratchRegs,ScratchRregs是指用vedio meory来模拟GPR,但是线程执行的速度会大大降低,应尽量减少ScratchRegs的数量。

可以看到,使用image对象kernel执行时间要短,但奇怪的是各项性能参数都是buffer对象领先,除了alu busy和alu指令数目。

改为下面的kernel代码,性能会有所提高

__kernel void filter(__global uchar4* inputImage, __global uchar4* outputImage, uint N)

{

int x = get_global_id(0);

int y = get_global_id(1);

int width = get_global_size(0);

int height = get_global_size(1);

if(x < 1 || y < 1 || x > width - 2 || y > height - 2)

{

outputImage[x + y * width] = inputImage[x + y * width];

return;

}

uint4 finalcolor = (uint4)(0);

finalcolor = finalcolor + convert_uint4(inputImage[x-1+( y-1) * width]);

finalcolor = finalcolor + convert_uint4(inputImage[x+( y-1) * width]);

finalcolor = finalcolor + convert_uint4(inputImage[x+1+( y-1) * width]);

finalcolor = finalcolor + convert_uint4(inputImage[x-1+y * width]);

finalcolor = finalcolor + convert_uint4(inputImage[x+y * width]);

finalcolor = finalcolor + convert_uint4(inputImage[x+1+y * width]);

finalcolor = finalcolor + convert_uint4(inputImage[x-1+( y+1) * width]);

finalcolor = finalcolor + convert_uint4(inputImage[x+( y+1) * width]);

finalcolor = finalcolor + convert_uint4(inputImage[x+1+( y+1) * width]);

outputImage[x + y * width] = convert_uchar4(finalcolor/9);

}

__kernel void filter1(__global uchar4* inputImage, __global uchar4* outputImage, uint N)

{

int x = get_global_id(0);

int y = get_global_id(1);

int width = get_global_size(0);

int height = get_global_size(1);

int k = (N-1)/2;

int n = N*N; //n*n

if(x < k || y < k || x > width - k - 1 || y > height - k - 1)

{

outputImage[x + y * width inputImage[x + y * width];

return;

}

// if(x==209 && y ==243)

//{

// printf("final color:%d,%d,%d,%d\n", finalcolor.x, finalcolor.y, finalcolor.z,finalcolor.w);

// }

uint4 finalcolor = (uint4)(0);

int i,j;

for(j = y - k; j <= y + k; j++)

{

for(i = x - k; i <= x + k; i++)

{

finalcolor = finalcolor + convert_uint4(inputImage[i + j * width]);

}

}

outputImage[x + y * width] = convert_uchar4(finalcolor/n);

}

__kernel void filterImg( image2d_t inputImage, __write_only image2d_t outputImage, uint N)

{

int x = get_global_id(0);

int y = get_global_id(1);

int width = get_global_size(0);

int height = get_global_size(1);

uint4 temp = read_imageui(inputImage, imageSampler, (int2)(x,y));

if(x < 1 || y < 1 || x > width - 2 || y > height - 2)

{

write_imageui(outputImage, (int2)(x,y), temp);

return;

}

/* k*k area */

uint4 finalcolor = (uint4)(0);

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x-1,y-1));

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x,y-1));

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x+1,y-1));

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x-1,y));

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x,y));

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x+1,y));

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x-1,y+1));

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x,y+1));

finalcolor = finalcolor + read_imageui(inputImage, imageSampler, (int2)(x+1,y+1));

finalcolor = finalcolor/9;

write_imageui(outputImage, (int2)(x,y), finalcolor);

}

image

image

image

image

原文作者:迈克老狼

0
0
基于OpenCL的mean filter性能
  • CSDN官方微信
  • 扫描二维码,向CSDN吐槽
  • 微信号:CSDNnews
程序员移动端订阅下载

微博关注

相关热门文章