RK3588使用openCL

一、opecnCL简介

    OpenCL（全称Open Computing Language，开放运算语言）是第一个面向异构系统通用目的并行编程的开放式、免费标准，也是一个统一的编程环境，便于软件开发人员为高性能计算服务器、桌面计算系统、手持设备编写高效轻便的代码，而且广泛适用于多核心处理器(CPU)、图形处理器(GPU)、Cell类型架构以及数字信号处理器（DSP)等其他并行处理器，在游戏、娱乐、科研、医疗等各种领域都有广阔的发展前景。（抄自百度百科）

    简单的理解，openCL是一种规范，也是一门语言，使用它，可以调用其他处理器如GPU、FPAG、CPU等用于运行代码，代码就是openCL语言写的（在c语言的基础上增加一些特性）。本文使用openCL调用GPU运行代码。

二、rk3588搭建openCL环境

 笔者使用的rk3588固件时是RK官网的ubuntu固件，名字为：ROC-RK3588S-PC_Ubuntu20.04-Gnome-r2202_v1.0.4b_221118.7z。使用官方提供的下载工具 RKDevTool_Release_v2.84下载固件到板子里面。

    在终端里面使用find指令，可以看到openCL对应的库文件和头文件。

笔者的板子里面有多个OpenCL库，windows下面指定的库名字时OpenCL，但我这里实际用到的OpenCL库名字时-lmali。笔者使用makefile编译程序，在makefile里面添加对应的库路径和库名字（g++可以找到open CL的头文件）。


OPENCL_LDLIBS = -lmali
OPENCL_LDLIBS_PATH = -L/usr/lib/aarch64-linux-gnu

三、使用OpenCL

    OpenCL的环境还是比较容易搭建的。下面用一个简单的demo展示下rk3588上的mali-610 GPU的效果。

    之前做图像处理时，写了灰度世界算法（自动白平衡算法的一种），功能就是把摄像头的图像进行白平衡处理，原理请参考其他的博客。代码写的比较简单，用了openCV的库打开图像，CPU版本代码如下。

/*
    灰度世界法
    dst_img_buffer: 存放处理过的图像缓存区，默认24位BGR格式 
    src_img_buffer: 原始图像缓存区
    img_w:  图像宽
    img_h:  图像高
    无返回值
*/
void GrayWorldMethod(unsigned char* dst_img_buffer, const unsigned char* src_img_buffer, const int img_w, const int img_h)
{

    int src_img_w = img_w;
    int src_img_h = img_h;
    const unsigned long img_size = src_img_w * src_img_h;  //图像像素尺寸大小   
    const unsigned long img_size_byte = (src_img_w * src_img_h) * 3;  //图像占用多少字节，
    unsigned long long m_R = 0, m_G = 0, m_B = 0;  // RGB分量的平均值
    float R, G, B;
    unsigned char* src_rgb[3]; // 存放RGB分量
    
    src_rgb[0] = (unsigned char*)malloc(img_size_byte);  //  R
    src_rgb[1] = (unsigned char*)malloc(img_size_byte);  //  G
    src_rgb[2] = (unsigned char*)malloc(img_size_byte);  //  B

    //分离 rgb分量，并储存到src_rgb中
    for (int y = 0; y < src_img_h; ++y) {
        for (int x = 0; x < src_img_w; ++x) {
            src_rgb[0][(src_img_w * y + x)] = src_img_buffer[(src_img_w * y + x) * 3 + 2];
            src_rgb[1][(src_img_w * y + x)] = src_img_buffer[(src_img_w * y + x) * 3 + 1];
            src_rgb[2][(src_img_w * y + x)] = src_img_buffer[(src_img_w * y + x) * 3 + 0];  //
            m_R += src_rgb[0][src_img_w * y + x]; //R
            m_G += src_rgb[1][src_img_w * y + x]; //G
            m_B += src_rgb[2][src_img_w * y + x]; //B
        }
    }

    R = m_R * 1.0 / img_size;
    G = m_G * 1.0 / img_size;
    B = m_B * 1.0 / img_size;

    //计算RGB对应的系数
    float K = (R + G + B) / 3.0f, Kr = K / R, Kg = K / G, Kb = K / B;

    // 将RGB进行变换 并写入图像缓冲中
    int Tr, Tg, Tb = 0;
    for (int y = 0; y < src_img_h; ++y) {
        for (int x = 0; x < src_img_w; ++x) {
            Tr = (src_rgb[0][src_img_w * y + x] * Kr);
            Tg = (src_rgb[1][src_img_w * y + x] * Kg);
            Tb = (src_rgb[2][src_img_w * y + x] * Kb);
            dst_img_buffer[(src_img_w * y + x) * 3 + 2] = Tr > 255 ? 255 : Tr;
            dst_img_buffer[(src_img_w * y + x) * 3 + 1] = Tg > 255 ? 255 : Tg;
            dst_img_buffer[(src_img_w * y + x) * 3 + 0] = Tb > 255 ? 255 : Tb;
        }
    }
    free(src_rgb[0]);
    free(src_rgb[1]);
    free(src_rgb[2]);
}

GPU版本代码如下。参数含义参考CPU版本。

unsigned int m_R = 0, m_G = 0, m_B = 0;

/*
    计算RGB分量平均值
*/
__kernel void MeanRGB(
    __global unsigned char* src_img_buffer,
    const int img_w)
{
    int w = get_global_id(0);
    int h = get_global_id(1);
    
    //对每个RGB分量进行运算

    int r = 0, g = 0, b = 0;
    r = src_img_buffer[(h * img_w + w) * 3 + 2];
    g = src_img_buffer[(h * img_w + w) * 3 + 1];
    b = src_img_buffer[(h * img_w + w) * 3 + 0];

    atomic_add(&m_R, r);  //必须原子访问
    atomic_add(&m_G, g);
    atomic_add(&m_B, b);
}

/*
   对图像进行灰度时间算法处理。 
*/
__kernel void GrayWorldMethod(
    __global unsigned char* dst_img_buffer,
    __global const unsigned char* src_img_buffer,
    const int img_w,
    const int img_h)
{
    float R, G, B;
    unsigned int img_size = img_w * img_h;
    int Tr, Tg, Tb = 0;
    int w = get_global_id(0);
    int h = get_global_id(1);

    R = m_R * 1.0 / img_size;
    G = m_G * 1.0 / img_size;
    B = m_B * 1.0 / img_size;

    
    //计算RGB对应的系数
    double K = (R + G + B) / 3.0, Kr = K / R, Kg = K / G, Kb = K / B;
    // 将RGB进行变换 并写入图像缓冲中

    Tr = (src_img_buffer[(h * img_w + w) * 3 + 2] * Kr);
    Tg = (src_img_buffer[(h * img_w + w) * 3 + 1] * Kg);
    Tb = (src_img_buffer[(h * img_w + w) * 3 + 0] * Kb);
    dst_img_buffer[(h * img_w + w) * 3 + 2] = Tr > 255 ? 255 : Tr;
    dst_img_buffer[(h * img_w + w) * 3 + 1] = Tg > 255 ? 255 : Tg;
    dst_img_buffer[(h * img_w + w) * 3 + 0] = Tb > 255 ? 255 : Tb;
    
}

OpenCL参考代码。

size_t global[2];
cl_event prof_event;
global[0] = (size_t)src_img_w;  //工作项设置成图像大小
global[1] = (size_t)src_img_h;

status = clEnqueueNDRangeKernel(cmd_queue, MeanRGB_kernel, 2, NULL, global, NULL, 0, NULL, &prof_event);  //执行GPU代码
if (status)
    cout << "执行内核时错误" << endl;
clFinish(cmd_queue);
status = clEnqueueNDRangeKernel(cmd_queue, GrayWorldMethod_kernel, 2, NULL, global, NULL, 0, NULL, &prof_event);  //执行GPU代码
if (status)
    cout << "执行内核时错误" << endl;
clFinish(cmd_queue);

status = clEnqueueReadBuffer(cmd_queue, memObjects[0], CL_TRUE, 0, img_size_byte, dst_img_buffer, 0, NULL, NULL);//数据拷回 host 内存
if (status)
    perror("读回数据的时候发生错误\n");

这里创建了两个内核函数执行代码。传输数据到显存，执行程序。结果如图

图像的分辨率1024*768，cpu执行时间时10.6ms，gpu用了1.59ms。执行时间是取十次运行的平均值，速度还是有非常大的提升的。图像处理结果。

原始图像

           灰度世界算法处理之后

标签： linux 图像处理嵌入式硬件

本文转载自: https://blog.csdn.net/zichuanning520/article/details/129411886
版权归原作者 紫川宁520 所有，如有侵权，请联系我们删除。

发表评论

“RK3588使用openCL”的评论:

关于作者

overfit同步小助手

相关阅读

文章导航