图像处理用openmp_gdal使用openmp并行读图像 - CSDN
精华内容
参与话题
  • 大部分图像处理都是串行的(即该函数的输入来自于另一个函数的输出),OpenMP只能适用与图像独立处理的场合,比如对某文件夹中的图片进行相同的图像增强处理,或这里要说的对两张图像分别进行特征提取。 OpenCV中...

            大部分图像处理都是串行的(即该函数的输入来自于另一个函数的输出),OpenMP只能适用与图像独立处理的场合,比如对某文件夹中的图片进行相同的图像增强处理,或这里要说的对两张图像分别进行特征提取。

            OpenCV中使用Sift或者Surf特征进行图像拼接的算法,需要分别对两幅或多幅图像进行特征提取和特征描述,之后再进行图像特征点的配对,图像变换等操作。不同图像的特征提取和描述的工作是整个过程中最耗费时间的,也是独立 运行的,可以使用OpenMP进行加速。

            将#pragma omp注释掉即可对比没有采用OpenMP时的运行效率,在该程序中相当于是两个线程分别执行两幅图像的特征提取和描述操作。使用OpenMP后速度差不多提升了一倍。

    #include "highgui/highgui.hpp"      
    #include "opencv2/nonfree/nonfree.hpp"      
    #include "opencv2/legacy/legacy.hpp"     
    #include "omp.h"  
    
    using namespace cv;
    
    //计算原始图像点位在经过矩阵变换后在目标图像上对应位置    
    Point2f getTransformPoint(const Point2f originalPoint, const Mat &transformMaxtri);
    
    int main(int argc, char *argv[])
    {
    	float startTime = omp_get_wtime();
    
    	Mat image01, image02;
    	Mat image1, image2;
    	vector<KeyPoint> keyPoint1, keyPoint2;
    	Mat imageDesc1, imageDesc2;
    	SiftFeatureDetector siftDetector(800);  // 海塞矩阵阈值    
    	SiftDescriptorExtrwww.tt951.comactor siftDescriptor;
    	//使用OpenMP的sections制导指令开启多线程  
    #pragma omp parallel sections    
    	{
    #pragma omp section    
    		{
    			image01 = imread("Test01.jpg");
    			imshow("拼接图像1", image01);
    			//灰度图转换   
    			cvtColor(image01, image1, CV_RGB2GRAY);
    			//提取特征点    
    			siftDetector.detect(image1, keyPoint1);
    			//特征点描述,为下边的特征点匹配做准备      
    			siftDescriptor.compute(image1, keyPoint1, imageDesc1);
    		}
    #pragma omp section    
    		{
    			image02 = imread("Test02.jpg");
    			imshow("拼接图像2", image02);
    			cvtColor(image02, image2, CV_RGB2GRAY);
    			siftDetector.detect(image2, keyPoint2);
    			siftDescriptor.compute(image2, keyPoint2, imageDesc2);
    		}
    	}
    	float endTime = omp_get_wtime();
    	std::cout << "使用OpenMP加速消耗时间: " << endTime - startTime << std::endl;
    
    	//获得匹配特征点,并提取最优配对       
    	FlannBasedMatcher matcher;
    	vector<DMatch> matchePoints;
    	matcher.match(imageDesc1, imageDesc2, matchePoints, Mat());
    	sort(matchePoints.begin(), matchePoints.end()); //特征点排序      
    	//获取排在前N个的最优匹配特征点    
    	vector<Point2f> imagePoints1, imagePoints2;
    	for (int i = 0; i < 10; i++)
    	{
    		imagePoints1.push_back(keyPoint1[matchePoints[i].queryIdx].pt);
    		imagePoints2.push_back(keyPoint2[matchePoints[i].trainIdx].pt);
    	}
    
    	//获取图像1到图像2的投影映射矩阵,尺寸为3*3    
    	Mat homo = findHomography(imagePoints1, imagePoints2, CV_RANSAC);
    	Mat adjustMat = (Mat_<double>(3, 3) << 1.0, 0, image01.cols, 0, 1.0, 0, 0, 0, 1.0);
    	Mat adjustHomo = adjustMat * homo;
    
    	//获取最强配对点在原始图像和矩阵变换后图像上的对应位置,用于图像拼接点的定位    
    	Point2f originalLinkPoint, targetLinkPoint, basedImagePoint;
    	originalLinkPoint = keyPoint1[matchePoints[0].queryIdx].pt;
    	targetLinkPoint = getTransformPoint(originalLinkPoint, adjustHomo);
    	basedImagePoint = keyPoint2[matchePoints[0].trainIdx].pt;
    
    	//图像配准    
    	Mat imageTransform1;
    	warpPerspective(image01, imageTransform1, adjustMat*homo, Size(image02.cols + image01.cols + 110, image02.rows));
    
    	//在最强匹配点左侧的重叠区域进行累加,是衔接稳定过渡,消除突变    
    	Mat image1Overlap, image2Overlap; //图1和图2的重叠部分       
    	image1Overlap = imageTransform1(Rect(Point(targetLinkPoint.x - basedImagePoint.x, 0), Point(targetLinkPoint.x, image02.rows)));
    	image2Overlap = image02(Rect(0, 0, image1Overlap.cols, image1Overlap.rows));
    	Mat image1ROICopy = image1Overlap.clone();  //复制一份图1的重叠部分   
    	for (int i = 0; i < image1Overlap.rows; i++)
    	{
    		for (int j = 0; j < image1Overlap.cols; j++)
    		{
    			double weight;
    			weight = (double)j / image1Overlap.cols;  //随距离改变而改变的叠加系数    
    			image1Overlap.at<Vec3b>(i, j)[0] = (1 - weight)*image1ROICopy.at<Vec3b>(i, j)[0] + weight * image2Overlap.at<Vec3b>(i, j)[0];
    			image1Overlap.at<Vec3b>(i, j)[1] = (1 - weight)*image1ROICopy.at<Vec3b>(i, j)[1] + weight * image2Overlap.at<Vec3b>(i, j)[1];
    			image1Overlap.at<Vec3b>(i, j)[2] = (1 - weight)*image1ROICopy.at<Vec3b>(i, j)[2] + weight * image2Overlap.at<Vec3b>(i, j)[2];
    		}
    	}
    	Mat ROIMat = image02(Rect(Point(image1Overlap.cols, 0), Point(image02.cols, image02.rows)));  //图2中不重合的部分    
    	ROIMat.copyTo(Mat(imageTransform1, Rect(targetLinkPoint.x, 0, ROIMat.cols, image02.rows))); //不重合的部分直接衔接上去    
    	namedWindow("拼接结果", 0);
    	imshow("拼接结果", imageTransform1);
    	imwrite("D:\\拼接结果.jpg", imageTransform1);
    	waitKey();
    	return 0;
    }
    
    //计算原始图像点位在经过矩阵变换后在目标图像上对应位置    
    Point2f getTransformPoint(const Point2f originalPoint, const Mat &transformMaxtri)
    {
    	Mat originelP, targetP;
    	originelP = (Mat_<double>(3, 1) << originalPoint.x, originalPoint.y, 1.0);
    	targetP = transformwww.baiyuewang.netMaxtri*originelP;
    	float x = targetP.at<double>(0, 0) / targetP.at<double>(2, 0);
    	float y = targetP.at<double>(1, 0) / targetP.at<double>(2, 0);
    	return Point2f(x, y);
    }

     

    展开全文
  • 本文首发于个人博客... speed up opencv image processing with openmp Series Part 1: compile opencv on ubuntu 16.04 Part 2: compile opencv with CUDA support on windows 10 Pa...

    本文首发于个人博客https://kezunlin.me/post/7a6ba82e/,欢迎阅读!

    speed up opencv image processing with openmp

    Series

    Guide

    config

    • linux/window: cmake with CXX_FLAGS=-fopenmp
    • window VS: VS also support openmp, C/C | Language | /openmp

    usage

    #include <omp.h>
    
    #pragma omp parallel for
        for loop ...

    code

    #include <iostream>
    #include <omp.h>
    
    int main()
    {
        omp_set_num_threads(4);
    #pragma omp parallel for
        for (int i = 0; i < 8; i  )
        {
            printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
        }
        printf("\n");    
    
        return 0;
    }
    
    /*
    i = 0, I am Thread 0
    i = 1, I am Thread 0
    i = 4, I am Thread 2
    i = 5, I am Thread 2
    i = 6, I am Thread 3
    i = 7, I am Thread 3
    i = 2, I am Thread 1
    i = 3, I am Thread 1
    */
    

    CMakeLists.txt

    use CXX_FLAGS=-fopenmp in CMakeLists.txt

    cmake_minimum_required(VERSION 3.0.0)
    
    project(hello)
    
    find_package(OpenMP REQUIRED)
    if(OPENMP_FOUND)
        message("OPENMP FOUND")
    
        message([main] " OpenMP_C_FLAGS=${OpenMP_C_FLAGS}") # -fopenmp
        message([main] " OpenMP_CXX_FLAGS}=${OpenMP_CXX_FLAGS}") # -fopenmp
        message([main] " OpenMP_EXE_LINKER_FLAGS=${OpenMP_EXE_LINKER_FLAGS}") # ***
    
        # no use for xxx_INCLUDE_DIRS and xxx_libraries for OpenMP
        message([main] " OpenMP_INCLUDE_DIRS=${OpenMP_INCLUDE_DIRS}") # ***
        message([main] " OpenMP_LIBRARIES=${OpenMP_LIBRARIES}") # ***
    
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
    endif()
    
    add_executable(hello hello.cpp)
    #target_link_libraries(hello xxx)

    optionsopenmp

    or use g hello.cpp -fopenmp to compile

    view demo

    list dynamic dependencies (ldd)

        ldd hello 
            linux-vdso.so.1 =>  (0x00007ffd71365000)
            libstdc  .so.6 => /usr/lib/x86_64-linux-gnu/libstdc  .so.6 (0x00007f8ea7f00000)
            libgomp.so.1 => /usr/lib/x86_64-linux-gnu/libgomp.so.1 (0x00007f8ea7cde000)
            libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f8ea7914000)
            libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f8ea760b000)
            /lib64/ld-linux-x86-64.so.2 (0x00007f8ea8282000)
            libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f8ea73f5000)
            libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f8ea71f1000)
            libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f8ea6fd4000)

    libgomp.so.1 => /usr/lib/x86_64-linux-gnu/libgomp.so.1

    list names (nm)

        nm hello 
        0000000000602080 B __bss_start
        0000000000602190 b completed.7594
                         U __cxa_atexit@@GLIBC_2.2.5
        0000000000602070 D __data_start
        0000000000602070 W data_start
        0000000000400b00 t deregister_tm_clones
        0000000000400b80 t __do_global_dtors_aux
        0000000000601df8 t __do_global_dtors_aux_fini_array_entry
        0000000000602078 d __dso_handle
        0000000000601e08 d _DYNAMIC
        0000000000602080 D _edata
        0000000000602198 B _end
        0000000000400d44 T _fini
        0000000000400ba0 t frame_dummy
        0000000000601de8 t __frame_dummy_init_array_entry
        0000000000400f18 r __FRAME_END__
        0000000000602000 d _GLOBAL_OFFSET_TABLE_
        0000000000400c28 t _GLOBAL__sub_I_main
                         w __gmon_start__
        0000000000400d54 r __GNU_EH_FRAME_HDR
                         U GOMP_parallel@@GOMP_4.0
                         U __gxx_personality_v0@@CXXABI_1.3
        00000000004009e0 T _init
        0000000000601df8 t __init_array_end
        0000000000601de8 t __init_array_start
        0000000000400d50 R _IO_stdin_used
                         w _ITM_deregisterTMCloneTable
                         w _ITM_registerTMCloneTable
        0000000000601e00 d __JCR_END__
        0000000000601e00 d __JCR_LIST__
                         w _Jv_RegisterClasses
        0000000000400d40 T __libc_csu_fini
        0000000000400cd0 T __libc_csu_init
                         U __libc_start_main@@GLIBC_2.2.5
        0000000000400bc6 T main
        0000000000400c3d t main._omp_fn.0
                         U omp_get_num_threads@@OMP_1.0
                         U omp_get_thread_num@@OMP_1.0
        0000000000400b40 t register_tm_clones
        0000000000400ad0 T _start
        0000000000602080 d __TMC_END__
        0000000000400bea t _Z41__static_initialization_and_destruction_0ii
                         U _ZNSolsEPFRSoS_E@@GLIBCXX_3.4
                         U _ZNSt8ios_base4InitC1Ev@@GLIBCXX_3.4
                         U _ZNSt8ios_base4InitD1Ev@@GLIBCXX_3.4
        0000000000602080 B _ZSt4cout@@GLIBCXX_3.4
                         U _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_@@GLIBCXX_3.4
        0000000000602191 b _ZStL8__ioinit
                         U _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c@@GLIBCXX_3.4

    omp_get_num_threads, omp_get_thread_num

    OpenMP Introduction

    OpenMP的指令格式

        #pragma omp directive [clause[clause]…]
        #pragma omp parallel private(i, j)

    parallel is directive, private is clause

    directive

    • parallel,用在一个代码段之前,表示这段代码将被多个线程并行执行
    • for,用于for循环之前,将循环分配到多个线程中并行执行,必须保证每次循环之间无相关性。
    • parallel for, parallel 和 for语句的结合,也是用在一个for循环之前,表示for循环的代码将被多个线程并行执行。
    • sections,用在可能会被并行执行的代码段之前
    • parallel sections,parallel和sections两个语句的结合
    • critical,用在一段代码临界区之前
    • single,用在一段只被单个线程执行的代码段之前,表示后面的代码段将被单线程执行。
    • flush,
    • barrier,用于并行区内代码的线程同步,所有线程执行到barrier时要停止,直到所有线程都执行到barrier时才继续往下执行。
    • atomic,用于指定一块内存区域被制动更新
    • master,用于指定一段代码块由主线程执行
    • ordered, 用于指定并行区域的循环按顺序执行
    • threadprivate, 用于指定一个变量是线程私有的。

    parallel for

    OpenMP 对可以多线程化的循环有如下五个要求:

    • 循环的变量变量(就是i)必须是有符号整形,其他的都不行。
    • 循环的比较条件必须是< <= > >=中的一种
    • 循环的增量部分必须是增减一个不变的值(即每次循环是不变的)。
    • 如果比较符号是< <=,那每次循环i应该增加,反之应该减小
    • 循环必须是没有奇奇怪怪的东西,不能从内部循环跳到外部循环,goto和break只能在循环内部跳转,异常必须在循环内部被捕获。

    如果你的循环不符合这些条件,那就只好改写了.

    avoid race condition

    当一个循环满足以上五个条件时,依然可能因为数据依赖而不能够合理的并行化。当两个不同的迭代之间的数据存在依赖关系时,就会发生这种情况。

    // 假设数组已经初始化为1
    #pragma omp parallel for
    for (int i = 2; i < 10; i  ) {
        factorial[i] = i * factorial[i-1];
    }

    ERROR.

    omp_set_num_threads(4);
    #pragma omp parallel
        {
            #pragma omp for
            for (int i = 0; i < 8; i  )
            {
                printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
            }
        }

    same as

    omp_set_num_threads(4);
    #pragma omp parallel for
        for (int i = 0; i < 8; i  )
        {
            printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
        }

    parallel sections

    #pragma omp parallel sections # parallel 
    {
        #pragma omp section # thread-1
        {
            function1();
        }
      #pragma omp section # thread-2
        {
            function2();
        }
    }

    parallel sections里面的内容要并行执行,具体分工上,每个线程执行其中的一个section

    clause

    • private, 指定每个线程都有它自己的变量私有副本。
    • firstprivate,指定每个线程都有它自己的变量私有副本,并且变量要被继承主线程中的初值。
    • lastprivate,主要是用来指定将线程中的私有变量的值在并行处理结束后复制回主线程中的对应变量。
    • reduce,用来指定一个或多个变量是私有的,并且在并行处理结束后这些变量要执行指定的运算。
    • nowait,忽略指定中暗含的等待
    • num_threads,指定线程的个数
    • schedule,指定如何调度for循环迭代
    • shared,指定一个或多个变量为多个线程间的共享变量
    • ordered,用来指定for循环的执行要按顺序执行
    • copyprivate,用于single指令中的指定变量为多个线程的共享变量
    • copyin,用来指定一个threadprivate的变量的值要用主线程的值进行初始化。
    • default,用来指定并行处理区域内的变量的使用方式,缺省是shared

    private

    #pragma omp parallel
    {
        int x; // private to each thread ? YES
    }
    
    #pragma omp parallel for
    for (int i = 0; i < 1000;   i)
    {
        int x; // private to each thread ? YES
    }

    local variables are automatically private to each thread.

    The reason for the existence of the private clause is so that you don't have to change your code.

    see here

    The only way to parallelize the following code without the private clause

    int i,j;
    #pragma omp parallel for private(j)
    for(i = 0; i < n; i  ) {
        for(j = 0; j < n; j  ) {
            //do something
        }
    }

    is to change the code. For example like this:

    int i;
    #pragma omp parallel for
    for(i = 0; i < n; i  ) {
        int j; // mark j as local variable to worker thread
        for(j = 0; j < n; j  ) {
            //do something
        }
    }

    reduction

    例如累加

    int sum = 0;
    for (int i = 0; i < 100; i  ) {
        sum  = array[i]; // sum需要私有才能实现并行化,但是又必须是公有的才能产生正确结果
    }

    上面的这个程序里,sum公有或者私有都不对,为了解决这个问题,OpenMP 提供了reduction语句;

    int sum = 0;
    #pragma omp parallel for reduction( :sum)
    for (int i = 0; i < 100; i  ) {
        sum  = array[i];
    }

    内部实现中,OpenMP为每个线程提供了私有的sum变量(初始化为0),当线程退出时,OpenMP再把每个线程私有的sum加在一起得到最终结果。

    num_threads

    num_threads(4) same as omp_set_num_threads(4)

    // `num_threads(4)` same as `omp_set_num_threads(4)`
        #pragma omp parallel num_threads(4)
        {
            printf("Hello, I am Thread %d\n", omp_get_thread_num()); // 0,1,2,3,
        }

    schedule

    format

        #pragma omp parallel for schedule(kind [, chunk size])

    kind: see openmp-loop-scheduling and whats-the-difference-between-static-and-dynamic-schedule-in-openmp

    • static: Divide the loop into equal-sized chunks or as equal as possible in the case where the number of loop iterations is not evenly divisible by the number of threads multiplied by the chunk size. By default, chunk size is loop_count/number_of_threads.
    • dynamic: Use the internal work queue to give a chunk-sized block of loop iterations to each thread. When a thread is finished, it retrieves the next block of loop iterations from the top of the work queue. By default, the chunk size is 1. Be careful when using this scheduling type because of the extra overhead involved.
    • guided: special case of dynamic. Similar to dynamic scheduling, but the chunk size starts off large and decreases to better handle load imbalance between iterations. The optional chunk parameter specifies them minimum size chunk to use. By default the chunk size is approximately loop_count/number_of_threads.
    • auto: When schedule (auto) is specified, the decision regarding scheduling is delegated to the compiler. The programmer gives the compiler the freedom to choose any possible mapping of iterations to threads in the team.
    • runtime: with ENVOMP_SCHEDULE, we can test 3 types scheduling: static,dynamic,guided without recompile the code.

    The optional parameter (chunk), when specified, must be a positive integer.

    默认情况下,OpenMP认为所有的循环迭代运行的时间都是一样的,这就导致了OpenMP会把不同的迭代等分到不同的核心上,并且让他们分布的尽可能减小内存访问冲突,这样做是因为循环一般会线性地访问内存, 所以把循环按照前一半后一半的方法分配可以最大程度的减少冲突. 然而对内存访问来说这可能是最好的方法, 但是对于负载均衡可能并不是最好的方法, 而且反过来最好的负载均衡可能也会破坏内存访问. 因此必须折衷考虑.

    内存访问vs负载均衡,需要折中考虑。

    openmp默认使用的schedule是取决于编译器实现的。gcc默认使用schedule(dynamic,1),也就是动态调度并且块大小是1.
    线程数不要大于实际核数,否则就是oversubscription

    isprime可以对dynamic做一个展示。

    functions

    • omp_get_num_procs, 返回运行本线程的多处理机的处理器个数。
    • omp_set_num_threads, 设置并行执行代码时的线程个数
    • omp_get_num_threads, 返回当前并行区域中的活动线程(active thread)个数,如果没有设置,默认为1。
    • omp_get_thread_num, 返回线程号(0,1,2,...)
    • omp_init_lock, 初始化一个简单锁
    • omp_set_lock, 上锁操作
    • omp_unset_lock, 解锁操作,要和omp_set_lock函数配对使用
    • omp_destroy_lock,关闭一个锁,要和 omp_init_lock函数配对使用

    check cpu

        cat /proc/cpuinfo | grep name | cut -f2 -d: | uniq -c 
            8  Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz

    omp_get_num_procs return 8.

    OpenMP Example

    ompgetnum_threads

    void test0()
    {
        printf("I am Thread %d,  omp_get_num_threads = %d, omp_get_num_procs = %d\n", 
            omp_get_thread_num(), 
            omp_get_num_threads(),
            omp_get_num_procs()
        );
    }
    /*
    I am Thread 0,  omp_get_num_threads = 1, omp_get_num_procs = 8
    */

    parallel

    case1

    void test1()
    {
        // `parallel`,用在一个代码段之前,表示这段代码block将被多个线程并行执行
        // if not set `omp_set_num_threads`, by default use `omp_get_num_procs`, eg 8
        //omp_set_num_threads(4); // 设置线程数,一般设置的线程数不超过CPU核心数
    #pragma omp parallel
        {
            printf("Hello, I am Thread %d,  omp_get_num_threads = %d, omp_get_num_procs = %d\n", 
                omp_get_thread_num(), 
                omp_get_num_threads(),
                omp_get_num_procs()
            );
        }
    }
    /*
    Hello, I am Thread 3,  omp_get_num_threads = 8, omp_get_num_procs = 8
    Hello, I am Thread 7,  omp_get_num_threads = 8, omp_get_num_procs = 8
    Hello, I am Thread 1,  omp_get_num_threads = 8, omp_get_num_procs = 8
    Hello, I am Thread 6,  omp_get_num_threads = 8, omp_get_num_procs = 8
    Hello, I am Thread 5,  omp_get_num_threads = 8, omp_get_num_procs = 8
    Hello, I am Thread 4,  omp_get_num_threads = 8, omp_get_num_procs = 8
    Hello, I am Thread 2,  omp_get_num_threads = 8, omp_get_num_procs = 8
    Hello, I am Thread 0,  omp_get_num_threads = 8, omp_get_num_procs = 8
    */

    case2

    void test1_2()
    {
        // `parallel`,用在一个代码段之前,表示这段代码block将被多个线程并行执行
        omp_set_num_threads(4); // 设置线程数,一般设置的线程数不超过CPU核心数
    #pragma omp parallel
        {
            printf("Hello, I am Thread %d,  omp_get_num_threads = %d, omp_get_num_procs = %d\n", 
                omp_get_thread_num(), 
                omp_get_num_threads(),
                omp_get_num_procs()
            );
            //std::cout << "Hello" << ", I am Thread " << omp_get_thread_num() << std::endl; // 0,1,2,3
        }
    }
    /*
    # use `cout`
    HelloHello, I am Thread Hello, I am Thread , I am Thread Hello, I am Thread 2
    1
    3
    0
    */
    
    /* use `printf`
    Hello, I am Thread 0,  omp_get_num_threads = 4, omp_get_num_procs = 8
    Hello, I am Thread 3,  omp_get_num_threads = 4, omp_get_num_procs = 8
    Hello, I am Thread 1,  omp_get_num_threads = 4, omp_get_num_procs = 8
    Hello, I am Thread 2,  omp_get_num_threads = 4, omp_get_num_procs = 8
    */
    

    notice the difference of std::cout and printf

    case3

    void test1_3()
    {
        // `parallel`,用在一个代码段之前,表示这段代码block将被多个线程并行执行
        omp_set_num_threads(4);
    #pragma omp parallel
        for (int i = 0; i < 3; i  )
        {
            printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
        }    
    }
    /*
    i = 0, I am Thread 1
    i = 1, I am Thread 1
    i = 2, I am Thread 1
    i = 0, I am Thread 3
    i = 1, I am Thread 3
    i = 2, I am Thread 3
    i = 0, I am Thread 2
    i = 1, I am Thread 2
    i = 2, I am Thread 2
    i = 0, I am Thread 0
    i = 1, I am Thread 0
    i = 2, I am Thread 0
    */

    omp parallel/for

    omp parallel omp for

    void test2()
    {
        // `omp parallel`   `omp for` === `omp parallel for`
        // `omp for` 用在一个for循环之前,表示for循环的每一次iteration将被分配到多个线程并行执行。
        // 此处8次iteration被平均分配到4个thread执行,每个thread执行2次iteration
        /*
        iter   #thread id
        0,1     0
        2,3     1
        4,5     2
        6,7     3
        */
        omp_set_num_threads(4);
    #pragma omp parallel
        {
            #pragma omp for
            for (int i = 0; i < 8; i  )
            {
                printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
            }
        }
    }
    /*
    i = 0, I am Thread 0
    i = 1, I am Thread 0
    i = 2, I am Thread 1
    i = 3, I am Thread 1
    i = 6, I am Thread 3
    i = 7, I am Thread 3
    i = 4, I am Thread 2
    i = 5, I am Thread 2
    */

    omp parallel for

    void test2_2()
    {
        // `parallel for`,用在一个for循环之前,表示for循环的每一次iteration将被分配到多个线程并行执行。
        // 此处8次iteration被平均分配到4个thread执行,每个thread执行2次iteration
        /*
        iter   #thread id
        0,1     0
        2,3     1
        4,5     2
        6,7     3
        */
        omp_set_num_threads(4);
    #pragma omp parallel for
        for (int i = 0; i < 8; i  )
        {
            printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
        }
    }
    /*
    i = 0, I am Thread 0
    i = 1, I am Thread 0
    i = 4, I am Thread 2
    i = 5, I am Thread 2
    i = 6, I am Thread 3
    i = 7, I am Thread 3
    i = 2, I am Thread 1
    i = 3, I am Thread 1
    */

    sqrt case

    void base_sqrt()
    {
        boost::posix_time::ptime pt1 = boost::posix_time::microsec_clock::local_time();
    
        float a = 0;
        for (int i=0;i<1000000000;i  )
            a = sqrt(i);
        
        boost::posix_time::ptime pt2 = boost::posix_time::microsec_clock::local_time();
        int64_t cost = (pt2 - pt1).total_milliseconds();
        printf("Worker Thread = %d, cost = %d ms\n",omp_get_thread_num(), cost);
    }
    
    void test2_3()
    {
        boost::posix_time::ptime pt1 = boost::posix_time::microsec_clock::local_time();
    
        omp_set_num_threads(8);
    #pragma omp parallel for
        for (int i=0;i<8;i  )
            base_sqrt();
        
        boost::posix_time::ptime pt2 = boost::posix_time::microsec_clock::local_time();
        int64_t cost = (pt2 - pt1).total_milliseconds();
        printf("Main Thread = %d, cost = %d ms\n",omp_get_thread_num(), cost);
    }

    sequential

        time ./demo_openmp
        Worker Thread = 0, cost = 1746 ms
        Worker Thread = 0, cost = 1711 ms
        Worker Thread = 0, cost = 1736 ms
        Worker Thread = 0, cost = 1734 ms
        Worker Thread = 0, cost = 1750 ms
        Worker Thread = 0, cost = 1718 ms
        Worker Thread = 0, cost = 1769 ms
        Worker Thread = 0, cost = 1732 ms
        Main Thread = 0, cost = 13899 ms
        ./demo_openmp  13.90s user 0.00s system 99% cpu 13.903 total

    parallel

        time ./demo_openmp
        Worker Thread = 1, cost = 1875 ms
        Worker Thread = 6, cost = 1876 ms
        Worker Thread = 0, cost = 1876 ms
        Worker Thread = 7, cost = 1876 ms
        Worker Thread = 5, cost = 1877 ms
        Worker Thread = 3, cost = 1963 ms
        Worker Thread = 4, cost = 2000 ms
        Worker Thread = 2, cost = 2027 ms
        Main Thread = 0, cost = 2031 ms
        ./demo_openmp  15.10s user 0.01s system 740% cpu 2.041 total

    2031s 10ms(system) = 2041ms (total)

    2.041* 740% = 15.1034 s

    parallel sections

    void test3()
    {
        boost::posix_time::ptime pt1 = boost::posix_time::microsec_clock::local_time();
    
        omp_set_num_threads(4);
        // `parallel sections`里面的内容要并行执行,具体分工上,每个线程执行其中的一个`section`
        #pragma omp parallel sections // parallel 
        {
            #pragma omp section // thread-0
            {
                base_sqrt();
            }
    
            #pragma omp section // thread-1
            {
                base_sqrt();
            }
    
            #pragma omp section // thread-2
            {
                base_sqrt();
            }
    
            #pragma omp section // thread-3
            {
                base_sqrt();
            }
        }
    
        boost::posix_time::ptime pt2 = boost::posix_time::microsec_clock::local_time();
        int64_t cost = (pt2 - pt1).total_milliseconds();
        printf("Main Thread = %d, cost = %d ms\n",omp_get_thread_num(), cost);
    }
    /*
    time ./demo_openmp
    Worker Thread = 0, cost = 1843 ms
    Worker Thread = 1, cost = 1843 ms
    Worker Thread = 3, cost = 1844 ms
    Worker Thread = 2, cost = 1845 ms
    Main Thread = 0, cost = 1845 ms
    ./demo_openmp  7.39s user 0.00s system 398% cpu 1.855 total
    */
    

    private

    error case

    void test4_error()
    {
        int i,j;
        omp_set_num_threads(4);
        // we get error result, because `j` is shared between all worker threads.
        #pragma omp parallel for
        for(i = 0; i < 4; i  ) {
            for(j = 0; j < 8; j  ) {
                printf("Worker Thread = %d, j = %d ms\n",omp_get_thread_num(), j);
            }
        }
    }
    /*
    Worker Thread = 3, j = 0 ms
    Worker Thread = 3, j = 1 ms
    Worker Thread = 0, j = 0 ms
    Worker Thread = 0, j = 3 ms
    Worker Thread = 0, j = 4 ms
    Worker Thread = 0, j = 5 ms
    Worker Thread = 3, j = 2 ms
    Worker Thread = 3, j = 7 ms
    Worker Thread = 0, j = 6 ms
    Worker Thread = 1, j = 0 ms
    Worker Thread = 2, j = 0 ms
    */

    error results.

    fix1 by changing code

    void test4_fix1()
    {
        int i;
        omp_set_num_threads(4);
        // we get error result, because `j` is shared between all worker threads.
        // fix1: we have to change original code to make j as local variable
        #pragma omp parallel for
        for(i = 0; i < 4; i  ) {
            int j;  // fix1: `int j`
            for(j = 0; j < 8; j  ) { 
                printf("Worker Thread = %d, j = %d ms\n",omp_get_thread_num(), j);
            }
        }
    }
    
    /*
    Worker Thread = 0, j = 0 ms
    Worker Thread = 0, j = 1 ms
    Worker Thread = 2, j = 0 ms
    Worker Thread = 2, j = 1 ms
    Worker Thread = 1, j = 0 ms
    Worker Thread = 1, j = 1 ms
    Worker Thread = 1, j = 2 ms
    Worker Thread = 1, j = 3 ms
    Worker Thread = 1, j = 4 ms
    Worker Thread = 1, j = 5 ms
    Worker Thread = 1, j = 6 ms
    Worker Thread = 1, j = 7 ms
    Worker Thread = 2, j = 2 ms
    Worker Thread = 2, j = 3 ms
    Worker Thread = 2, j = 4 ms
    Worker Thread = 2, j = 5 ms
    Worker Thread = 2, j = 6 ms
    Worker Thread = 2, j = 7 ms
    Worker Thread = 0, j = 2 ms
    Worker Thread = 0, j = 3 ms
    Worker Thread = 0, j = 4 ms
    Worker Thread = 0, j = 5 ms
    Worker Thread = 0, j = 6 ms
    Worker Thread = 0, j = 7 ms
    Worker Thread = 3, j = 0 ms
    Worker Thread = 3, j = 1 ms
    Worker Thread = 3, j = 2 ms
    Worker Thread = 3, j = 3 ms
    Worker Thread = 3, j = 4 ms
    Worker Thread = 3, j = 5 ms
    Worker Thread = 3, j = 6 ms
    Worker Thread = 3, j = 7 ms
    */
    

    fix2 by private(j)

    void test4_fix2()
    {
        int i,j;
        omp_set_num_threads(4);
        // we get error result, because `j` is shared between all worker threads.
        // fix1: we have to change original code to make j as local variable
        // fix2: use `private(j)`, no need to change original code
        #pragma omp parallel for private(j) // fix2
        for(i = 0; i < 4; i  ) {
            for(j = 0; j < 8; j  ) {
                printf("Worker Thread = %d, j = %d ms\n",omp_get_thread_num(), j);
            }
        }
    }
    
    /*
    Worker Thread = 0, j = 0 ms
    Worker Thread = 0, j = 1 ms
    Worker Thread = 0, j = 2 ms
    Worker Thread = 0, j = 3 ms
    Worker Thread = 0, j = 4 ms
    Worker Thread = 0, j = 5 ms
    Worker Thread = 0, j = 6 ms
    Worker Thread = 0, j = 7 ms
    Worker Thread = 2, j = 0 ms
    Worker Thread = 2, j = 1 ms
    Worker Thread = 2, j = 2 ms
    Worker Thread = 2, j = 3 ms
    Worker Thread = 2, j = 4 ms
    Worker Thread = 2, j = 5 ms
    Worker Thread = 2, j = 6 ms
    Worker Thread = 2, j = 7 ms
    Worker Thread = 3, j = 0 ms
    Worker Thread = 3, j = 1 ms
    Worker Thread = 3, j = 2 ms
    Worker Thread = 3, j = 3 ms
    Worker Thread = 3, j = 4 ms
    Worker Thread = 3, j = 5 ms
    Worker Thread = 1, j = 0 ms
    Worker Thread = 1, j = 1 ms
    Worker Thread = 1, j = 2 ms
    Worker Thread = 1, j = 3 ms
    Worker Thread = 1, j = 4 ms
    Worker Thread = 1, j = 5 ms
    Worker Thread = 1, j = 6 ms
    Worker Thread = 1, j = 7 ms
    Worker Thread = 3, j = 6 ms
    Worker Thread = 3, j = 7 ms
    */
    

    reduction

    error case

    void test5_error()
    {
        int array[8] = {0,1,2,3,4,5,6,7};
    
        int sum = 0;
        omp_set_num_threads(4);
    //#pragma omp parallel for reduction( :sum)
    #pragma omp parallel for  // ERROR
        for (int i = 0; i < 8; i  ) {
            sum  = array[i];
            printf("Worker Thread = %d, sum = %d ms\n",omp_get_thread_num(), sum);
        }
        printf("Main Thread = %d, sum = %d ms\n",omp_get_thread_num(), sum);
    }
    /*
    // ERROR RESULT
    Worker Thread = 0, sum = 0 ms
    Worker Thread = 0, sum = 9 ms
    Worker Thread = 3, sum = 8 ms
    Worker Thread = 3, sum = 16 ms
    Worker Thread = 1, sum = 2 ms
    Worker Thread = 1, sum = 19 ms
    Worker Thread = 2, sum = 4 ms
    Worker Thread = 2, sum = 24 ms
    Main Thread = 0, sum = 24 ms
    */

    reduction( :sum)

    void test5_fix()
    {
        int array[8] = {0,1,2,3,4,5,6,7};
    
        int sum = 0;
        /*
        sum需要私有才能实现并行化,但是又必须是公有的才能产生正确结果;
        sum公有或者私有都不对,为了解决这个问题,OpenMP提供了reduction语句.
        内部实现中,OpenMP为每个线程提供了私有的sum变量(初始化为0),
        当线程退出时,OpenMP再把每个线程私有的sum加在一起得到最终结果。
        */
        omp_set_num_threads(4);
    #pragma omp parallel for reduction( :sum)
    //#pragma omp parallel for  // ERROR
        for (int i = 0; i < 8; i  ) {
            sum  = array[i];
            printf("Worker Thread = %d, sum = %d ms\n",omp_get_thread_num(), sum);
        }
        printf("Main Thread = %d, sum = %d ms\n",omp_get_thread_num(), sum);
    }
    
    /*
    Worker Thread = 0, sum = 0 ms
    Worker Thread = 0, sum = 1 ms
    Worker Thread = 1, sum = 2 ms
    Worker Thread = 1, sum = 5 ms
    Worker Thread = 3, sum = 6 ms
    Worker Thread = 3, sum = 13 ms
    Worker Thread = 2, sum = 4 ms
    Worker Thread = 2, sum = 9 ms
    Main Thread = 0, sum = 28 ms
    */

    num_threads

    void test6()
    {
        // `num_threads(4)` same as `omp_set_num_threads(4)`
        #pragma omp parallel num_threads(4)
        {
            printf("Hello, I am Thread %d\n", omp_get_thread_num()); // 0,1,2,3,
        }
    }
    /*
    Hello, I am Thread 0
    Hello, I am Thread 2
    Hello, I am Thread 3
    Hello, I am Thread 1
    */

    schedule

    (static,2)

    void test7_1()
    {
        omp_set_num_threads(4);
        // static, num_loop/num_threads
    #pragma omp parallel for schedule(static,2) 
        for (int i = 0; i < 8; i  )
        {
            printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
        }
    }
    /*
    i = 2, I am Thread 1
    i = 3, I am Thread 1
    i = 6, I am Thread 3
    i = 7, I am Thread 3
    i = 4, I am Thread 2
    i = 5, I am Thread 2
    i = 0, I am Thread 0
    i = 1, I am Thread 0
    */
    

    (static,4)

    void test7_2()
    {
        omp_set_num_threads(4);
        // static, num_loop/num_threads
    #pragma omp parallel for schedule(static,4) 
        for (int i = 0; i < 8; i  )
        {
            printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
        }
    }
    /*
    i = 0, I am Thread 0
    i = 1, I am Thread 0
    i = 2, I am Thread 0
    i = 3, I am Thread 0
    i = 4, I am Thread 1
    i = 5, I am Thread 1
    i = 6, I am Thread 1
    i = 7, I am Thread 1
    */

    (dynamic,1)

    void test7_3()
    {
        omp_set_num_threads(4);
        // dynamic
    #pragma omp parallel for schedule(dynamic,1) 
        for (int i = 0; i < 8; i  )
        {
            printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
        }
    }
    /*
    i = 0, I am Thread 2
    i = 4, I am Thread 2
    i = 5, I am Thread 2
    i = 6, I am Thread 2
    i = 7, I am Thread 2
    i = 3, I am Thread 3
    i = 1, I am Thread 0
    i = 2, I am Thread 1
    */
    

    (dynamic,3)

    void test7_4()
    {
        omp_set_num_threads(4);
        // dynamic
    #pragma omp parallel for schedule(dynamic,3) 
        for (int i = 0; i < 8; i  )
        {
            printf("i = %d, I am Thread %d\n", i, omp_get_thread_num());
        }
    }
    /*
    i = 0, I am Thread 0
    i = 1, I am Thread 0
    i = 2, I am Thread 0
    i = 6, I am Thread 2
    i = 7, I am Thread 2
    i = 3, I am Thread 1
    i = 4, I am Thread 1
    i = 5, I am Thread 1
    */

    schedule compare

    #define NUM 100000000
    
    int isprime( int x )
    {
        for( int y = 2; y * y <= x; y   )
        {
            if( x % y == 0 )
                return 0;
        }
        return 1;
    }
    
    void test8()
    {
        int sum = 0;
    
        #pragma omp parallel for reduction ( :sum) schedule(dynamic,1) 
        for( int i = 2; i <= NUM ; i   )
        {
            sum  = isprime(i);
        }
    
        printf( "Number of primes numbers: %d", sum );
    }

    no schedule

        Number of primes numbers: 5761455./demo_openmp  151.64s user 0.04s system 582% cpu 26.048 total

    schedule(static,1)

        
        Number of primes numbers: 5761455./demo_openmp  111.13s user 0.00s system 399% cpu 27.799 total
    

    schedule(dynamic,1)


        Number of primes numbers: 5761455./demo_openmp  167.22s user 0.02s system 791% cpu 21.135 total

    schedule(dynamic,200)

        Number of primes numbers: 5761455./demo_openmp  165.96s user 0.02s system 791% cpu 20.981 total
    
    

    OpenCV with OpenMP

    see how-opencv-use-openmp-thread-to-get-performance

    3 type OpenCV implementation

    • sequential implementation: default (slowest)
    • parallel implementation: OpenMP / TBB
    • GPU implementation: CUDA(fastest) / OpenCL

    With CMake-gui, Building OpenCV with the WITH_OPENMP flag means that the internal functions will use OpenMP to parallelize some of the algorithms, like cvCanny, cvSmooth and cvThreshold.

    In OpenCV, an algorithm can have a sequential (slowest) implementation; a parallel implementation using OpenMP or TBB; and a GPU implementation using OpenCL or CUDA(fastest). You can decide with the WITH_XXX flags which version to use.

    Of course, not every algorithm can be parallelized.

    Now, if you want to parallelize your methods with OpenMP, you have to implement it yourself.

    concepts

    avoiding extra copying

    from improving-image-processing-speed

    There is one important thing about increasing speed in OpenCV not related to processor nor algorithm and it is avoiding extra copying when dealing with matrices. I will give you an example taken from the documentation:

    "...by constructing a header for a part of another matrix. It can be a single row, single column, several rows, several columns, rectangular region in the matrix (called a minor in algebra) or a diagonal. Such operations are also O(1), because the new header will reference the same data. You can actually modify a part of the matrix using this feature, e.g."

    parallel for

    #include "opencv2/highgui/highgui.hpp"
    #include "opencv2/features2d/features2d.hpp"
    #include <iostream>
    #include <vector>
    #include <omp.h>
    
    void opencv_vector()
    {
        int imNum = 2;
        std::vector<cv::Mat> imVec(imNum);
        std::vector<std::vector<cv::KeyPoint>>keypointVec(imNum);
        std::vector<cv::Mat> descriptorsVec(imNum);
        
        cv::Ptr<cv::ORB> detector = cv::ORB::create();
        cv::Ptr<DescriptorMatcher> matcher = cv::DescriptorMatcher::create("BruteForce-Hamming");
    
        std::vector< cv::DMatch > matches;
        char filename[100];
        double t1 = omp_get_wtime();
        
    //#pragma omp parallel for
        for (int i=0;i<imNum;i  ){
            sprintf(filename,"rgb%d.jpg",i);
            imVec[i] = cv::imread( filename, CV_LOAD_IMAGE_GRAYSCALE );
            detector->detect( imVec[i], keypointVec[i] );
            detector->compute( imVec[i],keypointVec[i],descriptorsVec[i]);
            std::cout<<"find "<<keypointVec[i].size()<<" keypoints in im"<<i<<std::endl;
        }
        
        double t2 = omp_get_wtime();
        std::cout<<"time: "<<t2-t1<<std::endl;
        
        matcher->match(descriptorsVec[0], descriptorsVec[1], matches, 2); // uchar descriptor Mat
    
        cv::Mat img_matches;
        cv::drawMatches( imVec[0], keypointVec[0], imVec[1], keypointVec[1], matches, img_matches ); 
        cv::namedWindow("Matches",CV_WINDOW_AUTOSIZE);
        cv::imshow( "Matches", img_matches );
        cv::waitKey(0);
    }

    parallel sections

    #pragma omp parallel sections
        {
    #pragma omp section
            {
                std::cout<<"processing im0"<<std::endl;
                im0 = cv::imread("rgb0.jpg", CV_LOAD_IMAGE_GRAYSCALE );
                detector.detect( im0, keypoints0);
                extractor.compute( im0,keypoints0,descriptors0);
                std::cout<<"find "<<keypoints0.size()<<"keypoints in im0"<<std::endl;
            }
            
    #pragma omp section
            {
                std::cout<<"processing im1"<<std::endl;
                im1 = cv::imread("rgb1.jpg", CV_LOAD_IMAGE_GRAYSCALE );
                detector.detect( im1, keypoints1);
                extractor.compute( im1,keypoints1,descriptors1);
                std::cout<<"find "<<keypoints1.size()<<"keypoints in im1"<<std::endl;
            }
        }

    Reference

    History

    • 20190403: created.

    Copyright

    展开全文
  • openMP的一点使用经验

    2012-08-22 15:44:44
    最近在看多核编程。简单来说,由于现在电脑CPU一般都有两个核,4核与8核的CPU也逐渐走入了寻常百姓家,传统的单线程编程方式难以发挥多核CPU的强大...这两天关注的多核编程的工具包括openMP和TBB。按照目前网上的讨论,

    最近在看多核编程。简单来说,由于现在电脑CPU一般都有两个核,4核与8核的CPU也逐渐走入了寻常百姓家,传统的单线程编程方式难以发挥多核CPU的强大功能,于是多核编程应运而生。按照我的理解,多核编程可以认为是对多线程编程做了一定程度的抽象,提供一些简单的API,使得用户不必花费太多精力来了解多线程的底层知识,从而提高编程效率。这两天关注的多核编程的工具包括openMP和TBB。按照目前网上的讨论,TBB风头要盖过openMP,比如openCV过去是使用openMP的,但从2.3版本开始抛弃openMP,转向TBB。但我试下来,TBB还是比较复杂的,相比之下,openMP则非常容易上手。因为精力和时间有限,没办法花费太多时间去学习TBB,就在这里分享下这两天学到的openMP的一点知识,和大家共同讨论。

    openMP支持的编程语言包括C语言、C++和Fortran,支持OpenMP的编译器包括Sun Studio,Intel Compiler,Microsoft Visual Studio,GCC。我使用的是Microsoft Visual Studio 2008,CPU为Intel i5 四核,首先讲一下在Microsoft Visual Studio 2008上openMP的配置。非常简单,总共分2步:

    (1) 新建一个工程。这个不再多讲。

    (2) 建立工程后,点击 菜单栏->Project->Properties,弹出菜单里,点击 Configuration Properties->C/C++->Language->OpenMP Support,在下拉菜单里选择Yes。

    至此配置结束。下面我们通过一个小例子来说明openMP的易用性。这个例子是 有一个简单的test()函数,然后在main()里,用一个for循环把这个test()函数跑8遍。

    复制代码
     1 #include <iostream>
    2 #include <time.h>
    3 void test()
    4 {
    5 int a = 0;
    6 for (int i=0;i<100000000;i++)
    7 a++;
    8 }
    9 int main()
    10 {
    11 clock_t t1 = clock();
    12 for (int i=0;i<8;i++)
    13 test();
    14 clock_t t2 = clock();
    15 std::cout<<"time: "<<t2-t1<<std::endl;
    16 }
    复制代码

    编译运行后,打印出来的耗时为:1.971秒。下面我们用一句话把上面代码变成多核运行。

    复制代码
     1 #include <iostream>
    2 #include <time.h>
    3 void test()
    4 {
    5 int a = 0;
    6 for (int i=0;i<100000000;i++)
    7 a++;
    8 }
    9 int main()
    10 {
    11 clock_t t1 = clock();
    12 #pragma omp parallel for
    13 for (int i=0;i<8;i++)
    14 test();
    15 clock_t t2 = clock();
    16 std::cout<<"time: "<<t2-t1<<std::endl;
    17 }
    复制代码

    编译运行后,打印出来的耗时为:0.546秒,几乎为上面时间的1/4。

    由此我们可以看到openMP的简单易用。在上面的代码里,我们一没有额外include头文件,二没有额外link库文件,只是在for循环前加了一句#pragma omp parallel for。而且这段代码在单核机器上,或者编译器没有将openMP设为Yes的机器上编译也不会报错,将自动忽略#pragma这行代码,然后按照传统单核串行的方式编译运行!我们唯一要多做的一步,是从C:\Program Files\Microsoft Visual Studio 9.0\VC\redist\x86\Microsoft.VC90.OPENMP和C:\Program Files\Microsoft Visual Studio 9.0\VC\redist\Debug_NonRedist\x86\Microsoft.VC90.DebugOpenMP目录下分别拷贝vcomp90d.dll和vcomp90.dll文件到工程文件当前目录下。

    对上面代码按照我的理解做个简单的剖析。

    当编译器发现#pragma omp parallel for后,自动将下面的for循环分成N份,(N为电脑CPU核数),然后把每份指派给一个核去执行,而且多核之间为并行执行。下面的代码验证了这种分析。

    复制代码
    1 #include <iostream>
    2 int main()
    3 {
    4 #pragma omp parallel for
    5 for (int i=0;i<10;i++)
    6 std::cout<<i<<std::endl;
    7 return 0;
    8 }
    复制代码

    会发现控制台打印出了0 3 4 5 8 9 6 7 1 2。注意:因为每个核之间是并行执行,所以每次执行时打印出的顺序可能都是不一样的。

    下面我们来了谈谈竞态条件(race condition)的问题,这是所有多线程编程最棘手的问题。该问题可表述为,当多个线程并行执行时,有可能多个线程同时对某变量进行了读写操作,从而导致不可预知的结果。比如下面的例子,对于包含10个整型元素的数组a,我们用for循环求它各元素之和,并将结果保存在变量sum里。

    复制代码
     1 #include <iostream>
    2 int main()
    3 {
    4 int sum = 0;
    5 int a[10] = {1,2,3,4,5,6,7,8,9,10};
    6 #pragma omp parallel for
    7 for (int i=0;i<10;i++)
    8 sum = sum + a[i];
    9 std::cout<<"sum: "<<sum<<std::endl;
    10 return 0;
    11 }
    复制代码

    如果我们注释掉#pragma omp parallel for,让程序先按照传统串行的方式执行,很明显,sum = 55。但按照并行方式执行后,sum则会变成其他值,比如在某次运行过程中,sum = 49。其原因是,当某线程A执行sum = sum + a[i]的同时,另一线程B正好在更新sum,而此时A还在用旧的sum做累加,于是出现了错误。

    那么用openMP怎么实现并行数组求和呢?下面我们先给出一个基本的解决方案。该方案的思想是,首先生成一个数组sumArray,其长度为并行执行的线程的个数(默认情况下,该个数等于CPU的核数),在for循环里,让各个线程更新自己线程对应的sumArray里的元素,最后再将sumArray里的元素累加到sum里,代码如下

    复制代码
     1 #include <iostream>
    2 #include <omp.h>
    3 int main(){
    4 int sum = 0;
    5 int a[10] = {1,2,3,4,5,6,7,8,9,10};
    6 int coreNum = omp_get_num_procs();//获得处理器个数
    7 int* sumArray = new int[coreNum];//对应处理器个数,先生成一个数组
    8 for (int i=0;i<coreNum;i++)//将数组各元素初始化为0
    9 sumArray[i] = 0;
    10 #pragma omp parallel for
    11 for (int i=0;i<10;i++)
    12 {
    13 int k = omp_get_thread_num();//获得每个线程的ID
    14 sumArray[k] = sumArray[k]+a[i];
    15 }
    16 for (int i = 0;i<coreNum;i++)
    17 sum = sum + sumArray[i];
    18 std::cout<<"sum: "<<sum<<std::endl;
    19 return 0;
    20 }
    复制代码

    需要注意的是,在上面代码里,我们用omp_get_num_procs()函数来获取处理器个数,用omp_get_thread_num()函数来获得每个线程的ID,为了使用这两个函数,我们需要include <omp.h>。

    上面的代码虽然达到了目的,但它产生了较多的额外操作,比如要先生成数组sumArray,最后还要用一个for循环将它的各元素累加起来,有没有更简便的方式呢?答案是有,openMP为我们提供了另一个工具,归约(reduction),见下面代码:

    复制代码
     1 #include <iostream>
    2 int main(){
    3 int sum = 0;
    4 int a[10] = {1,2,3,4,5,6,7,8,9,10};
    5 #pragma omp parallel for reduction(+:sum)
    6 for (int i=0;i<10;i++)
    7 sum = sum + a[i];
    8 std::cout<<"sum: "<<sum<<std::endl;
    9 return 0;
    10 }
    复制代码

    上面代码里,我们在#pragma omp parallel for 后面加上了 reduction(+:sum),它的意思是告诉编译器:下面的for循环你要分成多个线程跑,但每个线程都要保存变量sum的拷贝,循环结束后,所有线程把自己的sum累加起来作为最后的输出。

    reduction虽然很方便,但它只支持一些基本操作,比如+,-,*,&,|,&&,||等。有些情况下,我们既要避免race condition,但涉及到的操作又超出了reduction的能力范围,应该怎么办呢?这就要用到openMP的另一个工具,critical。来看下面的例子,该例中我们求数组a的最大值,将结果保存在max里。

    复制代码
     1 #include <iostream>
    2 int main(){
    3 int max = 0;
    4 int a[10] = {11,2,33,49,113,20,321,250,689,16};
    5 #pragma omp parallel for
    6 for (int i=0;i<10;i++)
    7 {
    8 int temp = a[i];
    9 #pragma omp critical
    10 {
    11 if (temp > max)
    12 max = temp;
    13 }
    14 }
    15 std::cout<<"max: "<<max<<std::endl;
    16 return 0;
    17 }
    复制代码

    上例中,for循环还是被自动分成N份来并行执行,但我们用#pragma omp critical将 if (temp > max) max = temp 括了起来,它的意思是:各个线程还是并行执行for里面的语句,但当你们执行到critical里面时,要注意有没有其他线程正在里面执行,如果有的话,要等其他线程执行完再进去执行。这样就避免了race condition问题,但显而易见,它的执行速度会变低,因为可能存在线程等待的情况。
    有了以上基本知识,对我来说做很多事情都足够了。下面我们来看一个具体的应用例,从硬盘读入两幅图像,对这两幅图像分别提取特征点,特征点匹配,最后将图像与匹配特征点画出来。理解该例子需要一些图像处理的基本知识,我不在此详细介绍。另外,编译该例需要opencv,我用的版本是2.3.1,关于opencv的安装与配置也不在此介绍。我们首先来看传统串行编程的方式。

    复制代码
     1 #include "opencv2/highgui/highgui.hpp"
    2 #include "opencv2/features2d/features2d.hpp"
    3 #include <iostream>
    4 #include <omp.h>
    5 int main( ){
    6 cv::SurfFeatureDetector detector( 400 );
    7 cv::SurfDescriptorExtractor extractor;
    8 cv::BruteForceMatcher<cv::L2<float> > matcher;
    9 std::vector< cv::DMatch > matches;
    10 cv::Mat im0,im1;
    11 std::vector<cv::KeyPoint> keypoints0,keypoints1;
    12 cv::Mat descriptors0, descriptors1;
    13 double t1 = omp_get_wtime( );
    14 //先处理第一幅图像
    15 im0 = cv::imread("rgb0.jpg", CV_LOAD_IMAGE_GRAYSCALE );
    16 detector.detect( im0, keypoints0);
    17 extractor.compute( im0,keypoints0,descriptors0);
    18 std::cout<<"find "<<keypoints0.size()<<"keypoints in im0"<<std::endl;
    19 //再处理第二幅图像
    20 im1 = cv::imread("rgb1.jpg", CV_LOAD_IMAGE_GRAYSCALE );
    21 detector.detect( im1, keypoints1);
    22 extractor.compute( im1,keypoints1,descriptors1);
    23 std::cout<<"find "<<keypoints1.size()<<"keypoints in im1"<<std::endl;
    24 double t2 = omp_get_wtime( );
    25 std::cout<<"time: "<<t2-t1<<std::endl;
    26 matcher.match( descriptors0, descriptors1, matches );
    27 cv::Mat img_matches;
    28 cv::drawMatches( im0, keypoints0, im1, keypoints1, matches, img_matches );
    29 cv::namedWindow("Matches",CV_WINDOW_AUTOSIZE);
    30 cv::imshow( "Matches", img_matches );
    31 cv::waitKey(0);
    32 return 1;
    33 }
    复制代码

    很明显,读入图像,提取特征点与特征描述子这部分可以改为并行执行,修改如下:

    复制代码
     1 #include "opencv2/highgui/highgui.hpp"
    2 #include "opencv2/features2d/features2d.hpp"
    3 #include <iostream>
    4 #include <vector>
    5 #include <omp.h>
    6 int main( ){
    7 int imNum = 2;
    8 std::vector<cv::Mat> imVec(imNum);
    9 std::vector<std::vector<cv::KeyPoint>>keypointVec(imNum);
    10 std::vector<cv::Mat> descriptorsVec(imNum);
    11 cv::SurfFeatureDetector detector( 400 ); cv::SurfDescriptorExtractor extractor;
    12 cv::BruteForceMatcher<cv::L2<float> > matcher;
    13 std::vector< cv::DMatch > matches;
    14 char filename[100];
    15 double t1 = omp_get_wtime( );
    16 #pragma omp parallel for
    17 for (int i=0;i<imNum;i++){
    18 sprintf(filename,"rgb%d.jpg",i);
    19 imVec[i] = cv::imread( filename, CV_LOAD_IMAGE_GRAYSCALE );
    20 detector.detect( imVec[i], keypointVec[i] );
    21 extractor.compute( imVec[i],keypointVec[i],descriptorsVec[i]);
    22 std::cout<<"find "<<keypointVec[i].size()<<"keypoints in im"<<i<<std::endl;
    23 }
    24 double t2 = omp_get_wtime( );
    25 std::cout<<"time: "<<t2-t1<<std::endl;
    26 matcher.match( descriptorsVec[0], descriptorsVec[1], matches );
    27 cv::Mat img_matches;
    28 cv::drawMatches( imVec[0], keypointVec[0], imVec[1], keypointVec[1], matches, img_matches );
    29 cv::namedWindow("Matches",CV_WINDOW_AUTOSIZE);
    30 cv::imshow( "Matches", img_matches );
    31 cv::waitKey(0);
    32 return 1;
    33 }
    复制代码

    两种执行方式做比较,时间为:2.343秒v.s. 1.2441秒

    在上面代码中,为了改成适合#pragma omp parallel for执行的方式,我们用了STL的vector来分别存放两幅图像、特征点与特征描述子,但在某些情况下,变量可能不适合放在vector里,此时应该怎么办呢?这就要用到openMP的另一个工具,section,代码如下:

    复制代码
     1 #include "opencv2/highgui/highgui.hpp"
    2 #include "opencv2/features2d/features2d.hpp"
    3 #include <iostream>
    4 #include <omp.h>
    5 int main( ){
    6 cv::SurfFeatureDetector detector( 400 ); cv::SurfDescriptorExtractor extractor;
    7 cv::BruteForceMatcher<cv::L2<float> > matcher;
    8 std::vector< cv::DMatch > matches;
    9 cv::Mat im0,im1;
    10 std::vector<cv::KeyPoint> keypoints0,keypoints1;
    11 cv::Mat descriptors0, descriptors1;
    12 double t1 = omp_get_wtime( );
    13 #pragma omp parallel sections
    14 {
    15 #pragma omp section
    16 {
    17 std::cout<<"processing im0"<<std::endl;
    18 im0 = cv::imread("rgb0.jpg", CV_LOAD_IMAGE_GRAYSCALE );
    19 detector.detect( im0, keypoints0);
    20 extractor.compute( im0,keypoints0,descriptors0);
    21 std::cout<<"find "<<keypoints0.size()<<"keypoints in im0"<<std::endl;
    22 }
    23 #pragma omp section
    24 {
    25 std::cout<<"processing im1"<<std::endl;
    26 im1 = cv::imread("rgb1.jpg", CV_LOAD_IMAGE_GRAYSCALE );
    27 detector.detect( im1, keypoints1);
    28 extractor.compute( im1,keypoints1,descriptors1);
    29 std::cout<<"find "<<keypoints1.size()<<"keypoints in im1"<<std::endl;
    30 }
    31 }
    32 double t2 = omp_get_wtime( );
    33 std::cout<<"time: "<<t2-t1<<std::endl;
    34 matcher.match( descriptors0, descriptors1, matches );
    35 cv::Mat img_matches;
    36 cv::drawMatches( im0, keypoints0, im1, keypoints1, matches, img_matches );
    37 cv::namedWindow("Matches",CV_WINDOW_AUTOSIZE);
    38 cv::imshow( "Matches", img_matches );
    39 cv::waitKey(0);
    40 return 1;
    41 }
    复制代码

    上面代码中,我们首先用#pragma omp parallel sections将要并行执行的内容括起来,在它里面,用了两个#pragma omp section,每个里面执行了图像读取、特征点与特征描述子提取。将其简化为伪代码形式即为:

    复制代码
     1 #pragma omp parallel sections
    2 {
    3 #pragma omp section
    4 {
    5 function1();
    6 }
    7   #pragma omp section
    8 {
    9 function2();
    10 }
    11 }
    复制代码

    意思是:parallel sections里面的内容要并行执行,具体分工上,每个线程执行其中的一个section,如果section数大于线程数,那么就等某线程执行完它的section后,再继续执行剩下的section。在时间上,这种方式与人为用vector构造for循环的方式差不多,但无疑该种方式更方便,而且在单核机器上或没有开启openMP的编译器上,该种方式不需任何改动即可正确编译,并按照单核串行方式执行。

    以上分享了这两天关于openMP的一点学习体会,其中难免有错误,欢迎指正。另外的一点疑问是,看到各种openMP教程里经常用到private,shared等来修饰变量,这些修饰符的意义和作用我大致明白,但在我上面所有例子中,不加这些修饰符似乎并不影响运行结果,不知道这里面有哪些讲究。

    在写上文的过程中,参考了包括以下两个网址在内的多个地方的资源,不再一 一列出,在此一并表示感谢。

    http://blog.csdn.net/drzhouweiming/article/details/4093624
    http://software.intel.com/zh-cn/articles/more-work-sharing-with-openmp

    展开全文
  • //openmptest的测试程序 // #include"stdafx.h" voidTest(intn){ for(inti=0;i<10000;i++) { intj=0; j=j+1; } printf("%d",n); } int_tmain(...
    // openmptest的测试程序
    //
     
    #include "stdafx.h"
     
    void Test(int n){
        for (int i=0;i<10000;i++)
        {
            int j=0;
            j = j+1;
        }
        printf("%d",n);
    }
     
    int _tmain(int argc_TCHARargv[])
    {
        for (int i=0;i<10;i++)
        {
            Test(i);
        }
        getchar();
        return 0;
    }
    而开启openmp
    代码
    // openmptest的测试程序
    //
     
    #include "stdafx.h"
     
    void Test(int n){
        for (int i=0;i<10000;i++)
        {
            int j=0;
            j = j+1;
        }
        printf("%d",n);
    }
     
    int _tmain(int argc_TCHARargv[])
    {
        for (int i=0;i<10;i++)
        {
            Test(i);
        }
        getchar();
        return 0;
    }
    速度更快。
    在最简单的层次上,openmp提供了粗颗粒的并行算法。一直以来,我都在寻找图像处理的加速算法,但是由于图像处理的特性(大多为线性项目),所以很难有好的提速方法。但是对于批量的图像处理,采用我们这种方法将是非常好用的。
    编写较为复杂的opencv 程序
    // openmptest的测试程序
    //
     
    #include "stdafx.h"
    #include <iostream>
    #include <opencv2/opencv.hpp>  
    #include "GoCvHelper.h"
    using namespace std;
    using namespace cv;
    using namespace GO;
     
    Mat Test(Mat src){
        Mat draw;
        Mat gray;
        cvtColor(src,gray,COLOR_BGR2GRAY);
        threshold(gray,gray,100,255,THRESH_OTSU);
        connection2(gray,draw);
        return draw;
    }
     
     
    int _tmain(int argc_TCHARargv[])
    {    
        //时间记录
        const int64 start = getTickCount();
        vector<MatvectorMats;
        //文件目录
        char cbuf[100] = "F:/图片资源/纹理库brodatz/brodatzjpg";
        //获取所有文件
        getFiles(cbuf,vectorMats);
        //循环处理
       // #pragma omp parallel for
        for (int i=0;i<vectorMats.size();i++)
        {
            Mat dst = Test(vectorMats[i]);
        }
        
        //时间
        double duration = (cv::getTickCount() - start)/getTickFrequency();
        printf("共消耗时间%f",duration);
        waitKey();
        return 0;
    }
    不用mp的是这么长时间
    不看算法本身的效率,在解决这个问题的时候,这种方法还是相当好用的。
     
    目前方向:图像拼接融合、图像识别 联系方式:jsxyhelu@foxmail.com
    展开全文
  • 1.循环语句中的循环变量必须是有符号整形,如果是无符号整形就无法使用OpenMP3.0中取消了这个约束 2.循环语句中的比较操作必须是这样的样式:loop_variable ,>=loop_invariant_interger 3.循环语句中必须是整数加...
  • 一个openMP编程处理图像的示例

    千次阅读 2017-05-26 23:55:45
    一个openMP编程处理图像的示例: 从硬盘读入两幅图像,对这两幅图像分别提取特征点,特征点匹配,最后将图像与匹配特征点画出来。
  • 一、算法测试//openmptest的测试程序#include"stdafx.h"voidTest(intn){for(inti=0;i<10000;i++){intj=0;j=j+1;}printf("%d",n);}int_tmain(intargc,_TCHAR*argv[]){for(...
  • VS OpenMP基础使用

    千次阅读 2015-07-14 09:37:37
    openMP支持的编程语言包括C语言、C++和Fortran,支持OpenMP的编译器包括Sun Studio,Intel Compiler,Microsoft Visual Studio,GCC。  OpenMP是CPU并行加速相关的编译处理方案,VS很早的版本就对其提供了支持,...
  • OpenMP是一种应用于多处理器程序设计的并行编程处理方案,它提供了对于并行编程的高层抽象,只需要在程序中添加简单的指令,就可以编写高效的并行程序,而不用关心具体的并行实现细节,降低了并行编程的难度和复杂度...
  • 简单实用openmp

    2016-12-22 14:11:55
    我把它简单理解为并行加速,做图像处理时特别有用,可以以最简单的方法极大的提升效率,节约时间,特别是高分辨率图像处理使用openmp,在vs中使用需要将openmp勾选一下, 然后加上头文件, 下面是使用例子...
  • OpenMP是一种应用于多处理器程序设计的并行编程处理方案,它提供了对于并行编程的高层抽象,只需要在程序中添加简单的指令,就可以编写搞笑的并行程序,而不用关心具体的并行实现细节,降低了并行编程的难度和复杂度...
  • 使用OpenMP在并行处理图像矩阵的时候,当以 img.at(i,j) 方式访问 图像像素的时候,总是会发生地址冲突的问题。 改变访问方式 以 uchat *data = ( img.ptr(i))[j] 进行访问,可以避免这样的问题。
  • openMP的一点使用经验—加速opencv

    千次阅读 2013-05-11 14:45:44
    最近在看多核编程。简单来说,由于现在电脑CPU一般都有两个核,4核与8核的CPU也逐渐走入了寻常百姓家,传统的单线程编程方式难以发挥多核CPU的强大功能,于是多核编程应运而生。按照我的理解,多核编程可以认为是对...
1 2 3 4 5 ... 20
收藏数 2,305
精华内容 922
关键字:

图像处理用openmp