2014年6月9日 星期一

[ OpenCL ] Set up OpenCL in Linux

Preface: 
這邊會介紹如何安裝 AMD 的 OpenCL SDK, 並使用一個簡單範例來驗證安裝成功. 當前我使用的版本是 AMD-APP-SDK-v2.9-lnx64.tgz: 
 

Install Process: 
首先請下載對應本身機器的作業系統版本到本機上, 接著解壓縮該 tgz 檔, 並進行安裝: 
# tar -xvf AMD-APP-SDK-v2.9-lnx64.tgz // 解壓縮 tgz 
AMD-APP-SDK-v2.9-RC-lnx64.tgz
default-install_lnx_64.pl
icd-registration.tgz
Install-AMD-APP.sh
ReadMe.txt

# ./Install-AMD-APP.sh // 開始安裝 OpenCL SDK, 
===========================================================
64-bit Operating System Found..

Starting Installation of AMD APPSDK v2.9 ....
...
AMD APPSDK v2.9 installation Completed
>
> Reboot required to reflect the changes
===============================================================
*****Please refer 'AMD_APPSDK_v2.9.log' in the same directory*****
*****Refer 'README.txt' for FAQ/help in the same directory********

接著 reboot 讓安裝生效, 便完成 OpenCL SDK 的設定. 而後續會用到的 OpenCL 標頭檔與函式庫會放在 /opt/AMDAPP/ 路徑下. 

Sample Code: 
接著我們會使用下面範例代碼來確認 OpenCL 環境 ready, 而該範例代碼是將矩陣 A 與 矩陣 B 相加後存放到矩陣 C. 代碼如下: 
- hw6d.c 
  1. // System includes  
  2. #include <stdio.h>  
  3. #include <stdlib.h>  
  4.   
  5. // OpenCL includes  
  6. #include <CL/cl.h>  
  7.   
  8. // OpenCL kernel to perform an element-wise addition  
  9. const char* programSource =   
  10.     "__kernel                                                         \n"  
  11.     "void vecadd(__global int *A,                                     \n"  
  12.     "            __global int *B,                                     \n"  
  13.     "            __global int *C)                                     \n"  
  14.     "{                                                                \n"  
  15.     " // Get the work-item's unique ID                                \n"  
  16.     " int idx=get_global_id(0);                                       \n"  
  17.     "                                                                 \n"  
  18.     " // Add the corresponding locations of                           \n"  
  19.     " // 'A' and 'B', and store the result in 'C'.                    \n"  
  20.     " C[idx]=A[idx]+B[idx];                                           \n"  
  21.     "}                                                                \n"  
  22.     ;  
  23. int main(){  
  24.     // This code executes on the OpenCL host  
  25.     // Host data  
  26.     int *A = NULL; // Input array  
  27.     int *B = NULL; // Input array  
  28.     int *C = NULL; // Output array  
  29.     // Elements in each array  
  30.     const int elements = 2048;  
  31.     // Compute the size of the data  
  32.     size_t datasize = sizeof(int)*elements;  
  33.     // Allocate space for input/output data  
  34.     A = (int*)malloc(datasize);  
  35.     B = (int*)malloc(datasize);  
  36.     C = (int*)malloc(datasize);  
  37.     // Initialize the input data  
  38.     int i;  
  39.     for(i=0; i<elements; i++)  
  40.     {  
  41.         A[i]=i;  
  42.         B[i]=i;  
  43.     }  
  44.   
  45.     // Use this to check the output of each API call  
  46.     cl_int status;  
  47.     // Retrieve the number of platforms  
  48.     cl_int numPlatforms=0;  
  49.     status = clGetPlatformIDs(0, NULL, &numPlatforms);  
  50.     printf("\t[Info] Total %d platforms...\n", numPlatforms);  
  51.   
  52.     // Allocate enough space for each platform  
  53.     cl_platform_id *platforms=NULL;  
  54.     platforms=(cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));  
  55.     // Fill in the platforms  
  56.     status = clGetPlatformIDs(numPlatforms, platforms, NULL);  
  57.     // Retrieve the number of devices  
  58.     cl_int numDevices=0;  
  59.     status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);  
  60.     printf("\t[Info] Total %d devices...\n", numDevices);  
  61.   
  62.     // Allocate enough space for each device  
  63.     cl_device_id *devices;  
  64.     devices=(cl_device_id*)malloc(numDevices*sizeof(cl_device_id));  
  65.     // Fill in the devices  
  66.     status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);  
  67.     if(status == CL_SUCCESS) printf("\t[Info] Get DeviceIDs Success!\n");  
  68.     else fprintf(stderr, "\t[Error] Get DeviceIDs Fail!\n");  
  69.     // Create a context and associate it with the devices  
  70.     cl_context context;  
  71.     context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);  
  72.     if(status == CL_SUCCESS) printf("\t[Info] Create Context Success!\n");  
  73.     else fprintf(stderr, "\t[Error] Create Context Fail!\n");  
  74.     // Create a command queue and associate it with the device  
  75.     cl_command_queue cmdQueue;  
  76.     cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status);  
  77.     if(status == CL_SUCCESS) printf("\t[Info] Create CommandQueue Success!\n");  
  78.     else fprintf(stderr, "\t[Error] Create CommandQueue Fail!\n");  
  79.   
  80.     // Create a buffer object that will contain the data from the host array A  
  81.     cl_mem bufA;  
  82.     bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status);  
  83.     // Create a buffer object that will contain the data from the host array B  
  84.     cl_mem bufB;  
  85.     bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status);  
  86.     // Create a buffer object that will hold the output data  
  87.     cl_mem bufC;  
  88.     bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status);  
  89.     // Write input array A to the device buffer bufferA  
  90.     status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE, 0, datasize, A, 0, NULL, NULL);  
  91.     // Write input array B to the device buffer bufferB  
  92.     status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE, 0, datasize, B, 0, NULL, NULL);  
  93.   
  94.     // Create a program with source code  
  95.     cl_program program = clCreateProgramWithSource(context,   
  96.         1,   
  97.         (const char**)&programSource,   
  98.         NULL,   
  99.         &status);  
  100.     if(status == CL_SUCCESS) printf("\t[Info] Create Program Success!\n");  
  101.     else fprintf(stderr, "\t[Error] Create Program Fail!\n");  
  102.     // Build (compile) the program for the device  
  103.     status = clBuildProgram(program,        // The program object.  
  104.         numDevices,                         // The number of devices listed in device_list.  
  105.         devices,                            // A pointer to a list of devices associated with program.  
  106.         NULL,  
  107.         NULL,  
  108.         NULL  
  109.         );  
  110.     if(status == CL_SUCCESS) printf("\t[Info] Build Program Success!\n");  
  111.     else fprintf(stderr, "\t[Error] Build Program Fail!\n");  
  112.     cl_kernel kernel;  
  113.     kernel = clCreateKernel(program,  
  114.         "vecadd",  
  115.         &status);  
  116.   
  117.     // Associate the input/output buffers with the kernel  
  118.     status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufA);  
  119.     status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufB);  
  120.     status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufC);  
  121.     // Define an index space (global work size) of work items for execution.  
  122.     // A workgroup size (local work size) is not required, but can be used.  
  123.     size_t globalWorkSize[1];  
  124.     // There are elements' work-items  
  125.     globalWorkSize[0]=elements;  
  126.     // Execute the kernel for execution  
  127.     status = clEnqueueNDRangeKernel(cmdQueue,       // A valid command-queue  
  128.         kernel,         // A valid kernel object.  
  129.         1,              // work_dim  
  130.         NULL,           // *global_work_offset  
  131.         globalWorkSize, // *global_work_size  
  132.         NULL,           // local_work_size  
  133.         0,              // num_events_in_wait_list  
  134.         NULL,           // *event_wait_list  
  135.         NULL            // *event  
  136.         );  
  137.   
  138.     // Read the device output buffer to the host output array  
  139.     clEnqueueReadBuffer(cmdQueue,     // Refers to the command-queue in which the read command will be queued  
  140.         bufC,         // Refers to a valid buffer object.  
  141.         CL_TRUE,      // Indicates if the read operations are blocking or non-blocking.  
  142.         0,            // The offset in bytes in the buffer object to read from.  
  143.         datasize,     // The size in bytes of data being read.  
  144.         C,            // The pointer to buffer in host memory where data is to be read into.  
  145.         0,            // num_events_in_wait_list  
  146.         NULL,         // *event_wait_list  
  147.         NULL          // *event  
  148.         );  
  149.     // Verify the output  
  150.     int result=1;  
  151.     for(i=0; i<elements; i++)  
  152.     {  
  153.         if(C[i]!=i+i)  
  154.         {  
  155.             printf("C[%d]=%d (Should be %d)!\n", i, C[i], i+i);  
  156.             result=0;  
  157.             break;  
  158.         }  
  159.     }               
  160.     if(result)  
  161.     {  
  162.         printf("Output is correct!\n");  
  163.     }       
  164.     else  
  165.     {  
  166.         printf("Output is incorrect!\n");  
  167.     }  
  168.   
  169.     printf("\t[Info] Free resources...\n");  
  170.     // Free OpenCL resources  
  171.     clReleaseKernel(kernel);  
  172.     clReleaseProgram(program);  
  173.     clReleaseCommandQueue(cmdQueue);  
  174.     clReleaseMemObject(bufA);  
  175.     clReleaseMemObject(bufB);  
  176.     clReleaseMemObject(bufC);  
  177.     clReleaseContext(context);  
  178.   
  179.     // Free host resources  
  180.     free(A);  
  181.     free(B);  
  182.     free(C);  
  183.     free(platforms);  
  184.     free(devices);  
  185.     return 0;  
  186. }  
接著準備一個 Make file 方便編譯過程: 
- makefile_hw6vecadd 
  1. #!/bin/sh  
  2. CC=gcc  
  3. CFLAGS=-I /opt/AMDAPP/include/ -L /opt/AMDAPP/lib/x86_64/  
  4.   
  5. hw6d:hw6d.o  
  6.         $(CC) ${CFLAGS} $^ -lOpenCL -o $@  
  7.   
  8. hw6d.o: hw6d.c  
  9.         $(CC) -c $(CFLAGS) $< -lOpenCL  
  10.   
  11. clean:  
  12.         rm -rf hw6d.o hw6d  
最後便是進行編譯與執行: 
# make -f makefile_hw6vecadd // 編譯程式
gcc -c -I /opt/AMDAPP/include/ -L /opt/AMDAPP/lib/x86_64/ hw6d.c -lOpenCL
gcc -I /opt/AMDAPP/include/ -L /opt/AMDAPP/lib/x86_64/ hw6d.o -lOpenCL -o hw6d

# ./hw6d // 執行程式
[Info] Total 1 platforms...
[Info] Total 1 devices...
[Info] Get DeviceIDs Success!
[Info] Create Context Success!
[Info] Create CommandQueue Success!
[Info] Create Program Success!
[Info] Build Program Success!
Output is correct!
[Info] Free resources...

Supplement: 
OpenCLHowTo - How to set up OpenCL in Linux 
Anteru's Blog - Getting started with OpenCL, Part #1 
Matrix Multiplication 1 (OpenCL) 
OpenCL Cookbook: Building a program and debugging failures

沒有留言:

張貼留言

[Git 常見問題] error: The following untracked working tree files would be overwritten by merge

  Source From  Here 方案1: // x -----删除忽略文件已经对 git 来说不识别的文件 // d -----删除未被添加到 git 的路径中的文件 // f -----强制运行 #   git clean -d -fx 方案2: 今天在服务器上  gi...