0

I'm a beginner at OpenCL. I was trying to build a simple app which just add 2 vectors to get results. This is my following host code

#define USE_PLATFORM 0
#define USE_DEVICE 2
#define DATA_SIZE 1024

#define USE_KERNEL_PATH "/Users/huangxin/Documents/August13Programming/FirstEGOpenCL/FirstEGOpenCL/kernel.cl"

using namespace std;

int main(int argc, const char * argv[]) {
    int err;
    cl_uint numPlatforms;
    cl_uint numDevices;
    cl_command_queue command;
    size_t global;

    //Query the number of platforms supported.
    err = clGetPlatformIDs(0, NULL, &numPlatforms);
    if (err != CL_SUCCESS || USE_PLATFORM >= numPlatforms)
    {
        printf("Error at: clGetPlatformIDs(querying platforms count failed):\n");
        exit(-1);
    }

    //Get all platforms.
    vector<cl_platform_id> platforms(numPlatforms);
    err = clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clGetPlatformIDs(getting all platforms failed):\n");
        exit(-1);
    }

    //Query the number of devices supported by the platform spicified.
    err = clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
    if (err != CL_SUCCESS || USE_PLATFORM >= numDevices)
    {
        printf("Error at: clGetDeviceIDs(querying devices count failed):\n");
        exit(-1);
    }

    //Get all devices.
    vector<cl_device_id> devices(numDevices);
    err=clGetDeviceIDs(platforms[USE_PLATFORM], CL_DEVICE_TYPE_ALL, numDevices, &devices[0], &numDevices);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clGetDeviceIDs(getting all devices failed):\n");
        exit(-1);
    }


    //Get device infomation.
    char deviceInfo[1024];
    //get device max work item dimensions.
    size_t maxItemSize[3];
    clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_NAME, sizeof(deviceInfo)*1024, deviceInfo, NULL);
    clGetDeviceInfo(devices[USE_DEVICE], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, maxItemSize, NULL);
    cout << "Device selected: " << deviceInfo << endl;
    cout << "Max item size: " << maxItemSize[0] << "," << maxItemSize[1] << ","<< maxItemSize[2] << endl;

    //Set property with certain platform
    cl_context_properties prop[] = {CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platforms[USE_PLATFORM]), 0};

    //create context with certain property.
    cl_context context = clCreateContextFromType(prop, CL_DEVICE_TYPE_ALL, NULL, NULL, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clCreateContextFromType(get context failed):\n");
        exit(-1);
    }

    //create command queue using selected device and context.
    command = clCreateCommandQueue(context, devices[USE_DEVICE], 0, NULL);

    //create program with specified kernel source.
    const char *kernelSource = getKernelSource(USE_KERNEL_PATH);
    cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, 0, &err);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clCreateProgramWithSource(get program failed):\n");
        exit(-1);
    }

    //since OpenCL is a dynamic-compile architechture, we need to build the program.
    err = clBuildProgram(program, 0, 0, 0, 0, 0);
    if (err != CL_SUCCESS)
    {
        cout << err << endl;
        size_t len;
        char buffer[2048];

        printf("Error: Failed to build program executable!\n");
        clGetProgramBuildInfo(program, devices[USE_DEVICE], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
        printf("%s\n", buffer);
        exit(1);
    }

    //kernel是OpenCL中对执行在一个最小粒度的compute item上的代码及参数的抽象
    //create the kernel function using the built program.
    cl_kernel adder = clCreateKernel(program, "adder", &err);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clCreateKernel(get kernel function failed):\n");
        exit(-1);
    }

    //create the vector of input random data.
    vector<float> inA(DATA_SIZE), inB(DATA_SIZE);
    for(int i = 0; i < DATA_SIZE; i++) {
        inA[i] = (float)(random() % DATA_SIZE) / 1000;
        inB[i] = (float)(random() % DATA_SIZE) / 1000;
    }

    //create the read-only device mem using specified context, that is to copy the host mem to the device mem.
    cl_mem cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inA[0], NULL);
    cl_mem cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * DATA_SIZE, &inB[0], NULL);
    //create the result mem.
    cl_mem cl_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);

    //setting up the arguement of kernel memory
    clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
    clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b);
    clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res);

    START_CHECK_RUNNING_TIME
    //enqueue the kernel into the specified command(#TODO:come back later to check the remaining arguement.
    global = DATA_SIZE;
    err = clEnqueueNDRangeKernel(command, adder, 1, 0, &global, 0, 0, 0, 0);
    if (err != CL_SUCCESS)
    {
        printf("Error at: clEnqueueNDRangeKernel(enqueue kernel failed):\n");
        exit(-1);
    }

    printf("*****************FLAG***************");

    //copy the results from the kernel into the host(CPU).
    vector<float> res(DATA_SIZE);
     err = clEnqueueReadBuffer(command, cl_res, CL_TRUE, 0, sizeof(float) * DATA_SIZE, &res[0], 0, 0, 0);
    END_CHECK_RUNNING_TIME

    //check the number of right compute.
    int cnt = 0;
    for (int i = 0; i < res.size(); i++) {
        cnt += (res[i] == inA[i] + inB[i] ? 1 : 0);
    }
    cout << "Computed " << res.size() << " values\n";
    cout << "Correct values:(" << cnt << "/" << res.size() << "),correct rate:" << (float)cnt / res.size() * 100 << "%" << endl;

    gettimeofday(&sTime, NULL);
    for (int i = 0; i < res.size(); i++) {
        for (int j = 0; j < 10000; j++)
            res[i] = inA[i] + inB[i];
    }
    gettimeofday(&eTime, NULL);timeuse = 1000000 * ( eTime.tv_sec - sTime.tv_sec ) + eTime.tv_usec -sTime.tv_usec; printf("Running time: %fs\n", (double)timeuse/(1000000));

    //cleaning up the variables.
    clReleaseKernel(adder);
    clReleaseProgram(program);
    clReleaseMemObject(cl_a);
    clReleaseMemObject(cl_b);
    clReleaseMemObject(cl_res);
    clReleaseCommandQueue(command);
    clReleaseContext(context);
    return 0;
}

It's a bit long code, but it's really doing simple stuff. this is my kernel code

kernel void adder(global const float* a, global const float* b, global float* result)
{
    size_t idx = get_global_id(0);
    for (int i = 0; i < 10000; i++)
        result[idx] = a[idx] +b[idx];
}

And I got the following result:

Device selected: GeForce GT 650M
-11
Error: Failed to build program executable!
No kernels or only kernel prototypes found.

I don't quite understand what "No kernels or only kernel prototypes found." mean and it's really strange that if I use the first device(CPU) or my second device(HD Graphics 4000), the same code runs perfectly.

I want to know what is wrong and why it happens.

I was running these code in the Xcode with Mac OS X 10.10.

2
  • Shouldn't it be __kernel void adder and not kernel void adder? Commented Jan 15, 2015 at 13:56
  • Actually that's not the problem, it could be either kernel or __kernel Commented Jan 15, 2015 at 13:57

1 Answer 1

2

As the comments say, is a good practice to use:

__kernel void adder(__global const float* a, __global const float* b, __global float* result)

Because that way you clearly define those are special CL flags. Tpically all the CL kernels follow that rule, even if the spec allows both.


But your problem is probably due to running the clBuildProgram() without any device in the devices list. Therefore, not compiling anything at all!

In CL every device has an specific compiler (the CPUs don't have the same compiler as GPU, sometimes not even the same instruction sets). So you should give the API the list of devices for which the kernels have to be compiled.

The proper way would be this:

err = clBuildProgram(program, 1, &devices[USE_DEVICE], "", 0, 0);

Note: I added "", because probably in the future you will want to add some build parameters, better to have it ready :)

Sign up to request clarification or add additional context in comments.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.