#include //Forget the program #include "cl.cvl" #include #include #include #include $input int NUM_DEVICES; $input int MAX_NUM_DEVICES; $assume(0 < NUM_DEVICES && NUM_DEVICES < MAX_NUM_DEVICES); $input int DATA_SIZE; $input int MAX_DATA_SIZE; $assume(0 < DATA_SIZE && DATA_SIZE < MAX_NUM_DEVICES); $input int LOCAL; $input int MAX_LOCAL; $assume(0 < LOCAL && LOCAL < MAX_LOCAL); //this args struct will hold all the parameters of for the kernel function typedef struct { //Variables for kernels /* float * input; float * output; int count; */ void * input; void * output; void * count; }args; /* args * argument - Takes in the struct, which is changed for every program using a different kernel */ cl_kernel clCreateKernel(args * argument, char * function) { cl_kernel kernel; kernel.arguments = argument; kernel.method = function; return kernel; } /* This is the kernel that processes compute with int workgroup - Gives the workgroup that a particular process came from, made by clEnqueueNDRangeKernel int global_id - Gives the global_id that a particular process has, given by workfunc int local_id - Gives the local_id that a particular process has, given by workfunc float* input - Kernel argument float* output - Kernel argument int count - Kernel argument */ void square(int workgroup, int global_id, int local_id, float* input, float* output, int count) { //int i = get_global_id(0); int i = global_id; if (i < count) { output[i] = input[i] * input[i]; //printf("output[%d] is %d\n", i, output[i]); } } /* workfunc assigns local and global ids, before calling the kernel. Note: The function should be identical in all transformations except the calling of the kernel, which means that it cannot be in openCLshared.cvl size_t local - The size of the workgroups, used to calculate blocks size_t global - The total amount of work to be done cl_kernel param - Holds the data for local_id, global_id, and the workgroup Use the print statement to get a better idea of what it means to split workgroups, local_ids, and global_ids */ void empty() { } void workfunc(size_t local, size_t global, cl_kernel param) { $proc procs[local]; for(int i = local * param.workgroup; i < local * param.workgroup + local; i++) { int count = *(int*)(((args*)param.arguments)->count); param.local_id = i % local; param.global_id = i; printf("My workgroup id is %d, my global id is %d, my local id is %d, and my method is %s\n", param.workgroup, param.global_id, param.local_id, param.method); procs[param.local_id] = $spawn square(param.workgroup, param.global_id, param.local_id, ((args*)param.arguments)->input, ((args*)param.arguments)->output, count); } for(int j = 0; j < local; j++) { $wait(procs[j]); } } /* Splits up and spawns processes based on global and local, using block TODO: remove cl_command_queue completely and put into a "just in case" file, currently not needed cl_command_queue commands - Holds a queue of the order that devices are to be executed cl_kernel kernel - Holds all the arguments for the kernel, as well as local_id, global_id, and the workgroup size_t global - The total amount of work to be done size_t local - Number to split into workgroups by */ int clEnqueueNDRangeKernel(cl_kernel kernel, size_t global, size_t local) { $assert((global % local == 0)); int numworkgroups = global/local; cl_kernel param[numworkgroups]; $proc procs[numworkgroups]; //consider $parfor /* $domain(1) dom = {0 .. numworkgroups - 1}; $for(int i: dom) { param[i] = kernel; param[i].workgroup = i; } $parfor(int i: dom) { workfunc(local, global, param[i]); } */ for(int i = 0; i < global/local; i++) { param[i] = kernel; param[i].workgroup = i; procs[i] = $spawn workfunc(local, global, param[i]); } //this part here is the new clFinish(commands); for(int i = 0; i < global/local; i++) { $wait(procs[i]); } return CL_SUCCESS; } int main(int argc, char** argv) { //Make function pointers for every method void (*squarePtr) (int, int, int, float *, float *, int); squarePtr = □ args * arguments; arguments = (args*)malloc(sizeof(args)); float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue //cl_program program; // compute program cl_kernel kernel; // compute kernel int err; float * input; // device memory used for the input array float * output; // device memory used for the output array //Puts in data for input unsigned int count = DATA_SIZE; for(int i = 0; i < count; i++) { data[i] = i; } //clCreateProgram is far different from the real version, this just stores parameters for the kernel //In order to make this clear, it is clCreateProgram and not something like clCreateProgramFromSource, which actually exists in openCL code //program = clCreateProgram(arguments); kernel = clCreateKernel(arguments, "square"); //replaces clCreateBuffer input = (float *) malloc(sizeof(float) * count); output = (float *) malloc(sizeof(float) * count); //replaces clEnqueueWriteBuffer, puts data into the input to be put into the kernel arguments memcpy(input, data, sizeof(float) * count); /* err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err = clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); */ //use pointer instead of malloc + memcpy for global variables //((args*)kernel.arguments)->input = (float*)malloc(sizeof(float) * count); //memcpy(((args *)kernel.arguments)->input, input, sizeof(float) * count); ((args*)kernel.arguments)->input = input; //((args*)kernel.arguments)->output = (float*)malloc(sizeof(float) * count); //memcpy(((args*)kernel.arguments)->output, output, sizeof(float)); ((args*)kernel.arguments)->output = output; ((args*)kernel.arguments)->count = (int *)malloc(sizeof(int)); memcpy(((args*)kernel.arguments)->count, &count, sizeof(int)); //no malloc needed for non pointers //clGetKernelWorkGroupInfo would get a local size optimal for a device, but is not needed here local = LOCAL; global = count; /* commands holds the "order" of devices kernel holds program, which holds variables offset not implemented */ err = clEnqueueNDRangeKernel(kernel, global, local); //Replaces clEnqueueReadBuffer, which takes one of the saved variables and puts it out to another one memcpy(results, output, sizeof(float) * count); //verifies that all values in results are actually squared correct = 0; for(int i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) { correct++; } } printf("Computed '%d/%d' correct values!\n", correct, count); //TODO: Think of using void * array instead of regular arguments to make freeing easier free(((args*)kernel.arguments)->input); free(((args*)kernel.arguments)->output); free(((args*)kernel.arguments)->count); /* free(input); free(output); */ free(arguments); return 0; } /*Note, pointers to global memory cannot be stored in local memory support for global, local, constant and group, local being the default private (local) - variable assignment, or malloc + memcpy global - pointer with memory location. For regular non pointer variables, make a pointer and assign it to it, then use the pointer Global variables can be declared in program source but they must use the "constant" address space qualifier and need to be initialized. You cannot have global variables that can be modified by kernels and where the modified values are persistent across work-groups and kernel executions. For this, you should use memory objects instead. Constant is just like global, but read only group - ??? */