source: CIVL/examples/opencl/2.15/reduceCL.cvl

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 9.2 KB
Line 
1#include <civlc.cvh>
2
3
4#include "cl.cvl"
5#include <stdio.h>
6#include <stdlib.h>
7#include <math.h>
8#include <string.h>
9#include <civlc.h>
10
11int numArgs = 4;
12
13#include "reduce_int_kernel.cvl"
14
15$input int MAX_WORKGROUP_SIZE;
16
17/*
18 __global int *output,
19 __global const int *input,
20 __local int *shared,
21 const unsigned int n
22*/
23typedef struct
24{
25 void * param[numArgs];
26 int mallocflag[numArgs];
27}args;
28
29cl_kernel clCreateKernel(args * argument, char * function)
30{
31 cl_kernel kernel;
32 kernel.arguments = argument;
33 kernel.method = function;
34
35 for(int j = 0; j < numArgs; j++)
36 {
37 ((args*)kernel.arguments)->mallocflag[j] = 0;
38 }
39
40 return kernel;
41}
42
43void clReleaseKernel(cl_kernel kernel)
44{
45 for (int i = 0; i < numArgs; i++)
46 {
47 //printf("I am argument %d with value %d\n", i, ((args*)kernel.arguments)->mallocflag[i]);
48 if (((args*)kernel.arguments)->mallocflag[i] == 1)
49 {
50 //printf("and I pass the flag check\n");
51 free(((args*)kernel.arguments)->param[i]);
52 }
53 }
54}
55
56void workfunc(size_t local, size_t global, cl_kernel param)
57{
58 $proc procs[local];
59 char * reduceChar = "reduce";
60 for(int i = local * param.workgroup; i < local * param.workgroup + local; i++)
61 {
62 int n = *(int*)(((args*)param.arguments)->param[3]);
63 param.local_id = i % local;
64 param.global_id = i;
65 printf("My workgroup id is %d, my global id is %d, my local id is %d, and my method is %s\n", param.workgroup, param.global_id, param.local_id, param.method);
66
67
68 if(strcmp(param.method, reduceChar) == 0)
69 {
70 procs[param.local_id] = $spawn reduce(param.workgroup, param.global_id, param.local_id, ((args*)param.arguments)->param[0], ((args*)param.arguments)->param[1], ((args*)param.arguments)->param[2], n);
71 }
72 }
73 for(int j = 0; j < local; j++)
74 {
75 $wait(procs[j]);
76 }
77}
78
79int clEnqueueNDRangeKernel(cl_kernel kernel, size_t global, size_t local)
80{
81
82 $assert((global % local == 0));
83 int numworkgroups = global/local;
84 cl_kernel param[numworkgroups];
85 $proc procs[numworkgroups];
86 //consider $parfor
87
88 /*
89 $domain(1) dom = {0 .. numworkgroups - 1};
90
91 $for(int i: dom)
92 {
93 param[i] = kernel;
94 param[i].workgroup = i;
95 }
96 $parfor(int i: dom)
97 {
98 workfunc(local, global, param[i]);
99 }
100 */
101
102 for(int i = 0; i < global/local; i++)
103 {
104 param[i] = kernel;
105 param[i].workgroup = i;
106 procs[i] = $spawn workfunc(local, global, param[i]);
107 }
108
109 //this part here is the new clFinish(commands);
110 for(int i = 0; i < global/local; i++)
111 {
112 $wait(procs[i]);
113 }
114
115 return CL_SUCCESS;
116
117}
118
119#define MIN_ERROR (1e-7)
120#define MAX_GROUPS (64)
121#define MAX_WORK_ITEMS (64)
122#define SEPARATOR ("----------------------------------------------------------------------\n")
123
124//count is too large, was 1024 * 1024
125static int iterations = 1000;
126static int count = 4 * 4;
127static int channels = 1;
128static bool integer = true;
129
130
131void reduce_validate_int(int *data, int size, int * result)
132{
133 int i;
134 int sum = data[0];
135 int c = (int)0.0f;
136 for (i = 1; i < size; i++)
137 {
138 int y = data[i] - c;
139 int t = sum + y;
140 c = (t - sum) - y;
141 sum = t;
142 }
143 result[0] = sum;
144}
145
146void create_reduction_pass_counts(
147 int count,
148 int max_group_size,
149 int max_groups,
150 int max_work_items,
151 int *pass_count,
152 size_t **group_counts,
153 size_t **work_item_counts,
154 int **operation_counts,
155 int **entry_counts)
156{
157 int work_items = (count < max_work_items * 2) ? count / 2 : max_work_items;
158 if(count < 1)
159 work_items = 1;
160
161 int groups = count / (work_items * 2);
162 groups = max_groups < groups ? max_groups : groups;
163
164 int max_levels = 1;
165 int s = groups;
166
167 while(s > 1)
168 {
169 int work_items = (s < max_work_items * 2) ? s / 2 : max_work_items;
170 s = s / (work_items*2);
171 max_levels++;
172 }
173
174 *group_counts = (size_t*)malloc(max_levels * sizeof(size_t));
175 *work_item_counts = (size_t*)malloc(max_levels * sizeof(size_t));
176 *operation_counts = (int*)malloc(max_levels * sizeof(int));
177 *entry_counts = (int*)malloc(max_levels * sizeof(int));
178
179 (*pass_count) = max_levels;
180 (*group_counts)[0] = groups;
181 (*work_item_counts)[0] = work_items;
182 (*operation_counts)[0] = 1;
183 (*entry_counts)[0] = count;
184 if(max_group_size < work_items)
185 {
186 (*operation_counts)[0] = work_items;
187 (*work_item_counts)[0] = max_group_size;
188 }
189
190 s = groups;
191 int level = 1;
192
193 while(s > 1)
194 {
195 int work_items = (s < max_work_items * 2) ? s / 2 : max_work_items;
196 int groups = s / (work_items * 2);
197 groups = (max_groups < groups) ? max_groups : groups;
198
199 (*group_counts)[level] = groups;
200 (*work_item_counts)[level] = work_items;
201 (*operation_counts)[level] = 1;
202 (*entry_counts)[level] = s;
203 if(max_group_size < work_items)
204 {
205 (*operation_counts)[level] = work_items;
206 (*work_item_counts)[level] = max_group_size;
207 }
208
209 s = s / (work_items*2);
210 level++;
211 }
212}
213
214
215int main(int argc, char** argv)
216{
217
218 int t1 = 0;
219 int t2 = 0;
220 int err;
221 cl_device_id device_id;
222 cl_command_queue commands;
223 cl_context context;
224 /*
225 cl_mem output_buffer;
226 cl_mem input_buffer;
227 cl_mem partials_buffer;
228 */
229 void * output_buffer;
230 void * input_buffer;
231 void * partials_buffer;
232 size_t typesize;
233 int pass_count = 0;
234 size_t* group_counts = 0;
235 size_t* work_item_counts = 0;
236 int* operation_counts = 0;
237 int* entry_counts = 0;
238 int use_gpu = 1;
239
240 int i;
241 int c;
242
243 float *float_data = (float*)malloc(count * channels * sizeof(float));
244 int *integer_data = (int*)malloc(count * channels * sizeof(int));
245
246 //Cannot use strstr
247 use_gpu = 0;
248 integer = true;
249 channels = 1;
250
251 //can use random, but cannot check numbers
252 for (i = 0; i < count * channels; i++)
253 {
254 float_data[i] = ((float) rand() / (float) RAND_MAX);
255 integer_data[i] = (int) (255.0f * float_data[i]);
256 }
257
258 size_t returned_size = 0;
259 size_t max_workgroup_size = 0;
260 //clGetDeviceInfo happens here
261 max_workgroup_size = MAX_WORKGROUP_SIZE;
262
263 printf(SEPARATOR);
264 typesize = integer ? (sizeof(int)) : (sizeof(float));
265
266
267 size_t buffer_size = typesize * count * channels;
268 input_buffer = (int*)malloc(buffer_size);
269
270 void *input_data = (integer) ? (void*)integer_data : (void*)float_data;
271
272 //clEnqueueWriteBuffer
273 memcpy(input_buffer, input_data, buffer_size);
274
275 //clCreateBuffer
276 partials_buffer = (int*)malloc(buffer_size);
277 output_buffer = (int*)malloc(buffer_size);
278
279 create_reduction_pass_counts(
280 count, max_workgroup_size,
281 MAX_GROUPS, MAX_WORK_ITEMS,
282 &pass_count, &group_counts,
283 &work_item_counts, &operation_counts,
284 &entry_counts);
285
286 args * arguments;
287 arguments = (args*)malloc(sizeof(args) * pass_count);
288
289 cl_kernel *kernels = (cl_kernel*)malloc(pass_count * sizeof(cl_kernel));
290 //memset(kernels, 0, pass_count * sizeof(cl_kernel));
291 //no memset
292
293 for(i = 0; i < pass_count; i++)
294 {
295 kernels[i] = clCreateKernel(arguments+i, "reduce");
296 }
297
298 void * pass_swap;
299 void * pass_input = output_buffer;
300 void * pass_output = input_buffer;
301
302 for(i = 0; i < pass_count; i++)
303 {
304 size_t global = group_counts[i] * work_item_counts[i];
305 size_t local = work_item_counts[i];
306 unsigned int operations = operation_counts[i];
307 unsigned int entries = entry_counts[i];
308 size_t shared_size = typesize * channels * local * operations;
309
310 printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Operations[%d] Entries[%d]\n", i,
311 (int)global, (int)local, (int)group_counts[i], (int)work_item_counts[i], operations, entries);
312
313 pass_swap = pass_input;
314 pass_input = pass_output;
315 pass_output = pass_swap;
316/*
317 __global int *output,
318 __global const int *input,
319 __local int *shared,
320 const unsigned int n
321*/
322 ((args*)kernels[i].arguments)->param[0] = pass_output;
323 ((args*)kernels[i].arguments)->param[1] = pass_input;
324
325 ((args*)kernels[i].arguments)->param[2] = (int*)malloc(shared_size);
326
327
328 ((args*)kernels[i].arguments)->param[3] = (int*)malloc(sizeof(int));
329 memcpy(((args*)kernels[i].arguments)->param[3], &entries,sizeof(int));
330 ((args*)kernels[i].arguments)->mallocflag[3] = 1;
331
332 if(pass_input == input_buffer)
333 {
334 pass_input = partials_buffer;
335 }
336 clEnqueueNDRangeKernel(kernels[i], global, local);
337 }
338 /*
339 for (int k = 0 ; k < iterations; k++)
340 {
341 for(int i = 0; i < pass_count; i++)
342 {
343 size_t global = group_counts[i] * work_item_counts[i];
344 size_t local = work_item_counts[i];
345
346 clEnqueueNDRangeKernel(kernels[i], global, local);
347
348 }
349 }
350 */
351 for(i = 0; i < pass_count; i++)
352 {
353 clReleaseKernel(kernels[i]);
354 }
355
356 free(arguments);
357
358 free(output_buffer);
359 free(partials_buffer);
360 free(input_buffer);
361
362 free(group_counts);
363 free(work_item_counts);
364 free(operation_counts);
365 free(entry_counts);
366 //free(computed_result);
367 free(kernels);
368 free(float_data);
369 free(integer_data);
370
371 return 0;
372}
373
Note: See TracBrowser for help on using the repository browser.