source: CIVL/examples/opencl/2.15/reduceCL.cvl@ 41340c1

1.23 2.0 main test-branch
Last change on this file since 41340c1 was 6317abc, checked in by Ziqing Luo <ziqing@…>, 11 years ago

renaming opencl version files
move div0 to arithmetic

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1768 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 9.2 KB
Line 
1
2
3#include "cl.cvl"
4#include <stdio.h>
5#include <stdlib.h>
6#include <math.h>
7#include <string.h>
8#include <civlc.h>
9
10int numArgs = 4;
11
12#include "reduce_int_kernel.cvl"
13
14$input int MAX_WORKGROUP_SIZE;
15
16/*
17 __global int *output,
18 __global const int *input,
19 __local int *shared,
20 const unsigned int n
21*/
22typedef struct
23{
24 void * param[numArgs];
25 int mallocflag[numArgs];
26}args;
27
28cl_kernel clCreateKernel(args * argument, char * function)
29{
30 cl_kernel kernel;
31 kernel.arguments = argument;
32 kernel.method = function;
33
34 for(int j = 0; j < numArgs; j++)
35 {
36 ((args*)kernel.arguments)->mallocflag[j] = 0;
37 }
38
39 return kernel;
40}
41
42void clReleaseKernel(cl_kernel kernel)
43{
44 for (int i = 0; i < numArgs; i++)
45 {
46 //printf("I am argument %d with value %d\n", i, ((args*)kernel.arguments)->mallocflag[i]);
47 if (((args*)kernel.arguments)->mallocflag[i] == 1)
48 {
49 //printf("and I pass the flag check\n");
50 free(((args*)kernel.arguments)->param[i]);
51 }
52 }
53}
54
55void workfunc(size_t local, size_t global, cl_kernel param)
56{
57 $proc procs[local];
58 char * reduceChar = "reduce";
59 for(int i = local * param.workgroup; i < local * param.workgroup + local; i++)
60 {
61 int n = *(int*)(((args*)param.arguments)->param[3]);
62 param.local_id = i % local;
63 param.global_id = i;
64 printf("My workgroup id is %d, my global id is %d, my local id is %d, and my method is %s\n", param.workgroup, param.global_id, param.local_id, param.method);
65
66
67 if(strcmp(param.method, reduceChar) == 0)
68 {
69 procs[param.local_id] = $spawn reduce(param.workgroup, param.global_id, param.local_id, ((args*)param.arguments)->param[0], ((args*)param.arguments)->param[1], ((args*)param.arguments)->param[2], n);
70 }
71 }
72 for(int j = 0; j < local; j++)
73 {
74 $wait(procs[j]);
75 }
76}
77
78int clEnqueueNDRangeKernel(cl_kernel kernel, size_t global, size_t local)
79{
80
81 $assert(global % local == 0);
82 int numworkgroups = global/local;
83 cl_kernel param[numworkgroups];
84 $proc procs[numworkgroups];
85 //consider $parfor
86
87 /*
88 $domain(1) dom = {0 .. numworkgroups - 1};
89
90 $for(int i: dom)
91 {
92 param[i] = kernel;
93 param[i].workgroup = i;
94 }
95 $parfor(int i: dom)
96 {
97 workfunc(local, global, param[i]);
98 }
99 */
100
101 for(int i = 0; i < global/local; i++)
102 {
103 param[i] = kernel;
104 param[i].workgroup = i;
105 procs[i] = $spawn workfunc(local, global, param[i]);
106 }
107
108 //this part here is the new clFinish(commands);
109 for(int i = 0; i < global/local; i++)
110 {
111 $wait(procs[i]);
112 }
113
114 return CL_SUCCESS;
115
116}
117
118#define MIN_ERROR (1e-7)
119#define MAX_GROUPS (64)
120#define MAX_WORK_ITEMS (64)
121#define SEPARATOR ("----------------------------------------------------------------------\n")
122
123//count is too large, was 1024 * 1024
124static int iterations = 1000;
125static int count = 4 * 4;
126static int channels = 1;
127static bool integer = true;
128
129
130void reduce_validate_int(int *data, int size, int * result)
131{
132 int i;
133 int sum = data[0];
134 int c = (int)0.0f;
135 for (i = 1; i < size; i++)
136 {
137 int y = data[i] - c;
138 int t = sum + y;
139 c = (t - sum) - y;
140 sum = t;
141 }
142 result[0] = sum;
143}
144
145void create_reduction_pass_counts(
146 int count,
147 int max_group_size,
148 int max_groups,
149 int max_work_items,
150 int *pass_count,
151 size_t **group_counts,
152 size_t **work_item_counts,
153 int **operation_counts,
154 int **entry_counts)
155{
156 int work_items = (count < max_work_items * 2) ? count / 2 : max_work_items;
157 if(count < 1)
158 work_items = 1;
159
160 int groups = count / (work_items * 2);
161 groups = max_groups < groups ? max_groups : groups;
162
163 int max_levels = 1;
164 int s = groups;
165
166 while(s > 1)
167 {
168 int work_items = (s < max_work_items * 2) ? s / 2 : max_work_items;
169 s = s / (work_items*2);
170 max_levels++;
171 }
172
173 *group_counts = (size_t*)malloc(max_levels * sizeof(size_t));
174 *work_item_counts = (size_t*)malloc(max_levels * sizeof(size_t));
175 *operation_counts = (int*)malloc(max_levels * sizeof(int));
176 *entry_counts = (int*)malloc(max_levels * sizeof(int));
177
178 (*pass_count) = max_levels;
179 (*group_counts)[0] = groups;
180 (*work_item_counts)[0] = work_items;
181 (*operation_counts)[0] = 1;
182 (*entry_counts)[0] = count;
183 if(max_group_size < work_items)
184 {
185 (*operation_counts)[0] = work_items;
186 (*work_item_counts)[0] = max_group_size;
187 }
188
189 s = groups;
190 int level = 1;
191
192 while(s > 1)
193 {
194 int work_items = (s < max_work_items * 2) ? s / 2 : max_work_items;
195 int groups = s / (work_items * 2);
196 groups = (max_groups < groups) ? max_groups : groups;
197
198 (*group_counts)[level] = groups;
199 (*work_item_counts)[level] = work_items;
200 (*operation_counts)[level] = 1;
201 (*entry_counts)[level] = s;
202 if(max_group_size < work_items)
203 {
204 (*operation_counts)[level] = work_items;
205 (*work_item_counts)[level] = max_group_size;
206 }
207
208 s = s / (work_items*2);
209 level++;
210 }
211}
212
213
214int main(int argc, char** argv)
215{
216
217 int t1 = 0;
218 int t2 = 0;
219 int err;
220 cl_device_id device_id;
221 cl_command_queue commands;
222 cl_context context;
223 /*
224 cl_mem output_buffer;
225 cl_mem input_buffer;
226 cl_mem partials_buffer;
227 */
228 void * output_buffer;
229 void * input_buffer;
230 void * partials_buffer;
231 size_t typesize;
232 int pass_count = 0;
233 size_t* group_counts = 0;
234 size_t* work_item_counts = 0;
235 int* operation_counts = 0;
236 int* entry_counts = 0;
237 int use_gpu = 1;
238
239 int i;
240 int c;
241
242 float *float_data = (float*)malloc(count * channels * sizeof(float));
243 int *integer_data = (int*)malloc(count * channels * sizeof(int));
244
245 //Cannot use strstr
246 use_gpu = 0;
247 integer = true;
248 channels = 1;
249
250 //can use random, but cannot check numbers
251 for (i = 0; i < count * channels; i++)
252 {
253 float_data[i] = ((float) rand() / (float) RAND_MAX);
254 integer_data[i] = (int) (255.0f * float_data[i]);
255 }
256
257 size_t returned_size = 0;
258 size_t max_workgroup_size = 0;
259 //clGetDeviceInfo happens here
260 max_workgroup_size = MAX_WORKGROUP_SIZE;
261
262 printf(SEPARATOR);
263 typesize = integer ? (sizeof(int)) : (sizeof(float));
264
265
266 size_t buffer_size = typesize * count * channels;
267 input_buffer = (int*)malloc(buffer_size);
268
269 void *input_data = (integer) ? (void*)integer_data : (void*)float_data;
270
271 //clEnqueueWriteBuffer
272 memcpy(input_buffer, input_data, buffer_size);
273
274 //clCreateBuffer
275 partials_buffer = (int*)malloc(buffer_size);
276 output_buffer = (int*)malloc(buffer_size);
277
278 create_reduction_pass_counts(
279 count, max_workgroup_size,
280 MAX_GROUPS, MAX_WORK_ITEMS,
281 &pass_count, &group_counts,
282 &work_item_counts, &operation_counts,
283 &entry_counts);
284
285 args * arguments;
286 arguments = (args*)malloc(sizeof(args) * pass_count);
287
288 cl_kernel *kernels = (cl_kernel*)malloc(pass_count * sizeof(cl_kernel));
289 //memset(kernels, 0, pass_count * sizeof(cl_kernel));
290 //no memset
291
292 for(i = 0; i < pass_count; i++)
293 {
294 kernels[i] = clCreateKernel(arguments+i, "reduce");
295 }
296
297 void * pass_swap;
298 void * pass_input = output_buffer;
299 void * pass_output = input_buffer;
300
301 for(i = 0; i < pass_count; i++)
302 {
303 size_t global = group_counts[i] * work_item_counts[i];
304 size_t local = work_item_counts[i];
305 unsigned int operations = operation_counts[i];
306 unsigned int entries = entry_counts[i];
307 size_t shared_size = typesize * channels * local * operations;
308
309 printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Operations[%d] Entries[%d]\n", i,
310 (int)global, (int)local, (int)group_counts[i], (int)work_item_counts[i], operations, entries);
311
312 pass_swap = pass_input;
313 pass_input = pass_output;
314 pass_output = pass_swap;
315/*
316 __global int *output,
317 __global const int *input,
318 __local int *shared,
319 const unsigned int n
320*/
321 ((args*)kernels[i].arguments)->param[0] = pass_output;
322 ((args*)kernels[i].arguments)->param[1] = pass_input;
323
324 ((args*)kernels[i].arguments)->param[2] = (int*)malloc(shared_size);
325
326
327 ((args*)kernels[i].arguments)->param[3] = (int*)malloc(sizeof(int));
328 memcpy(((args*)kernels[i].arguments)->param[3], &entries,sizeof(int));
329 ((args*)kernels[i].arguments)->mallocflag[3] = 1;
330
331 if(pass_input == input_buffer)
332 {
333 pass_input = partials_buffer;
334 }
335 clEnqueueNDRangeKernel(kernels[i], global, local);
336 }
337 /*
338 for (int k = 0 ; k < iterations; k++)
339 {
340 for(int i = 0; i < pass_count; i++)
341 {
342 size_t global = group_counts[i] * work_item_counts[i];
343 size_t local = work_item_counts[i];
344
345 clEnqueueNDRangeKernel(kernels[i], global, local);
346
347 }
348 }
349 */
350 for(i = 0; i < pass_count; i++)
351 {
352 clReleaseKernel(kernels[i]);
353 }
354
355 free(arguments);
356
357 free(output_buffer);
358 free(partials_buffer);
359 free(input_buffer);
360
361 free(group_counts);
362 free(work_item_counts);
363 free(operation_counts);
364 free(entry_counts);
365 //free(computed_result);
366 free(kernels);
367 free(float_data);
368 free(integer_data);
369
370 return 0;
371}
372
Note: See TracBrowser for help on using the repository browser.