source: CIVL/examples/opencl/2.14/reduce/reduce.c

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 25.6 KB
RevLine 
[5693398]1/*
2 * Parallel reduction example from:
3 * https://developer.apple.com/library/mac/samplecode/OpenCL_Parallel_Reduction_Example/Introduction/Intro.html
4 */
5
6//
7// File: reduce.c
8//
9// Abstract: This example shows how to perform an efficient parallel reduction using OpenCL.
10// Reduce is a common data parallel primitive which can be used for variety
11// of different operations -- this example computes the global sum for a large
12// number of values, and includes kernels for integer and floating point vector
13// types.
14//
15// Version: <1.0>
16//
17// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
18// in consideration of your agreement to the following terms, and your use,
19// installation, modification or redistribution of this Apple software
20// constitutes acceptance of these terms. If you do not agree with these
21// terms, please do not use, install, modify or redistribute this Apple
22// software.
23//
24// In consideration of your agreement to abide by the following terms, and
25// subject to these terms, Apple grants you a personal, non - exclusive
26// license, under Apple's copyrights in this original Apple software ( the
27// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
28// Software, with or without modifications, in source and / or binary forms;
29// provided that if you redistribute the Apple Software in its entirety and
30// without modifications, you must retain this notice and the following text
31// and disclaimers in all such redistributions of the Apple Software. Neither
32// the name, trademarks, service marks or logos of Apple Inc. may be used to
33// endorse or promote products derived from the Apple Software without specific
34// prior written permission from Apple. Except as expressly stated in this
35// notice, no other rights or licenses, express or implied, are granted by
36// Apple herein, including but not limited to any patent rights that may be
37// infringed by your derivative works or by other works in which the Apple
38// Software may be incorporated.
39//
40// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
41// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
42// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
43// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
44// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
45//
46// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
47// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
48// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
49// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
50// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
51// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
52// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53//
54// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
55//
56////////////////////////////////////////////////////////////////////////////////////////////////////
57
58#include <libc.h>
59#include <stdbool.h>
60#include <sys/stat.h>
61#include <sys/types.h>
62#include <stdio.h>
63#include <stdlib.h>
64#include <mach/mach_time.h>
65#include <math.h>
66
67#include <OpenCL/opencl.h>
68
69////////////////////////////////////////////////////////////////////////////////////////////////////
70
71#define MIN_ERROR (1e-7)
72#define MAX_GROUPS (64)
73#define MAX_WORK_ITEMS (64)
74#define SEPARATOR ("----------------------------------------------------------------------\n")
75
76static int iterations = 1000;
77static int count = 1024 * 1024;
78static int channels = 1;
79static bool integer = true;
80
81////////////////////////////////////////////////////////////////////////////////////////////////////
82
83uint64_t
84current_time()
85{
86 return mach_absolute_time();
87}
88
89double
90subtract_time_in_seconds( uint64_t endtime, uint64_t starttime )
91{
92 static double conversion = 0.0;
93 uint64_t difference = endtime - starttime;
94 if( 0 == conversion )
95 {
96 mach_timebase_info_data_t timebase;
97 kern_return_t kError = mach_timebase_info( &timebase );
98 if( kError == 0 )
99 conversion = 1e-9 * (double) timebase.numer / (double) timebase.denom;
100 }
101
102 return conversion * (double) difference;
103}
104
105static char *
106load_program_source(const char *filename)
107{
108 struct stat statbuf;
109 FILE *fh;
110 char *source;
111
112 fh = fopen(filename, "r");
113 if (fh == 0)
114 return 0;
115
116 stat(filename, &statbuf);
117 source = (char *) malloc(statbuf.st_size + 1);
118 fread(source, statbuf.st_size, 1, fh);
119 source[statbuf.st_size] = '\0';
120
121 return source;
122}
123
124////////////////////////////////////////////////////////////////////////////////////////////////////
125
126void reduce_validate_float(float *data, int size, float * result)
127{
128 int i;
129 float sum = data[0];
130 float c = (float)0.0f;
131 for (i = 1; i < size; i++)
132 {
133 float y = data[i] - c;
134 float t = sum + y;
135 c = (t - sum) - y;
136 sum = t;
137 }
138 result[0] = sum;
139}
140
141void reduce_validate_float2(float *data, int size, float *result)
142{
143 int i;
144 float c[2] = { 0.0f, 0.0f };
145
146 result[0] = data[0*2+0];
147 result[1] = data[0*2+1];
148
149 for (i = 1; i < size; i++)
150 {
151 float y[2] = { data[i*2+0] - c[0], data[i*2+1] - c[1] };
152 float t[2] = { result[0] + y[0], result[1] + y[1] };
153
154 c[0] = (t[0] - result[0]) - y[0];
155 c[1] = (t[1] - result[1]) - y[1];
156
157 result[0] = t[0];
158 result[1] = t[1];
159 }
160}
161
162void reduce_validate_float4(float *data, int size, float *result)
163{
164 int i;
165 float c[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
166
167 result[0] = data[0*4+0];
168 result[1] = data[0*4+1];
169 result[2] = data[0*4+2];
170 result[3] = data[0*4+3];
171
172 for (i = 1; i < size; i++)
173 {
174 float y[4] = { data[i*4+0] - c[0], data[i*4+1] - c[1], data[i*4+2] - c[2], data[i*4+3] - c[3] };
175 float t[4] = { result[0] + y[0], result[1] + y[1], result[2] + y[2], result[3] + y[3] };
176
177 c[0] = (t[0] - result[0]) - y[0];
178 c[1] = (t[1] - result[1]) - y[1];
179 c[2] = (t[2] - result[2]) - y[2];
180 c[3] = (t[3] - result[3]) - y[3];
181
182 result[0] = t[0];
183 result[1] = t[1];
184 result[2] = t[2];
185 result[3] = t[3];
186 }
187}
188
189void reduce_validate_int(int *data, int size, int * result)
190{
191 int i;
192 int sum = data[0];
193 int c = (int)0.0f;
194 for (i = 1; i < size; i++)
195 {
196 int y = data[i] - c;
197 int t = sum + y;
198 c = (t - sum) - y;
199 sum = t;
200 }
201 result[0] = sum;
202}
203
204void reduce_validate_int2(int *data, int size, int *result)
205{
206 int i;
207 int c[2] = { 0.0f, 0.0f };
208
209 result[0] = data[0*2+0];
210 result[1] = data[0*2+1];
211
212 for (i = 1; i < size; i++)
213 {
214 int y[2] = { data[i*2+0] - c[0], data[i*2+1] - c[1] };
215 int t[2] = { result[0] + y[0], result[1] + y[1] };
216
217 c[0] = (t[0] - result[0]) - y[0];
218 c[1] = (t[1] - result[1]) - y[1];
219
220 result[0] = t[0];
221 result[1] = t[1];
222 }
223}
224
225void reduce_validate_int4(int *data, int size, int *result)
226{
227 int i;
228 int c[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
229
230 result[0] = data[0*4+0];
231 result[1] = data[0*4+1];
232 result[2] = data[0*4+2];
233 result[3] = data[0*4+3];
234
235 for (i = 1; i < size; i++)
236 {
237 int y[4] = { data[i*4+0] - c[0], data[i*4+1] - c[1], data[i*4+2] - c[2], data[i*4+3] - c[3] };
238 int t[4] = { result[0] + y[0], result[1] + y[1], result[2] + y[2], result[3] + y[3] };
239
240 c[0] = (t[0] - result[0]) - y[0];
241 c[1] = (t[1] - result[1]) - y[1];
242 c[2] = (t[2] - result[2]) - y[2];
243 c[3] = (t[3] - result[3]) - y[3];
244
245 result[0] = t[0];
246 result[1] = t[1];
247 result[2] = t[2];
248 result[3] = t[3];
249 }
250}
251
252////////////////////////////////////////////////////////////////////////////////////////////////////
253
254void create_reduction_pass_counts(
255 int count,
256 int max_group_size,
257 int max_groups,
258 int max_work_items,
259 int *pass_count,
260 size_t **group_counts,
261 size_t **work_item_counts,
262 int **operation_counts,
263 int **entry_counts)
264{
265 int work_items = (count < max_work_items * 2) ? count / 2 : max_work_items;
266 if(count < 1)
267 work_items = 1;
268
269 int groups = count / (work_items * 2);
270 groups = max_groups < groups ? max_groups : groups;
271
272 int max_levels = 1;
273 int s = groups;
274
275 while(s > 1)
276 {
277 int work_items = (s < max_work_items * 2) ? s / 2 : max_work_items;
278 s = s / (work_items*2);
279 max_levels++;
280 }
281
282 *group_counts = (size_t*)malloc(max_levels * sizeof(size_t));
283 *work_item_counts = (size_t*)malloc(max_levels * sizeof(size_t));
284 *operation_counts = (int*)malloc(max_levels * sizeof(int));
285 *entry_counts = (int*)malloc(max_levels * sizeof(int));
286
287 (*pass_count) = max_levels;
288 (*group_counts)[0] = groups;
289 (*work_item_counts)[0] = work_items;
290 (*operation_counts)[0] = 1;
291 (*entry_counts)[0] = count;
292 if(max_group_size < work_items)
293 {
294 (*operation_counts)[0] = work_items;
295 (*work_item_counts)[0] = max_group_size;
296 }
297
298 s = groups;
299 int level = 1;
300
301 while(s > 1)
302 {
303 int work_items = (s < max_work_items * 2) ? s / 2 : max_work_items;
304 int groups = s / (work_items * 2);
305 groups = (max_groups < groups) ? max_groups : groups;
306
307 (*group_counts)[level] = groups;
308 (*work_item_counts)[level] = work_items;
309 (*operation_counts)[level] = 1;
310 (*entry_counts)[level] = s;
311 if(max_group_size < work_items)
312 {
313 (*operation_counts)[level] = work_items;
314 (*work_item_counts)[level] = max_group_size;
315 }
316
317 s = s / (work_items*2);
318 level++;
319 }
320}
321
322/////////////////////////////////////////////////////////////////////////////
323
324int main(int argc, char **argv)
325{
326 uint64_t t1 = 0;
327 uint64_t t2 = 0;
328 int err;
329 cl_device_id device_id;
330 cl_command_queue commands;
331 cl_context context;
332 cl_mem output_buffer;
333 cl_mem input_buffer;
334 cl_mem partials_buffer;
335 size_t typesize;
336 int pass_count = 0;
337 size_t* group_counts = 0;
338 size_t* work_item_counts = 0;
339 int* operation_counts = 0;
340 int* entry_counts = 0;
341 int use_gpu = 1;
342
343 int i;
344 int c;
345
346 // Parse command line options
347 //
348 for( i = 0; i < argc && argv; i++)
349 {
350 if(!argv[i])
351 continue;
352
353 if(strstr(argv[i], "cpu"))
354 {
355 use_gpu = 0;
356 }
357 else if(strstr(argv[i], "gpu"))
358 {
359 use_gpu = 1;
360 }
361 else if(strstr(argv[i], "float2"))
362 {
363 integer = false;
364 channels = 2;
365 }
366 else if(strstr(argv[i], "float4"))
367 {
368 integer = false;
369 channels = 4;
370 }
371 else if(strstr(argv[i], "float"))
372 {
373 integer = false;
374 channels = 1;
375 }
376 else if(strstr(argv[i], "int2"))
377 {
378 integer = true;
379 channels = 2;
380 }
381 else if(strstr(argv[i], "int4"))
382 {
383 integer = true;
384 channels = 4;
385 }
386 else if(strstr(argv[i], "int"))
387 {
388 integer = true;
389 channels = 1;
390 }
391 }
392
393 // Create some random input data on the host
394 //
395 float *float_data = (float*)malloc(count * channels * sizeof(float));
396 int *integer_data = (int*)malloc(count * channels * sizeof(int));
397 for (i = 0; i < count * channels; i++)
398 {
399 float_data[i] = ((float) rand() / (float) RAND_MAX);
400 integer_data[i] = (int) (255.0f * float_data[i]);
401 }
402
403 // Connect to a compute device
404 //
405 err = clGetDeviceIDs(NULL, use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
406 if (err != CL_SUCCESS)
407 {
408 printf("Error: Failed to locate a compute device!\n");
409 return EXIT_FAILURE;
410 }
411
412 size_t returned_size = 0;
413 size_t max_workgroup_size = 0;
414 err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, &returned_size);
415 if (err != CL_SUCCESS)
416 {
417 printf("Error: Failed to retrieve device info!\n");
418 return EXIT_FAILURE;
419 }
420
421 cl_char vendor_name[1024] = {0};
422 cl_char device_name[1024] = {0};
423 err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
424 err|= clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
425 if (err != CL_SUCCESS)
426 {
427 printf("Error: Failed to retrieve device info!\n");
428 return EXIT_FAILURE;
429 }
430
431 printf(SEPARATOR);
432 printf("Connecting to %s %s...\n", vendor_name, device_name);
433
434 // Load the compute program from disk into a cstring buffer
435 //
436 typesize = integer ? (sizeof(int)) : (sizeof(float));
437 const char* filename = 0;
438 switch(channels)
439 {
440 case 4:
441 filename = integer ? "reduce_int4_kernel.cl" : "reduce_float4_kernel.cl";
442 break;
443 case 2:
444 filename = integer ? "reduce_int2_kernel.cl" : "reduce_float2_kernel.cl";
445 break;
446 case 1:
447 filename = integer ? "reduce_int_kernel.cl" : "reduce_float_kernel.cl";
448 break;
449 default:
450 printf("Invalid channel count specified!\n");
451 return EXIT_FAILURE;
452 };
453
454 printf(SEPARATOR);
455 printf("Loading program '%s'...\n", filename);
456 printf(SEPARATOR);
457
458 char *source = load_program_source(filename);
459 if(!source)
460 {
461 printf("Error: Failed to load compute program from file!\n");
462 return EXIT_FAILURE;
463 }
464
465 // Create a compute context
466 //
467 context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
468 if (!context)
469 {
470 printf("Error: Failed to create a compute context!\n");
471 return EXIT_FAILURE;
472 }
473
474 // Create a command queue
475 //
476 commands = clCreateCommandQueue(context, device_id, 0, &err);
477 if (!commands)
478 {
479 printf("Error: Failed to create a command commands!\n");
480 return EXIT_FAILURE;
481 }
482
483 // Create the input buffer on the device
484 //
485 size_t buffer_size = typesize * count * channels;
486 input_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
487 if (!input_buffer)
488 {
489 printf("Error: Failed to allocate input buffer on device!\n");
490 return EXIT_FAILURE;
491 }
492
493 // Fill the input buffer with the host allocated random data
494 //
495 void *input_data = (integer) ? (void*)integer_data : (void*)float_data;
496 err = clEnqueueWriteBuffer(commands, input_buffer, CL_TRUE, 0, buffer_size, input_data, 0, NULL, NULL);
497 if (err != CL_SUCCESS)
498 {
499 printf("Error: Failed to write to source array!\n");
500 return EXIT_FAILURE;
501 }
502
503 // Create an intermediate data buffer for intra-level results
504 //
505 partials_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
506 if (!partials_buffer)
507 {
508 printf("Error: Failed to allocate partial sum buffer on device!\n");
509 return EXIT_FAILURE;
510 }
511
512 // Create the output buffer on the device
513 //
514 output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL);
515 if (!output_buffer)
516 {
517 printf("Error: Failed to allocate result buffer on device!\n");
518 return EXIT_FAILURE;
519 }
520
521 // Determine the reduction pass configuration for each level in the pyramid
522 //
523 create_reduction_pass_counts(
524 count, max_workgroup_size,
525 MAX_GROUPS, MAX_WORK_ITEMS,
526 &pass_count, &group_counts,
527 &work_item_counts, &operation_counts,
528 &entry_counts);
529
530 // Create specialized programs and kernels for each level of the reduction
531 //
532 cl_program *programs = (cl_program*)malloc(pass_count * sizeof(cl_program));
533 memset(programs, 0, pass_count * sizeof(cl_program));
534
535 cl_kernel *kernels = (cl_kernel*)malloc(pass_count * sizeof(cl_kernel));
536 memset(kernels, 0, pass_count * sizeof(cl_kernel));
537
538 for(i = 0; i < pass_count; i++)
539 {
540 char *block_source = malloc(strlen(source) + 1024);
541 size_t source_length = strlen(source) + 1024;
542 memset(block_source, 0, source_length);
543
544 // Insert macro definitions to specialize the kernel to a particular group size
545 //
546 const char group_size_macro[] = "#define GROUP_SIZE";
547 const char operations_macro[] = "#define OPERATIONS";
548 sprintf(block_source, "%s (%d) \n%s (%d)\n\n%s\n",
549 group_size_macro, (int)group_counts[i],
550 operations_macro, (int)operation_counts[i],
551 source);
552
553 // Create the compute program from the source buffer
554 //
555 programs[i] = clCreateProgramWithSource(context, 1, (const char **) & block_source, NULL, &err);
556 if (!programs[i] || err != CL_SUCCESS)
557 {
558 printf("%s\n", block_source);
559 printf("Error: Failed to create compute program!\n");
560 return EXIT_FAILURE;
561 }
562
563 // Build the program executable
564 //
565 err = clBuildProgram(programs[i], 0, NULL, NULL, NULL, NULL);
566 if (err != CL_SUCCESS)
567 {
568 size_t length;
569 char build_log[2048];
570 printf("%s\n", block_source);
571 printf("Error: Failed to build program executable!\n");
572 clGetProgramBuildInfo(programs[i], device_id, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, &length);
573 printf("%s\n", build_log);
574 return EXIT_FAILURE;
575 }
576
577 // Create the compute kernel from within the program
578 //
579 kernels[i] = clCreateKernel(programs[i], "reduce", &err);
580 if (!kernels[i] || err != CL_SUCCESS)
581 {
582 printf("Error: Failed to create compute kernel!\n");
583 return EXIT_FAILURE;
584 }
585
586 free(block_source);
587 }
588
589 // Do the reduction for each level
590 //
591 cl_mem pass_swap;
592 cl_mem pass_input = output_buffer;
593 cl_mem pass_output = input_buffer;
594
595 for(i = 0; i < pass_count; i++)
596 {
597 size_t global = group_counts[i] * work_item_counts[i];
598 size_t local = work_item_counts[i];
599 unsigned int operations = operation_counts[i];
600 unsigned int entries = entry_counts[i];
601 size_t shared_size = typesize * channels * local * operations;
602
603 printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Operations[%d] Entries[%d]\n", i,
604 (int)global, (int)local, (int)group_counts[i], (int)work_item_counts[i], operations, entries);
605
606 // Swap the inputs and outputs for each pass
607 //
608 pass_swap = pass_input;
609 pass_input = pass_output;
610 pass_output = pass_swap;
611
612 err = CL_SUCCESS;
613 err |= clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &pass_output);
614 err |= clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &pass_input);
615 err |= clSetKernelArg(kernels[i], 2, shared_size, NULL);
616 err |= clSetKernelArg(kernels[i], 3, sizeof(int), &entries);
617 if (err != CL_SUCCESS)
618 {
619 printf("Error: Failed to set kernel arguments!\n");
620 return EXIT_FAILURE;
621 }
622
623 // After the first pass, use the partial sums for the next input values
624 //
625 if(pass_input == input_buffer)
626 pass_input = partials_buffer;
627
628 err = CL_SUCCESS;
629 err |= clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
630 if (err != CL_SUCCESS)
631 {
632 printf("Error: Failed to execute kernel!\n");
633 return EXIT_FAILURE;
634 }
635 }
636
637 err = clFinish(commands);
638 if (err != CL_SUCCESS)
639 {
640 printf("Error: Failed to wait for command queue to finish! %d\n", err);
641 return EXIT_FAILURE;
642 }
643
644 // Start the timing loop and execute the kernel over several iterations
645 //
646 printf(SEPARATOR);
647 printf("Timing %d iterations of reduction with %d elements of type %s%s...\n",
648 iterations, count, integer ? "int" : "float",
649 (channels <= 1) ? (" ") : (channels == 2) ? "2" : "4");
650 printf(SEPARATOR);
651
652 int k;
653 err = CL_SUCCESS;
654 t1 = current_time();
655 for (k = 0 ; k < iterations; k++)
656 {
657 for(i = 0; i < pass_count; i++)
658 {
659 size_t global = group_counts[i] * work_item_counts[i];
660 size_t local = work_item_counts[i];
661
662 err = clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL);
663 if (err != CL_SUCCESS)
664 {
665 printf("Error: Failed to execute kernel!\n");
666 return EXIT_FAILURE;
667 }
668 }
669 }
670 err = clFinish(commands);
671 if (err != CL_SUCCESS)
672 {
673 printf("Error: Failed to wait for command queue to finish! %d\n", err);
674 return EXIT_FAILURE;
675 }
676 t2 = current_time();
677
678 // Calculate the statistics for execution time and throughput
679 //
680 double t = subtract_time_in_seconds(t2, t1);
681 printf("Exec Time: %.2f ms\n", 1000.0 * t / (double)(iterations));
682 printf("Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / t);
683 printf(SEPARATOR);
684
685 // Read back the results that were computed on the device
686 //
687 void *computed_result = malloc(typesize * channels);
688 memset(computed_result, 0, typesize * channels);
689 err = clEnqueueReadBuffer(commands, pass_output, CL_TRUE, 0, typesize * channels, computed_result, 0, NULL, NULL);
690 if (err)
691 {
692 printf("Error: Failed to read back results from the device!\n");
693 return EXIT_FAILURE;
694 }
695
696
697 // Verify the results are correct
698 //
699 if(integer)
700 {
701 int reference[4] = { 0, 0, 0, 0};
702 switch(channels)
703 {
704 case 4:
705 reduce_validate_int4(integer_data, count, reference);
706 break;
707 case 2:
708 reduce_validate_int2(integer_data, count, reference);
709 break;
710 case 1:
711 reduce_validate_int(integer_data, count, reference);
712 break;
713 default:
714 printf("Invalid channel count specified!\n");
715 return EXIT_FAILURE;
716 }
717
718 int result[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
719 for(c = 0; c < channels; c++)
720 {
721 int v = ((int*) computed_result)[c];
722 result[c] += v;
723 }
724
725 float error = 0.0f;
726 float diff = 0.0f;
727 for(c = 0; c < channels; c++)
728 {
729 diff = fabs(reference[c] - result[c]);
730 error = diff > error ? diff : error;
731 }
732
733 if (error > MIN_ERROR)
734 {
735 for(c = 0; c < channels; c++)
736 printf("Result[%d] %d != %d\n", c, reference[c], result[c]);
737
738 printf("Error: Incorrect results obtained! Max error = %f\n", error);
739 return EXIT_FAILURE;
740 }
741 else
742 {
743 printf("Results Validated!\n");
744 printf(SEPARATOR);
745 }
746 }
747 else
748 {
749 float reference[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
750 switch(channels)
751 {
752 case 4:
753 reduce_validate_float4(float_data, count, reference);
754 break;
755 case 2:
756 reduce_validate_float2(float_data, count, reference);
757 break;
758 case 1:
759 reduce_validate_float(float_data, count, reference);
760 break;
761 default:
762 printf("Invalid channel count specified!\n");
763 return EXIT_FAILURE;
764 }
765
766 float result[4] = { 0.0f, 0.0f, 0.0f, 0.0f};
767 for(c = 0; c < channels; c++)
768 {
769 float v = ((float*) computed_result)[c];
770 result[c] += v;
771 }
772
773 float error = 0.0f;
774 float diff = 0.0f;
775 for(c = 0; c < channels; c++)
776 {
777 diff = fabs(reference[c] - result[c]);
778 error = diff > error ? diff : error;
779 }
780
781 if (error > MIN_ERROR)
782 {
783 for(c = 0; c < channels; c++)
784 printf("Result[%d] %f != %f\n", c, reference[c], result[c]);
785
786 printf("Error: Incorrect results obtained! Max error = %f\n", error);
787 return EXIT_FAILURE;
788 }
789 else
790 {
791 printf("Results Validated!\n");
792 printf(SEPARATOR);
793 }
794 }
795
796 // Shutdown and cleanup
797 //
798 for(i = 0; i < pass_count; i++)
799 {
800 clReleaseKernel(kernels[i]);
801 clReleaseProgram(programs[i]);
802 }
803
804 clReleaseMemObject(input_buffer);
805 clReleaseMemObject(output_buffer);
806 clReleaseMemObject(partials_buffer);
807 clReleaseCommandQueue(commands);
808 clReleaseContext(context);
809
810 free(group_counts);
811 free(work_item_counts);
812 free(operation_counts);
813 free(entry_counts);
814 free(computed_result);
815 free(kernels);
816 free(float_data);
817 free(integer_data);
818
819
820 return 0;
821}
822
Note: See TracBrowser for help on using the repository browser.