source: CIVL/examples/opencl/2.14/reduce/reduce_float_kernel.cl@ a389857

main test-branch
Last change on this file since a389857 was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 6.8 KB
Line 
1//
2// File: reduce_float_kernel.cl
3//
4// Abstract: This example shows how to perform an efficient parallel reduction using OpenCL.
5// Reduce is a common data parallel primitive which can be used for variety
6// of different operations -- this example computes the global sum for a large
7// number of values, and includes kernels for integer and floating point vector
8// types.
9//
10// Version: <1.0>
11//
12// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
13// in consideration of your agreement to the following terms, and your use,
14// installation, modification or redistribution of this Apple software
15// constitutes acceptance of these terms. If you do not agree with these
16// terms, please do not use, install, modify or redistribute this Apple
17// software.
18//
19// In consideration of your agreement to abide by the following terms, and
20// subject to these terms, Apple grants you a personal, non - exclusive
21// license, under Apple's copyrights in this original Apple software ( the
22// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
23// Software, with or without modifications, in source and / or binary forms;
24// provided that if you redistribute the Apple Software in its entirety and
25// without modifications, you must retain this notice and the following text
26// and disclaimers in all such redistributions of the Apple Software. Neither
27// the name, trademarks, service marks or logos of Apple Inc. may be used to
28// endorse or promote products derived from the Apple Software without specific
29// prior written permission from Apple. Except as expressly stated in this
30// notice, no other rights or licenses, express or implied, are granted by
31// Apple herein, including but not limited to any patent rights that may be
32// infringed by your derivative works or by other works in which the Apple
33// Software may be incorporated.
34//
35// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
36// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
37// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
38// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
39// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
40//
41// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
42// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
45// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
46// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
47// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48//
49// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
50//
51////////////////////////////////////////////////////////////////////////////////////////////////////
52
53#ifndef GROUP_SIZE
54#define GROUP_SIZE (64)
55#endif
56
57#ifndef OPERATIONS
58#define OPERATIONS (1)
59#endif
60
61////////////////////////////////////////////////////////////////////////////////////////////////////
62
63#define LOAD_GLOBAL_F1(s, i) \
64 ((__global const float*)(s))[(size_t)(i)]
65
66#define STORE_GLOBAL_F1(s, i, v) \
67 ((__global float*)(s))[(size_t)(i)] = (v)
68
69////////////////////////////////////////////////////////////////////////////////////////////////////
70
71#define LOAD_LOCAL_F1(s, i) \
72 ((__local float*)(s))[(size_t)(i)]
73
74#define STORE_LOCAL_F1(s, i, v) \
75 ((__local float*)(s))[(size_t)(i)] = (v)
76
77#define ACCUM_LOCAL_F1(s, i, j) \
78{ \
79 float x = ((__local float*)(s))[(size_t)(i)]; \
80 float y = ((__local float*)(s))[(size_t)(j)]; \
81 ((__local float*)(s))[(size_t)(i)] = (x + y); \
82}
83
84////////////////////////////////////////////////////////////////////////////////////////////////////
85
86__kernel void
87reduce(
88 __global float *output,
89 __global const float *input,
90 __local float *shared,
91 const unsigned int n)
92{
93 const float zero = 0.0f;
94 const unsigned int group_id = get_global_id(0) / get_local_size(0);
95 const unsigned int group_size = GROUP_SIZE;
96 const unsigned int group_stride = 2 * group_size;
97 const size_t local_stride = group_stride * group_size;
98
99 unsigned int op = 0;
100 unsigned int last = OPERATIONS - 1;
101 for(op = 0; op < OPERATIONS; op++)
102 {
103 const unsigned int offset = (last - op);
104 const size_t local_id = get_local_id(0) + offset;
105
106 STORE_LOCAL_F1(shared, local_id, zero);
107
108 size_t i = group_id * group_stride + local_id;
109 while (i < n)
110 {
111 float a = LOAD_GLOBAL_F1(input, i);
112 float b = LOAD_GLOBAL_F1(input, i + group_size);
113 float s = LOAD_LOCAL_F1(shared, local_id);
114 STORE_LOCAL_F1(shared, local_id, (a + b + s));
115 i += local_stride;
116 }
117
118 barrier(CLK_LOCAL_MEM_FENCE);
119 #if (GROUP_SIZE >= 512)
120 if (local_id < 256) { ACCUM_LOCAL_F1(shared, local_id, local_id + 256); }
121 #endif
122
123 barrier(CLK_LOCAL_MEM_FENCE);
124 #if (GROUP_SIZE >= 256)
125 if (local_id < 128) { ACCUM_LOCAL_F1(shared, local_id, local_id + 128); }
126 #endif
127
128 barrier(CLK_LOCAL_MEM_FENCE);
129 #if (GROUP_SIZE >= 128)
130 if (local_id < 64) { ACCUM_LOCAL_F1(shared, local_id, local_id + 64); }
131 #endif
132
133 barrier(CLK_LOCAL_MEM_FENCE);
134 #if (GROUP_SIZE >= 64)
135 if (local_id < 32) { ACCUM_LOCAL_F1(shared, local_id, local_id + 32); }
136 #endif
137
138 barrier(CLK_LOCAL_MEM_FENCE);
139 #if (GROUP_SIZE >= 32)
140 if (local_id < 16) { ACCUM_LOCAL_F1(shared, local_id, local_id + 16); }
141 #endif
142
143 barrier(CLK_LOCAL_MEM_FENCE);
144 #if (GROUP_SIZE >= 16)
145 if (local_id < 8) { ACCUM_LOCAL_F1(shared, local_id, local_id + 8); }
146 #endif
147
148 barrier(CLK_LOCAL_MEM_FENCE);
149 #if (GROUP_SIZE >= 8)
150 if (local_id < 4) { ACCUM_LOCAL_F1(shared, local_id, local_id + 4); }
151 #endif
152
153 barrier(CLK_LOCAL_MEM_FENCE);
154 #if (GROUP_SIZE >= 4)
155 if (local_id < 2) { ACCUM_LOCAL_F1(shared, local_id, local_id + 2); }
156 #endif
157
158 barrier(CLK_LOCAL_MEM_FENCE);
159 #if (GROUP_SIZE >= 2)
160 if (local_id < 1) { ACCUM_LOCAL_F1(shared, local_id, local_id + 1); }
161 #endif
162
163 }
164
165 barrier(CLK_LOCAL_MEM_FENCE);
166 if (get_local_id(0) == 0)
167 {
168 float v = LOAD_LOCAL_F1(shared, 0);
169 STORE_GLOBAL_F1(output, group_id, v);
170 }
171}
Note: See TracBrowser for help on using the repository browser.