source: CIVL/examples/opencl/2.14/reduce/reduce_float2_kernel.cl

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 7.1 KB
Line 
1//
2// File: reduce_float2_kernel.cl
3//
4// Abstract: This example shows how to perform an efficient parallel reduction using OpenCL.
5// Reduce is a common data parallel primitive which can be used for variety
6// of different operations -- this example computes the global sum for a large
7// number of values, and includes kernels for integer and floating point vector
8// types.
9//
10// Version: <1.0>
11//
12// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
13// in consideration of your agreement to the following terms, and your use,
14// installation, modification or redistribution of this Apple software
15// constitutes acceptance of these terms. If you do not agree with these
16// terms, please do not use, install, modify or redistribute this Apple
17// software.
18//
19// In consideration of your agreement to abide by the following terms, and
20// subject to these terms, Apple grants you a personal, non - exclusive
21// license, under Apple's copyrights in this original Apple software ( the
22// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
23// Software, with or without modifications, in source and / or binary forms;
24// provided that if you redistribute the Apple Software in its entirety and
25// without modifications, you must retain this notice and the following text
26// and disclaimers in all such redistributions of the Apple Software. Neither
27// the name, trademarks, service marks or logos of Apple Inc. may be used to
28// endorse or promote products derived from the Apple Software without specific
29// prior written permission from Apple. Except as expressly stated in this
30// notice, no other rights or licenses, express or implied, are granted by
31// Apple herein, including but not limited to any patent rights that may be
32// infringed by your derivative works or by other works in which the Apple
33// Software may be incorporated.
34//
35// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
36// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
37// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
38// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
39// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
40//
41// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
42// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
45// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
46// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
47// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48//
49// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
50//
51////////////////////////////////////////////////////////////////////////////////////////////////////
52
53#ifndef GROUP_SIZE
54#define GROUP_SIZE (64)
55#endif
56
57#ifndef OPERATIONS
58#define OPERATIONS (1)
59#endif
60
61////////////////////////////////////////////////////////////////////////////////////////////////////
62
63#define LOAD_GLOBAL_F2(s, i) \
64 vload2((size_t)(i), (__global const float*)(s))
65
66#define STORE_GLOBAL_F2(s, i, v) \
67 vstore2((v), (size_t)(i), (__global float*)(s))
68
69////////////////////////////////////////////////////////////////////////////////////////////////////
70
71#define LOAD_LOCAL_F1(s, i) \
72 ((__local const float*)(s))[(size_t)(i)]
73
74#define STORE_LOCAL_F1(s, i, v) \
75 ((__local float*)(s))[(size_t)(i)] = (v)
76
77#define LOAD_LOCAL_F2(s, i) \
78 (float2)( (LOAD_LOCAL_F1(s, i)), \
79 (LOAD_LOCAL_F1(s, i + GROUP_SIZE)))
80
81#define STORE_LOCAL_F2(s, i, v) \
82 STORE_LOCAL_F1(s, i, (v)[0]); \
83 STORE_LOCAL_F1(s, i + GROUP_SIZE, (v)[1])
84
85#define ACCUM_LOCAL_F2(s, i, j) \
86{ \
87 float2 x = LOAD_LOCAL_F2(s, i); \
88 float2 y = LOAD_LOCAL_F2(s, j); \
89 float2 xy = (x + y); \
90 STORE_LOCAL_F2(s, i, xy); \
91}
92
93////////////////////////////////////////////////////////////////////////////////////////////////////
94
95__kernel void
96reduce(
97 __global float2 *output,
98 __global const float2 *input,
99 __local float2 *shared,
100 const unsigned int n)
101{
102 const float2 zero = (float2)(0.0f, 0.0f);
103 const unsigned int group_id = get_global_id(0) / get_local_size(0);
104 const unsigned int group_size = GROUP_SIZE;
105 const unsigned int group_stride = 2 * group_size;
106 const size_t local_stride = group_stride * group_size;
107
108 unsigned int op = 0;
109 unsigned int last = OPERATIONS - 1;
110 for(op = 0; op < OPERATIONS; op++)
111 {
112 const unsigned int offset = (last - op);
113 const size_t local_id = get_local_id(0) + offset;
114
115 STORE_LOCAL_F2(shared, local_id, zero);
116
117 size_t i = group_id * group_stride + local_id;
118 while (i < n)
119 {
120 float2 a = LOAD_GLOBAL_F2(input, i);
121 float2 b = LOAD_GLOBAL_F2(input, i + group_size);
122 float2 s = LOAD_LOCAL_F2(shared, local_id);
123 STORE_LOCAL_F2(shared, local_id, (a + b + s));
124 i += local_stride;
125 }
126
127 barrier(CLK_LOCAL_MEM_FENCE);
128 #if (GROUP_SIZE >= 512)
129 if (local_id < 256) { ACCUM_LOCAL_F2(shared, local_id, local_id + 256); }
130 #endif
131
132 barrier(CLK_LOCAL_MEM_FENCE);
133 #if (GROUP_SIZE >= 256)
134 if (local_id < 128) { ACCUM_LOCAL_F2(shared, local_id, local_id + 128); }
135 #endif
136
137 barrier(CLK_LOCAL_MEM_FENCE);
138 #if (GROUP_SIZE >= 128)
139 if (local_id < 64) { ACCUM_LOCAL_F2(shared, local_id, local_id + 64); }
140 #endif
141
142 barrier(CLK_LOCAL_MEM_FENCE);
143 #if (GROUP_SIZE >= 64)
144 if (local_id < 32) { ACCUM_LOCAL_F2(shared, local_id, local_id + 32); }
145 #endif
146
147 barrier(CLK_LOCAL_MEM_FENCE);
148 #if (GROUP_SIZE >= 32)
149 if (local_id < 16) { ACCUM_LOCAL_F2(shared, local_id, local_id + 16); }
150 #endif
151
152 barrier(CLK_LOCAL_MEM_FENCE);
153 #if (GROUP_SIZE >= 16)
154 if (local_id < 8) { ACCUM_LOCAL_F2(shared, local_id, local_id + 8); }
155 #endif
156
157 barrier(CLK_LOCAL_MEM_FENCE);
158 #if (GROUP_SIZE >= 8)
159 if (local_id < 4) { ACCUM_LOCAL_F2(shared, local_id, local_id + 4); }
160 #endif
161
162 barrier(CLK_LOCAL_MEM_FENCE);
163 #if (GROUP_SIZE >= 4)
164 if (local_id < 2) { ACCUM_LOCAL_F2(shared, local_id, local_id + 2); }
165 #endif
166
167 barrier(CLK_LOCAL_MEM_FENCE);
168 #if (GROUP_SIZE >= 2)
169 if (local_id < 1) { ACCUM_LOCAL_F2(shared, local_id, local_id + 1); }
170 #endif
171
172 }
173
174 barrier(CLK_LOCAL_MEM_FENCE);
175 if (get_local_id(0) == 0)
176 {
177 float2 v = LOAD_LOCAL_F2(shared, 0);
178 STORE_GLOBAL_F2(output, group_id, v);
179 }
180}
Note: See TracBrowser for help on using the repository browser.