source: CIVL/examples/opencl/2.14/reduce/reduce_int4_kernel.cl

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 7.5 KB
Line 
1//
2// File: reduce_int4_kernel.cl
3//
4// Abstract: This example shows how to perform an efficient parallel reduction using OpenCL.
5// Reduce is a common data parallel primitive which can be used for variety
6// of different operations -- this example computes the global sum for a large
7// number of values, and includes kernels for integer and floating point vector
8// types.
9//
10// Version: <1.0>
11//
12// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
13// in consideration of your agreement to the following terms, and your use,
14// installation, modification or redistribution of this Apple software
15// constitutes acceptance of these terms. If you do not agree with these
16// terms, please do not use, install, modify or redistribute this Apple
17// software.
18//
19// In consideration of your agreement to abide by the following terms, and
20// subject to these terms, Apple grants you a personal, non - exclusive
21// license, under Apple's copyrights in this original Apple software ( the
22// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
23// Software, with or without modifications, in source and / or binary forms;
24// provided that if you redistribute the Apple Software in its entirety and
25// without modifications, you must retain this notice and the following text
26// and disclaimers in all such redistributions of the Apple Software. Neither
27// the name, trademarks, service marks or logos of Apple Inc. may be used to
28// endorse or promote products derived from the Apple Software without specific
29// prior written permission from Apple. Except as expressly stated in this
30// notice, no other rights or licenses, express or implied, are granted by
31// Apple herein, including but not limited to any patent rights that may be
32// infringed by your derivative works or by other works in which the Apple
33// Software may be incorporated.
34//
35// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
36// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
37// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
38// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
39// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
40//
41// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
42// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
43// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
44// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
45// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
46// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
47// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48//
49// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
50//
51////////////////////////////////////////////////////////////////////////////////////////////////////
52
53#ifndef GROUP_SIZE
54#define GROUP_SIZE (64)
55#endif
56
57#ifndef OPERATIONS
58#define OPERATIONS (1)
59#endif
60
61////////////////////////////////////////////////////////////////////////////////////////////////////
62
63#define GROUP_SIZE_2 (GROUP_SIZE + GROUP_SIZE)
64#define GROUP_SIZE_3 (GROUP_SIZE_2 + GROUP_SIZE)
65
66////////////////////////////////////////////////////////////////////////////////////////////////////
67
68#define LOAD_GLOBAL_I4(s, i) \
69 vload4((size_t)(i), (__global const int*)(s))
70
71#define STORE_GLOBAL_I4(s, i, v) \
72 vstore4((v), (size_t)(i), (__global int*)(s))
73
74////////////////////////////////////////////////////////////////////////////////////////////////////
75
76#define LOAD_LOCAL_I1(s, i) \
77 ((__local const int*)(s))[(size_t)(i)]
78
79#define STORE_LOCAL_I1(s, i, v) \
80 ((__local int*)(s))[(size_t)(i)] = (v)
81
82#define LOAD_LOCAL_I4(s, i) \
83 (int4)( (LOAD_LOCAL_I1(s, i )), \
84 (LOAD_LOCAL_I1(s, i + GROUP_SIZE )), \
85 (LOAD_LOCAL_I1(s, i + GROUP_SIZE_2)), \
86 (LOAD_LOCAL_I1(s, i + GROUP_SIZE_3)))
87
88#define STORE_LOCAL_I4(s, i, v) \
89 STORE_LOCAL_I1(s, i, (v)[0]); \
90 STORE_LOCAL_I1(s, i + GROUP_SIZE, (v)[1]); \
91 STORE_LOCAL_I1(s, i + GROUP_SIZE_2, (v)[2]); \
92 STORE_LOCAL_I1(s, i + GROUP_SIZE_3, (v)[3])
93
94#define ACCUM_LOCAL_I4(s, i, j) \
95{ \
96 int4 x = LOAD_LOCAL_I4(s, i); \
97 int4 y = LOAD_LOCAL_I4(s, j); \
98 int4 xy = x + y; \
99 STORE_LOCAL_I4(s, i, xy); \
100}
101////////////////////////////////////////////////////////////////////////////////////////////////////
102
103__kernel void
104reduce(
105 __global int4 *output,
106 __global const int4 *input,
107 __local int4 *shared,
108 const unsigned int n)
109{
110 const int4 zero = (int4)(0.0f, 0.0f, 0.0f, 0.0f);
111 const unsigned int group_id = get_global_id(0) / get_local_size(0);
112 const unsigned int group_size = GROUP_SIZE;
113 const unsigned int group_stride = 2 * group_size;
114 const size_t local_stride = group_stride * group_size;
115
116 unsigned int op = 0;
117 unsigned int last = OPERATIONS - 1;
118 for(op = 0; op < OPERATIONS; op++)
119 {
120 const unsigned int offset = (last - op);
121 const size_t local_id = get_local_id(0) + offset;
122
123 STORE_LOCAL_I4(shared, local_id, zero);
124
125 size_t i = group_id * group_stride + local_id;
126 while (i < n)
127 {
128 int4 a = LOAD_GLOBAL_I4(input, i);
129 int4 b = LOAD_GLOBAL_I4(input, i + group_size);
130 int4 s = LOAD_LOCAL_I4(shared, local_id);
131 STORE_LOCAL_I4(shared, local_id, (a + b + s));
132 i += local_stride;
133 }
134
135 barrier(CLK_LOCAL_MEM_FENCE);
136 #if (GROUP_SIZE >= 512)
137 if (local_id < 256) { ACCUM_LOCAL_I4(shared, local_id, local_id + 256); }
138 #endif
139
140 barrier(CLK_LOCAL_MEM_FENCE);
141 #if (GROUP_SIZE >= 256)
142 if (local_id < 128) { ACCUM_LOCAL_I4(shared, local_id, local_id + 128); }
143 #endif
144
145 barrier(CLK_LOCAL_MEM_FENCE);
146 #if (GROUP_SIZE >= 128)
147 if (local_id < 64) { ACCUM_LOCAL_I4(shared, local_id, local_id + 64); }
148 #endif
149
150 barrier(CLK_LOCAL_MEM_FENCE);
151 #if (GROUP_SIZE >= 64)
152 if (local_id < 32) { ACCUM_LOCAL_I4(shared, local_id, local_id + 32); }
153 #endif
154
155 barrier(CLK_LOCAL_MEM_FENCE);
156 #if (GROUP_SIZE >= 32)
157 if (local_id < 16) { ACCUM_LOCAL_I4(shared, local_id, local_id + 16); }
158 #endif
159
160 barrier(CLK_LOCAL_MEM_FENCE);
161 #if (GROUP_SIZE >= 16)
162 if (local_id < 8) { ACCUM_LOCAL_I4(shared, local_id, local_id + 8); }
163 #endif
164
165 barrier(CLK_LOCAL_MEM_FENCE);
166 #if (GROUP_SIZE >= 8)
167 if (local_id < 4) { ACCUM_LOCAL_I4(shared, local_id, local_id + 4); }
168 #endif
169
170 barrier(CLK_LOCAL_MEM_FENCE);
171 #if (GROUP_SIZE >= 4)
172 if (local_id < 2) { ACCUM_LOCAL_I4(shared, local_id, local_id + 2); }
173 #endif
174
175 barrier(CLK_LOCAL_MEM_FENCE);
176 #if (GROUP_SIZE >= 2)
177 if (local_id < 1) { ACCUM_LOCAL_I4(shared, local_id, local_id + 1); }
178 #endif
179
180 }
181
182 barrier(CLK_LOCAL_MEM_FENCE);
183 if (get_local_id(0) == 0)
184 {
185 int4 v = LOAD_LOCAL_I4(shared, 0);
186 STORE_GLOBAL_I4(output, group_id, v);
187 }
188}
Note: See TracBrowser for help on using the repository browser.