| [5693398] | 1 | //
|
|---|
| 2 | // File: reduce_int4_kernel.cl
|
|---|
| 3 | //
|
|---|
| 4 | // Abstract: This example shows how to perform an efficient parallel reduction using OpenCL.
|
|---|
| 5 | // Reduce is a common data parallel primitive which can be used for variety
|
|---|
| 6 | // of different operations -- this example computes the global sum for a large
|
|---|
| 7 | // number of values, and includes kernels for integer and floating point vector
|
|---|
| 8 | // types.
|
|---|
| 9 | //
|
|---|
| 10 | // Version: <1.0>
|
|---|
| 11 | //
|
|---|
| 12 | // Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
|
|---|
| 13 | // in consideration of your agreement to the following terms, and your use,
|
|---|
| 14 | // installation, modification or redistribution of this Apple software
|
|---|
| 15 | // constitutes acceptance of these terms. If you do not agree with these
|
|---|
| 16 | // terms, please do not use, install, modify or redistribute this Apple
|
|---|
| 17 | // software.
|
|---|
| 18 | //
|
|---|
| 19 | // In consideration of your agreement to abide by the following terms, and
|
|---|
| 20 | // subject to these terms, Apple grants you a personal, non - exclusive
|
|---|
| 21 | // license, under Apple's copyrights in this original Apple software ( the
|
|---|
| 22 | // "Apple Software" ), to use, reproduce, modify and redistribute the Apple
|
|---|
| 23 | // Software, with or without modifications, in source and / or binary forms;
|
|---|
| 24 | // provided that if you redistribute the Apple Software in its entirety and
|
|---|
| 25 | // without modifications, you must retain this notice and the following text
|
|---|
| 26 | // and disclaimers in all such redistributions of the Apple Software. Neither
|
|---|
| 27 | // the name, trademarks, service marks or logos of Apple Inc. may be used to
|
|---|
| 28 | // endorse or promote products derived from the Apple Software without specific
|
|---|
| 29 | // prior written permission from Apple. Except as expressly stated in this
|
|---|
| 30 | // notice, no other rights or licenses, express or implied, are granted by
|
|---|
| 31 | // Apple herein, including but not limited to any patent rights that may be
|
|---|
| 32 | // infringed by your derivative works or by other works in which the Apple
|
|---|
| 33 | // Software may be incorporated.
|
|---|
| 34 | //
|
|---|
| 35 | // The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
|
|---|
| 36 | // WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
|
|---|
| 37 | // WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
|
|---|
| 38 | // PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
|
|---|
| 39 | // ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
|
|---|
| 40 | //
|
|---|
| 41 | // IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
|
|---|
| 42 | // CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|---|
| 43 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|---|
| 44 | // INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
|
|---|
| 45 | // AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
|
|---|
| 46 | // UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
|
|---|
| 47 | // OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|---|
| 48 | //
|
|---|
| 49 | // Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
|
|---|
| 50 | //
|
|---|
| 51 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 52 |
|
|---|
| 53 | #ifndef GROUP_SIZE
|
|---|
| 54 | #define GROUP_SIZE (64)
|
|---|
| 55 | #endif
|
|---|
| 56 |
|
|---|
| 57 | #ifndef OPERATIONS
|
|---|
| 58 | #define OPERATIONS (1)
|
|---|
| 59 | #endif
|
|---|
| 60 |
|
|---|
| 61 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 62 |
|
|---|
| 63 | #define GROUP_SIZE_2 (GROUP_SIZE + GROUP_SIZE)
|
|---|
| 64 | #define GROUP_SIZE_3 (GROUP_SIZE_2 + GROUP_SIZE)
|
|---|
| 65 |
|
|---|
| 66 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 67 |
|
|---|
| 68 | #define LOAD_GLOBAL_I4(s, i) \
|
|---|
| 69 | vload4((size_t)(i), (__global const int*)(s))
|
|---|
| 70 |
|
|---|
| 71 | #define STORE_GLOBAL_I4(s, i, v) \
|
|---|
| 72 | vstore4((v), (size_t)(i), (__global int*)(s))
|
|---|
| 73 |
|
|---|
| 74 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 75 |
|
|---|
| 76 | #define LOAD_LOCAL_I1(s, i) \
|
|---|
| 77 | ((__local const int*)(s))[(size_t)(i)]
|
|---|
| 78 |
|
|---|
| 79 | #define STORE_LOCAL_I1(s, i, v) \
|
|---|
| 80 | ((__local int*)(s))[(size_t)(i)] = (v)
|
|---|
| 81 |
|
|---|
| 82 | #define LOAD_LOCAL_I4(s, i) \
|
|---|
| 83 | (int4)( (LOAD_LOCAL_I1(s, i )), \
|
|---|
| 84 | (LOAD_LOCAL_I1(s, i + GROUP_SIZE )), \
|
|---|
| 85 | (LOAD_LOCAL_I1(s, i + GROUP_SIZE_2)), \
|
|---|
| 86 | (LOAD_LOCAL_I1(s, i + GROUP_SIZE_3)))
|
|---|
| 87 |
|
|---|
| 88 | #define STORE_LOCAL_I4(s, i, v) \
|
|---|
| 89 | STORE_LOCAL_I1(s, i, (v)[0]); \
|
|---|
| 90 | STORE_LOCAL_I1(s, i + GROUP_SIZE, (v)[1]); \
|
|---|
| 91 | STORE_LOCAL_I1(s, i + GROUP_SIZE_2, (v)[2]); \
|
|---|
| 92 | STORE_LOCAL_I1(s, i + GROUP_SIZE_3, (v)[3])
|
|---|
| 93 |
|
|---|
| 94 | #define ACCUM_LOCAL_I4(s, i, j) \
|
|---|
| 95 | { \
|
|---|
| 96 | int4 x = LOAD_LOCAL_I4(s, i); \
|
|---|
| 97 | int4 y = LOAD_LOCAL_I4(s, j); \
|
|---|
| 98 | int4 xy = x + y; \
|
|---|
| 99 | STORE_LOCAL_I4(s, i, xy); \
|
|---|
| 100 | }
|
|---|
| 101 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 102 |
|
|---|
| 103 | __kernel void
|
|---|
| 104 | reduce(
|
|---|
| 105 | __global int4 *output,
|
|---|
| 106 | __global const int4 *input,
|
|---|
| 107 | __local int4 *shared,
|
|---|
| 108 | const unsigned int n)
|
|---|
| 109 | {
|
|---|
| 110 | const int4 zero = (int4)(0.0f, 0.0f, 0.0f, 0.0f);
|
|---|
| 111 | const unsigned int group_id = get_global_id(0) / get_local_size(0);
|
|---|
| 112 | const unsigned int group_size = GROUP_SIZE;
|
|---|
| 113 | const unsigned int group_stride = 2 * group_size;
|
|---|
| 114 | const size_t local_stride = group_stride * group_size;
|
|---|
| 115 |
|
|---|
| 116 | unsigned int op = 0;
|
|---|
| 117 | unsigned int last = OPERATIONS - 1;
|
|---|
| 118 | for(op = 0; op < OPERATIONS; op++)
|
|---|
| 119 | {
|
|---|
| 120 | const unsigned int offset = (last - op);
|
|---|
| 121 | const size_t local_id = get_local_id(0) + offset;
|
|---|
| 122 |
|
|---|
| 123 | STORE_LOCAL_I4(shared, local_id, zero);
|
|---|
| 124 |
|
|---|
| 125 | size_t i = group_id * group_stride + local_id;
|
|---|
| 126 | while (i < n)
|
|---|
| 127 | {
|
|---|
| 128 | int4 a = LOAD_GLOBAL_I4(input, i);
|
|---|
| 129 | int4 b = LOAD_GLOBAL_I4(input, i + group_size);
|
|---|
| 130 | int4 s = LOAD_LOCAL_I4(shared, local_id);
|
|---|
| 131 | STORE_LOCAL_I4(shared, local_id, (a + b + s));
|
|---|
| 132 | i += local_stride;
|
|---|
| 133 | }
|
|---|
| 134 |
|
|---|
| 135 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 136 | #if (GROUP_SIZE >= 512)
|
|---|
| 137 | if (local_id < 256) { ACCUM_LOCAL_I4(shared, local_id, local_id + 256); }
|
|---|
| 138 | #endif
|
|---|
| 139 |
|
|---|
| 140 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 141 | #if (GROUP_SIZE >= 256)
|
|---|
| 142 | if (local_id < 128) { ACCUM_LOCAL_I4(shared, local_id, local_id + 128); }
|
|---|
| 143 | #endif
|
|---|
| 144 |
|
|---|
| 145 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 146 | #if (GROUP_SIZE >= 128)
|
|---|
| 147 | if (local_id < 64) { ACCUM_LOCAL_I4(shared, local_id, local_id + 64); }
|
|---|
| 148 | #endif
|
|---|
| 149 |
|
|---|
| 150 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 151 | #if (GROUP_SIZE >= 64)
|
|---|
| 152 | if (local_id < 32) { ACCUM_LOCAL_I4(shared, local_id, local_id + 32); }
|
|---|
| 153 | #endif
|
|---|
| 154 |
|
|---|
| 155 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 156 | #if (GROUP_SIZE >= 32)
|
|---|
| 157 | if (local_id < 16) { ACCUM_LOCAL_I4(shared, local_id, local_id + 16); }
|
|---|
| 158 | #endif
|
|---|
| 159 |
|
|---|
| 160 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 161 | #if (GROUP_SIZE >= 16)
|
|---|
| 162 | if (local_id < 8) { ACCUM_LOCAL_I4(shared, local_id, local_id + 8); }
|
|---|
| 163 | #endif
|
|---|
| 164 |
|
|---|
| 165 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 166 | #if (GROUP_SIZE >= 8)
|
|---|
| 167 | if (local_id < 4) { ACCUM_LOCAL_I4(shared, local_id, local_id + 4); }
|
|---|
| 168 | #endif
|
|---|
| 169 |
|
|---|
| 170 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 171 | #if (GROUP_SIZE >= 4)
|
|---|
| 172 | if (local_id < 2) { ACCUM_LOCAL_I4(shared, local_id, local_id + 2); }
|
|---|
| 173 | #endif
|
|---|
| 174 |
|
|---|
| 175 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 176 | #if (GROUP_SIZE >= 2)
|
|---|
| 177 | if (local_id < 1) { ACCUM_LOCAL_I4(shared, local_id, local_id + 1); }
|
|---|
| 178 | #endif
|
|---|
| 179 |
|
|---|
| 180 | }
|
|---|
| 181 |
|
|---|
| 182 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 183 | if (get_local_id(0) == 0)
|
|---|
| 184 | {
|
|---|
| 185 | int4 v = LOAD_LOCAL_I4(shared, 0);
|
|---|
| 186 | STORE_GLOBAL_I4(output, group_id, v);
|
|---|
| 187 | }
|
|---|
| 188 | }
|
|---|