| [5693398] | 1 | //
|
|---|
| 2 | // File: reduce_float2_kernel.cl
|
|---|
| 3 | //
|
|---|
| 4 | // Abstract: This example shows how to perform an efficient parallel reduction using OpenCL.
|
|---|
| 5 | // Reduce is a common data parallel primitive which can be used for variety
|
|---|
| 6 | // of different operations -- this example computes the global sum for a large
|
|---|
| 7 | // number of values, and includes kernels for integer and floating point vector
|
|---|
| 8 | // types.
|
|---|
| 9 | //
|
|---|
| 10 | // Version: <1.0>
|
|---|
| 11 | //
|
|---|
| 12 | // Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
|
|---|
| 13 | // in consideration of your agreement to the following terms, and your use,
|
|---|
| 14 | // installation, modification or redistribution of this Apple software
|
|---|
| 15 | // constitutes acceptance of these terms. If you do not agree with these
|
|---|
| 16 | // terms, please do not use, install, modify or redistribute this Apple
|
|---|
| 17 | // software.
|
|---|
| 18 | //
|
|---|
| 19 | // In consideration of your agreement to abide by the following terms, and
|
|---|
| 20 | // subject to these terms, Apple grants you a personal, non - exclusive
|
|---|
| 21 | // license, under Apple's copyrights in this original Apple software ( the
|
|---|
| 22 | // "Apple Software" ), to use, reproduce, modify and redistribute the Apple
|
|---|
| 23 | // Software, with or without modifications, in source and / or binary forms;
|
|---|
| 24 | // provided that if you redistribute the Apple Software in its entirety and
|
|---|
| 25 | // without modifications, you must retain this notice and the following text
|
|---|
| 26 | // and disclaimers in all such redistributions of the Apple Software. Neither
|
|---|
| 27 | // the name, trademarks, service marks or logos of Apple Inc. may be used to
|
|---|
| 28 | // endorse or promote products derived from the Apple Software without specific
|
|---|
| 29 | // prior written permission from Apple. Except as expressly stated in this
|
|---|
| 30 | // notice, no other rights or licenses, express or implied, are granted by
|
|---|
| 31 | // Apple herein, including but not limited to any patent rights that may be
|
|---|
| 32 | // infringed by your derivative works or by other works in which the Apple
|
|---|
| 33 | // Software may be incorporated.
|
|---|
| 34 | //
|
|---|
| 35 | // The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
|
|---|
| 36 | // WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
|
|---|
| 37 | // WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
|
|---|
| 38 | // PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
|
|---|
| 39 | // ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
|
|---|
| 40 | //
|
|---|
| 41 | // IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
|
|---|
| 42 | // CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|---|
| 43 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|---|
| 44 | // INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
|
|---|
| 45 | // AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
|
|---|
| 46 | // UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
|
|---|
| 47 | // OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|---|
| 48 | //
|
|---|
| 49 | // Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
|
|---|
| 50 | //
|
|---|
| 51 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 52 |
|
|---|
| 53 | #ifndef GROUP_SIZE
|
|---|
| 54 | #define GROUP_SIZE (64)
|
|---|
| 55 | #endif
|
|---|
| 56 |
|
|---|
| 57 | #ifndef OPERATIONS
|
|---|
| 58 | #define OPERATIONS (1)
|
|---|
| 59 | #endif
|
|---|
| 60 |
|
|---|
| 61 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 62 |
|
|---|
| 63 | #define LOAD_GLOBAL_F2(s, i) \
|
|---|
| 64 | vload2((size_t)(i), (__global const float*)(s))
|
|---|
| 65 |
|
|---|
| 66 | #define STORE_GLOBAL_F2(s, i, v) \
|
|---|
| 67 | vstore2((v), (size_t)(i), (__global float*)(s))
|
|---|
| 68 |
|
|---|
| 69 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 70 |
|
|---|
| 71 | #define LOAD_LOCAL_F1(s, i) \
|
|---|
| 72 | ((__local const float*)(s))[(size_t)(i)]
|
|---|
| 73 |
|
|---|
| 74 | #define STORE_LOCAL_F1(s, i, v) \
|
|---|
| 75 | ((__local float*)(s))[(size_t)(i)] = (v)
|
|---|
| 76 |
|
|---|
| 77 | #define LOAD_LOCAL_F2(s, i) \
|
|---|
| 78 | (float2)( (LOAD_LOCAL_F1(s, i)), \
|
|---|
| 79 | (LOAD_LOCAL_F1(s, i + GROUP_SIZE)))
|
|---|
| 80 |
|
|---|
| 81 | #define STORE_LOCAL_F2(s, i, v) \
|
|---|
| 82 | STORE_LOCAL_F1(s, i, (v)[0]); \
|
|---|
| 83 | STORE_LOCAL_F1(s, i + GROUP_SIZE, (v)[1])
|
|---|
| 84 |
|
|---|
| 85 | #define ACCUM_LOCAL_F2(s, i, j) \
|
|---|
| 86 | { \
|
|---|
| 87 | float2 x = LOAD_LOCAL_F2(s, i); \
|
|---|
| 88 | float2 y = LOAD_LOCAL_F2(s, j); \
|
|---|
| 89 | float2 xy = (x + y); \
|
|---|
| 90 | STORE_LOCAL_F2(s, i, xy); \
|
|---|
| 91 | }
|
|---|
| 92 |
|
|---|
| 93 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 94 |
|
|---|
| 95 | __kernel void
|
|---|
| 96 | reduce(
|
|---|
| 97 | __global float2 *output,
|
|---|
| 98 | __global const float2 *input,
|
|---|
| 99 | __local float2 *shared,
|
|---|
| 100 | const unsigned int n)
|
|---|
| 101 | {
|
|---|
| 102 | const float2 zero = (float2)(0.0f, 0.0f);
|
|---|
| 103 | const unsigned int group_id = get_global_id(0) / get_local_size(0);
|
|---|
| 104 | const unsigned int group_size = GROUP_SIZE;
|
|---|
| 105 | const unsigned int group_stride = 2 * group_size;
|
|---|
| 106 | const size_t local_stride = group_stride * group_size;
|
|---|
| 107 |
|
|---|
| 108 | unsigned int op = 0;
|
|---|
| 109 | unsigned int last = OPERATIONS - 1;
|
|---|
| 110 | for(op = 0; op < OPERATIONS; op++)
|
|---|
| 111 | {
|
|---|
| 112 | const unsigned int offset = (last - op);
|
|---|
| 113 | const size_t local_id = get_local_id(0) + offset;
|
|---|
| 114 |
|
|---|
| 115 | STORE_LOCAL_F2(shared, local_id, zero);
|
|---|
| 116 |
|
|---|
| 117 | size_t i = group_id * group_stride + local_id;
|
|---|
| 118 | while (i < n)
|
|---|
| 119 | {
|
|---|
| 120 | float2 a = LOAD_GLOBAL_F2(input, i);
|
|---|
| 121 | float2 b = LOAD_GLOBAL_F2(input, i + group_size);
|
|---|
| 122 | float2 s = LOAD_LOCAL_F2(shared, local_id);
|
|---|
| 123 | STORE_LOCAL_F2(shared, local_id, (a + b + s));
|
|---|
| 124 | i += local_stride;
|
|---|
| 125 | }
|
|---|
| 126 |
|
|---|
| 127 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 128 | #if (GROUP_SIZE >= 512)
|
|---|
| 129 | if (local_id < 256) { ACCUM_LOCAL_F2(shared, local_id, local_id + 256); }
|
|---|
| 130 | #endif
|
|---|
| 131 |
|
|---|
| 132 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 133 | #if (GROUP_SIZE >= 256)
|
|---|
| 134 | if (local_id < 128) { ACCUM_LOCAL_F2(shared, local_id, local_id + 128); }
|
|---|
| 135 | #endif
|
|---|
| 136 |
|
|---|
| 137 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 138 | #if (GROUP_SIZE >= 128)
|
|---|
| 139 | if (local_id < 64) { ACCUM_LOCAL_F2(shared, local_id, local_id + 64); }
|
|---|
| 140 | #endif
|
|---|
| 141 |
|
|---|
| 142 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 143 | #if (GROUP_SIZE >= 64)
|
|---|
| 144 | if (local_id < 32) { ACCUM_LOCAL_F2(shared, local_id, local_id + 32); }
|
|---|
| 145 | #endif
|
|---|
| 146 |
|
|---|
| 147 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 148 | #if (GROUP_SIZE >= 32)
|
|---|
| 149 | if (local_id < 16) { ACCUM_LOCAL_F2(shared, local_id, local_id + 16); }
|
|---|
| 150 | #endif
|
|---|
| 151 |
|
|---|
| 152 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 153 | #if (GROUP_SIZE >= 16)
|
|---|
| 154 | if (local_id < 8) { ACCUM_LOCAL_F2(shared, local_id, local_id + 8); }
|
|---|
| 155 | #endif
|
|---|
| 156 |
|
|---|
| 157 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 158 | #if (GROUP_SIZE >= 8)
|
|---|
| 159 | if (local_id < 4) { ACCUM_LOCAL_F2(shared, local_id, local_id + 4); }
|
|---|
| 160 | #endif
|
|---|
| 161 |
|
|---|
| 162 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 163 | #if (GROUP_SIZE >= 4)
|
|---|
| 164 | if (local_id < 2) { ACCUM_LOCAL_F2(shared, local_id, local_id + 2); }
|
|---|
| 165 | #endif
|
|---|
| 166 |
|
|---|
| 167 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 168 | #if (GROUP_SIZE >= 2)
|
|---|
| 169 | if (local_id < 1) { ACCUM_LOCAL_F2(shared, local_id, local_id + 1); }
|
|---|
| 170 | #endif
|
|---|
| 171 |
|
|---|
| 172 | }
|
|---|
| 173 |
|
|---|
| 174 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 175 | if (get_local_id(0) == 0)
|
|---|
| 176 | {
|
|---|
| 177 | float2 v = LOAD_LOCAL_F2(shared, 0);
|
|---|
| 178 | STORE_GLOBAL_F2(output, group_id, v);
|
|---|
| 179 | }
|
|---|
| 180 | }
|
|---|