| 1 | //
|
|---|
| 2 | // File: reduce_float_kernel.cl
|
|---|
| 3 | //
|
|---|
| 4 | // Abstract: This example shows how to perform an efficient parallel reduction using OpenCL.
|
|---|
| 5 | // Reduce is a common data parallel primitive which can be used for variety
|
|---|
| 6 | // of different operations -- this example computes the global sum for a large
|
|---|
| 7 | // number of values, and includes kernels for integer and floating point vector
|
|---|
| 8 | // types.
|
|---|
| 9 | //
|
|---|
| 10 | // Version: <1.0>
|
|---|
| 11 | //
|
|---|
| 12 | // Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
|
|---|
| 13 | // in consideration of your agreement to the following terms, and your use,
|
|---|
| 14 | // installation, modification or redistribution of this Apple software
|
|---|
| 15 | // constitutes acceptance of these terms. If you do not agree with these
|
|---|
| 16 | // terms, please do not use, install, modify or redistribute this Apple
|
|---|
| 17 | // software.
|
|---|
| 18 | //
|
|---|
| 19 | // In consideration of your agreement to abide by the following terms, and
|
|---|
| 20 | // subject to these terms, Apple grants you a personal, non - exclusive
|
|---|
| 21 | // license, under Apple's copyrights in this original Apple software ( the
|
|---|
| 22 | // "Apple Software" ), to use, reproduce, modify and redistribute the Apple
|
|---|
| 23 | // Software, with or without modifications, in source and / or binary forms;
|
|---|
| 24 | // provided that if you redistribute the Apple Software in its entirety and
|
|---|
| 25 | // without modifications, you must retain this notice and the following text
|
|---|
| 26 | // and disclaimers in all such redistributions of the Apple Software. Neither
|
|---|
| 27 | // the name, trademarks, service marks or logos of Apple Inc. may be used to
|
|---|
| 28 | // endorse or promote products derived from the Apple Software without specific
|
|---|
| 29 | // prior written permission from Apple. Except as expressly stated in this
|
|---|
| 30 | // notice, no other rights or licenses, express or implied, are granted by
|
|---|
| 31 | // Apple herein, including but not limited to any patent rights that may be
|
|---|
| 32 | // infringed by your derivative works or by other works in which the Apple
|
|---|
| 33 | // Software may be incorporated.
|
|---|
| 34 | //
|
|---|
| 35 | // The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
|
|---|
| 36 | // WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
|
|---|
| 37 | // WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
|
|---|
| 38 | // PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
|
|---|
| 39 | // ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
|
|---|
| 40 | //
|
|---|
| 41 | // IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
|
|---|
| 42 | // CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|---|
| 43 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|---|
| 44 | // INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
|
|---|
| 45 | // AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
|
|---|
| 46 | // UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
|
|---|
| 47 | // OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|---|
| 48 | //
|
|---|
| 49 | // Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
|
|---|
| 50 | //
|
|---|
| 51 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 52 |
|
|---|
| 53 | #ifndef GROUP_SIZE
|
|---|
| 54 | #define GROUP_SIZE (64)
|
|---|
| 55 | #endif
|
|---|
| 56 |
|
|---|
| 57 | #ifndef OPERATIONS
|
|---|
| 58 | #define OPERATIONS (1)
|
|---|
| 59 | #endif
|
|---|
| 60 |
|
|---|
| 61 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 62 |
|
|---|
| 63 | #define LOAD_GLOBAL_F1(s, i) \
|
|---|
| 64 | ((__global const float*)(s))[(size_t)(i)]
|
|---|
| 65 |
|
|---|
| 66 | #define STORE_GLOBAL_F1(s, i, v) \
|
|---|
| 67 | ((__global float*)(s))[(size_t)(i)] = (v)
|
|---|
| 68 |
|
|---|
| 69 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 70 |
|
|---|
| 71 | #define LOAD_LOCAL_F1(s, i) \
|
|---|
| 72 | ((__local float*)(s))[(size_t)(i)]
|
|---|
| 73 |
|
|---|
| 74 | #define STORE_LOCAL_F1(s, i, v) \
|
|---|
| 75 | ((__local float*)(s))[(size_t)(i)] = (v)
|
|---|
| 76 |
|
|---|
| 77 | #define ACCUM_LOCAL_F1(s, i, j) \
|
|---|
| 78 | { \
|
|---|
| 79 | float x = ((__local float*)(s))[(size_t)(i)]; \
|
|---|
| 80 | float y = ((__local float*)(s))[(size_t)(j)]; \
|
|---|
| 81 | ((__local float*)(s))[(size_t)(i)] = (x + y); \
|
|---|
| 82 | }
|
|---|
| 83 |
|
|---|
| 84 | ////////////////////////////////////////////////////////////////////////////////////////////////////
|
|---|
| 85 |
|
|---|
| 86 | __kernel void
|
|---|
| 87 | reduce(
|
|---|
| 88 | __global float *output,
|
|---|
| 89 | __global const float *input,
|
|---|
| 90 | __local float *shared,
|
|---|
| 91 | const unsigned int n)
|
|---|
| 92 | {
|
|---|
| 93 | const float zero = 0.0f;
|
|---|
| 94 | const unsigned int group_id = get_global_id(0) / get_local_size(0);
|
|---|
| 95 | const unsigned int group_size = GROUP_SIZE;
|
|---|
| 96 | const unsigned int group_stride = 2 * group_size;
|
|---|
| 97 | const size_t local_stride = group_stride * group_size;
|
|---|
| 98 |
|
|---|
| 99 | unsigned int op = 0;
|
|---|
| 100 | unsigned int last = OPERATIONS - 1;
|
|---|
| 101 | for(op = 0; op < OPERATIONS; op++)
|
|---|
| 102 | {
|
|---|
| 103 | const unsigned int offset = (last - op);
|
|---|
| 104 | const size_t local_id = get_local_id(0) + offset;
|
|---|
| 105 |
|
|---|
| 106 | STORE_LOCAL_F1(shared, local_id, zero);
|
|---|
| 107 |
|
|---|
| 108 | size_t i = group_id * group_stride + local_id;
|
|---|
| 109 | while (i < n)
|
|---|
| 110 | {
|
|---|
| 111 | float a = LOAD_GLOBAL_F1(input, i);
|
|---|
| 112 | float b = LOAD_GLOBAL_F1(input, i + group_size);
|
|---|
| 113 | float s = LOAD_LOCAL_F1(shared, local_id);
|
|---|
| 114 | STORE_LOCAL_F1(shared, local_id, (a + b + s));
|
|---|
| 115 | i += local_stride;
|
|---|
| 116 | }
|
|---|
| 117 |
|
|---|
| 118 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 119 | #if (GROUP_SIZE >= 512)
|
|---|
| 120 | if (local_id < 256) { ACCUM_LOCAL_F1(shared, local_id, local_id + 256); }
|
|---|
| 121 | #endif
|
|---|
| 122 |
|
|---|
| 123 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 124 | #if (GROUP_SIZE >= 256)
|
|---|
| 125 | if (local_id < 128) { ACCUM_LOCAL_F1(shared, local_id, local_id + 128); }
|
|---|
| 126 | #endif
|
|---|
| 127 |
|
|---|
| 128 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 129 | #if (GROUP_SIZE >= 128)
|
|---|
| 130 | if (local_id < 64) { ACCUM_LOCAL_F1(shared, local_id, local_id + 64); }
|
|---|
| 131 | #endif
|
|---|
| 132 |
|
|---|
| 133 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 134 | #if (GROUP_SIZE >= 64)
|
|---|
| 135 | if (local_id < 32) { ACCUM_LOCAL_F1(shared, local_id, local_id + 32); }
|
|---|
| 136 | #endif
|
|---|
| 137 |
|
|---|
| 138 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 139 | #if (GROUP_SIZE >= 32)
|
|---|
| 140 | if (local_id < 16) { ACCUM_LOCAL_F1(shared, local_id, local_id + 16); }
|
|---|
| 141 | #endif
|
|---|
| 142 |
|
|---|
| 143 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 144 | #if (GROUP_SIZE >= 16)
|
|---|
| 145 | if (local_id < 8) { ACCUM_LOCAL_F1(shared, local_id, local_id + 8); }
|
|---|
| 146 | #endif
|
|---|
| 147 |
|
|---|
| 148 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 149 | #if (GROUP_SIZE >= 8)
|
|---|
| 150 | if (local_id < 4) { ACCUM_LOCAL_F1(shared, local_id, local_id + 4); }
|
|---|
| 151 | #endif
|
|---|
| 152 |
|
|---|
| 153 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 154 | #if (GROUP_SIZE >= 4)
|
|---|
| 155 | if (local_id < 2) { ACCUM_LOCAL_F1(shared, local_id, local_id + 2); }
|
|---|
| 156 | #endif
|
|---|
| 157 |
|
|---|
| 158 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 159 | #if (GROUP_SIZE >= 2)
|
|---|
| 160 | if (local_id < 1) { ACCUM_LOCAL_F1(shared, local_id, local_id + 1); }
|
|---|
| 161 | #endif
|
|---|
| 162 |
|
|---|
| 163 | }
|
|---|
| 164 |
|
|---|
| 165 | barrier(CLK_LOCAL_MEM_FENCE);
|
|---|
| 166 | if (get_local_id(0) == 0)
|
|---|
| 167 | {
|
|---|
| 168 | float v = LOAD_LOCAL_F1(shared, 0);
|
|---|
| 169 | STORE_GLOBAL_F1(output, group_id, v);
|
|---|
| 170 | }
|
|---|
| 171 | }
|
|---|