| [cc87898] | 1 | // Copyright (C) 2013 Altera Corporation, San Jose, California, USA. All rights reserved.
|
|---|
| 2 | // Permission is hereby granted, free of charge, to any person obtaining a copy of this
|
|---|
| 3 | // software and associated documentation files (the "Software"), to deal in the Software
|
|---|
| 4 | // without restriction, including without limitation the rights to use, copy, modify, merge,
|
|---|
| 5 | // publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
|
|---|
| 6 | // whom the Software is furnished to do so, subject to the following conditions:
|
|---|
| 7 | // The above copyright notice and this permission notice shall be included in all copies or
|
|---|
| 8 | // substantial portions of the Software.
|
|---|
| 9 | //
|
|---|
| 10 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|---|
| 11 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|---|
| 12 | // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|---|
| 13 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|---|
| 14 | // HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|---|
| 15 | // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|---|
| 16 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|---|
| 17 | // OTHER DEALINGS IN THE SOFTWARE.
|
|---|
| 18 | //
|
|---|
| 19 | // This agreement shall be governed in all respects by the laws of the State of California and
|
|---|
| 20 | // by the laws of the United States of America.
|
|---|
| 21 |
|
|---|
| 22 | ///////////////////////////////////////////////////////////////////////////////////
|
|---|
| 23 | // This host program executes a vector addition kernel to perform:
|
|---|
| 24 | // C = A + B
|
|---|
| 25 | // where A, B and C are vectors with N elements.
|
|---|
| 26 | //
|
|---|
| 27 | // This host program supports partitioning the problem across multiple OpenCL
|
|---|
| 28 | // devices if available. If there are M available devices, the problem is
|
|---|
| 29 | // divided so that each device operates on N/M points. The host program
|
|---|
| 30 | // assumes that all devices are of the same type (that is, the same binary can
|
|---|
| 31 | // be used), but the code can be generalized to support different device types
|
|---|
| 32 | // easily.
|
|---|
| 33 | //
|
|---|
| 34 | // Verification is performed against the same computation on the host CPU.
|
|---|
| 35 | ///////////////////////////////////////////////////////////////////////////////////
|
|---|
| 36 |
|
|---|
| 37 | #include <stdio.h>
|
|---|
| 38 | #include <stdlib.h>
|
|---|
| 39 | #include <math.h>
|
|---|
| 40 | #include "CL/opencl.h"
|
|---|
| 41 | #include "AOCL_Utils.h"
|
|---|
| 42 |
|
|---|
| 43 | using namespace aocl_utils;
|
|---|
| 44 |
|
|---|
| 45 | // OpenCL runtime configuration
|
|---|
| 46 | cl_platform_id platform = NULL;
|
|---|
| 47 | unsigned num_devices = 0;
|
|---|
| 48 | scoped_array<cl_device_id> device; // num_devices elements
|
|---|
| 49 | cl_context context = NULL;
|
|---|
| 50 | scoped_array<cl_command_queue> queue; // num_devices elements
|
|---|
| 51 | cl_program program = NULL;
|
|---|
| 52 | scoped_array<cl_kernel> kernel; // num_devices elements
|
|---|
| 53 | scoped_array<cl_mem> input_a_buf; // num_devices elements
|
|---|
| 54 | scoped_array<cl_mem> input_b_buf; // num_devices elements
|
|---|
| 55 | scoped_array<cl_mem> output_buf; // num_devices elements
|
|---|
| 56 |
|
|---|
| 57 | // Problem data.
|
|---|
| 58 | const unsigned N = 1000000; // problem size
|
|---|
| 59 | scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
|
|---|
| 60 | scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
|
|---|
| 61 | scoped_array<scoped_array<float> > ref_output; // num_devices elements
|
|---|
| 62 | scoped_array<unsigned> n_per_device; // num_devices elements
|
|---|
| 63 |
|
|---|
| 64 | // Function prototypes
|
|---|
| 65 | float rand_float();
|
|---|
| 66 | bool init_opencl();
|
|---|
| 67 | void init_problem();
|
|---|
| 68 | void run();
|
|---|
| 69 | void cleanup();
|
|---|
| 70 |
|
|---|
| 71 | // Entry point.
|
|---|
| 72 | int main() {
|
|---|
| 73 | // Initialize OpenCL.
|
|---|
| 74 | if(!init_opencl()) {
|
|---|
| 75 | return -1;
|
|---|
| 76 | }
|
|---|
| 77 |
|
|---|
| 78 | // Initialize the problem data.
|
|---|
| 79 | // Requires the number of devices to be known.
|
|---|
| 80 | init_problem();
|
|---|
| 81 |
|
|---|
| 82 | // Run the kernel.
|
|---|
| 83 | run();
|
|---|
| 84 |
|
|---|
| 85 | // Free the resources allocated
|
|---|
| 86 | cleanup();
|
|---|
| 87 |
|
|---|
| 88 | return 0;
|
|---|
| 89 | }
|
|---|
| 90 |
|
|---|
| 91 | /////// HELPER FUNCTIONS ///////
|
|---|
| 92 |
|
|---|
| 93 | // Randomly generate a floating-point number between -10 and 10.
|
|---|
| 94 | float rand_float() {
|
|---|
| 95 | return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f;
|
|---|
| 96 | }
|
|---|
| 97 |
|
|---|
| 98 | // Initializes the OpenCL objects.
|
|---|
| 99 | bool init_opencl() {
|
|---|
| 100 | cl_int status;
|
|---|
| 101 |
|
|---|
| 102 | printf("Initializing OpenCL\n");
|
|---|
| 103 |
|
|---|
| 104 | if(!setCwdToExeDir()) {
|
|---|
| 105 | return false;
|
|---|
| 106 | }
|
|---|
| 107 |
|
|---|
| 108 | // Get the OpenCL platform.
|
|---|
| 109 | platform = findPlatform("Altera");
|
|---|
| 110 | if(platform == NULL) {
|
|---|
| 111 | printf("ERROR: Unable to find Altera OpenCL platform.\n");
|
|---|
| 112 | return false;
|
|---|
| 113 | }
|
|---|
| 114 |
|
|---|
| 115 | // Query the available OpenCL device.
|
|---|
| 116 | device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
|
|---|
| 117 | printf("Platform: %s\n", getPlatformName(platform).c_str());
|
|---|
| 118 | printf("Using %d device(s)\n", num_devices);
|
|---|
| 119 | for(unsigned i = 0; i < num_devices; ++i) {
|
|---|
| 120 | printf(" %s\n", getDeviceName(device[i]).c_str());
|
|---|
| 121 | }
|
|---|
| 122 |
|
|---|
| 123 | // Create the context.
|
|---|
| 124 | context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
|
|---|
| 125 | checkError(status, "Failed to create context");
|
|---|
| 126 |
|
|---|
| 127 | // Create the program for all device. Use the first device as the
|
|---|
| 128 | // representative device (assuming all device are of the same type).
|
|---|
| 129 | std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]);
|
|---|
| 130 | printf("Using AOCX: %s\n", binary_file.c_str());
|
|---|
| 131 | program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
|
|---|
| 132 |
|
|---|
| 133 | // Build the program that was just created.
|
|---|
| 134 | status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
|
|---|
| 135 | checkError(status, "Failed to build program");
|
|---|
| 136 |
|
|---|
| 137 | // Create per-device objects.
|
|---|
| 138 | queue.reset(num_devices);
|
|---|
| 139 | kernel.reset(num_devices);
|
|---|
| 140 | n_per_device.reset(num_devices);
|
|---|
| 141 | input_a_buf.reset(num_devices);
|
|---|
| 142 | input_b_buf.reset(num_devices);
|
|---|
| 143 | output_buf.reset(num_devices);
|
|---|
| 144 |
|
|---|
| 145 | for(unsigned i = 0; i < num_devices; ++i) {
|
|---|
| 146 | // Command queue.
|
|---|
| 147 | queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
|
|---|
| 148 | checkError(status, "Failed to create command queue");
|
|---|
| 149 |
|
|---|
| 150 | // Kernel.
|
|---|
| 151 | const char *kernel_name = "vectorAdd";
|
|---|
| 152 | kernel[i] = clCreateKernel(program, kernel_name, &status);
|
|---|
| 153 | checkError(status, "Failed to create kernel");
|
|---|
| 154 |
|
|---|
| 155 | // Determine the number of elements processed by this device.
|
|---|
| 156 | n_per_device[i] = N / num_devices; // number of elements handled by this device
|
|---|
| 157 |
|
|---|
| 158 | // Spread out the remainder of the elements over the first
|
|---|
| 159 | // N % num_devices.
|
|---|
| 160 | if(i < (N % num_devices)) {
|
|---|
| 161 | n_per_device[i]++;
|
|---|
| 162 | }
|
|---|
| 163 |
|
|---|
| 164 | // Input buffers.
|
|---|
| 165 | input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
|
|---|
| 166 | n_per_device[i] * sizeof(float), NULL, &status);
|
|---|
| 167 | checkError(status, "Failed to create buffer for input A");
|
|---|
| 168 |
|
|---|
| 169 | input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
|
|---|
| 170 | n_per_device[i] * sizeof(float), NULL, &status);
|
|---|
| 171 | checkError(status, "Failed to create buffer for input B");
|
|---|
| 172 |
|
|---|
| 173 | // Output buffer.
|
|---|
| 174 | output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
|
|---|
| 175 | n_per_device[i] * sizeof(float), NULL, &status);
|
|---|
| 176 | checkError(status, "Failed to create buffer for output");
|
|---|
| 177 | }
|
|---|
| 178 |
|
|---|
| 179 | return true;
|
|---|
| 180 | }
|
|---|
| 181 |
|
|---|
| 182 | // Initialize the data for the problem. Requires num_devices to be known.
|
|---|
| 183 | void init_problem() {
|
|---|
| 184 | if(num_devices == 0) {
|
|---|
| 185 | checkError(-1, "No devices");
|
|---|
| 186 | }
|
|---|
| 187 |
|
|---|
| 188 | input_a.reset(num_devices);
|
|---|
| 189 | input_b.reset(num_devices);
|
|---|
| 190 | output.reset(num_devices);
|
|---|
| 191 | ref_output.reset(num_devices);
|
|---|
| 192 |
|
|---|
| 193 | // Generate input vectors A and B and the reference output consisting
|
|---|
| 194 | // of a total of N elements.
|
|---|
| 195 | // We create separate arrays for each device so that each device has an
|
|---|
| 196 | // aligned buffer.
|
|---|
| 197 | for(unsigned i = 0; i < num_devices; ++i) {
|
|---|
| 198 | input_a[i].reset(n_per_device[i]);
|
|---|
| 199 | input_b[i].reset(n_per_device[i]);
|
|---|
| 200 | output[i].reset(n_per_device[i]);
|
|---|
| 201 | ref_output[i].reset(n_per_device[i]);
|
|---|
| 202 |
|
|---|
| 203 | for(unsigned j = 0; j < n_per_device[i]; ++j) {
|
|---|
| 204 | input_a[i][j] = rand_float();
|
|---|
| 205 | input_b[i][j] = rand_float();
|
|---|
| 206 | ref_output[i][j] = input_a[i][j] + input_b[i][j];
|
|---|
| 207 | }
|
|---|
| 208 | }
|
|---|
| 209 | }
|
|---|
| 210 |
|
|---|
| 211 | void run() {
|
|---|
| 212 | cl_int status;
|
|---|
| 213 |
|
|---|
| 214 | const double start_time = getCurrentTimestamp();
|
|---|
| 215 |
|
|---|
| 216 | // Launch the problem for each device.
|
|---|
| 217 | scoped_array<cl_event> kernel_event(num_devices);
|
|---|
| 218 | scoped_array<cl_event> finish_event(num_devices);
|
|---|
| 219 |
|
|---|
| 220 | for(unsigned i = 0; i < num_devices; ++i) {
|
|---|
| 221 |
|
|---|
| 222 | // Transfer inputs to each device. Each of the host buffers supplied to
|
|---|
| 223 | // clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
|
|---|
| 224 | // for the host-to-device transfer.
|
|---|
| 225 | cl_event write_event[2];
|
|---|
| 226 | status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
|
|---|
| 227 | 0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
|
|---|
| 228 | checkError(status, "Failed to transfer input A");
|
|---|
| 229 |
|
|---|
| 230 | status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
|
|---|
| 231 | 0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]);
|
|---|
| 232 | checkError(status, "Failed to transfer input B");
|
|---|
| 233 |
|
|---|
| 234 | // Set kernel arguments.
|
|---|
| 235 | unsigned argi = 0;
|
|---|
| 236 |
|
|---|
| 237 | status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
|
|---|
| 238 | checkError(status, "Failed to set argument %d", argi - 1);
|
|---|
| 239 |
|
|---|
| 240 | status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
|
|---|
| 241 | checkError(status, "Failed to set argument %d", argi - 1);
|
|---|
| 242 |
|
|---|
| 243 | status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
|
|---|
| 244 | checkError(status, "Failed to set argument %d", argi - 1);
|
|---|
| 245 |
|
|---|
| 246 | // Enqueue kernel.
|
|---|
| 247 | // Use a global work size corresponding to the number of elements to add
|
|---|
| 248 | // for this device.
|
|---|
| 249 | //
|
|---|
| 250 | // We don't specify a local work size and let the runtime choose
|
|---|
| 251 | // (it'll choose to use one work-group with the same size as the global
|
|---|
| 252 | // work-size).
|
|---|
| 253 | //
|
|---|
| 254 | // Events are used to ensure that the kernel is not launched until
|
|---|
| 255 | // the writes to the input buffers have completed.
|
|---|
| 256 | const size_t global_work_size = n_per_device[i];
|
|---|
| 257 | printf("Launching for device %d (%d elements)\n", i, global_work_size);
|
|---|
| 258 |
|
|---|
| 259 | status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
|
|---|
| 260 | &global_work_size, NULL, 2, write_event, &kernel_event[i]);
|
|---|
| 261 | checkError(status, "Failed to launch kernel");
|
|---|
| 262 |
|
|---|
| 263 | // Read the result. This the final operation.
|
|---|
| 264 | status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
|
|---|
| 265 | 0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
|
|---|
| 266 |
|
|---|
| 267 | // Release local events.
|
|---|
| 268 | clReleaseEvent(write_event[0]);
|
|---|
| 269 | clReleaseEvent(write_event[1]);
|
|---|
| 270 | }
|
|---|
| 271 |
|
|---|
| 272 | // Wait for all devices to finish.
|
|---|
| 273 | clWaitForEvents(num_devices, finish_event);
|
|---|
| 274 |
|
|---|
| 275 | const double end_time = getCurrentTimestamp();
|
|---|
| 276 |
|
|---|
| 277 | // Wall-clock time taken.
|
|---|
| 278 | printf("\nTime: %0.3f ms\n", (end_time - start_time) * 1e3);
|
|---|
| 279 |
|
|---|
| 280 | // Get kernel times using the OpenCL event profiling API.
|
|---|
| 281 | for(unsigned i = 0; i < num_devices; ++i) {
|
|---|
| 282 | cl_ulong time_ns = getStartEndTime(kernel_event[i]);
|
|---|
| 283 | printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
|
|---|
| 284 | }
|
|---|
| 285 |
|
|---|
| 286 | // Release all events.
|
|---|
| 287 | for(unsigned i = 0; i < num_devices; ++i) {
|
|---|
| 288 | clReleaseEvent(kernel_event[i]);
|
|---|
| 289 | clReleaseEvent(finish_event[i]);
|
|---|
| 290 | }
|
|---|
| 291 |
|
|---|
| 292 | // Verify results.
|
|---|
| 293 | bool pass = true;
|
|---|
| 294 | for(unsigned i = 0; i < num_devices && pass; ++i) {
|
|---|
| 295 | for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
|
|---|
| 296 | if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
|
|---|
| 297 | printf("Failed verification @ device %d, index %d\nOutput: %f\nReference: %f\n",
|
|---|
| 298 | i, j, output[i][j], ref_output[i][j]);
|
|---|
| 299 | pass = false;
|
|---|
| 300 | }
|
|---|
| 301 | }
|
|---|
| 302 | }
|
|---|
| 303 |
|
|---|
| 304 | printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
|
|---|
| 305 | }
|
|---|
| 306 |
|
|---|
| 307 | // Free the resources allocated during initialization
|
|---|
| 308 | void cleanup() {
|
|---|
| 309 | for(unsigned i = 0; i < num_devices; ++i) {
|
|---|
| 310 | if(kernel && kernel[i]) {
|
|---|
| 311 | clReleaseKernel(kernel[i]);
|
|---|
| 312 | }
|
|---|
| 313 | if(queue && queue[i]) {
|
|---|
| 314 | clReleaseCommandQueue(queue[i]);
|
|---|
| 315 | }
|
|---|
| 316 | if(input_a_buf && input_a_buf[i]) {
|
|---|
| 317 | clReleaseMemObject(input_a_buf[i]);
|
|---|
| 318 | }
|
|---|
| 319 | if(input_b_buf && input_b_buf[i]) {
|
|---|
| 320 | clReleaseMemObject(input_b_buf[i]);
|
|---|
| 321 | }
|
|---|
| 322 | if(output_buf && output_buf[i]) {
|
|---|
| 323 | clReleaseMemObject(output_buf[i]);
|
|---|
| 324 | }
|
|---|
| 325 | }
|
|---|
| 326 |
|
|---|
| 327 | if(program) {
|
|---|
| 328 | clReleaseProgram(program);
|
|---|
| 329 | }
|
|---|
| 330 | if(context) {
|
|---|
| 331 | clReleaseContext(context);
|
|---|
| 332 | }
|
|---|
| 333 | }
|
|---|
| 334 |
|
|---|