source: CIVL/examples/opencl/2.14/vecadd/main.cpp

main
Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 11.3 KB
RevLine 
[cc87898]1// Copyright (C) 2013 Altera Corporation, San Jose, California, USA. All rights reserved.
2// Permission is hereby granted, free of charge, to any person obtaining a copy of this
3// software and associated documentation files (the "Software"), to deal in the Software
4// without restriction, including without limitation the rights to use, copy, modify, merge,
5// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
6// whom the Software is furnished to do so, subject to the following conditions:
7// The above copyright notice and this permission notice shall be included in all copies or
8// substantial portions of the Software.
9//
10// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
11// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
12// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
13// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
14// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
15// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
16// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
17// OTHER DEALINGS IN THE SOFTWARE.
18//
19// This agreement shall be governed in all respects by the laws of the State of California and
20// by the laws of the United States of America.
21
22///////////////////////////////////////////////////////////////////////////////////
23// This host program executes a vector addition kernel to perform:
24// C = A + B
25// where A, B and C are vectors with N elements.
26//
27// This host program supports partitioning the problem across multiple OpenCL
28// devices if available. If there are M available devices, the problem is
29// divided so that each device operates on N/M points. The host program
30// assumes that all devices are of the same type (that is, the same binary can
31// be used), but the code can be generalized to support different device types
32// easily.
33//
34// Verification is performed against the same computation on the host CPU.
35///////////////////////////////////////////////////////////////////////////////////
36
37#include <stdio.h>
38#include <stdlib.h>
39#include <math.h>
40#include "CL/opencl.h"
41#include "AOCL_Utils.h"
42
43using namespace aocl_utils;
44
45// OpenCL runtime configuration
46cl_platform_id platform = NULL;
47unsigned num_devices = 0;
48scoped_array<cl_device_id> device; // num_devices elements
49cl_context context = NULL;
50scoped_array<cl_command_queue> queue; // num_devices elements
51cl_program program = NULL;
52scoped_array<cl_kernel> kernel; // num_devices elements
53scoped_array<cl_mem> input_a_buf; // num_devices elements
54scoped_array<cl_mem> input_b_buf; // num_devices elements
55scoped_array<cl_mem> output_buf; // num_devices elements
56
57// Problem data.
58const unsigned N = 1000000; // problem size
59scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
60scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
61scoped_array<scoped_array<float> > ref_output; // num_devices elements
62scoped_array<unsigned> n_per_device; // num_devices elements
63
64// Function prototypes
65float rand_float();
66bool init_opencl();
67void init_problem();
68void run();
69void cleanup();
70
71// Entry point.
72int main() {
73 // Initialize OpenCL.
74 if(!init_opencl()) {
75 return -1;
76 }
77
78 // Initialize the problem data.
79 // Requires the number of devices to be known.
80 init_problem();
81
82 // Run the kernel.
83 run();
84
85 // Free the resources allocated
86 cleanup();
87
88 return 0;
89}
90
91/////// HELPER FUNCTIONS ///////
92
93// Randomly generate a floating-point number between -10 and 10.
94float rand_float() {
95 return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f;
96}
97
98// Initializes the OpenCL objects.
99bool init_opencl() {
100 cl_int status;
101
102 printf("Initializing OpenCL\n");
103
104 if(!setCwdToExeDir()) {
105 return false;
106 }
107
108 // Get the OpenCL platform.
109 platform = findPlatform("Altera");
110 if(platform == NULL) {
111 printf("ERROR: Unable to find Altera OpenCL platform.\n");
112 return false;
113 }
114
115 // Query the available OpenCL device.
116 device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
117 printf("Platform: %s\n", getPlatformName(platform).c_str());
118 printf("Using %d device(s)\n", num_devices);
119 for(unsigned i = 0; i < num_devices; ++i) {
120 printf(" %s\n", getDeviceName(device[i]).c_str());
121 }
122
123 // Create the context.
124 context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
125 checkError(status, "Failed to create context");
126
127 // Create the program for all device. Use the first device as the
128 // representative device (assuming all device are of the same type).
129 std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]);
130 printf("Using AOCX: %s\n", binary_file.c_str());
131 program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
132
133 // Build the program that was just created.
134 status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
135 checkError(status, "Failed to build program");
136
137 // Create per-device objects.
138 queue.reset(num_devices);
139 kernel.reset(num_devices);
140 n_per_device.reset(num_devices);
141 input_a_buf.reset(num_devices);
142 input_b_buf.reset(num_devices);
143 output_buf.reset(num_devices);
144
145 for(unsigned i = 0; i < num_devices; ++i) {
146 // Command queue.
147 queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
148 checkError(status, "Failed to create command queue");
149
150 // Kernel.
151 const char *kernel_name = "vectorAdd";
152 kernel[i] = clCreateKernel(program, kernel_name, &status);
153 checkError(status, "Failed to create kernel");
154
155 // Determine the number of elements processed by this device.
156 n_per_device[i] = N / num_devices; // number of elements handled by this device
157
158 // Spread out the remainder of the elements over the first
159 // N % num_devices.
160 if(i < (N % num_devices)) {
161 n_per_device[i]++;
162 }
163
164 // Input buffers.
165 input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
166 n_per_device[i] * sizeof(float), NULL, &status);
167 checkError(status, "Failed to create buffer for input A");
168
169 input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
170 n_per_device[i] * sizeof(float), NULL, &status);
171 checkError(status, "Failed to create buffer for input B");
172
173 // Output buffer.
174 output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
175 n_per_device[i] * sizeof(float), NULL, &status);
176 checkError(status, "Failed to create buffer for output");
177 }
178
179 return true;
180}
181
182// Initialize the data for the problem. Requires num_devices to be known.
183void init_problem() {
184 if(num_devices == 0) {
185 checkError(-1, "No devices");
186 }
187
188 input_a.reset(num_devices);
189 input_b.reset(num_devices);
190 output.reset(num_devices);
191 ref_output.reset(num_devices);
192
193 // Generate input vectors A and B and the reference output consisting
194 // of a total of N elements.
195 // We create separate arrays for each device so that each device has an
196 // aligned buffer.
197 for(unsigned i = 0; i < num_devices; ++i) {
198 input_a[i].reset(n_per_device[i]);
199 input_b[i].reset(n_per_device[i]);
200 output[i].reset(n_per_device[i]);
201 ref_output[i].reset(n_per_device[i]);
202
203 for(unsigned j = 0; j < n_per_device[i]; ++j) {
204 input_a[i][j] = rand_float();
205 input_b[i][j] = rand_float();
206 ref_output[i][j] = input_a[i][j] + input_b[i][j];
207 }
208 }
209}
210
211void run() {
212 cl_int status;
213
214 const double start_time = getCurrentTimestamp();
215
216 // Launch the problem for each device.
217 scoped_array<cl_event> kernel_event(num_devices);
218 scoped_array<cl_event> finish_event(num_devices);
219
220 for(unsigned i = 0; i < num_devices; ++i) {
221
222 // Transfer inputs to each device. Each of the host buffers supplied to
223 // clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
224 // for the host-to-device transfer.
225 cl_event write_event[2];
226 status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
227 0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
228 checkError(status, "Failed to transfer input A");
229
230 status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
231 0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]);
232 checkError(status, "Failed to transfer input B");
233
234 // Set kernel arguments.
235 unsigned argi = 0;
236
237 status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
238 checkError(status, "Failed to set argument %d", argi - 1);
239
240 status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
241 checkError(status, "Failed to set argument %d", argi - 1);
242
243 status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
244 checkError(status, "Failed to set argument %d", argi - 1);
245
246 // Enqueue kernel.
247 // Use a global work size corresponding to the number of elements to add
248 // for this device.
249 //
250 // We don't specify a local work size and let the runtime choose
251 // (it'll choose to use one work-group with the same size as the global
252 // work-size).
253 //
254 // Events are used to ensure that the kernel is not launched until
255 // the writes to the input buffers have completed.
256 const size_t global_work_size = n_per_device[i];
257 printf("Launching for device %d (%d elements)\n", i, global_work_size);
258
259 status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
260 &global_work_size, NULL, 2, write_event, &kernel_event[i]);
261 checkError(status, "Failed to launch kernel");
262
263 // Read the result. This the final operation.
264 status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
265 0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
266
267 // Release local events.
268 clReleaseEvent(write_event[0]);
269 clReleaseEvent(write_event[1]);
270 }
271
272 // Wait for all devices to finish.
273 clWaitForEvents(num_devices, finish_event);
274
275 const double end_time = getCurrentTimestamp();
276
277 // Wall-clock time taken.
278 printf("\nTime: %0.3f ms\n", (end_time - start_time) * 1e3);
279
280 // Get kernel times using the OpenCL event profiling API.
281 for(unsigned i = 0; i < num_devices; ++i) {
282 cl_ulong time_ns = getStartEndTime(kernel_event[i]);
283 printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
284 }
285
286 // Release all events.
287 for(unsigned i = 0; i < num_devices; ++i) {
288 clReleaseEvent(kernel_event[i]);
289 clReleaseEvent(finish_event[i]);
290 }
291
292 // Verify results.
293 bool pass = true;
294 for(unsigned i = 0; i < num_devices && pass; ++i) {
295 for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
296 if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
297 printf("Failed verification @ device %d, index %d\nOutput: %f\nReference: %f\n",
298 i, j, output[i][j], ref_output[i][j]);
299 pass = false;
300 }
301 }
302 }
303
304 printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
305}
306
307// Free the resources allocated during initialization
308void cleanup() {
309 for(unsigned i = 0; i < num_devices; ++i) {
310 if(kernel && kernel[i]) {
311 clReleaseKernel(kernel[i]);
312 }
313 if(queue && queue[i]) {
314 clReleaseCommandQueue(queue[i]);
315 }
316 if(input_a_buf && input_a_buf[i]) {
317 clReleaseMemObject(input_a_buf[i]);
318 }
319 if(input_b_buf && input_b_buf[i]) {
320 clReleaseMemObject(input_b_buf[i]);
321 }
322 if(output_buf && output_buf[i]) {
323 clReleaseMemObject(output_buf[i]);
324 }
325 }
326
327 if(program) {
328 clReleaseProgram(program);
329 }
330 if(context) {
331 clReleaseContext(context);
332 }
333}
334
Note: See TracBrowser for help on using the repository browser.