Context Navigation

main.cpp

main

Last change on this file was ea777aa, checked in by Alex Wilton <awilton@…>, 3 years ago

Moved examples, include, build_default.properties, common.xml, and README out from dev.civl.com into the root of the repo.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@5704 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 11.3 KB

Rev	Line
[cc87898]	1	// Copyright (C) 2013 Altera Corporation, San Jose, California, USA. All rights reserved.
	2	// Permission is hereby granted, free of charge, to any person obtaining a copy of this
	3	// software and associated documentation files (the "Software"), to deal in the Software
	4	// without restriction, including without limitation the rights to use, copy, modify, merge,
	5	// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to
	6	// whom the Software is furnished to do so, subject to the following conditions:
	7	// The above copyright notice and this permission notice shall be included in all copies or
	8	// substantial portions of the Software.
	9	//
	10	// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
	11	// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
	12	// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
	13	// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
	14	// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
	15	// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	16	// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
	17	// OTHER DEALINGS IN THE SOFTWARE.
	18	//
	19	// This agreement shall be governed in all respects by the laws of the State of California and
	20	// by the laws of the United States of America.
	21
	22	///////////////////////////////////////////////////////////////////////////////////
	23	// This host program executes a vector addition kernel to perform:
	24	// C = A + B
	25	// where A, B and C are vectors with N elements.
	26	//
	27	// This host program supports partitioning the problem across multiple OpenCL
	28	// devices if available. If there are M available devices, the problem is
	29	// divided so that each device operates on N/M points. The host program
	30	// assumes that all devices are of the same type (that is, the same binary can
	31	// be used), but the code can be generalized to support different device types
	32	// easily.
	33	//
	34	// Verification is performed against the same computation on the host CPU.
	35	///////////////////////////////////////////////////////////////////////////////////
	36
	37	#include <stdio.h>
	38	#include <stdlib.h>
	39	#include <math.h>
	40	#include "CL/opencl.h"
	41	#include "AOCL_Utils.h"
	42
	43	using namespace aocl_utils;
	44
	45	// OpenCL runtime configuration
	46	cl_platform_id platform = NULL;
	47	unsigned num_devices = 0;
	48	scoped_array<cl_device_id> device; // num_devices elements
	49	cl_context context = NULL;
	50	scoped_array<cl_command_queue> queue; // num_devices elements
	51	cl_program program = NULL;
	52	scoped_array<cl_kernel> kernel; // num_devices elements
	53	scoped_array<cl_mem> input_a_buf; // num_devices elements
	54	scoped_array<cl_mem> input_b_buf; // num_devices elements
	55	scoped_array<cl_mem> output_buf; // num_devices elements
	56
	57	// Problem data.
	58	const unsigned N = 1000000; // problem size
	59	scoped_array<scoped_aligned_ptr<float> > input_a, input_b; // num_devices elements
	60	scoped_array<scoped_aligned_ptr<float> > output; // num_devices elements
	61	scoped_array<scoped_array<float> > ref_output; // num_devices elements
	62	scoped_array<unsigned> n_per_device; // num_devices elements
	63
	64	// Function prototypes
	65	float rand_float();
	66	bool init_opencl();
	67	void init_problem();
	68	void run();
	69	void cleanup();
	70
	71	// Entry point.
	72	int main() {
	73	// Initialize OpenCL.
	74	if(!init_opencl()) {
	75	return -1;
	76	}
	77
	78	// Initialize the problem data.
	79	// Requires the number of devices to be known.
	80	init_problem();
	81
	82	// Run the kernel.
	83	run();
	84
	85	// Free the resources allocated
	86	cleanup();
	87
	88	return 0;
	89	}
	90
	91	/////// HELPER FUNCTIONS ///////
	92
	93	// Randomly generate a floating-point number between -10 and 10.
	94	float rand_float() {
	95	return float(rand()) / float(RAND_MAX) * 20.0f - 10.0f;
	96	}
	97
	98	// Initializes the OpenCL objects.
	99	bool init_opencl() {
	100	cl_int status;
	101
	102	printf("Initializing OpenCL\n");
	103
	104	if(!setCwdToExeDir()) {
	105	return false;
	106	}
	107
	108	// Get the OpenCL platform.
	109	platform = findPlatform("Altera");
	110	if(platform == NULL) {
	111	printf("ERROR: Unable to find Altera OpenCL platform.\n");
	112	return false;
	113	}
	114
	115	// Query the available OpenCL device.
	116	device.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
	117	printf("Platform: %s\n", getPlatformName(platform).c_str());
	118	printf("Using %d device(s)\n", num_devices);
	119	for(unsigned i = 0; i < num_devices; ++i) {
	120	printf(" %s\n", getDeviceName(device[i]).c_str());
	121	}
	122
	123	// Create the context.
	124	context = clCreateContext(NULL, num_devices, device, NULL, NULL, &status);
	125	checkError(status, "Failed to create context");
	126
	127	// Create the program for all device. Use the first device as the
	128	// representative device (assuming all device are of the same type).
	129	std::string binary_file = getBoardBinaryFile("vectorAdd", device[0]);
	130	printf("Using AOCX: %s\n", binary_file.c_str());
	131	program = createProgramFromBinary(context, binary_file.c_str(), device, num_devices);
	132
	133	// Build the program that was just created.
	134	status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
	135	checkError(status, "Failed to build program");
	136
	137	// Create per-device objects.
	138	queue.reset(num_devices);
	139	kernel.reset(num_devices);
	140	n_per_device.reset(num_devices);
	141	input_a_buf.reset(num_devices);
	142	input_b_buf.reset(num_devices);
	143	output_buf.reset(num_devices);
	144
	145	for(unsigned i = 0; i < num_devices; ++i) {
	146	// Command queue.
	147	queue[i] = clCreateCommandQueue(context, device[i], CL_QUEUE_PROFILING_ENABLE, &status);
	148	checkError(status, "Failed to create command queue");
	149
	150	// Kernel.
	151	const char *kernel_name = "vectorAdd";
	152	kernel[i] = clCreateKernel(program, kernel_name, &status);
	153	checkError(status, "Failed to create kernel");
	154
	155	// Determine the number of elements processed by this device.
	156	n_per_device[i] = N / num_devices; // number of elements handled by this device
	157
	158	// Spread out the remainder of the elements over the first
	159	// N % num_devices.
	160	if(i < (N % num_devices)) {
	161	n_per_device[i]++;
	162	}
	163
	164	// Input buffers.
	165	input_a_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
	166	n_per_device[i] * sizeof(float), NULL, &status);
	167	checkError(status, "Failed to create buffer for input A");
	168
	169	input_b_buf[i] = clCreateBuffer(context, CL_MEM_READ_ONLY,
	170	n_per_device[i] * sizeof(float), NULL, &status);
	171	checkError(status, "Failed to create buffer for input B");
	172
	173	// Output buffer.
	174	output_buf[i] = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
	175	n_per_device[i] * sizeof(float), NULL, &status);
	176	checkError(status, "Failed to create buffer for output");
	177	}
	178
	179	return true;
	180	}
	181
	182	// Initialize the data for the problem. Requires num_devices to be known.
	183	void init_problem() {
	184	if(num_devices == 0) {
	185	checkError(-1, "No devices");
	186	}
	187
	188	input_a.reset(num_devices);
	189	input_b.reset(num_devices);
	190	output.reset(num_devices);
	191	ref_output.reset(num_devices);
	192
	193	// Generate input vectors A and B and the reference output consisting
	194	// of a total of N elements.
	195	// We create separate arrays for each device so that each device has an
	196	// aligned buffer.
	197	for(unsigned i = 0; i < num_devices; ++i) {
	198	input_a[i].reset(n_per_device[i]);
	199	input_b[i].reset(n_per_device[i]);
	200	output[i].reset(n_per_device[i]);
	201	ref_output[i].reset(n_per_device[i]);
	202
	203	for(unsigned j = 0; j < n_per_device[i]; ++j) {
	204	input_a[i][j] = rand_float();
	205	input_b[i][j] = rand_float();
	206	ref_output[i][j] = input_a[i][j] + input_b[i][j];
	207	}
	208	}
	209	}
	210
	211	void run() {
	212	cl_int status;
	213
	214	const double start_time = getCurrentTimestamp();
	215
	216	// Launch the problem for each device.
	217	scoped_array<cl_event> kernel_event(num_devices);
	218	scoped_array<cl_event> finish_event(num_devices);
	219
	220	for(unsigned i = 0; i < num_devices; ++i) {
	221
	222	// Transfer inputs to each device. Each of the host buffers supplied to
	223	// clEnqueueWriteBuffer here is already aligned to ensure that DMA is used
	224	// for the host-to-device transfer.
	225	cl_event write_event[2];
	226	status = clEnqueueWriteBuffer(queue[i], input_a_buf[i], CL_FALSE,
	227	0, n_per_device[i] * sizeof(float), input_a[i], 0, NULL, &write_event[0]);
	228	checkError(status, "Failed to transfer input A");
	229
	230	status = clEnqueueWriteBuffer(queue[i], input_b_buf[i], CL_FALSE,
	231	0, n_per_device[i] * sizeof(float), input_b[i], 0, NULL, &write_event[1]);
	232	checkError(status, "Failed to transfer input B");
	233
	234	// Set kernel arguments.
	235	unsigned argi = 0;
	236
	237	status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_a_buf[i]);
	238	checkError(status, "Failed to set argument %d", argi - 1);
	239
	240	status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &input_b_buf[i]);
	241	checkError(status, "Failed to set argument %d", argi - 1);
	242
	243	status = clSetKernelArg(kernel[i], argi++, sizeof(cl_mem), &output_buf[i]);
	244	checkError(status, "Failed to set argument %d", argi - 1);
	245
	246	// Enqueue kernel.
	247	// Use a global work size corresponding to the number of elements to add
	248	// for this device.
	249	//
	250	// We don't specify a local work size and let the runtime choose
	251	// (it'll choose to use one work-group with the same size as the global
	252	// work-size).
	253	//
	254	// Events are used to ensure that the kernel is not launched until
	255	// the writes to the input buffers have completed.
	256	const size_t global_work_size = n_per_device[i];
	257	printf("Launching for device %d (%d elements)\n", i, global_work_size);
	258
	259	status = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL,
	260	&global_work_size, NULL, 2, write_event, &kernel_event[i]);
	261	checkError(status, "Failed to launch kernel");
	262
	263	// Read the result. This the final operation.
	264	status = clEnqueueReadBuffer(queue[i], output_buf[i], CL_FALSE,
	265	0, n_per_device[i] * sizeof(float), output[i], 1, &kernel_event[i], &finish_event[i]);
	266
	267	// Release local events.
	268	clReleaseEvent(write_event[0]);
	269	clReleaseEvent(write_event[1]);
	270	}
	271
	272	// Wait for all devices to finish.
	273	clWaitForEvents(num_devices, finish_event);
	274
	275	const double end_time = getCurrentTimestamp();
	276
	277	// Wall-clock time taken.
	278	printf("\nTime: %0.3f ms\n", (end_time - start_time) * 1e3);
	279
	280	// Get kernel times using the OpenCL event profiling API.
	281	for(unsigned i = 0; i < num_devices; ++i) {
	282	cl_ulong time_ns = getStartEndTime(kernel_event[i]);
	283	printf("Kernel time (device %d): %0.3f ms\n", i, double(time_ns) * 1e-6);
	284	}
	285
	286	// Release all events.
	287	for(unsigned i = 0; i < num_devices; ++i) {
	288	clReleaseEvent(kernel_event[i]);
	289	clReleaseEvent(finish_event[i]);
	290	}
	291
	292	// Verify results.
	293	bool pass = true;
	294	for(unsigned i = 0; i < num_devices && pass; ++i) {
	295	for(unsigned j = 0; j < n_per_device[i] && pass; ++j) {
	296	if(fabsf(output[i][j] - ref_output[i][j]) > 1.0e-5f) {
	297	printf("Failed verification @ device %d, index %d\nOutput: %f\nReference: %f\n",
	298	i, j, output[i][j], ref_output[i][j]);
	299	pass = false;
	300	}
	301	}
	302	}
	303
	304	printf("\nVerification: %s\n", pass ? "PASS" : "FAIL");
	305	}
	306
	307	// Free the resources allocated during initialization
	308	void cleanup() {
	309	for(unsigned i = 0; i < num_devices; ++i) {
	310	if(kernel && kernel[i]) {
	311	clReleaseKernel(kernel[i]);
	312	}
	313	if(queue && queue[i]) {
	314	clReleaseCommandQueue(queue[i]);
	315	}
	316	if(input_a_buf && input_a_buf[i]) {
	317	clReleaseMemObject(input_a_buf[i]);
	318	}
	319	if(input_b_buf && input_b_buf[i]) {
	320	clReleaseMemObject(input_b_buf[i]);
	321	}
	322	if(output_buf && output_buf[i]) {
	323	clReleaseMemObject(output_buf[i]);
	324	}
	325	}
	326
	327	if(program) {
	328	clReleaseProgram(program);
	329	}
	330	if(context) {
	331	clReleaseContext(context);
	332	}
	333	}
	334

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/opencl/2.14/vecadd/main.cpp

Download in other formats: