source: CIVL/examples/cuda/vectorAdd.cu@ b784507

1.23 2.0 main test-branch
Last change on this file since b784507 was 2321281, checked in by Manchun Zheng <zmanchun@…>, 11 years ago

fixed $elaborate for examples.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@2355 fb995dde-84ed-4084-dfe6-e5aef3e2452c

  • Property mode set to 100644
File size: 6.5 KB
Line 
1/**
2 * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
3 *
4 * Please refer to the NVIDIA end user license agreement (EULA) associated
5 * with this source code for terms and conditions that govern your use of
6 * this software. Any use, reproduction, disclosure, or distribution of
7 * this software and related documentation outside the terms of the EULA
8 * is strictly prohibited.
9 *
10 */
11
12/**
13 * Vector addition: C = A + B.
14 *
15 * This sample is a very basic sample that implements element by element
16 * vector addition. It is the same as the sample illustrating Chapter 2
17 * of the programming guide with some additions like error checking.
18 */
19
20#include <stdio.h>
21#include <stdlib.h>
22#include <math.h>
23
24// For the CUDA runtime routines (prefixed with "cuda_")
25#include <cuda.h>
26
27$input int N;
28$input int B;
29$assume(0 < N && N <= B);
30$input int THREADS;
31$input int THREADS_B;
32$assume(0 < THREADS && THREADS <= THREADS_B);
33
34/**
35 * CUDA Kernel Device code
36 *
37 * Computes the vector addition of A and B into C. The 3 vectors have the same
38 * number of elements numElements.
39 */
40__global__ void
41vectorAdd(const float *A, const float *B, float *C, int numElements)
42{
43 int i = blockDim.x * blockIdx.x + threadIdx.x;
44
45 if (i < numElements)
46 {
47 C[i] = A[i] + B[i];
48 }
49}
50
51/**
52 * Host main routine
53 */
54int
55main(void)
56{
57 $elaborate(THREADS);
58
59 // Error code to check return values for CUDA calls
60 cudaError_t err = cudaSuccess;
61
62 // Print the vector length to be used, and compute its size
63 int numElements = N;
64 size_t size = numElements * sizeof(float);
65 printf("[Vector addition of %d elements]\n", numElements);
66
67 // Allocate the host input vector A
68 float *h_A = (float *)malloc(size);
69
70 // Allocate the host input vector B
71 float *h_B = (float *)malloc(size);
72
73 // Allocate the host output vector C
74 float *h_C = (float *)malloc(size);
75
76 // Verify that allocations succeeded
77 if (h_A == NULL || h_B == NULL || h_C == NULL)
78 {
79 fprintf(stderr, "Failed to allocate host vectors!\n");
80 exit(EXIT_FAILURE);
81 }
82
83 // Initialize the host input vectors
84 for (int i = 0; i < numElements; ++i)
85 {
86 h_A[i] = (float)rand()/(float)RAND_MAX;
87 h_B[i] = (float)rand()/(float)RAND_MAX;
88 }
89
90 // Allocate the device input vector A
91 float *d_A = NULL;
92 err = cudaMalloc((void **)&d_A, size);
93
94 if (err != cudaSuccess)
95 {
96 fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
97 exit(EXIT_FAILURE);
98 }
99
100 // Allocate the device input vector B
101 float *d_B = NULL;
102 err = cudaMalloc((void **)&d_B, size);
103
104 if (err != cudaSuccess)
105 {
106 fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
107 exit(EXIT_FAILURE);
108 }
109
110 // Allocate the device output vector C
111 float *d_C = NULL;
112 err = cudaMalloc((void **)&d_C, size);
113
114 if (err != cudaSuccess)
115 {
116 fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
117 exit(EXIT_FAILURE);
118 }
119
120 // Copy the host input vectors A and B in host memory to the device input vectors in
121 // device memory
122 printf("Copy input data from the host memory to the CUDA device\n");
123 err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
124
125 if (err != cudaSuccess)
126 {
127 fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
128 exit(EXIT_FAILURE);
129 }
130
131 err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
132
133 if (err != cudaSuccess)
134 {
135 fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
136 exit(EXIT_FAILURE);
137 }
138
139 // Launch the Vector Add CUDA Kernel
140 int threadsPerBlock = THREADS;
141 int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
142 printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
143 vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
144 err = cudaGetLastError();
145
146 if (err != cudaSuccess)
147 {
148 fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
149 exit(EXIT_FAILURE);
150 }
151
152 // Copy the device result vector in device memory to the host result vector
153 // in host memory.
154 printf("Copy output data from the CUDA device to the host memory\n");
155 err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
156
157 if (err != cudaSuccess)
158 {
159 fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
160 exit(EXIT_FAILURE);
161 }
162
163 // Verify that the result vector is correct
164 for (int i = 0; i < numElements; ++i)
165 {
166 if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
167 {
168 $assert($false);
169 fprintf(stderr, "Result verification failed at element %d!\n", i);
170 exit(EXIT_FAILURE);
171 }
172 }
173
174 printf("Test PASSED\n");
175
176 // Free device global memory
177 err = cudaFree(d_A);
178
179 if (err != cudaSuccess)
180 {
181 fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
182 exit(EXIT_FAILURE);
183 }
184
185 err = cudaFree(d_B);
186
187 if (err != cudaSuccess)
188 {
189 fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
190 exit(EXIT_FAILURE);
191 }
192
193 err = cudaFree(d_C);
194
195 if (err != cudaSuccess)
196 {
197 fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
198 exit(EXIT_FAILURE);
199 }
200
201 // Free host memory
202 free(h_A);
203 free(h_B);
204 free(h_C);
205
206 // Reset the device and exit
207 // cudaDeviceReset causes the driver to clean up all state. While
208 // not mandatory in normal operation, it is good practice. It is also
209 // needed to ensure correct operation when the application is being
210 // profiled. Calling cudaDeviceReset causes all profile data to be
211 // flushed before the application exits
212 err = cudaDeviceReset();
213
214 if (err != cudaSuccess)
215 {
216 fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
217 exit(EXIT_FAILURE);
218 }
219
220 printf("Done\n");
221 return 0;
222}
223
Note: See TracBrowser for help on using the repository browser.