| 1 | /**
|
|---|
| 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
|
|---|
| 3 | *
|
|---|
| 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated
|
|---|
| 5 | * with this source code for terms and conditions that govern your use of
|
|---|
| 6 | * this software. Any use, reproduction, disclosure, or distribution of
|
|---|
| 7 | * this software and related documentation outside the terms of the EULA
|
|---|
| 8 | * is strictly prohibited.
|
|---|
| 9 | *
|
|---|
| 10 | */
|
|---|
| 11 |
|
|---|
| 12 | /**
|
|---|
| 13 | * Vector addition: C = A + B.
|
|---|
| 14 | *
|
|---|
| 15 | * This sample is a very basic sample that implements element by element
|
|---|
| 16 | * vector addition. It is the same as the sample illustrating Chapter 2
|
|---|
| 17 | * of the programming guide with some additions like error checking.
|
|---|
| 18 | */
|
|---|
| 19 |
|
|---|
| 20 | #include <stdio.h>
|
|---|
| 21 | #include <stdlib.h>
|
|---|
| 22 | #include <math.h>
|
|---|
| 23 |
|
|---|
| 24 | // For the CUDA runtime routines (prefixed with "cuda_")
|
|---|
| 25 | #include <cuda.h>
|
|---|
| 26 |
|
|---|
| 27 | $input int N=8;
|
|---|
| 28 | $input int B;
|
|---|
| 29 | $assume(0 < N && N <= B);
|
|---|
| 30 | $input int THREADS=2;
|
|---|
| 31 | $input int THREADS_B;
|
|---|
| 32 | $assume(0 < THREADS && THREADS <= THREADS_B);
|
|---|
| 33 |
|
|---|
| 34 | /**
|
|---|
| 35 | * CUDA Kernel Device code
|
|---|
| 36 | *
|
|---|
| 37 | * Computes the vector addition of A and B into C. The 3 vectors have the same
|
|---|
| 38 | * number of elements numElements.
|
|---|
| 39 | */
|
|---|
| 40 | __global__ void
|
|---|
| 41 | vectorAdd(const float *A, const float *B, float *C, int numElements)
|
|---|
| 42 | {
|
|---|
| 43 | int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|---|
| 44 |
|
|---|
| 45 | if (i < numElements)
|
|---|
| 46 | {
|
|---|
| 47 | C[i] = A[i] + B[i];
|
|---|
| 48 | }
|
|---|
| 49 | }
|
|---|
| 50 |
|
|---|
| 51 | /**
|
|---|
| 52 | * Host main routine
|
|---|
| 53 | */
|
|---|
| 54 | int
|
|---|
| 55 | main(void)
|
|---|
| 56 | {
|
|---|
| 57 |
|
|---|
| 58 | // Error code to check return values for CUDA calls
|
|---|
| 59 | cudaError_t err = cudaSuccess;
|
|---|
| 60 |
|
|---|
| 61 | // Print the vector length to be used, and compute its size
|
|---|
| 62 | int numElements = N;
|
|---|
| 63 | size_t size = numElements * sizeof(float);
|
|---|
| 64 | printf("[Vector addition of %d elements]\n", numElements);
|
|---|
| 65 |
|
|---|
| 66 | // Allocate the host input vector A
|
|---|
| 67 | float *h_A = (float *)malloc(size);
|
|---|
| 68 |
|
|---|
| 69 | // Allocate the host input vector B
|
|---|
| 70 | float *h_B = (float *)malloc(size);
|
|---|
| 71 |
|
|---|
| 72 | // Allocate the host output vector C
|
|---|
| 73 | float *h_C = (float *)malloc(size);
|
|---|
| 74 |
|
|---|
| 75 | // Verify that allocations succeeded
|
|---|
| 76 | if (h_A == NULL || h_B == NULL || h_C == NULL)
|
|---|
| 77 | {
|
|---|
| 78 | fprintf(stderr, "Failed to allocate host vectors!\n");
|
|---|
| 79 | exit(EXIT_FAILURE);
|
|---|
| 80 | }
|
|---|
| 81 |
|
|---|
| 82 | // Initialize the host input vectors
|
|---|
| 83 | for (int i = 0; i < numElements; ++i)
|
|---|
| 84 | {
|
|---|
| 85 | h_A[i] = (float)rand()/(float)RAND_MAX;
|
|---|
| 86 | h_B[i] = (float)rand()/(float)RAND_MAX;
|
|---|
| 87 | }
|
|---|
| 88 |
|
|---|
| 89 | // Allocate the device input vector A
|
|---|
| 90 | float *d_A = NULL;
|
|---|
| 91 | err = cudaMalloc((void **)&d_A, size);
|
|---|
| 92 |
|
|---|
| 93 | if (err != cudaSuccess)
|
|---|
| 94 | {
|
|---|
| 95 | fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 96 | exit(EXIT_FAILURE);
|
|---|
| 97 | }
|
|---|
| 98 |
|
|---|
| 99 | // Allocate the device input vector B
|
|---|
| 100 | float *d_B = NULL;
|
|---|
| 101 | err = cudaMalloc((void **)&d_B, size);
|
|---|
| 102 |
|
|---|
| 103 | if (err != cudaSuccess)
|
|---|
| 104 | {
|
|---|
| 105 | fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 106 | exit(EXIT_FAILURE);
|
|---|
| 107 | }
|
|---|
| 108 |
|
|---|
| 109 | // Allocate the device output vector C
|
|---|
| 110 | float *d_C = NULL;
|
|---|
| 111 | err = cudaMalloc((void **)&d_C, size);
|
|---|
| 112 |
|
|---|
| 113 | if (err != cudaSuccess)
|
|---|
| 114 | {
|
|---|
| 115 | fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 116 | exit(EXIT_FAILURE);
|
|---|
| 117 | }
|
|---|
| 118 |
|
|---|
| 119 | // Copy the host input vectors A and B in host memory to the device input vectors in
|
|---|
| 120 | // device memory
|
|---|
| 121 | printf("Copy input data from the host memory to the CUDA device\n");
|
|---|
| 122 | err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
|
|---|
| 123 |
|
|---|
| 124 | if (err != cudaSuccess)
|
|---|
| 125 | {
|
|---|
| 126 | fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 127 | exit(EXIT_FAILURE);
|
|---|
| 128 | }
|
|---|
| 129 |
|
|---|
| 130 | err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
|
|---|
| 131 |
|
|---|
| 132 | if (err != cudaSuccess)
|
|---|
| 133 | {
|
|---|
| 134 | fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 135 | exit(EXIT_FAILURE);
|
|---|
| 136 | }
|
|---|
| 137 |
|
|---|
| 138 | // Launch the Vector Add CUDA Kernel
|
|---|
| 139 | int threadsPerBlock = THREADS;
|
|---|
| 140 | int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
|
|---|
| 141 | printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
|
|---|
| 142 | vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
|
|---|
| 143 | err = cudaGetLastError();
|
|---|
| 144 |
|
|---|
| 145 | if (err != cudaSuccess)
|
|---|
| 146 | {
|
|---|
| 147 | fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 148 | exit(EXIT_FAILURE);
|
|---|
| 149 | }
|
|---|
| 150 |
|
|---|
| 151 | // Copy the device result vector in device memory to the host result vector
|
|---|
| 152 | // in host memory.
|
|---|
| 153 | printf("Copy output data from the CUDA device to the host memory\n");
|
|---|
| 154 | err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
|
|---|
| 155 |
|
|---|
| 156 | if (err != cudaSuccess)
|
|---|
| 157 | {
|
|---|
| 158 | fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 159 | exit(EXIT_FAILURE);
|
|---|
| 160 | }
|
|---|
| 161 |
|
|---|
| 162 | // Verify that the result vector is correct
|
|---|
| 163 | for (int i = 0; i < numElements; ++i)
|
|---|
| 164 | {
|
|---|
| 165 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
|
|---|
| 166 | {
|
|---|
| 167 | $assert($false);
|
|---|
| 168 | fprintf(stderr, "Result verification failed at element %d!\n", i);
|
|---|
| 169 | exit(EXIT_FAILURE);
|
|---|
| 170 | }
|
|---|
| 171 | }
|
|---|
| 172 |
|
|---|
| 173 | printf("Test PASSED\n");
|
|---|
| 174 |
|
|---|
| 175 | // Free device global memory
|
|---|
| 176 | err = cudaFree(d_A);
|
|---|
| 177 |
|
|---|
| 178 | if (err != cudaSuccess)
|
|---|
| 179 | {
|
|---|
| 180 | fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 181 | exit(EXIT_FAILURE);
|
|---|
| 182 | }
|
|---|
| 183 |
|
|---|
| 184 | err = cudaFree(d_B);
|
|---|
| 185 |
|
|---|
| 186 | if (err != cudaSuccess)
|
|---|
| 187 | {
|
|---|
| 188 | fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 189 | exit(EXIT_FAILURE);
|
|---|
| 190 | }
|
|---|
| 191 |
|
|---|
| 192 | err = cudaFree(d_C);
|
|---|
| 193 |
|
|---|
| 194 | if (err != cudaSuccess)
|
|---|
| 195 | {
|
|---|
| 196 | fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
|
|---|
| 197 | exit(EXIT_FAILURE);
|
|---|
| 198 | }
|
|---|
| 199 |
|
|---|
| 200 | // Free host memory
|
|---|
| 201 | free(h_A);
|
|---|
| 202 | free(h_B);
|
|---|
| 203 | free(h_C);
|
|---|
| 204 |
|
|---|
| 205 | // Reset the device and exit
|
|---|
| 206 | // cudaDeviceReset causes the driver to clean up all state. While
|
|---|
| 207 | // not mandatory in normal operation, it is good practice. It is also
|
|---|
| 208 | // needed to ensure correct operation when the application is being
|
|---|
| 209 | // profiled. Calling cudaDeviceReset causes all profile data to be
|
|---|
| 210 | // flushed before the application exits
|
|---|
| 211 | err = cudaDeviceReset();
|
|---|
| 212 |
|
|---|
| 213 | if (err != cudaSuccess)
|
|---|
| 214 | {
|
|---|
| 215 | fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err));
|
|---|
| 216 | exit(EXIT_FAILURE);
|
|---|
| 217 | }
|
|---|
| 218 |
|
|---|
| 219 | printf("Done\n");
|
|---|
| 220 | return 0;
|
|---|
| 221 | }
|
|---|
| 222 |
|
|---|