Context Navigation

cuda-omp.cu@ b32c2d8

1.23 2.0 main test-branch

Last change on this file since b32c2d8 was 354d5fa, checked in by Andre Marianiello <andre.marianiello@…>, 11 years ago

Updated cuda examples with #ifdef _CIVL in appropriate places.

git-svn-id: svn://vsl.cis.udel.edu/civl/trunk@1981 fb995dde-84ed-4084-dfe6-e5aef3e2452c

Property mode set to 100644

File size: 4.1 KB

Line
1	//http://www.arc.vt.edu/resources/software/cuda/docs/cuda-omp.cu
2
3	#include <omp.h>
4	#include <cuda.h>
5	#include <stdio.h>
6	#include <stdlib.h>
7
8
9	#ifdef _CIVL
10	$input int BLOCKS;
11	$input int BLOCK_B;
12	$assume 1 <= BLOCKS && BLOCKS <= BLOCK_B;
13	$input int THREADS_PER_BLOCK;
14	$input int THREADS_B;
15	$assume 1 <= THREADS_PER_BLOCK && THREADS_PER_BLOCK <= THREADS_B;
16	#else
17	#define BLOCKS 64
18	#define THREADS_PER_BLOCK 128
19	#endif
20
21	// A kernel that increments each array element by the value b
22
23	__global__ void kernelAddConstant(int *g_a, const int b)
24	{
25	int idx = blockIdx.x * blockDim.x + threadIdx.x;
26	g_a[idx] += b;
27	}
28
29	// Check whether each element was incremented by the value b
30	int correctResult(int *data, const int n, const int b)
31	{
32	for(int i = 0; i < n; i++)
33	if(data[i] != i + b)
34	return 0;
35	return 1;
36	}
37
38	int main(int argc, char *argv[])
39	{
40	#ifdef _CIVL
41	elaborate(BLOCKS);
42	elaborate(THREADS_PER_BLOCK);
43	#endif
44
45	// Variable which holds number of GPUs
46	int num_gpus = 0;
47
48	// Determine the number of CUDA capable GPUs
49	cudaGetDeviceCount(&num_gpus);
50	if(num_gpus < 1)
51	{
52	printf("No CUDA Capable GPU(s) Detected \n");
53	return 1;
54	}
55
56	// Display the CPU and GPU processor specification
57	int num_procs = omp_get_num_procs();
58	printf("number of host CPUs:\t%d\n", num_procs);
59	printf("number of CUDA devices:\t%d\n", num_gpus);
60	for(int i = 0; i < num_gpus; i++)
61	{
62	cudaDeviceProp dprop;
63	cudaGetDeviceProperties(&dprop, i);
64	printf("\t Device %d is a %s\n", i, dprop.name);
65	}
66
67
68	// Initialize the variables
69	unsigned int n = num_gpus * THREADS_PER_BLOCK * BLOCKS;
70	unsigned int nbytes = n * sizeof(int);
71	int *a = 0; // pointer to data on the CPU
72	int b = 3; // value by which each array array element will be incremented
73	a = (int*)malloc(nbytes);
74
75	if(0 == a)
76	{
77	printf("couldn't allocate CPU memory\n");
78	return 1;
79	}
80
81	for(unsigned int i = 0; i < n; i++)
82	a[i] = i;
83
84	// Set the number of threads to the number of GPUs on the system
85	omp_set_num_threads(num_gpus);
86
87	#pragma omp parallel
88	{
89	unsigned int cpu_thread_id = omp_get_thread_num();
90	unsigned int num_cpu_threads = omp_get_num_threads();
91
92	// Assign and check the GPU device for each thread
93	int gpu_id = -1;
94	cudaSetDevice(cpu_thread_id % num_gpus);
95	cudaGetDevice(&gpu_id);
96
97	printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
98
99	// Variable on the device associated with this CPU thread
100	int *d_a = 0;
101
102	// Variable for the CPU
103	int sub_a = a + cpu_thread_id n / num_cpu_threads;
104
105	unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
106	dim3 gpu_threads = {THREADS_PER_BLOCK, 1, 1}; // 128 threads per block
107	dim3 gpu_blocks = {(n / (gpu_threads.x * num_cpu_threads)), 1, 1};
108
109	//Allocate memory on the device
110	cudaMalloc((void**)&d_a, nbytes_per_kernel);
111
112	//Initialize the array on the device with zeros
113	cudaMemset(d_a, 0, nbytes_per_kernel);
114
115	//Copy data from host to device
116	cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice);
117
118	//Launch the kernel
119	kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
120
121	//Copy the result from the device to the host
122	cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost);
123
124	//Deallocate the memory on the device
125	cudaFree(d_a);
126
127	}
128
129
130	if(cudaSuccess != cudaGetLastError()) {
131	int err_num = cudaGetLastError();
132	const char * err_str = cudaGetErrorString(err_num);
133	printf("%s\n", err_str);
134	}
135
136
137	//Check for correctness of the result
138	if(correctResult(a, n, b)) {
139	#ifdef _CIVL
140	$assert($true);
141	#endif
142	printf("Test PASSED\n");
143	} else
144	printf("Test FAILED\n");
145
146	//Deallocate the CPU memory
147	free(a);
148
149	// deprecated
150	// cudaThreadExit();
151
152	return 0;
153	}
154

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: CIVL/examples/cuda/cuda-omp.cu@ b32c2d8

Download in other formats: