1. Introduction to
Parallel Programming With
CUDA & OpenCL
CUDA & OpenCL
Moayad H. Almohaishi
Graduate student, Computer Science
Louisiana Tech University
mha023@latech.edu
1
2. Outlines
• Introduction
• Introduction to CUDA
– Hello World
– Addition application
– Array Addition
– CUDA Memories
– Matrix Multiplication
– Performance considerations
• Introduction to OpenCL
– Addition Kernel
– differences from CUDA kernel
– setting the OpenCL host code
• Sources and additional Resources
01/23/11 2
3. Introduction
• Why GPU
– Available in almost all new desktops and laptops
– many-core
• 512 cores on GTX580
– high floating point operations
• GTX580 offer peak performance ≈1.5 TFLOPS (Single
Precision)
– high memory bandwidth
• GTX580 offer 192.4 GB/sec
3
4. Introduction to CUDA
• CUDA Architecture
– The physical technology on the GPU
• CUDA C
– The programming language to harvest the power of
CUDA architecture
– based on standard C
4
5. What you need to know?
Today :
•You will need some knowledge about C
•Yow don’t need to know about parallel
programming
•You don’t need to know about CUDA architecture
5
6. Terminology
• Host
– The CPU and its dedicated system memory (RAM).
• Device
– The GPU and its on-board memory
6
7. C Hello World
int main( void ) {
printf(“Hello World ! n”);
return 0;
}
This Hello world C Code if compiled with Nvidia
CUDA compiler will compile without problem.
7
8. CUDA Kernel
__global__ void kernel( Void ){
}
int main( void ) {
kernel<<<1,1>>>();
printf(“Hello World ! n”);
return 0;
}
8
9. Kernel<<1,1>>(); is the command
__global__ is a key word to define
to call the CUDA kernel kernel
the function as a CUDA from the
CUDA Kernel
host code
__global__ void kernel( Void ){
__global__ void kernel( Void
}}
int main( void ) {
kernel<<<1,1>>>();
kernel<<<1,1>>>();
printf(“Hello World ! n”);
return 0;
}
9
10. Single Addition on the CPU
float add( float *a, float *b ){
return a+b;
}
void main( void ) {
float *a, *b, *c;
... // setting a and b values
c = add(a,b);
printf(“%f + %f = %f n”, a,b,c);
return 0;
} 10
11. Single Addition on the GPU
__global__ void add( float *a, float *b, float *c ){
c= a+b;
}
void main( void ) {
float *a, *b, *c;
... // setting a and b values
add<<<1,1>>>(a,b,c);
printf(“%f + %f = %f n”, a,b,c);
return 0;
} 11
12. Single Addition on the GPU
__global__ void add( float *a, float *b, float *c ){
c= a+b;
}
?!
void main( void ) {
float *a, *b, *c;
... // setting a and b values
add<<<1,1>>>(a,b,c); // c will need to be copied to host
printf(“%f + %f = %f n”, a,b,c);
return 0;
} 12
13. Original C memory commands:
malloc(), free(), memcpy()
CUDA Global Memory
• To be able to use the GPU memory you will need:
– Allocate memory on the GPU using the command
• cudaMalloc()
– Copy the host memory to the device memory using
• cudaMemcpy()
• To free the memory
• cudaFree()
13
14. The Kernel will is correct and will
stay the same
Single Addition on the GPU
__global__ void add( float *a, float *b, float *c ){
c= a+b;
}
14
15. Allocating the the different
we need to define device memory
variables for host and device
Single Addition on the GPU
memories.
void main( void ) {
float *h_a, *h_b, *h_c;
float *d_a, *d_b, *d_c;
int size = sizeof(float);
cudaMalloc((void**) &d_a, size);
cudaMalloc((void**) &d_b, size);
cudaMalloc((void**) &d_c, size);
h_a = 150; h_b = 89;
15
16. copy
Free the device memory from
memory to and
the device
Single Addition on the GPU
cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);
add<<<1,1>>>(d_a,d_b,d_c);
cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);
printf(“%f + %f = %f n”, a,b,c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
16
17. Is that right to do?
• GPU is about massive parallelism, so running this
program on the GPU is inefficient and will run
slower than the CPU version
• You need large data
17
18. The add function will stay the same
Array Addition on the CPU
void main( void ) {
int n = 512; // 2^9
float *a[n], *b[n], *c[n];
... // setting a and b values
for (int i=0 i<=n, i++){
c[i] = add(a[i],b[i]);
printf(“%f + %f = %f n”, a,b,c);
}
return 0;
}
18
19. we have to modify the size
Array Addition on the GPU
void main( void ) {
int n = 512;
float *h_a[n], *h_b[n], *h_c[n];
float *d_a[n], *d_b[n], *d_c[n];
int size = sizeof(float) * n;
cudaMalloc((void**) &d_a, size);
cudaMalloc((void**) &d_b, size);
cudaMalloc((void**) &d_c, size);
... // setting the input data h_a and h_b
19
21. Blocks
• CUDA Run the Kernel as a block on a grid
containing n number of blocks.
• The maximum value of n can defer from device to
device. current devices limit is 65535 blocks per
grid
• we will use blockIdx.x to access the block ID from
the kernel
21
22. n number of blocks will be
running on the kernel
Array Addition on the GPU1
cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);
add<<<n,1>>>(d_a,d_b,d_c);
cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);
printf(“%f + %f = %f n”, a,b,c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
22
24. Threads
• Each block can contain up to 512 parallel threads
in the first and second CUDA architecture
• In fermi architecture each block can contain up to
1024 parallel threads.
• we will use threadIdx.x to access the thread ID
from the kernel
24
25. n number of threads on single
block will be running on the
Array Addition on the GPU
kernel
cudaMemcpy(d_a, &h_a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, size, cudaMemcpyHostToDevice);
add<<<1,n>>>(d_a,d_b,d_c);
cudaMemcpy(&h_c, d_c, size, cudaMemcpyDeviceToHost);
printf(“%f + %f = %f n”, a,b,c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
25
26. CUDA Run the threads as half warps. so
it is more efficient to have at least 16
Array Addition Kernel
threads per block
__global__ void add( float *a, float *b, float *c ){
int idx = threadIdx.x ;
c[idx]= a[idx]+b[idx];
}
26
27. MORE
• is it still massive parallelism ?
• what about more than 512 elements ?
27
33. Exercises
• What is the maximum number of threads that can
be run on a grid ?
• How we can go over that limit ?
33
34. How to point each thread to the right global memory address ?
Hint: you need to find the idx formula that count one memory index and
global memory access
jump the second one.
You will access the second index throw idx + 1
• Allowing each thread to do 2 computation
blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2
0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6 Threads
0 1 2 3 4 5 6 7 8 9
1 1 1 1 1 1 1 1 1 1 2 Global
0 1 2 3 4 5 6 7 8 9 0
Memory
34
35. How to point each thread to the right global memory address ?
Hint: you need to find the idx formula that count one memory index and
global memory access
jump the next blockSize .
You will access the second index throw idx + blockDim.x
• Allowing each thread to do 2 computation
blockIdx.x= 0 blockIdx.x= 1 blockIdx.x= 2
0 1 2 3 4 5 6 0 1 2 3 4 5 6 0 1 2 3 4 5 6 Threads
0 1 2 3 4 5 6 7 8 9
1 1 1 1 1 1 1 1 1 1 2 Global
0 1 2 3 4 5 6 7 8 9 0
Memory
35
36. What you learned
• Creating CUDA Kernel
• Calling the Kernel from the host
• Allocating CUDA memory
• Copy to/from the device memory
• freeing the device memory
• controlling the number of threads throw the block
size and number of blocks per grid.
36
38. • if each thread do one multiplication. which thread
will make the addition ?
38
39. Shared Memory
• The Shared Memory is very fast memory on the
GPU chip itself.
• each block has its own shared memory space.
• can be declared using __shared__ CUDA
keyword
• to make sure all the thread finished computing
use the CUDA keyword __syncthreads()
39
40. Dot Product Kernel
__global__ void dotP(int *a, int *b, int *c){
__shared__ temp[N];
temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
__syncthreads();
if (threadIdx.x == 0) {
int sum = 0;
for (int i = 0;i<N;i++)
sum += temp[i];
c = sum;
}
}
40
41. exercise
• in this application the addition will run on thread 0
only. is that efficient ?
• how to make it better ?
41
45. Simple Matrix Multiplication Kernel
__global__ void matrixMul( float *a, float *b, float
*c ){
int x = threadIdx.x ; //row
int y = threadIdx.y; //column
float temp = 0;
for (int i=0; i<= blockDim.x; i++){
temp += a[i][y] * b[x][i];
}
c[x][y] = temp;
}
45
46. Exercise
• Use the shared memory to optimize the matrix
algorithm (hint: look at the code on the SDK)
46
47. What you learned
• Using the shared memory to share the date
among the threads in a block
• Synchronizing the threads
• setting blockSize of more than one dimension
using dim3
47
48. Performance Considerations
• for maximum performance:
– Reduce the global memory access.
– maximize the occupancy (allow scheduling of 1024
threads per stream multi processor)
• use the right blockSize
• use the right number of registers
• use the right size of the shared memory
– Increasing the independent instructions
– coalescing the memory access
– Using right instruction:byte ratio
48
49. Introduction to OpenCL
• OpenCL is open standard
• Cross platform; can run on:
– Multi-core CPU
– GPU (NVIDIA,ATI)
– Cell B/E
– others
• close to CUDA
49
50. How the program work
Host Device (GPU)
•Allocating the memory in the
Host Stream Processors
GPU
•initializing data in the memory
Kernel
objects.
Code
•Allocating the memory in the
Device (GPU)
Memory •Copy the Data from Host to Memory
Device
A[] B[] C[] A[] B[] C[]
•Running the Kernel
•Copy the results to the Host
memory
•Clear the Memory and Free the
resources
50
51. Basic OpenCL program Structure
• OpenCL Kernel
• Host program containing:
– a. Devices Context.
– b. Command Queue
– c. Memory Objects
– d. OpenCL Program.
– e. Kernel Memory Arguments.
51
52. Creating the Kernel
#include <studio.h>#include <stdlib.h>#include <CL/cl.h>const char*
OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int*
a, n”, “ __global int* b) n”, “{ n”, “
unsigned int n = get_global_id(0); n”, “ c[c] = a[n] + b[n]; n”, “} n”}
};
};
};
52
53. Notice that all the kernel here stored as
char variable
Creating the Kernel
#include <studio.h>#include <stdlib.h>#include <CL/cl.h>const char*
OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int*
a, n”, char* OpenCLSource[ ] = {
const “ __global int* b) n”, “{ n”, “
* OpenCLSource[ ] = {
unsigned int n = get_global_id(0); n”, “ c[c] = a[n] + b[n]; n”, “} n”};
};
};
};
53
54. The __kernel key word in function
get_global_id() is a builtis equivalent to
__global__ in CUDA
instead of calculating the global ID in
Creating the Kernel
The function parameters need to be
CUDA
define as __global while you don’t need
that in CUDA
#include <studio.h>#include <stdlib.h>#include <CL/cl.h>const char*
OpenCLSource[ ] = { “__kernel void VectorAdd(__global int* c, __global int*
a, n”, char* OpenCLSource[ ] = {
const “ __global int* b) n”,VectorAdd(__global int*
“__kernel void “{ n”, “
c, __global int* a, n”, “
unsigned int n = get_global_id(0); n”, “ c[c] = a[n] + b[n]; n”, int* b) n
__global “} n”};
}; “ unsigned int n = get_global_id(0); n”,
,
};
,
};
54
56. Creating the main function
int main (int argc, char **argv){ int HostVector1[SIZE]; int HostVector2[SIZE]
for (int c= 0; c<SIZE; c++) { HostVector[c] = InitialData1[c
%12]; HostVector[c] = initialData2[c%12];}
];}
];}
];}
];}
56
71. Cleaning the GPU device
clReleaseMemObject(GPUVector1);clReleaseMemObject(GPUVector2);clRele
seMemObject(GPUOutputVector);free (GPUDevices);for(int c= 0; c < 305; c+
+)printf (“%c”, (char)HostOutputVector[c]);return 0;}
;}
71
72. What you learned
• Writing OpenCL Kernel
• Writing OpenCL Application
– Setting the context
– preparing the command queue
– setting the memory objects
– setting the program
– setting the kernel and the arguments
72
73. Sources and additional resources
• Jason sander, “Introduction to CUDA” -book and
GTC presentation.
• OpenCL specification document
• NVIDIA CUDA programming guide
• NVIDIA OpenCL getting started guide
• Videos from GTC’10 in the link :
• http://www.nvidia.com/object/gtc2010-presentation-a
73