Weitere Àhnliche Inhalte
Ăhnlich wie ISCA Final Presentaiton - Compilations (20)
Mehr von HSA Foundation (20)
KĂŒrzlich hochgeladen (20)
ISCA Final Presentaiton - Compilations
- 2. KEY HSA FEATURES FOR COMPILATION
ALL-PROCESSORS-EQUAL
ïč GPU and CPU have equal
flexibility to create and
dispatch work items
EQUAL ACCESS TO
ENTIRE SYSTEM MEMORY
ïč GPU and CPU have
uniform visibility into entire
memory space
Unified Coherent
Memory
GPUCPU
Single Dispatch Path
GPUCPU
© Copyright 2014 HSA Foundation. All Rights Reserved
- 3. A QUICK REVIEW OF OPENCL
CURRENT STATE OF PORTABLE HETEROGENEOUS
PARALLEL PROGRAMMING
- 4. DEVICE CODE IN OPENCL
SIMPLE MATRIX MULTIPLICATION
__kernel void
matrixMul(__global float* C, __global float* A, __global float* B, int wA, int wB) {
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;
}
Explicit thread index usage.
Reasonably readable.
Portable across CPUs, GPUs, and FPGAs
© Copyright 2014 HSA Foundation. All Rights Reserved
- 5. HOST CODE IN OPENCL -
CONCEPTUAL
1. allocate and initialize memory on host side
2. Initialize OpenCL
3. allocate device memory and move the data
4. Load and build device code
5. Launch kernel
a. append arguments
6. move the data back from device
© Copyright 2014 HSA Foundation. All Rights Reserved
- 6. int main(int argc, char** argv){
// set seed for rand()
srand(2006);
/****************************************************/
/* Allocate and initialize memory on Host Side */
/****************************************************/
// allocate and initialize host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float* h_A = (float*) malloc(mem_size_A);
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(mem_size_B);
randomInit(h_A, size_A);
randomInit(h_B, size_B);
// allocate host memory for the result C
unsigned int size_C = WC * HC;
unsigned int mem_size_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(mem_size_C);
/*****************************************/
/* Initialize OpenCL */
/*****************************************/
// OpenCL specific variables
cl_context clGPUContext;
cl_command_queue clCommandQue;
cl_program clProgram;
size_t dataBytes;
size_t kernelLength;
cl_int errcode;
// OpenCL device memory pointers for matrices
cl_mem d_A;
cl_mem d_B;
cl_mem d_C;
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
shrCheckError(errcode, CL_SUCCESS);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// 3. Allocate device memory and move data
d_C = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B, &errcode);
// 4. Load and build OpenCL kernel
char *clMatrixMul = oclLoadProgSource("kernel.cl",
"// My commentn",
&kernelLength);
shrCheckError(clMatrixMul != NULL, shrTRUE);
clProgram = clCreateProgramWithSource(clGPUContext,
1, (const char **)&clMatrixMul,
&kernelLength, &errcode);
shrCheckError(errcode, CL_SUCCESS);
errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);
clKernel = clCreateKernel(clProgram,
"matrixMul", &errcode);
shrCheckError(errcode, CL_SUCCESS);
// 5. Launch OpenCL kernel
size_t localWorkSize[2], globalWorkSize[2];
int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,
sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,
sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,
sizeof(int), (void *)&wC);
shrCheckError(errcode, CL_SUCCESS);
localWorkSize[0] = 16;
localWorkSize[1] = 16;
globalWorkSize[0] = 1024;
globalWorkSize[1] = 1024;
errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);
// 6. Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS);
// 7. clean up memory
free(h_A);
free(h_B);
free(h_C);
clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);
free(clDevices);
free(clMatrixMul);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);}
almost 100 lines of code
â tedious and hard to maintain
It does not take advantage of HAS features.
It will likely need to be changed for OpenCL 2.0.
- 7. COMPARING SEVERAL HIGH-LEVEL
PROGRAMMING INTERFACES
C++AMP Thrust Bolt OpenACC SYCL
C++ Language
extension
proposed by
Microsoft
library
proposed
by CUDA
library
proposed
by AMD
Annotation
and
Pragmas
proposed
by PGI
C++
wrapper
for
OpenCL
All these proposals aim to reduce tedious boiler
plate code and provide transparent porting to future
systems (future proofing).
© Copyright 2014 HSA Foundation. All Rights Reserved
- 9. OPENACC
- SIMPLE MATRIX MULTIPLICATION EXAMPLE
1. void MatrixMulti(float *C, const float *A, const float *B, int hA, int wA, int wB)
2 {
3 #pragma acc parallel loop copyin(A[0:hA*wA]) copyin(B[0:wA*wB]) copyout(C[0:hA*wB])
4 for (int i=0; i<hA; i++) {
5 #pragma acc loop
6 for (int j=0; j<wB; j++) {
7 float sum = 0;
8 for (int k=0; k<wA; k++) {
9 float a = A[i*wA+k];
10 float b = B[k*wB+j];
11 sum += a*b;
12 }
13 C[i*Nw+j] = sum;
14 }
15 }
16 }
Little Host Code Overhead
Programmer annotation of
kernel computation
Programmer annotation of data movement
© Copyright 2014 HSA Foundation. All Rights Reserved
- 10. ADVANTAGE OF HSA FOR OPENACC
ï” Flexibility in copyin and copyout implementation
ï” Flexible code generation for nested acc parallel loops
ï” E.g., inner loop bounds that depend on outer loop iterations
ï” Compiler data affinity optimization (especially OpenACC kernel regions)
ï” The compiler does not have to undo programmer managed data transfers
© Copyright 2014 HSA Foundation. All Rights Reserved
- 11. C++AMP
HSA ENABLES EFFICIENT COMPILATION OF AN
EVEN HIGHER LEVEL OF PROGRAMMING
INTERFACE
© Copyright 2014 HSA Foundation. All Rights Reserved
- 12. C++ AMP
â C++ Accelerated Massive Parallelism
â Designed for data level parallelism
â Extension of C++11 proposed by Microsoft
â An open specification with multiple implementations aiming at standardization
â MS Visual Studio 2013
â MulticoreWare CLAMP
â GPU data modeled as C++14-like containers for multidimensional arrays
â GPU kernels modeled as C++11 lambda
â Minimal extension to C++ for simplicity and future proofing
© Copyright 2014 HSA Foundation. All Rights Reserved
- 13. MATRIX MULTIPLICATION IN C++AMP
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix,
int ha, int hb, int hc) {
array_view<int, 2> a(ha, hb, aMatrix);
array_view<int, 2> b(hb, hc, bMatrix);
array_view<int, 2> product(ha, hc, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();}
clGPUContext = clCreateContextFromType(0,
CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS);
// get the list of GPU devices associated
// with context
errcode = clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, 0, NULL,
&dataBytes);
cl_device_id *clDevices = (cl_device_id *)
malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext,
CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL);
shrCheckError(errcode, CL_SUCCESS);
//Create a command-queue
clCommandQue =
clCreateCommandQueue(clGPUContext,
clDevices[0], 0, &errcode);
shrCheckError(errcode, CL_SUCCESS);
__kernel void
matrixMul(__global float* C, __global float* A,
__global float* B, int wA, int wB) {
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;}
© Copyright 2014 HSA Foundation. All Rights Reserved
- 14. C++AMP PROGRAMMING MODEL
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();}
GPU data
modeled as
data container
© Copyright 2014 HSA Foundation. All Rights Reserved
- 15. C++AMP PROGRAMMING MODEL
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();}
Kernels modeled as
lambdas; arguments are
implicitly modeled as
captured variables,
programmer do not need to
specify copyin and copyout
© Copyright 2014 HSA Foundation. All Rights Reserved
- 16. C++AMP PROGRAMMING MODEL
void MultiplyWithAMP(int* aMatrix, int* bMatrix, int *productMatrix) {
array_view<int, 2> a(3, 2, aMatrix);
array_view<int, 2> b(2, 3, bMatrix);
array_view<int, 2> product(3, 3, productMatrix);
parallel_for_each(
product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
}
);
product.synchronize();
}
Execution
interface; marking
an implicitly
parallel region for
GPU execution
© Copyright 2014 HSA Foundation. All Rights Reserved
- 17. MCW C++AMP (CLAMP)
â Runs on Linux and Mac OS X
â Output code compatible with all major OpenCL stacks: AMD, Apple/Intel (OS X),
NVIDIA and even POCL
â Clang/LLVM-based, open source
o Translate C++AMP code to OpenCL C or OpenCL 1.2 SPIR
o With template helper library
â Runtime: OpenCL 1.1/HSA Runtime and GMAC for non-HSA systems
â One of the two C++ AMP implementations recognized by HSA foundation
© Copyright 2014 HSA Foundation. All Rights Reserved
- 18. MCW C++ AMP COMPILER
â Device Path
o generate OpenCL C code and SPIR
o emit kernel function
â Host Path
o preparation to launch the code
C++ AMP
source code
Clang/LLVM 3.3
Device
Code
Host
Code
© Copyright 2014 HSA Foundation. All Rights Reserved
- 19. TRANSLATION
parallel_for_each(product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
});
__kernel void
matrixMul(__global float* C, __global float*
A,
__global float* B, int wA, int wB){
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;}
â Append the arguments
â Set the index
â emit kernel function
â implicit memory management
© Copyright 2014 HSA Foundation. All Rights Reserved
- 20. EXECUTION ON NON-HSA OPENCL
PLATFORMS
C++ AMP
source code
Clang/LLVM
3.3
Device Code
C++ AMP
source code
Clang/LLVM
3.3
Host Code
gmac
OpenCL
Our work
Runtime
© Copyright 2014 HSA Foundation. All Rights Reserved
- 21. GMAC
â unified virtual address space in
software
â Can have high overhead
sometimes
â In HSA (e.g., AMD Kaveri), GMAC
is not longer needed
Gelado, et al, ASPLOS 2010
© Copyright 2014 HSA Foundation. All Rights Reserved
- 22. CASE STUDY:
BINOMIAL OPTION PRICING
ï” Line of Codes
0
50
100
150
200
250
300
350
C++AMP OpenCL
Lines of Code by Cloc
Host
Kernel
© Copyright 2014 HSA Foundation. All Rights Reserved
- 23. PERFORMANCE ON NON-HSA SYSTEMS
BINOMIAL OPTION PRICING
0
0.02
0.04
0.06
0.08
0.1
0.12
Total GPU Time Kernel-only
TimeinSeconds
Performance on an NV Tesla C2050
OpenCL
C++AMP
© Copyright 2014 HSA Foundation. All Rights Reserved
- 24. EXECUTION ON HSA
C++ AMP
source code
Clang/LLVM
3.3
Device SPIR
C++ AMP
source code
Clang/LLVM
3.3
Host SPIR
HSA Runtime
Compile Time
Runtime
© Copyright 2014 HSA Foundation. All Rights Reserved
- 25. WHAT WE NEED TO DO?
â Kernel function
o emit the kernel function with required arguments
â On Host side
o a function that recursively traverses the object and append the arguments to OpenCL
stack.
â On Device side
o reconstruct it on the device code for future use.
© Copyright 2014 HSA Foundation. All Rights Reserved
- 26. WHY COMPILING C++AMP TO OPENCL IS
NOT TRIVIAL
â C++AMP â LLVM IR â OpenCL C or SPIR
â arguments passing (lambda capture vs function calls)
â explicit V.S. implicit memory transfer
â Heavy lifting is done by compiler and runtime
© Copyright 2014 HSA Foundation. All Rights Reserved
- 27. EXAMPLE
struct A { int a; };struct B : A { int b; };struct C { B b; int c; };
struct C c;
c.c = 100;
auto fn = [=] () { int qq = c.c; };
© Copyright 2014 HSA Foundation. All Rights Reserved
- 28. TRANSLATION
parallel_for_each(product.extent,
[=](index<2> idx) restrict(amp) {
int row = idx[0];
int col = idx[1];
for (int inner = 0; inner < 2; inner++) {
product[idx] += a(row, inner) * b(inner, col);
}
});
__kernel void
matrixMul(__global float* C, __global float* A,
__global float* B, int wA, int wB){
int tx = get_global_id(0);
int ty = get_global_id(1);
float value = 0;
for (int k = 0; k < wA; ++k)
{
float elementA = A[ty * wA + k];
float elementB = B[k * wB + tx];
value += elementA * elementB;
}
C[ty * wA + tx] = value;}
â Compiler
â Turn captured variables into
OpenCL arguments
â Populate the index<N> in OCL
kernel
â Runtime
â Implicit memory management
© Copyright 2014 HSA Foundation. All Rights Reserved