In this deck from the Stanford HPC Conference, Doug Miles from NVIDIA presents: Accelerating HPC Applications on NVIDIA GPUs with OpenACC."
"OpenACC is a directive-based parallel programming model for GPU accelerated and heterogeneous parallel HPC systems. It offers higher programmer productivity compared to use of explicit models like CUDA and OpenCL.
Application source code instrumented with OpenACC directives remains portable to any system with a standard Fortran/C/C++ compiler, and can be efficiently parallelized for various types of HPC systems â multicore CPUs, heterogeneous CPU+GPU, and manycore processors.
This talk will include an introduction to the OpenACC programming model, provide examples of its use in a number of production applications, explain how OpenACC and CUDA Unified Memory working together can dramatically simplify GPU programming, and close with a few thoughts on OpenACC future directions."
Watch the video: https://youtu.be/CaE3n89QM8o
Learn more: https://www.openacc.org/
and
http://hpcadvisorycouncil.com
Sign up for our insideHPC Newsletter: http://insidehpc.com/newsletter
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
Â
Accelerating HPC Applications on NVIDIA GPUs with OpenACC
1. Doug Miles, PGI Compilers & Tools, NVIDIA
High Performance Computing Advisory Council
February 21, 2018
ACCELERATING HPC APPLICATIONS
ON NVIDIA GPUS WITH OPENACC
2. 2
PGI â THE NVIDIA HPC SDK
Fortran, C & C++ Compilers
Optimizing, SIMD Vectorizing, OpenMP
Accelerated Computing Features
CUDA Fortran, OpenACC Directives
Multi-Platform Solution
X86-64 and OpenPOWER Multicore CPUs
NVIDIA Tesla GPUs
Supported on Linux, macOS, Windows
MPI/OpenMP/OpenACC Tools
Debugger
Performance Profiler
Interoperable with DDT, TotalView
5. 5
attributes(global) subroutine mm_kernel
( A, B, C, N, M, L )
real :: A(N,M), B(M,L), C(N,L), Cij
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real, shared :: Asub(16,16),Bsub(16,16)
tx = threadidx%x
ty = threadidx%y
i = blockidx%x * 16 + tx
j = blockidx%y * 16 + ty
Cij = 0.0
do kb = 1, M, 16
Asub(tx,ty) = A(i,kb+tx-1)
Bsub(tx,ty) = B(kb+ty-1,j)
call syncthreads()
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
call syncthreads()
enddo
C(i,j) = Cij
end subroutine mmul_kernel
real, device, allocatable, dimension(:,:) ::
Adev,Bdev,Cdev
. . .
allocate (Adev(N,M), Bdev(M,L), Cdev(N,L))
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
call mm_kernel <<<dim3(N/16,M/16),dim3(16,16)>>>
( Adev, Bdev, Cdev, N, M, L )
C(1:N,1:L) = Cdev
deallocate ( Adev, Bdev, Cdev )
. . .
CPU Code Tesla Code
CUDA FORTRAN
6. 6
CUDA FORTRAN
module madd_device_module
use cudafor
contains
subroutine madd_dev(a,b,c,sum,n1,n2)
real,dimension(:,:),device :: a,b,c
real :: sum
integer :: n1,n2
type(dim3) :: grid, block
!$cuf kernel do (2) <<<(*,*),(32,4)>>>
do j = 1,n2
do i = 1,n1
a(i,j) = b(i,j) + c(i,j)
sum = sum + a(i,j)
enddo
enddo
end subroutine
end module
Equivalent
hand-written
CUDA kernels
module madd_device_module
use cudafor
implicit none
contains
attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2)
real, dimension(:,:) :: a,b,c
real, dimension(:) :: blocksum
integer, value :: n1,n2
integer :: i,j,tindex,tneighbor,bindex
real :: mysum
real, shared :: bsum(256)
! Do this thread's work
mysum = 0.0
do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y
do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x
a(i,j) = b(i,j) + c(i,j)
mysum = mysum + a(i,j) ! accumulates partial sum per thread
enddo
enddo
! Now add up all partial sums for the whole thread block
! Compute this thread's linear index in the thread block
! We assume 256 threads in the thread block
tindex = threadidx%x + (threadidx%y-1)*blockdim%x
! Store this thread's partial sum in the shared memory block
bsum(tindex) = mysum
call syncthreads()
! Accumulate all the partial sums for this thread block to a single value
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
! Store the partial sum for the thread block
bindex = blockidx%x + (blockidx%y-1)*griddim%x
if( tindex == 1 ) blocksum(bindex) = bsum(1)
end subroutine
! Add up partial sums for all thread blocks to a single cumulative sum
attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb)
real, dimension(:) :: blocksum
real :: dsum
integer, value :: nb
real, shared :: bsum(256)
integer :: tindex,tneighbor,i
! Again, we assume 256 threads in the thread block
! accumulate a partial sum for each thread
tindex = threadidx%x
bsum(tindex) = 0.0
do i = tindex, nb, blockdim%x
bsum(tindex) = bsum(tindex) + blocksum(i)
enddo
call syncthreads()
! This code is copied from the previous kernel
! Accumulate all the partial sums for this thread block to a single value
! Since there is only one thread block, this single value is the final result
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
if( tindex == 1 ) dsum = bsum(1)
end subroutine
subroutine madd_dev(a,b,c,dsum,n1,n2)
real, dimension(:,:), device :: a,b,c
real, device :: dsum
real, dimension(:), allocatable, device :: blocksum
integer :: n1,n2,nb
type(dim3) :: grid, block
integer :: r
! Compute grid/block size; block size must be 256 threads
grid = dim3((n1+31)/32, (n2+7)/8, 1)
block = dim3(32,8,1)
nb = grid%x * grid%y
allocate(blocksum(1:nb))
call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2)
call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb)
r = cudaThreadSynchronize() ! don't deallocate too early
deallocate(blocksum)
end subroutine
!$CUF KERNEL Directives
13. 13
ANSYS FLUENT
Weâve effectively used
OpenACC for heterogeneous
computing in ANSYS Fluent
with impressive performance.
Weâre now applying this work
to more of our models and
new platforms.
Sunil Sathe
Lead Software Developer
ANSYS Fluent
Image courtesy: ANSYS
14. 14
VASP
For VASP, OpenACC is the way
forward for GPU acceleration.
Performance is similar and in some
cases better than CUDA C, and
OpenACC dramatically decreases
GPU development and maintenance
efforts. Weâre excited to collaborate
with NVIDIA and PGI as an early
adopter of CUDA Unified Memory.
Prof. Georg Kresse
Computational Materials Physics
University of Vienna
15. 15
David Gutzwiller
Lead Software Developer
NUMECA
NUMECA FINE/Open
Porting our unstructured C++ CFD
solver FINE/Open to GPUs using
OpenACC would have been
impossible two or three years ago,
but OpenACC has developed
enough that weâre now getting
some really good results.
16. 16
MPAS-A
Our team has been evaluating
OpenACC as a pathway to
performance portability for the Model
for Prediction (MPAS) atmospheric
model. Using this approach on the
MPAS dynamical core, we have
achieved performance on a single
P100 GPU equivalent to 2.7 dual
socketed Intel Xeon nodes on our new
Cheyenne supercomputer.
Richard Loft
Director, Technology Development
NCAR
Image courtesy: NCAR
17. 17
OpenACC made it practical to
develop for GPU-based hardware
while retaining a single source for
almost all the COSMO physics
code.
Dr. Oliver Fuhrer
Senior Scientist
Meteoswiss
COSMO
18. 18
GAMERA FOR GPU
With OpenACC and a compute
node based on NVIDIA's Tesla
P100 GPU, we achieved more
than a 14X speed up over a K
Computer node running our
earthquake disaster simulation
code
Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo
Hori, Lalith Wijerathne
The University of Tokyo
Map courtesy University of Tokyo
19. 19
QUANTUM ESPRESSO
CUDA Fortran gives us the full
performance potential of the
CUDA programming model and
NVIDIA GPUs. !$CUF KERNELS
directives give us productivity and
source code maintainability. Itâs
the best of both worlds.
Filippo Spiga
Head of Research Software Engineering
University of Cambridge
21. 21
Programming GPU-Accelerated Systems
CUDA Unified Memory for Dynamically Allocated Data
GPU Developer View With
CUDA Unified Memory
Unified Memory
GPU Developer View
System
Memory
GPU Memory
PCIe
22. 22
How CUDA Unified Memory Works on TESLA GPUs
Servicing CPU and GPU Page Faults for Allocatable Data
GPU Memory MappingCPU Memory Mapping
PCIe or NVLink
Page
Fault
Page
Fault
arrayarray
__global__
void setValue(char *ptr, int index, char val)
{
ptr[index] = val;
}
cudaMallocManaged(&array, size);
memset(array, size);
setValue<<<...>>>(array, size/2, 5);
...
23. 23
#pragma acc data copyin(a,b) copyout(c)
{
...
#pragma acc parallel
{
#pragma acc loop gang vector
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
}
PGI OpenACC and CUDA Unified Memory
Compiling with the âta=tesla:managed option
C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory
GPU Developer View With
CUDA Unified Memory
Unified Memory
24. 24
PGI OpenACC and CUDA Unified Memory
Compiling with the âta=tesla:managed option
C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory
...
#pragma acc parallel
{
#pragma acc loop gang vector
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
GPU Developer View With
CUDA Unified Memory
Unified Memory
25. 25
Center for Accelerated Application
Readiness (CAAR)
Oak Ridge Leadership Computing Facility
IBM POWER9 CPUs
NVIDIA Volta V100 GPUs
26. 26
GTC: An OpenACC Production Application
The gyrokinetic toroidal
code (GTC) is a massively
parallel, particle-in-cell
production code for
turbulence simulation in
support of the burning
plasma experiment ITER,
the crucial next step in the
quest for fusion energy.
Being ported for runs on the ORNL Summit supercomputer
http://phoenix.ps.uci.edu/gtc_group
27. 27
GTC Performance using OpenACC
P8 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK
UM : No Data Directives in sources, compiled with âta=tesla:managed
2x
4x
6x
8x
10x
12x
OpenPOWER | NVLink | Unified Memory | P100 | V100
14x
16x
Data Directives Data Directives Data Directives
6.1X 5.9X
12.1X 12X
16.5X
20-core P8 P8+2xP100
UM
P8+2xP100 P8+4xP100
UM
P8+4xP100 x64+4xV100
29. 29
Managing Aggregate Data Structures with OpenACC
An Example from the OpenACC port of VASP
Derived Type 1
Members:
3 dynamic
1 derived type 2
Derived Type 2
Members:
21 dynamic
1 derived type 3
1 derived type 4
Derived Type 3
Members:
only static
Derived Type 4
Members:
8 dynamic
4 derived type 5
2 derived type 6
Derived Type 5
Members:
3 dynamic
Derived Type 6
Members:
8 dynamic
âą Real-world applications often have complex,
aggregate data structures
âą CUDA Unified Memory can automatically
manage Deep Copy, but âŠ
30. 30
Managing Aggregate Data Structures with OpenACC
An Example from the OpenACC port of VASP
Derived Type 1
Members:
3 dynamic
1 derived type 2
Derived Type 2
Members:
21 dynamic
1 derived type 3
1 derived type 4
Derived Type 3
Members:
only static
Derived Type 4
Members:
8 dynamic
4 derived type 5
2 derived type 6
Derived Type 5
Members:
3 dynamic
Derived Type 6
Members:
8 dynamic
âą Real-world applications often have complex,
aggregate data structures
âą CUDA Unified Memory can automatically
manage Deep Copy, but âŠ
âą CUDA Unified Memory is only for allocatable
data today
38. 38
FORTRAN 2018 DO CONCURRENT
75
76
77 DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) &
78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, &
min_cell_volume,energy_change,recip_volume)
79
80
81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) &
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) &
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) &
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) &
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
89 total_flux=right_flux-left_flux+top_flux-bottom_flux
90
91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux)
92
93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux &
94 ,volume(j,k)+right_flux-left_flux &
95 ,volume(j,k)+top_flux-bottom_flux)
97 recip_volume=1.0/volume(j,k)
99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*...
101 energy1(j,k)=energy0(j,k)-energy_change
103 density1(j,k)=density0(j,k)*volume_change(j,k)
105
106 ENDDO
107
Fortran 2018 DO CONCURRENT
+ True Parallel Loops
+ Loop-scope shared/private data
â No support for reductions
â No support for atomics
â No support for data management
39. 39
OPENACC FOR EVERYONE
The PGI Community Edition, pgicompilers.com/community
PROGRAMMING MODELS
OpenACC, CUDA Fortran, OpenMP,
C/C++/Fortran Compilers and Tools
PLATFORMS
X86, OpenPOWER, NVIDIA GPU
UPDATES 1-2 times a year 6-9 times a year 6-9 times a year
SUPPORT User Forums PGI Support
PGI Premier
Services
LICENSE Annual Perpetual Volume/Site
FREE