SlideShare ist ein Scribd-Unternehmen logo
1 von 39
Downloaden Sie, um offline zu lesen
Doug Miles, PGI Compilers & Tools, NVIDIA
High Performance Computing Advisory Council
February 21, 2018
ACCELERATING HPC APPLICATIONS
ON NVIDIA GPUS WITH OPENACC
2
PGI — THE NVIDIA HPC SDK
Fortran, C & C++ Compilers
Optimizing, SIMD Vectorizing, OpenMP
Accelerated Computing Features
CUDA Fortran, OpenACC Directives
Multi-Platform Solution
X86-64 and OpenPOWER Multicore CPUs
NVIDIA Tesla GPUs
Supported on Linux, macOS, Windows
MPI/OpenMP/OpenACC Tools
Debugger
Performance Profiler
Interoperable with DDT, TotalView
3
Programming GPU-Accelerated Systems
Separate CPU System and GPU Memories
GPU Developer View
System
Memory
GPU Memory
PCIe
4
NVLink
Programming GPU-Accelerated Systems
Separate CPU System and GPU Memories
GPU Developer View
System
Memory
GPU Memory
5
attributes(global) subroutine mm_kernel
( A, B, C, N, M, L )
real :: A(N,M), B(M,L), C(N,L), Cij
integer, value :: N, M, L
integer :: i, j, kb, k, tx, ty
real, shared :: Asub(16,16),Bsub(16,16)
tx = threadidx%x
ty = threadidx%y
i = blockidx%x * 16 + tx
j = blockidx%y * 16 + ty
Cij = 0.0
do kb = 1, M, 16
Asub(tx,ty) = A(i,kb+tx-1)
Bsub(tx,ty) = B(kb+ty-1,j)
call syncthreads()
do k = 1,16
Cij = Cij + Asub(tx,k) * Bsub(k,ty)
enddo
call syncthreads()
enddo
C(i,j) = Cij
end subroutine mmul_kernel
real, device, allocatable, dimension(:,:) ::
Adev,Bdev,Cdev
. . .
allocate (Adev(N,M), Bdev(M,L), Cdev(N,L))
Adev = A(1:N,1:M)
Bdev = B(1:M,1:L)
call mm_kernel <<<dim3(N/16,M/16),dim3(16,16)>>>
( Adev, Bdev, Cdev, N, M, L )
C(1:N,1:L) = Cdev
deallocate ( Adev, Bdev, Cdev )
. . .
CPU Code Tesla Code
CUDA FORTRAN
6
CUDA FORTRAN
module madd_device_module
use cudafor
contains
subroutine madd_dev(a,b,c,sum,n1,n2)
real,dimension(:,:),device :: a,b,c
real :: sum
integer :: n1,n2
type(dim3) :: grid, block
!$cuf kernel do (2) <<<(*,*),(32,4)>>>
do j = 1,n2
do i = 1,n1
a(i,j) = b(i,j) + c(i,j)
sum = sum + a(i,j)
enddo
enddo
end subroutine
end module
Equivalent
hand-written
CUDA kernels
module madd_device_module
use cudafor
implicit none
contains
attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2)
real, dimension(:,:) :: a,b,c
real, dimension(:) :: blocksum
integer, value :: n1,n2
integer :: i,j,tindex,tneighbor,bindex
real :: mysum
real, shared :: bsum(256)
! Do this thread's work
mysum = 0.0
do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y
do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x
a(i,j) = b(i,j) + c(i,j)
mysum = mysum + a(i,j) ! accumulates partial sum per thread
enddo
enddo
! Now add up all partial sums for the whole thread block
! Compute this thread's linear index in the thread block
! We assume 256 threads in the thread block
tindex = threadidx%x + (threadidx%y-1)*blockdim%x
! Store this thread's partial sum in the shared memory block
bsum(tindex) = mysum
call syncthreads()
! Accumulate all the partial sums for this thread block to a single value
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
! Store the partial sum for the thread block
bindex = blockidx%x + (blockidx%y-1)*griddim%x
if( tindex == 1 ) blocksum(bindex) = bsum(1)
end subroutine
! Add up partial sums for all thread blocks to a single cumulative sum
attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb)
real, dimension(:) :: blocksum
real :: dsum
integer, value :: nb
real, shared :: bsum(256)
integer :: tindex,tneighbor,i
! Again, we assume 256 threads in the thread block
! accumulate a partial sum for each thread
tindex = threadidx%x
bsum(tindex) = 0.0
do i = tindex, nb, blockdim%x
bsum(tindex) = bsum(tindex) + blocksum(i)
enddo
call syncthreads()
! This code is copied from the previous kernel
! Accumulate all the partial sums for this thread block to a single value
! Since there is only one thread block, this single value is the final result
tneighbor = 128
do while( tneighbor >= 1 )
if( tindex <= tneighbor ) &
bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor)
tneighbor = tneighbor / 2
call syncthreads()
enddo
if( tindex == 1 ) dsum = bsum(1)
end subroutine
subroutine madd_dev(a,b,c,dsum,n1,n2)
real, dimension(:,:), device :: a,b,c
real, device :: dsum
real, dimension(:), allocatable, device :: blocksum
integer :: n1,n2,nb
type(dim3) :: grid, block
integer :: r
! Compute grid/block size; block size must be 256 threads
grid = dim3((n1+31)/32, (n2+7)/8, 1)
block = dim3(32,8,1)
nb = grid%x * grid%y
allocate(blocksum(1:nb))
call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2)
call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb)
r = cudaThreadSynchronize() ! don't deallocate too early
deallocate(blocksum)
end subroutine
!$CUF KERNEL Directives
7
OpenACC Directives
Manage
Data
Movement
Initiate
Parallel
Execution
Optimize
Loop
Mappings
#pragma acc data copyin(a,b) copyout(c)
{
...
#pragma acc parallel
{
#pragma acc loop gang vector
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
}
CPU, GPU, Manycore
Performance portable
Interoperable
Single source
Incremental
8
GPU
Memory
System
Memory
...
#pragma acc data copy(b[0:n][0:m]) 
create(a[0:n][0:m])
{
for (iter = 1; iter <= p; ++iter){
#pragma acc parallel loop
for (i = 1; i < n-1; ++i){
for (j = 1; j < m-1; ++j){
a[i][j]=w0*b[i][j]+
w1*(b[i-1][j]+b[i+1][j]+
b[i][j-1]+b[i][j+1])+
w2*(b[i-1][j-1]+b[i-1][j+1]+
b[i+1][j-1]+b[i+1][j+1]);
} }
#pragma acc parallel loop
for( i = 1; i < n-1; ++i )
for( j = 1; j < m-1; ++j )
b[i][j] = a[i][j];
}
}
...
AA
BB
S2
(B)S1
(B)S1
(B)S2
(B)
S1
(B)
Sp
(B)Sp
(B)
Sp
(B)
OpenACC for GPUs in a Nutshell
9
Multicore CPU
OpenACC is for Multicore, Manycore & GPUs
% pgfortran -ta=multicore –fast –Minfo=acc -c 
update_tile_halo_kernel.f90
. . .
100, Loop is parallelizable
Generating Multicore code
100, !$acc loop gang
102, Loop is parallelizable
Tesla GPU
% pgfortran -ta=tesla –fast -Minfo=acc –c 
update_tile_halo_kernel.f90
. . .
100, Loop is parallelizable
102, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
100, !$acc loop gang, vector(4) ! blockidx%y threadidx%y
102, !$acc loop gang, vector(32) ! blockidx%x threadidx%x
98 !$acc parallel
99 !$acc loop independent
100 do k=y_min-depth,y_max+depth
101 !$acc loop independent
102 do j=1,depth
103 density0(x_min-j,k)=left_density0(left_xmax+1-j,k)
104 enddo
105 enddo
106 !$acc end parallel
Performance measured February, 2018. Skylake: Two 20 core Intel Xeon Gold 6148 CPUs @ 2.4GHz w/ 376GB memory, hyperthreading enabled. EPYC: Two 24 core AMD EPYC 7451 CPUs
@ 2.3GHz w/ 256GB memory. Broadwell: Two 20 core Intel Xeon E5-2698 v4 CPUs @ 3.6GHz w/ 256GB memory, hyperthreading enabled. Volta: NVIDIA DGX1 system with two 20 core
Intel Xeon E5-2698 v4 CPUs @ 2.20GHz, 256GB memory, one NVIDIA Tesla V100-SXM2-16GB GPU @ 1.53GHz. SPECÂź is a registered trademark of the Standard Performance Evaluation
Corporation (www.spec.org).
SPEC ACCEL 1.2 BENCHMARKS
0
50
100
150
200
2-socket Skylake 2-socket EPYC 2-socket BroadwellGEOMEANSeconds
Intel 2018 PGI 18.1
OpenMP 4.5
40 cores / 80 threads 48 cores / 48 threads 40 cores / 80 threads
0
50
100
150
200
GEOMEANSeconds
PGI 18.1
OpenACC
2-socket
Broadwell
1x Volta
V100
4.4x
Speed-up
11
OPENACC APPLICATIONS
12
GAUSSIAN 16
Using OpenACC allowed us to continue
development of our fundamental
algorithms and software capabilities
simultaneously with the GPU-related
work. In the end, we could use the
same code base for SMP, cluster/
network and GPU parallelism. PGI's
compilers were essential to the success
of our efforts.
Mike Frisch, Ph.D.
President and CEO
Gaussian, Inc.
Gaussian, Inc.
340QuinnipiacSt. Bldg. 40
Wallingford, CT 06492USA
custserv@gaussian.com
Gaussian isa registered trademark of Gaussian, Inc. All other trademarksand
thepropertiesof their respectiveholders. Specif cationssubject tochangewitho
Copyright © 2017, Gaussian, Inc. All rightsreserved.
Roberto Gomperts
NVIDIA
Michael Frisch
Gaussian
Brent Leback
NVIDIA/PGI
Gio
Project Contributors
%GPUCPU=0 - 7 =0 - 7 UseGPUs0-7with CPUs0-7astheir controllers.
Detailed information isavailableon our website.
13
ANSYS FLUENT
We’ve effectively used
OpenACC for heterogeneous
computing in ANSYS Fluent
with impressive performance.
We’re now applying this work
to more of our models and
new platforms.
Sunil Sathe
Lead Software Developer
ANSYS Fluent
Image courtesy: ANSYS
14
VASP
For VASP, OpenACC is the way
forward for GPU acceleration.
Performance is similar and in some
cases better than CUDA C, and
OpenACC dramatically decreases
GPU development and maintenance
efforts. We’re excited to collaborate
with NVIDIA and PGI as an early
adopter of CUDA Unified Memory.
Prof. Georg Kresse
Computational Materials Physics
University of Vienna
15
David Gutzwiller
Lead Software Developer
NUMECA
NUMECA FINE/Open
Porting our unstructured C++ CFD
solver FINE/Open to GPUs using
OpenACC would have been
impossible two or three years ago,
but OpenACC has developed
enough that we’re now getting
some really good results.
16
MPAS-A
Our team has been evaluating
OpenACC as a pathway to
performance portability for the Model
for Prediction (MPAS) atmospheric
model. Using this approach on the
MPAS dynamical core, we have
achieved performance on a single
P100 GPU equivalent to 2.7 dual
socketed Intel Xeon nodes on our new
Cheyenne supercomputer.
Richard Loft
Director, Technology Development
NCAR
Image courtesy: NCAR
17
OpenACC made it practical to
develop for GPU-based hardware
while retaining a single source for
almost all the COSMO physics
code.
Dr. Oliver Fuhrer
Senior Scientist
Meteoswiss
COSMO
18
GAMERA FOR GPU
With OpenACC and a compute
node based on NVIDIA's Tesla
P100 GPU, we achieved more
than a 14X speed up over a K
Computer node running our
earthquake disaster simulation
code
Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo
Hori, Lalith Wijerathne
The University of Tokyo
Map courtesy University of Tokyo
19
QUANTUM ESPRESSO
CUDA Fortran gives us the full
performance potential of the
CUDA programming model and
NVIDIA GPUs. !$CUF KERNELS
directives give us productivity and
source code maintainability. It’s
the best of both worlds.
Filippo Spiga
Head of Research Software Engineering
University of Cambridge
20
OPENACC AND CUDA UNIFIED MEMORY
21
Programming GPU-Accelerated Systems
CUDA Unified Memory for Dynamically Allocated Data
GPU Developer View With
CUDA Unified Memory
Unified Memory
GPU Developer View
System
Memory
GPU Memory
PCIe
22
How CUDA Unified Memory Works on TESLA GPUs
Servicing CPU and GPU Page Faults for Allocatable Data
GPU Memory MappingCPU Memory Mapping
PCIe or NVLink
Page
Fault
Page
Fault
arrayarray
__global__
void setValue(char *ptr, int index, char val)
{
ptr[index] = val;
}
cudaMallocManaged(&array, size);
memset(array, size);
setValue<<<...>>>(array, size/2, 5);
...
23
#pragma acc data copyin(a,b) copyout(c)
{
...
#pragma acc parallel
{
#pragma acc loop gang vector
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
}
PGI OpenACC and CUDA Unified Memory
Compiling with the –ta=tesla:managed option
C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory
GPU Developer View With
CUDA Unified Memory
Unified Memory
24
PGI OpenACC and CUDA Unified Memory
Compiling with the –ta=tesla:managed option
C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory
...
#pragma acc parallel
{
#pragma acc loop gang vector
for (i = 0; i < n; ++i) {
c[i] = a[i] + b[i];
...
}
}
...
GPU Developer View With
CUDA Unified Memory
Unified Memory
25
Center for Accelerated Application
Readiness (CAAR)
Oak Ridge Leadership Computing Facility
IBM POWER9 CPUs
NVIDIA Volta V100 GPUs
26
GTC: An OpenACC Production Application
The gyrokinetic toroidal
code (GTC) is a massively
parallel, particle-in-cell
production code for
turbulence simulation in
support of the burning
plasma experiment ITER,
the crucial next step in the
quest for fusion energy.
Being ported for runs on the ORNL Summit supercomputer
http://phoenix.ps.uci.edu/gtc_group
27
GTC Performance using OpenACC
P8 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK
UM : No Data Directives in sources, compiled with –ta=tesla:managed
2x
4x
6x
8x
10x
12x
OpenPOWER | NVLink | Unified Memory | P100 | V100
14x
16x
Data Directives Data Directives Data Directives
6.1X 5.9X
12.1X 12X
16.5X
20-core P8 P8+2xP100
UM
P8+2xP100 P8+4xP100
UM
P8+4xP100 x64+4xV100
28
DEEP COPY
29
Managing Aggregate Data Structures with OpenACC
An Example from the OpenACC port of VASP
Derived Type 1
Members:
3 dynamic
1 derived type 2
Derived Type 2
Members:
21 dynamic
1 derived type 3
1 derived type 4
Derived Type 3
Members:
only static
Derived Type 4
Members:
8 dynamic
4 derived type 5
2 derived type 6
Derived Type 5
Members:
3 dynamic
Derived Type 6
Members:
8 dynamic
‱ Real-world applications often have complex,
aggregate data structures
‱ CUDA Unified Memory can automatically
manage Deep Copy, but 

30
Managing Aggregate Data Structures with OpenACC
An Example from the OpenACC port of VASP
Derived Type 1
Members:
3 dynamic
1 derived type 2
Derived Type 2
Members:
21 dynamic
1 derived type 3
1 derived type 4
Derived Type 3
Members:
only static
Derived Type 4
Members:
8 dynamic
4 derived type 5
2 derived type 6
Derived Type 5
Members:
3 dynamic
Derived Type 6
Members:
8 dynamic
‱ Real-world applications often have complex,
aggregate data structures
‱ CUDA Unified Memory can automatically
manage Deep Copy, but 

‱ CUDA Unified Memory is only for allocatable
data today
31
FORTRAN AUTOMATIC FULL DEEP COPY
Fortran Derived Types
32
OPENACC 2.6 MANUAL DEEP COPY
typedef struct points {
float* x; float* y; float* z;
int n;
float coef, direction;
} points;
void sub ( int n, float* y ) {
points p;
#pragma acc data create (p)
{
p.n = n;
p.x = ( float*) malloc ( sizeof ( float )*n );
p.y = ( float*) malloc ( sizeof ( float )*n );
p.z = ( float*) malloc ( sizeof ( float )*n );
#pragma acc update device (p.n)
#pragma acc data copyin (p.x[0:n], p.y[0: n])
{
#pragma acc parallel loop
for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];
. . .
Supported Today in PGI Compilers
33
DRAFT OPENACC 3.0 TRUE DEEP COPY
typedef struct points {
float* x; float* y; float* z;
int n;
float coef, direction;
#pragma acc policy inout(x[0:n],y[0:n])
} points;
void sub ( int n, float* y ) {
points p;
p.n = n;
p.x = ( float*) malloc ( sizeof ( float )*n );
p.y = ( float*) malloc ( sizeof ( float )*n );
p.z = ( float*) malloc ( sizeof ( float )*n );
#pragma acc data copy (p)
{
#pragma acc parallel loop
for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i];
. . .
Still in definition by the OpenACC Committee
34
WHITHER OPENACC?
35
0
20
40
60
80
100
120
140
160
Multicore Haswell Multicore Broadwell Multicore Skylake Kepler Pascal
PGI 18.1 OpenACC
Intel 2018 OpenMP
7.6x 7.9x 10x 10x 11x
40x
14.8x 15x
Volta V100
CLOVERLEAF
SpeedupvsSingleHaswellCore
Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Broadwell server, eight V100s (dgx07), Skylake 2x20 core Xeon Gold server (sky-4).
Compilers: Intel 2018.0.128, PGI 18.1
Benchmark: CloverLeaf v1.3 downloaded from http://uk-mac.github.io/CloverLeaf the week of November 7 2016; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC (MPI+OpenACC)
Data compiled by PGI February 2018.
AWE Hydrodynamics mini-App, bm32 data set
http://uk-mac.github.io/CloverLeaf
109x
67x
142x
1x 2x 4x
36
OPENACC DIRECTIVES FOR GPUS
75 !$ACC KERNELS
76 !$ACC LOOP INDEPENDENT
77 DO k=y_min,y_max
78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux,
min_cell_volume,energy_change,recip_volume)
79 DO j=x_min,x_max
80
81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) &
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) &
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) &
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) &
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
89 total_flux=right_flux-left_flux+top_flux-bottom_flux
90
91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux)
92
93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux &
94 ,volume(j,k)+right_flux-left_flux &
95 ,volume(j,k)+top_flux-bottom_flux)
97 recip_volume=1.0/volume(j,k)
99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*...
101 energy1(j,k)=energy0(j,k)-energy_change
103 density1(j,k)=density0(j,k)*volume_change(j,k)
105 ENDDO
106 ENDDO
107 !$ACC END KERNELS
% pgfortran –fast –ta=tesla –Minfo -c PdV_kernel.f90
pdv_kernel:
...
77, Loop is parallelizable
79, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
77, !$acc loop gang, vector(4) ! blockidx%y
! threadidx%y
79, !$acc loop gang, vector(32)! blockidx%x
! threadidx%x
...
http://uk-mac.github.io/CloverLeaf
37
OPENACC DIRECTIVES FOR MULTICORE CPUS
75 !$ACC KERNELS
76 !$ACC LOOP INDEPENDENT
77 DO k=y_min,y_max
78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux,
min_cell_volume,energy_change,recip_volume)
79 DO j=x_min,x_max
80
81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) &
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) &
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) &
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) &
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
89 total_flux=right_flux-left_flux+top_flux-bottom_flux
90
91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux)
92
93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux &
94 ,volume(j,k)+right_flux-left_flux &
95 ,volume(j,k)+top_flux-bottom_flux)
97 recip_volume=1.0/volume(j,k)
99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*...
101 energy1(j,k)=energy0(j,k)-energy_change
103 density1(j,k)=density0(j,k)*volume_change(j,k)
105 ENDDO
106 ENDDO
107 !$ACC END KERNELS
% pgfortran –fast –ta=multicore ... PdV_kernel.f90
pdv_kernel:
...
77, Loop is parallelizable
Generating Multicore code
77, !$acc loop gang
79, Loop is parallelizable
3 loop-carried redundant expressions removed
with 9 operations and 9 arrays
Innermost loop distributed: 2 new loops
Generated vector SIMD code for the loop
Generated 2 prefetch instructions for the loop
Generated 12 prefetch instructions for the loop
...
http://uk-mac.github.io/CloverLeaf
38
FORTRAN 2018 DO CONCURRENT
75
76
77 DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) &
78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, &
min_cell_volume,energy_change,recip_volume)
79
80
81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) &
82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5
83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) &
84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5
85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) &
86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5
87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) &
88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5
89 total_flux=right_flux-left_flux+top_flux-bottom_flux
90
91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux)
92
93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux &
94 ,volume(j,k)+right_flux-left_flux &
95 ,volume(j,k)+top_flux-bottom_flux)
97 recip_volume=1.0/volume(j,k)
99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*...
101 energy1(j,k)=energy0(j,k)-energy_change
103 density1(j,k)=density0(j,k)*volume_change(j,k)
105
106 ENDDO
107
Fortran 2018 DO CONCURRENT
+ True Parallel Loops
+ Loop-scope shared/private data
− No support for reductions
− No support for atomics
− No support for data management
39
OPENACC FOR EVERYONE
The PGI Community Edition, pgicompilers.com/community
PROGRAMMING MODELS
OpenACC, CUDA Fortran, OpenMP,
C/C++/Fortran Compilers and Tools
PLATFORMS
X86, OpenPOWER, NVIDIA GPU
UPDATES 1-2 times a year 6-9 times a year 6-9 times a year
SUPPORT User Forums PGI Support
PGI Premier
Services
LICENSE Annual Perpetual Volume/Site
FREE

Weitere Àhnliche Inhalte

Was ist angesagt?

LFCollab14: Xen vs Xen Automotive
LFCollab14: Xen vs Xen AutomotiveLFCollab14: Xen vs Xen Automotive
LFCollab14: Xen vs Xen AutomotiveThe Linux Foundation
 
ACPI Debugging from Linux Kernel
ACPI Debugging from Linux KernelACPI Debugging from Linux Kernel
ACPI Debugging from Linux KernelSUSE Labs Taipei
 
LLVM Backend Porting
LLVM Backend PortingLLVM Backend Porting
LLVM Backend PortingShiva Chen
 
from Binary to Binary: How Qemu Works
from Binary to Binary: How Qemu Worksfrom Binary to Binary: How Qemu Works
from Binary to Binary: How Qemu WorksZhen Wei
 
ARM LinuxăźïŒ­ïŒ­ïŒ”ăŻă‚ă‹ă‚Šă«ăă„
ARM LinuxăźïŒ­ïŒ­ïŒ”ăŻă‚ă‹ă‚Šă«ăă„ARM LinuxăźïŒ­ïŒ­ïŒ”ăŻă‚ă‹ă‚Šă«ăă„
ARM LinuxăźïŒ­ïŒ­ïŒ”ăŻă‚ă‹ă‚Šă«ăă„wata2ki
 
GPU Computing
GPU ComputingGPU Computing
GPU ComputingKhan Mostafa
 
Summary of linux kernel security protections
Summary of linux kernel security protectionsSummary of linux kernel security protections
Summary of linux kernel security protectionsShubham Dubey
 
Boosting I/O Performance with KVM io_uring
Boosting I/O Performance with KVM io_uringBoosting I/O Performance with KVM io_uring
Boosting I/O Performance with KVM io_uringShapeBlue
 
Project ACRN hypervisor introduction
Project ACRN hypervisor introduction Project ACRN hypervisor introduction
Project ACRN hypervisor introduction Project ACRN
 
Linux MMAP & Ioremap introduction
Linux MMAP & Ioremap introductionLinux MMAP & Ioremap introduction
Linux MMAP & Ioremap introductionGene Chang
 
Embedded Linux BSP Training (Intro)
Embedded Linux BSP Training (Intro)Embedded Linux BSP Training (Intro)
Embedded Linux BSP Training (Intro)RuggedBoardGroup
 
Fun with Network Interfaces
Fun with Network InterfacesFun with Network Interfaces
Fun with Network InterfacesKernel TLV
 
Android デバッグ氏ネタ
Android デバッグ氏ネタAndroid デバッグ氏ネタ
Android デバッグ氏ネタl_b__
 
Debian Linux on Zynq (Xilinx ARM-SoC FPGA) Setup Flow (Vivado 2015.4)
Debian Linux on Zynq (Xilinx ARM-SoC FPGA) Setup Flow (Vivado 2015.4)Debian Linux on Zynq (Xilinx ARM-SoC FPGA) Setup Flow (Vivado 2015.4)
Debian Linux on Zynq (Xilinx ARM-SoC FPGA) Setup Flow (Vivado 2015.4)Shinya Takamaeda-Y
 
Introduction to char device driver
Introduction to char device driverIntroduction to char device driver
Introduction to char device driverVandana Salve
 

Was ist angesagt? (20)

LFCollab14: Xen vs Xen Automotive
LFCollab14: Xen vs Xen AutomotiveLFCollab14: Xen vs Xen Automotive
LFCollab14: Xen vs Xen Automotive
 
ACPI Debugging from Linux Kernel
ACPI Debugging from Linux KernelACPI Debugging from Linux Kernel
ACPI Debugging from Linux Kernel
 
LLVM Backend Porting
LLVM Backend PortingLLVM Backend Porting
LLVM Backend Porting
 
from Binary to Binary: How Qemu Works
from Binary to Binary: How Qemu Worksfrom Binary to Binary: How Qemu Works
from Binary to Binary: How Qemu Works
 
ARM LinuxăźïŒ­ïŒ­ïŒ”ăŻă‚ă‹ă‚Šă«ăă„
ARM LinuxăźïŒ­ïŒ­ïŒ”ăŻă‚ă‹ă‚Šă«ăă„ARM LinuxăźïŒ­ïŒ­ïŒ”ăŻă‚ă‹ă‚Šă«ăă„
ARM LinuxăźïŒ­ïŒ­ïŒ”ăŻă‚ă‹ă‚Šă«ăă„
 
GPU Computing
GPU ComputingGPU Computing
GPU Computing
 
Summary of linux kernel security protections
Summary of linux kernel security protectionsSummary of linux kernel security protections
Summary of linux kernel security protections
 
Boosting I/O Performance with KVM io_uring
Boosting I/O Performance with KVM io_uringBoosting I/O Performance with KVM io_uring
Boosting I/O Performance with KVM io_uring
 
Project ACRN hypervisor introduction
Project ACRN hypervisor introduction Project ACRN hypervisor introduction
Project ACRN hypervisor introduction
 
L4 Microkernel :: Design Overview
L4 Microkernel :: Design OverviewL4 Microkernel :: Design Overview
L4 Microkernel :: Design Overview
 
Linux MMAP & Ioremap introduction
Linux MMAP & Ioremap introductionLinux MMAP & Ioremap introduction
Linux MMAP & Ioremap introduction
 
Embedded Linux BSP Training (Intro)
Embedded Linux BSP Training (Intro)Embedded Linux BSP Training (Intro)
Embedded Linux BSP Training (Intro)
 
Xvisor: embedded and lightweight hypervisor
Xvisor: embedded and lightweight hypervisorXvisor: embedded and lightweight hypervisor
Xvisor: embedded and lightweight hypervisor
 
Fun with Network Interfaces
Fun with Network InterfacesFun with Network Interfaces
Fun with Network Interfaces
 
Android デバッグ氏ネタ
Android デバッグ氏ネタAndroid デバッグ氏ネタ
Android デバッグ氏ネタ
 
Debian Linux on Zynq (Xilinx ARM-SoC FPGA) Setup Flow (Vivado 2015.4)
Debian Linux on Zynq (Xilinx ARM-SoC FPGA) Setup Flow (Vivado 2015.4)Debian Linux on Zynq (Xilinx ARM-SoC FPGA) Setup Flow (Vivado 2015.4)
Debian Linux on Zynq (Xilinx ARM-SoC FPGA) Setup Flow (Vivado 2015.4)
 
OpenCL Heterogeneous Parallel Computing
OpenCL Heterogeneous Parallel ComputingOpenCL Heterogeneous Parallel Computing
OpenCL Heterogeneous Parallel Computing
 
Embedded Hypervisor for ARM
Embedded Hypervisor for ARMEmbedded Hypervisor for ARM
Embedded Hypervisor for ARM
 
Linux Internals - Part I
Linux Internals - Part ILinux Internals - Part I
Linux Internals - Part I
 
Introduction to char device driver
Introduction to char device driverIntroduction to char device driver
Introduction to char device driver
 

Ähnlich wie Accelerating HPC Applications on NVIDIA GPUs with OpenACC

GPGPU Computation
GPGPU ComputationGPGPU Computation
GPGPU Computationjtsagata
 
Etude Ă©ducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Etude Ă©ducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...Etude Ă©ducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Etude Ă©ducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...mouhouioui
 
Porting and optimizing UniFrac for GPUs
Porting and optimizing UniFrac for GPUsPorting and optimizing UniFrac for GPUs
Porting and optimizing UniFrac for GPUsIgor Sfiligoi
 
Intro2 Cuda Moayad
Intro2 Cuda MoayadIntro2 Cuda Moayad
Intro2 Cuda MoayadMoayadhn
 
Introduction to CUDA
Introduction to CUDAIntroduction to CUDA
Introduction to CUDARaymond Tay
 
Gpu workshop cluster universe: scripting cuda
Gpu workshop cluster universe: scripting cudaGpu workshop cluster universe: scripting cuda
Gpu workshop cluster universe: scripting cudaFerdinand Jamitzky
 
Hardware & Software Platforms for HPC, AI and ML
Hardware & Software Platforms for HPC, AI and MLHardware & Software Platforms for HPC, AI and ML
Hardware & Software Platforms for HPC, AI and MLinside-BigData.com
 
CIFAR-10 for DAWNBench: Wide ResNets, Mixup Augmentation and "Super Convergen...
CIFAR-10 for DAWNBench: Wide ResNets, Mixup Augmentation and "Super Convergen...CIFAR-10 for DAWNBench: Wide ResNets, Mixup Augmentation and "Super Convergen...
CIFAR-10 for DAWNBench: Wide ResNets, Mixup Augmentation and "Super Convergen...Thom Lane
 
CUDA and Caffe for deep learning
CUDA and Caffe for deep learningCUDA and Caffe for deep learning
CUDA and Caffe for deep learningAmgad Muhammad
 
Parallel Implementation of K Means Clustering on CUDA
Parallel Implementation of K Means Clustering on CUDAParallel Implementation of K Means Clustering on CUDA
Parallel Implementation of K Means Clustering on CUDAprithan
 
Introduction to cuda geek camp singapore 2011
Introduction to cuda   geek camp singapore 2011Introduction to cuda   geek camp singapore 2011
Introduction to cuda geek camp singapore 2011Raymond Tay
 
An Introduction to CUDA-OpenCL - University.pptx
An Introduction to CUDA-OpenCL - University.pptxAn Introduction to CUDA-OpenCL - University.pptx
An Introduction to CUDA-OpenCL - University.pptxAnirudhGarg35
 
Csw2016 wheeler barksdale-gruskovnjak-execute_mypacket
Csw2016 wheeler barksdale-gruskovnjak-execute_mypacketCsw2016 wheeler barksdale-gruskovnjak-execute_mypacket
Csw2016 wheeler barksdale-gruskovnjak-execute_mypacketCanSecWest
 
Distributed Multi-GPU Computing with Dask, CuPy and RAPIDS
Distributed Multi-GPU Computing with Dask, CuPy and RAPIDSDistributed Multi-GPU Computing with Dask, CuPy and RAPIDS
Distributed Multi-GPU Computing with Dask, CuPy and RAPIDSPeterAndreasEntschev
 
Intro to GPGPU Programming with Cuda
Intro to GPGPU Programming with CudaIntro to GPGPU Programming with Cuda
Intro to GPGPU Programming with CudaRob Gillen
 
Optimizing unity games (Google IO 2014)
Optimizing unity games (Google IO 2014)Optimizing unity games (Google IO 2014)
Optimizing unity games (Google IO 2014)Alexander Dolbilov
 
Using GPUs to handle Big Data with Java by Adam Roberts.
Using GPUs to handle Big Data with Java by Adam Roberts.Using GPUs to handle Big Data with Java by Adam Roberts.
Using GPUs to handle Big Data with Java by Adam Roberts.J On The Beach
 
lecture11_GPUArchCUDA01.pptx
lecture11_GPUArchCUDA01.pptxlecture11_GPUArchCUDA01.pptx
lecture11_GPUArchCUDA01.pptxssuser413a98
 

Ähnlich wie Accelerating HPC Applications on NVIDIA GPUs with OpenACC (20)

Gpu perf-presentation
Gpu perf-presentationGpu perf-presentation
Gpu perf-presentation
 
GPGPU Computation
GPGPU ComputationGPGPU Computation
GPGPU Computation
 
Etude Ă©ducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Etude Ă©ducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...Etude Ă©ducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
Etude Ă©ducatif sur les GPUs & CPUs et les architectures paralleles -Programmi...
 
GPU Programming
GPU ProgrammingGPU Programming
GPU Programming
 
Porting and optimizing UniFrac for GPUs
Porting and optimizing UniFrac for GPUsPorting and optimizing UniFrac for GPUs
Porting and optimizing UniFrac for GPUs
 
Intro2 Cuda Moayad
Intro2 Cuda MoayadIntro2 Cuda Moayad
Intro2 Cuda Moayad
 
Introduction to CUDA
Introduction to CUDAIntroduction to CUDA
Introduction to CUDA
 
Gpu workshop cluster universe: scripting cuda
Gpu workshop cluster universe: scripting cudaGpu workshop cluster universe: scripting cuda
Gpu workshop cluster universe: scripting cuda
 
Hardware & Software Platforms for HPC, AI and ML
Hardware & Software Platforms for HPC, AI and MLHardware & Software Platforms for HPC, AI and ML
Hardware & Software Platforms for HPC, AI and ML
 
CIFAR-10 for DAWNBench: Wide ResNets, Mixup Augmentation and "Super Convergen...
CIFAR-10 for DAWNBench: Wide ResNets, Mixup Augmentation and "Super Convergen...CIFAR-10 for DAWNBench: Wide ResNets, Mixup Augmentation and "Super Convergen...
CIFAR-10 for DAWNBench: Wide ResNets, Mixup Augmentation and "Super Convergen...
 
CUDA and Caffe for deep learning
CUDA and Caffe for deep learningCUDA and Caffe for deep learning
CUDA and Caffe for deep learning
 
Parallel Implementation of K Means Clustering on CUDA
Parallel Implementation of K Means Clustering on CUDAParallel Implementation of K Means Clustering on CUDA
Parallel Implementation of K Means Clustering on CUDA
 
Introduction to cuda geek camp singapore 2011
Introduction to cuda   geek camp singapore 2011Introduction to cuda   geek camp singapore 2011
Introduction to cuda geek camp singapore 2011
 
An Introduction to CUDA-OpenCL - University.pptx
An Introduction to CUDA-OpenCL - University.pptxAn Introduction to CUDA-OpenCL - University.pptx
An Introduction to CUDA-OpenCL - University.pptx
 
Csw2016 wheeler barksdale-gruskovnjak-execute_mypacket
Csw2016 wheeler barksdale-gruskovnjak-execute_mypacketCsw2016 wheeler barksdale-gruskovnjak-execute_mypacket
Csw2016 wheeler barksdale-gruskovnjak-execute_mypacket
 
Distributed Multi-GPU Computing with Dask, CuPy and RAPIDS
Distributed Multi-GPU Computing with Dask, CuPy and RAPIDSDistributed Multi-GPU Computing with Dask, CuPy and RAPIDS
Distributed Multi-GPU Computing with Dask, CuPy and RAPIDS
 
Intro to GPGPU Programming with Cuda
Intro to GPGPU Programming with CudaIntro to GPGPU Programming with Cuda
Intro to GPGPU Programming with Cuda
 
Optimizing unity games (Google IO 2014)
Optimizing unity games (Google IO 2014)Optimizing unity games (Google IO 2014)
Optimizing unity games (Google IO 2014)
 
Using GPUs to handle Big Data with Java by Adam Roberts.
Using GPUs to handle Big Data with Java by Adam Roberts.Using GPUs to handle Big Data with Java by Adam Roberts.
Using GPUs to handle Big Data with Java by Adam Roberts.
 
lecture11_GPUArchCUDA01.pptx
lecture11_GPUArchCUDA01.pptxlecture11_GPUArchCUDA01.pptx
lecture11_GPUArchCUDA01.pptx
 

Mehr von inside-BigData.com

Major Market Shifts in IT
Major Market Shifts in ITMajor Market Shifts in IT
Major Market Shifts in ITinside-BigData.com
 
Preparing to program Aurora at Exascale - Early experiences and future direct...
Preparing to program Aurora at Exascale - Early experiences and future direct...Preparing to program Aurora at Exascale - Early experiences and future direct...
Preparing to program Aurora at Exascale - Early experiences and future direct...inside-BigData.com
 
Transforming Private 5G Networks
Transforming Private 5G NetworksTransforming Private 5G Networks
Transforming Private 5G Networksinside-BigData.com
 
The Incorporation of Machine Learning into Scientific Simulations at Lawrence...
The Incorporation of Machine Learning into Scientific Simulations at Lawrence...The Incorporation of Machine Learning into Scientific Simulations at Lawrence...
The Incorporation of Machine Learning into Scientific Simulations at Lawrence...inside-BigData.com
 
How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...
How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...
How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...inside-BigData.com
 
Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...
Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...
Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...inside-BigData.com
 
HPC Impact: EDA Telemetry Neural Networks
HPC Impact: EDA Telemetry Neural NetworksHPC Impact: EDA Telemetry Neural Networks
HPC Impact: EDA Telemetry Neural Networksinside-BigData.com
 
Biohybrid Robotic Jellyfish for Future Applications in Ocean Monitoring
Biohybrid Robotic Jellyfish for Future Applications in Ocean MonitoringBiohybrid Robotic Jellyfish for Future Applications in Ocean Monitoring
Biohybrid Robotic Jellyfish for Future Applications in Ocean Monitoringinside-BigData.com
 
Machine Learning for Weather Forecasts
Machine Learning for Weather ForecastsMachine Learning for Weather Forecasts
Machine Learning for Weather Forecastsinside-BigData.com
 
HPC AI Advisory Council Update
HPC AI Advisory Council UpdateHPC AI Advisory Council Update
HPC AI Advisory Council Updateinside-BigData.com
 
Fugaku Supercomputer joins fight against COVID-19
Fugaku Supercomputer joins fight against COVID-19Fugaku Supercomputer joins fight against COVID-19
Fugaku Supercomputer joins fight against COVID-19inside-BigData.com
 
Energy Efficient Computing using Dynamic Tuning
Energy Efficient Computing using Dynamic TuningEnergy Efficient Computing using Dynamic Tuning
Energy Efficient Computing using Dynamic Tuninginside-BigData.com
 
HPC at Scale Enabled by DDN A3i and NVIDIA SuperPOD
HPC at Scale Enabled by DDN A3i and NVIDIA SuperPODHPC at Scale Enabled by DDN A3i and NVIDIA SuperPOD
HPC at Scale Enabled by DDN A3i and NVIDIA SuperPODinside-BigData.com
 
Versal Premium ACAP for Network and Cloud Acceleration
Versal Premium ACAP for Network and Cloud AccelerationVersal Premium ACAP for Network and Cloud Acceleration
Versal Premium ACAP for Network and Cloud Accelerationinside-BigData.com
 
Zettar: Moving Massive Amounts of Data across Any Distance Efficiently
Zettar: Moving Massive Amounts of Data across Any Distance EfficientlyZettar: Moving Massive Amounts of Data across Any Distance Efficiently
Zettar: Moving Massive Amounts of Data across Any Distance Efficientlyinside-BigData.com
 
Scaling TCO in a Post Moore's Era
Scaling TCO in a Post Moore's EraScaling TCO in a Post Moore's Era
Scaling TCO in a Post Moore's Erainside-BigData.com
 
CUDA-Python and RAPIDS for blazing fast scientific computing
CUDA-Python and RAPIDS for blazing fast scientific computingCUDA-Python and RAPIDS for blazing fast scientific computing
CUDA-Python and RAPIDS for blazing fast scientific computinginside-BigData.com
 
Introducing HPC with a Raspberry Pi Cluster
Introducing HPC with a Raspberry Pi ClusterIntroducing HPC with a Raspberry Pi Cluster
Introducing HPC with a Raspberry Pi Clusterinside-BigData.com
 
Overview of HPC Interconnects
Overview of HPC InterconnectsOverview of HPC Interconnects
Overview of HPC Interconnectsinside-BigData.com
 

Mehr von inside-BigData.com (20)

Major Market Shifts in IT
Major Market Shifts in ITMajor Market Shifts in IT
Major Market Shifts in IT
 
Preparing to program Aurora at Exascale - Early experiences and future direct...
Preparing to program Aurora at Exascale - Early experiences and future direct...Preparing to program Aurora at Exascale - Early experiences and future direct...
Preparing to program Aurora at Exascale - Early experiences and future direct...
 
Transforming Private 5G Networks
Transforming Private 5G NetworksTransforming Private 5G Networks
Transforming Private 5G Networks
 
The Incorporation of Machine Learning into Scientific Simulations at Lawrence...
The Incorporation of Machine Learning into Scientific Simulations at Lawrence...The Incorporation of Machine Learning into Scientific Simulations at Lawrence...
The Incorporation of Machine Learning into Scientific Simulations at Lawrence...
 
How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...
How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...
How to Achieve High-Performance, Scalable and Distributed DNN Training on Mod...
 
Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...
Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...
Evolving Cyberinfrastructure, Democratizing Data, and Scaling AI to Catalyze ...
 
HPC Impact: EDA Telemetry Neural Networks
HPC Impact: EDA Telemetry Neural NetworksHPC Impact: EDA Telemetry Neural Networks
HPC Impact: EDA Telemetry Neural Networks
 
Biohybrid Robotic Jellyfish for Future Applications in Ocean Monitoring
Biohybrid Robotic Jellyfish for Future Applications in Ocean MonitoringBiohybrid Robotic Jellyfish for Future Applications in Ocean Monitoring
Biohybrid Robotic Jellyfish for Future Applications in Ocean Monitoring
 
Machine Learning for Weather Forecasts
Machine Learning for Weather ForecastsMachine Learning for Weather Forecasts
Machine Learning for Weather Forecasts
 
HPC AI Advisory Council Update
HPC AI Advisory Council UpdateHPC AI Advisory Council Update
HPC AI Advisory Council Update
 
Fugaku Supercomputer joins fight against COVID-19
Fugaku Supercomputer joins fight against COVID-19Fugaku Supercomputer joins fight against COVID-19
Fugaku Supercomputer joins fight against COVID-19
 
Energy Efficient Computing using Dynamic Tuning
Energy Efficient Computing using Dynamic TuningEnergy Efficient Computing using Dynamic Tuning
Energy Efficient Computing using Dynamic Tuning
 
HPC at Scale Enabled by DDN A3i and NVIDIA SuperPOD
HPC at Scale Enabled by DDN A3i and NVIDIA SuperPODHPC at Scale Enabled by DDN A3i and NVIDIA SuperPOD
HPC at Scale Enabled by DDN A3i and NVIDIA SuperPOD
 
State of ARM-based HPC
State of ARM-based HPCState of ARM-based HPC
State of ARM-based HPC
 
Versal Premium ACAP for Network and Cloud Acceleration
Versal Premium ACAP for Network and Cloud AccelerationVersal Premium ACAP for Network and Cloud Acceleration
Versal Premium ACAP for Network and Cloud Acceleration
 
Zettar: Moving Massive Amounts of Data across Any Distance Efficiently
Zettar: Moving Massive Amounts of Data across Any Distance EfficientlyZettar: Moving Massive Amounts of Data across Any Distance Efficiently
Zettar: Moving Massive Amounts of Data across Any Distance Efficiently
 
Scaling TCO in a Post Moore's Era
Scaling TCO in a Post Moore's EraScaling TCO in a Post Moore's Era
Scaling TCO in a Post Moore's Era
 
CUDA-Python and RAPIDS for blazing fast scientific computing
CUDA-Python and RAPIDS for blazing fast scientific computingCUDA-Python and RAPIDS for blazing fast scientific computing
CUDA-Python and RAPIDS for blazing fast scientific computing
 
Introducing HPC with a Raspberry Pi Cluster
Introducing HPC with a Raspberry Pi ClusterIntroducing HPC with a Raspberry Pi Cluster
Introducing HPC with a Raspberry Pi Cluster
 
Overview of HPC Interconnects
Overview of HPC InterconnectsOverview of HPC Interconnects
Overview of HPC Interconnects
 

KĂŒrzlich hochgeladen

What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?Antenna Manufacturer Coco
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Drew Madelung
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Scriptwesley chun
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...Martijn de Jong
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Igalia
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoffsammart93
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsEnterprise Knowledge
 
🐬 The future of MySQL is Postgres 🐘
🐬  The future of MySQL is Postgres   🐘🐬  The future of MySQL is Postgres   🐘
🐬 The future of MySQL is Postgres 🐘RTylerCroy
 
Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Enterprise Knowledge
 
Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreternaman860154
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024The Digital Insurer
 
presentation ICT roal in 21st century education
presentation ICT roal in 21st century educationpresentation ICT roal in 21st century education
presentation ICT roal in 21st century educationjfdjdjcjdnsjd
 
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking MenDelhi Call girls
 
GenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdfGenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdflior mazor
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonAnna Loughnan Colquhoun
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)Gabriella Davis
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerThousandEyes
 
Exploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone ProcessorsExploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone Processorsdebabhi2
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfEnterprise Knowledge
 

KĂŒrzlich hochgeladen (20)

What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?What Are The Drone Anti-jamming Systems Technology?
What Are The Drone Anti-jamming Systems Technology?
 
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
Strategies for Unlocking Knowledge Management in Microsoft 365 in the Copilot...
 
Automating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps ScriptAutomating Google Workspace (GWS) & more with Apps Script
Automating Google Workspace (GWS) & more with Apps Script
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...
 
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
Raspberry Pi 5: Challenges and Solutions in Bringing up an OpenGL/Vulkan Driv...
 
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot TakeoffStrategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
Strategize a Smooth Tenant-to-tenant Migration and Copilot Takeoff
 
IAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI SolutionsIAC 2024 - IA Fast Track to Search Focused AI Solutions
IAC 2024 - IA Fast Track to Search Focused AI Solutions
 
🐬 The future of MySQL is Postgres 🐘
🐬  The future of MySQL is Postgres   🐘🐬  The future of MySQL is Postgres   🐘
🐬 The future of MySQL is Postgres 🐘
 
Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...Driving Behavioral Change for Information Management through Data-Driven Gree...
Driving Behavioral Change for Information Management through Data-Driven Gree...
 
Presentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreterPresentation on how to chat with PDF using ChatGPT code interpreter
Presentation on how to chat with PDF using ChatGPT code interpreter
 
Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024Partners Life - Insurer Innovation Award 2024
Partners Life - Insurer Innovation Award 2024
 
presentation ICT roal in 21st century education
presentation ICT roal in 21st century educationpresentation ICT roal in 21st century education
presentation ICT roal in 21st century education
 
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
08448380779 Call Girls In Diplomatic Enclave Women Seeking Men
 
GenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdfGenAI Risks & Security Meetup 01052024.pdf
GenAI Risks & Security Meetup 01052024.pdf
 
Data Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt RobisonData Cloud, More than a CDP by Matt Robison
Data Cloud, More than a CDP by Matt Robison
 
A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)A Domino Admins Adventures (Engage 2024)
A Domino Admins Adventures (Engage 2024)
 
How to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected WorkerHow to Troubleshoot Apps for the Modern Connected Worker
How to Troubleshoot Apps for the Modern Connected Worker
 
Exploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone ProcessorsExploring the Future Potential of AI-Enabled Smartphone Processors
Exploring the Future Potential of AI-Enabled Smartphone Processors
 
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdfThe Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
The Role of Taxonomy and Ontology in Semantic Layers - Heather Hedden.pdf
 

Accelerating HPC Applications on NVIDIA GPUs with OpenACC

  • 1. Doug Miles, PGI Compilers & Tools, NVIDIA High Performance Computing Advisory Council February 21, 2018 ACCELERATING HPC APPLICATIONS ON NVIDIA GPUS WITH OPENACC
  • 2. 2 PGI — THE NVIDIA HPC SDK Fortran, C & C++ Compilers Optimizing, SIMD Vectorizing, OpenMP Accelerated Computing Features CUDA Fortran, OpenACC Directives Multi-Platform Solution X86-64 and OpenPOWER Multicore CPUs NVIDIA Tesla GPUs Supported on Linux, macOS, Windows MPI/OpenMP/OpenACC Tools Debugger Performance Profiler Interoperable with DDT, TotalView
  • 3. 3 Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View System Memory GPU Memory PCIe
  • 4. 4 NVLink Programming GPU-Accelerated Systems Separate CPU System and GPU Memories GPU Developer View System Memory GPU Memory
  • 5. 5 attributes(global) subroutine mm_kernel ( A, B, C, N, M, L ) real :: A(N,M), B(M,L), C(N,L), Cij integer, value :: N, M, L integer :: i, j, kb, k, tx, ty real, shared :: Asub(16,16),Bsub(16,16) tx = threadidx%x ty = threadidx%y i = blockidx%x * 16 + tx j = blockidx%y * 16 + ty Cij = 0.0 do kb = 1, M, 16 Asub(tx,ty) = A(i,kb+tx-1) Bsub(tx,ty) = B(kb+ty-1,j) call syncthreads() do k = 1,16 Cij = Cij + Asub(tx,k) * Bsub(k,ty) enddo call syncthreads() enddo C(i,j) = Cij end subroutine mmul_kernel real, device, allocatable, dimension(:,:) :: Adev,Bdev,Cdev . . . allocate (Adev(N,M), Bdev(M,L), Cdev(N,L)) Adev = A(1:N,1:M) Bdev = B(1:M,1:L) call mm_kernel <<<dim3(N/16,M/16),dim3(16,16)>>> ( Adev, Bdev, Cdev, N, M, L ) C(1:N,1:L) = Cdev deallocate ( Adev, Bdev, Cdev ) . . . CPU Code Tesla Code CUDA FORTRAN
  • 6. 6 CUDA FORTRAN module madd_device_module use cudafor contains subroutine madd_dev(a,b,c,sum,n1,n2) real,dimension(:,:),device :: a,b,c real :: sum integer :: n1,n2 type(dim3) :: grid, block !$cuf kernel do (2) <<<(*,*),(32,4)>>> do j = 1,n2 do i = 1,n1 a(i,j) = b(i,j) + c(i,j) sum = sum + a(i,j) enddo enddo end subroutine end module Equivalent hand-written CUDA kernels module madd_device_module use cudafor implicit none contains attributes(global) subroutine madd_kernel(a,b,c,blocksum,n1,n2) real, dimension(:,:) :: a,b,c real, dimension(:) :: blocksum integer, value :: n1,n2 integer :: i,j,tindex,tneighbor,bindex real :: mysum real, shared :: bsum(256) ! Do this thread's work mysum = 0.0 do j = threadidx%y + (blockidx%y-1)*blockdim%y, n2, blockdim%y*griddim%y do i = threadidx%x + (blockidx%x-1)*blockdim%x, n1, blockdim%x*griddim%x a(i,j) = b(i,j) + c(i,j) mysum = mysum + a(i,j) ! accumulates partial sum per thread enddo enddo ! Now add up all partial sums for the whole thread block ! Compute this thread's linear index in the thread block ! We assume 256 threads in the thread block tindex = threadidx%x + (threadidx%y-1)*blockdim%x ! Store this thread's partial sum in the shared memory block bsum(tindex) = mysum call syncthreads() ! Accumulate all the partial sums for this thread block to a single value tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo ! Store the partial sum for the thread block bindex = blockidx%x + (blockidx%y-1)*griddim%x if( tindex == 1 ) blocksum(bindex) = bsum(1) end subroutine ! Add up partial sums for all thread blocks to a single cumulative sum attributes(global) subroutine madd_sum_kernel(blocksum,dsum,nb) real, dimension(:) :: blocksum real :: dsum integer, value :: nb real, shared :: bsum(256) integer :: tindex,tneighbor,i ! Again, we assume 256 threads in the thread block ! accumulate a partial sum for each thread tindex = threadidx%x bsum(tindex) = 0.0 do i = tindex, nb, blockdim%x bsum(tindex) = bsum(tindex) + blocksum(i) enddo call syncthreads() ! This code is copied from the previous kernel ! Accumulate all the partial sums for this thread block to a single value ! Since there is only one thread block, this single value is the final result tneighbor = 128 do while( tneighbor >= 1 ) if( tindex <= tneighbor ) & bsum(tindex) = bsum(tindex) + bsum(tindex+tneighbor) tneighbor = tneighbor / 2 call syncthreads() enddo if( tindex == 1 ) dsum = bsum(1) end subroutine subroutine madd_dev(a,b,c,dsum,n1,n2) real, dimension(:,:), device :: a,b,c real, device :: dsum real, dimension(:), allocatable, device :: blocksum integer :: n1,n2,nb type(dim3) :: grid, block integer :: r ! Compute grid/block size; block size must be 256 threads grid = dim3((n1+31)/32, (n2+7)/8, 1) block = dim3(32,8,1) nb = grid%x * grid%y allocate(blocksum(1:nb)) call madd_kernel<<< grid, block >>>(a,b,c,blocksum,n1,n2) call madd_sum_kernel<<< 1, 256 >>>(blocksum,dsum,nb) r = cudaThreadSynchronize() ! don't deallocate too early deallocate(blocksum) end subroutine !$CUF KERNEL Directives
  • 7. 7 OpenACC Directives Manage Data Movement Initiate Parallel Execution Optimize Loop Mappings #pragma acc data copyin(a,b) copyout(c) { ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... } CPU, GPU, Manycore Performance portable Interoperable Single source Incremental
  • 8. 8 GPU Memory System Memory ... #pragma acc data copy(b[0:n][0:m]) create(a[0:n][0:m]) { for (iter = 1; iter <= p; ++iter){ #pragma acc parallel loop for (i = 1; i < n-1; ++i){ for (j = 1; j < m-1; ++j){ a[i][j]=w0*b[i][j]+ w1*(b[i-1][j]+b[i+1][j]+ b[i][j-1]+b[i][j+1])+ w2*(b[i-1][j-1]+b[i-1][j+1]+ b[i+1][j-1]+b[i+1][j+1]); } } #pragma acc parallel loop for( i = 1; i < n-1; ++i ) for( j = 1; j < m-1; ++j ) b[i][j] = a[i][j]; } } ... AA BB S2 (B)S1 (B)S1 (B)S2 (B) S1 (B) Sp (B)Sp (B) Sp (B) OpenACC for GPUs in a Nutshell
  • 9. 9 Multicore CPU OpenACC is for Multicore, Manycore & GPUs % pgfortran -ta=multicore –fast –Minfo=acc -c update_tile_halo_kernel.f90 . . . 100, Loop is parallelizable Generating Multicore code 100, !$acc loop gang 102, Loop is parallelizable Tesla GPU % pgfortran -ta=tesla –fast -Minfo=acc –c update_tile_halo_kernel.f90 . . . 100, Loop is parallelizable 102, Loop is parallelizable Accelerator kernel generated Generating Tesla code 100, !$acc loop gang, vector(4) ! blockidx%y threadidx%y 102, !$acc loop gang, vector(32) ! blockidx%x threadidx%x 98 !$acc parallel 99 !$acc loop independent 100 do k=y_min-depth,y_max+depth 101 !$acc loop independent 102 do j=1,depth 103 density0(x_min-j,k)=left_density0(left_xmax+1-j,k) 104 enddo 105 enddo 106 !$acc end parallel
  • 10. Performance measured February, 2018. Skylake: Two 20 core Intel Xeon Gold 6148 CPUs @ 2.4GHz w/ 376GB memory, hyperthreading enabled. EPYC: Two 24 core AMD EPYC 7451 CPUs @ 2.3GHz w/ 256GB memory. Broadwell: Two 20 core Intel Xeon E5-2698 v4 CPUs @ 3.6GHz w/ 256GB memory, hyperthreading enabled. Volta: NVIDIA DGX1 system with two 20 core Intel Xeon E5-2698 v4 CPUs @ 2.20GHz, 256GB memory, one NVIDIA Tesla V100-SXM2-16GB GPU @ 1.53GHz. SPECÂź is a registered trademark of the Standard Performance Evaluation Corporation (www.spec.org). SPEC ACCEL 1.2 BENCHMARKS 0 50 100 150 200 2-socket Skylake 2-socket EPYC 2-socket BroadwellGEOMEANSeconds Intel 2018 PGI 18.1 OpenMP 4.5 40 cores / 80 threads 48 cores / 48 threads 40 cores / 80 threads 0 50 100 150 200 GEOMEANSeconds PGI 18.1 OpenACC 2-socket Broadwell 1x Volta V100 4.4x Speed-up
  • 12. 12 GAUSSIAN 16 Using OpenACC allowed us to continue development of our fundamental algorithms and software capabilities simultaneously with the GPU-related work. In the end, we could use the same code base for SMP, cluster/ network and GPU parallelism. PGI's compilers were essential to the success of our efforts. Mike Frisch, Ph.D. President and CEO Gaussian, Inc. Gaussian, Inc. 340QuinnipiacSt. Bldg. 40 Wallingford, CT 06492USA custserv@gaussian.com Gaussian isa registered trademark of Gaussian, Inc. All other trademarksand thepropertiesof their respectiveholders. Specif cationssubject tochangewitho Copyright © 2017, Gaussian, Inc. All rightsreserved. Roberto Gomperts NVIDIA Michael Frisch Gaussian Brent Leback NVIDIA/PGI Gio Project Contributors %GPUCPU=0 - 7 =0 - 7 UseGPUs0-7with CPUs0-7astheir controllers. Detailed information isavailableon our website.
  • 13. 13 ANSYS FLUENT We’ve effectively used OpenACC for heterogeneous computing in ANSYS Fluent with impressive performance. We’re now applying this work to more of our models and new platforms. Sunil Sathe Lead Software Developer ANSYS Fluent Image courtesy: ANSYS
  • 14. 14 VASP For VASP, OpenACC is the way forward for GPU acceleration. Performance is similar and in some cases better than CUDA C, and OpenACC dramatically decreases GPU development and maintenance efforts. We’re excited to collaborate with NVIDIA and PGI as an early adopter of CUDA Unified Memory. Prof. Georg Kresse Computational Materials Physics University of Vienna
  • 15. 15 David Gutzwiller Lead Software Developer NUMECA NUMECA FINE/Open Porting our unstructured C++ CFD solver FINE/Open to GPUs using OpenACC would have been impossible two or three years ago, but OpenACC has developed enough that we’re now getting some really good results.
  • 16. 16 MPAS-A Our team has been evaluating OpenACC as a pathway to performance portability for the Model for Prediction (MPAS) atmospheric model. Using this approach on the MPAS dynamical core, we have achieved performance on a single P100 GPU equivalent to 2.7 dual socketed Intel Xeon nodes on our new Cheyenne supercomputer. Richard Loft Director, Technology Development NCAR Image courtesy: NCAR
  • 17. 17 OpenACC made it practical to develop for GPU-based hardware while retaining a single source for almost all the COSMO physics code. Dr. Oliver Fuhrer Senior Scientist Meteoswiss COSMO
  • 18. 18 GAMERA FOR GPU With OpenACC and a compute node based on NVIDIA's Tesla P100 GPU, we achieved more than a 14X speed up over a K Computer node running our earthquake disaster simulation code Takuma Yamaguchi, Kohei Fujita, Tsuyoshi Ichimura, Muneo Hori, Lalith Wijerathne The University of Tokyo Map courtesy University of Tokyo
  • 19. 19 QUANTUM ESPRESSO CUDA Fortran gives us the full performance potential of the CUDA programming model and NVIDIA GPUs. !$CUF KERNELS directives give us productivity and source code maintainability. It’s the best of both worlds. Filippo Spiga Head of Research Software Engineering University of Cambridge
  • 20. 20 OPENACC AND CUDA UNIFIED MEMORY
  • 21. 21 Programming GPU-Accelerated Systems CUDA Unified Memory for Dynamically Allocated Data GPU Developer View With CUDA Unified Memory Unified Memory GPU Developer View System Memory GPU Memory PCIe
  • 22. 22 How CUDA Unified Memory Works on TESLA GPUs Servicing CPU and GPU Page Faults for Allocatable Data GPU Memory MappingCPU Memory Mapping PCIe or NVLink Page Fault Page Fault arrayarray __global__ void setValue(char *ptr, int index, char val) { ptr[index] = val; } cudaMallocManaged(&array, size); memset(array, size); setValue<<<...>>>(array, size/2, 5); ...
  • 23. 23 #pragma acc data copyin(a,b) copyout(c) { ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... } PGI OpenACC and CUDA Unified Memory Compiling with the –ta=tesla:managed option C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory GPU Developer View With CUDA Unified Memory Unified Memory
  • 24. 24 PGI OpenACC and CUDA Unified Memory Compiling with the –ta=tesla:managed option C malloc, C++ new, Fortran allocate all mapped to CUDA Unified Memory ... #pragma acc parallel { #pragma acc loop gang vector for (i = 0; i < n; ++i) { c[i] = a[i] + b[i]; ... } } ... GPU Developer View With CUDA Unified Memory Unified Memory
  • 25. 25 Center for Accelerated Application Readiness (CAAR) Oak Ridge Leadership Computing Facility IBM POWER9 CPUs NVIDIA Volta V100 GPUs
  • 26. 26 GTC: An OpenACC Production Application The gyrokinetic toroidal code (GTC) is a massively parallel, particle-in-cell production code for turbulence simulation in support of the burning plasma experiment ITER, the crucial next step in the quest for fusion energy. Being ported for runs on the ORNL Summit supercomputer http://phoenix.ps.uci.edu/gtc_group
  • 27. 27 GTC Performance using OpenACC P8 : IBM POWER8NVL, 2 sockets, 20 cores, NVLINK UM : No Data Directives in sources, compiled with –ta=tesla:managed 2x 4x 6x 8x 10x 12x OpenPOWER | NVLink | Unified Memory | P100 | V100 14x 16x Data Directives Data Directives Data Directives 6.1X 5.9X 12.1X 12X 16.5X 20-core P8 P8+2xP100 UM P8+2xP100 P8+4xP100 UM P8+4xP100 x64+4xV100
  • 29. 29 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 3 Members: only static Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic ‱ Real-world applications often have complex, aggregate data structures ‱ CUDA Unified Memory can automatically manage Deep Copy, but 

  • 30. 30 Managing Aggregate Data Structures with OpenACC An Example from the OpenACC port of VASP Derived Type 1 Members: 3 dynamic 1 derived type 2 Derived Type 2 Members: 21 dynamic 1 derived type 3 1 derived type 4 Derived Type 3 Members: only static Derived Type 4 Members: 8 dynamic 4 derived type 5 2 derived type 6 Derived Type 5 Members: 3 dynamic Derived Type 6 Members: 8 dynamic ‱ Real-world applications often have complex, aggregate data structures ‱ CUDA Unified Memory can automatically manage Deep Copy, but 
 ‱ CUDA Unified Memory is only for allocatable data today
  • 31. 31 FORTRAN AUTOMATIC FULL DEEP COPY Fortran Derived Types
  • 32. 32 OPENACC 2.6 MANUAL DEEP COPY typedef struct points { float* x; float* y; float* z; int n; float coef, direction; } points; void sub ( int n, float* y ) { points p; #pragma acc data create (p) { p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc update device (p.n) #pragma acc data copyin (p.x[0:n], p.y[0: n]) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i]; . . . Supported Today in PGI Compilers
  • 33. 33 DRAFT OPENACC 3.0 TRUE DEEP COPY typedef struct points { float* x; float* y; float* z; int n; float coef, direction; #pragma acc policy inout(x[0:n],y[0:n]) } points; void sub ( int n, float* y ) { points p; p.n = n; p.x = ( float*) malloc ( sizeof ( float )*n ); p.y = ( float*) malloc ( sizeof ( float )*n ); p.z = ( float*) malloc ( sizeof ( float )*n ); #pragma acc data copy (p) { #pragma acc parallel loop for ( i =0; i<p.n; ++I ) p.x[i] += p.y[i]; . . . Still in definition by the OpenACC Committee
  • 35. 35 0 20 40 60 80 100 120 140 160 Multicore Haswell Multicore Broadwell Multicore Skylake Kepler Pascal PGI 18.1 OpenACC Intel 2018 OpenMP 7.6x 7.9x 10x 10x 11x 40x 14.8x 15x Volta V100 CLOVERLEAF SpeedupvsSingleHaswellCore Systems: Haswell: 2x16 core Haswell server, four K80s, CentOS 7.2 (perf-hsw10), Broadwell: 2x20 core Broadwell server, eight P100s (dgx1-prd-01), Broadwell server, eight V100s (dgx07), Skylake 2x20 core Xeon Gold server (sky-4). Compilers: Intel 2018.0.128, PGI 18.1 Benchmark: CloverLeaf v1.3 downloaded from http://uk-mac.github.io/CloverLeaf the week of November 7 2016; CloverlLeaf_Serial; CloverLeaf_ref (MPI+OpenMP); CloverLeaf_OpenACC (MPI+OpenACC) Data compiled by PGI February 2018. AWE Hydrodynamics mini-App, bm32 data set http://uk-mac.github.io/CloverLeaf 109x 67x 142x 1x 2x 4x
  • 36. 36 OPENACC DIRECTIVES FOR GPUS 75 !$ACC KERNELS 76 !$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107 !$ACC END KERNELS % pgfortran –fast –ta=tesla –Minfo -c PdV_kernel.f90 pdv_kernel: ... 77, Loop is parallelizable 79, Loop is parallelizable Accelerator kernel generated Generating Tesla code 77, !$acc loop gang, vector(4) ! blockidx%y ! threadidx%y 79, !$acc loop gang, vector(32)! blockidx%x ! threadidx%x ... http://uk-mac.github.io/CloverLeaf
  • 37. 37 OPENACC DIRECTIVES FOR MULTICORE CPUS 75 !$ACC KERNELS 76 !$ACC LOOP INDEPENDENT 77 DO k=y_min,y_max 78 !$ACC LOOP INDEPENDENT PRIVATE(right_flux,left_flux,top_flux,bottom_flux,total_flux, min_cell_volume,energy_change,recip_volume) 79 DO j=x_min,x_max 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 ENDDO 106 ENDDO 107 !$ACC END KERNELS % pgfortran –fast –ta=multicore ... PdV_kernel.f90 pdv_kernel: ... 77, Loop is parallelizable Generating Multicore code 77, !$acc loop gang 79, Loop is parallelizable 3 loop-carried redundant expressions removed with 9 operations and 9 arrays Innermost loop distributed: 2 new loops Generated vector SIMD code for the loop Generated 2 prefetch instructions for the loop Generated 12 prefetch instructions for the loop ... http://uk-mac.github.io/CloverLeaf
  • 38. 38 FORTRAN 2018 DO CONCURRENT 75 76 77 DO CONCURRENT (k=y_min:y_max, j=x_min:x_max) & 78 LOCAL (right_flux,left_flux,top_flux,bottom_flux,total_flux, & min_cell_volume,energy_change,recip_volume) 79 80 81 left_flux= (xarea(j ,k )*(xvel0(j ,k )+xvel0(j ,k+1) & 82 +xvel0(j ,k )+xvel0(j ,k+1)))*0.25_8*dt*0.5 83 right_flux= (xarea(j+1,k )*(xvel0(j+1,k )+xvel0(j+1,k+1) & 84 +xvel0(j+1,k )+xvel0(j+1,k+1)))*0.25_8*dt*0.5 85 bottom_flux=(yarea(j ,k )*(yvel0(j ,k )+yvel0(j+1,k ) & 86 +yvel0(j ,k )+yvel0(j+1,k )))*0.25_8*dt*0.5 87 top_flux= (yarea(j ,k+1)*(yvel0(j ,k+1)+yvel0(j+1,k+1) & 88 +yvel0(j ,k+1)+yvel0(j+1,k+1)))*0.25_8*dt*0.5 89 total_flux=right_flux-left_flux+top_flux-bottom_flux 90 91 volume_change(j,k)=volume(j,k)/(volume(j,k)+total_flux) 92 93 min_cell_volume=MIN(volume(j,k)+right_flux-left_flux+top_flux-bottom_flux & 94 ,volume(j,k)+right_flux-left_flux & 95 ,volume(j,k)+top_flux-bottom_flux) 97 recip_volume=1.0/volume(j,k) 99 energy_change=(pressure(j,k)/density0(j,k)+viscosity(j,k)/density0(j,k))*... 101 energy1(j,k)=energy0(j,k)-energy_change 103 density1(j,k)=density0(j,k)*volume_change(j,k) 105 106 ENDDO 107 Fortran 2018 DO CONCURRENT + True Parallel Loops + Loop-scope shared/private data − No support for reductions − No support for atomics − No support for data management
  • 39. 39 OPENACC FOR EVERYONE The PGI Community Edition, pgicompilers.com/community PROGRAMMING MODELS OpenACC, CUDA Fortran, OpenMP, C/C++/Fortran Compilers and Tools PLATFORMS X86, OpenPOWER, NVIDIA GPU UPDATES 1-2 times a year 6-9 times a year 6-9 times a year SUPPORT User Forums PGI Support PGI Premier Services LICENSE Annual Perpetual Volume/Site FREE