Matrix multiplication using CUDA

Basic Example: Matrix
Multiplication using CUDA

General-purpose Programming of Massively Parallel
Graphics Processors
Shiraz University, Spring 2010
Instructor: Reza Azimi

Some materials/slides are adapted from:
Andreas Moshovos’ Course at the University of Toronto
UIUC course by Wen-Mei Hwu and David Kirk

( 6 07 4 7 6 5 4 32 1 0) 0
A @ 9 8 B A

void MatrixMulOnHost( float* M, float* N, float* P, int Width) {
for (int i = 0; i < Width; ++i) N
for (int j = 0; j < Width; ++j) {
float sum = 0; k
for (int k = 0; k < Width; ++k) {
WIDTH

float a = M[i * Width + k]; j
float b = N[k * Width + j];
sum += a * b;
}
P[i * Width + j] = sum;
}
}
M P

i
WIDTH

k

Adapted From:
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ WIDTH
©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH ' 2
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC

1

60 IH 34 4 G F ED
A 8P A A@

__global__
void MatrixMulKernel(float* d_M,
d_N
float* d_N,
float* d_P, k
int Width) {

WIDTH
int row = threadIdx.y;
int col = threadIdx.x; col
float P_val = 0; (threadIdx.x)
for (int k = 0; k Width; ++k) {
float M_elem = d_M[row * Width + k];
float N_elem = d_N[k * Width + col];
P_val += M_elem * N_elem;
} d_M d_P
d_p[row*Width+col] = P_val; row
} (threadIdx.y)

WIDTH
k

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ WIDTH
©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH C 3
Adapted From:

4W 7 60 6 33 ( I2 R
@ AT @ V U AT A 8TP T8T S 8 A

void MatrixMulOnDevice(float* M,
float* N,
float* P,
int Width)
{
int matrix_size = Width * Width * sizeof(float);
float *d_M, *d_N, *d_P;

// Allocate and Load M and N to device memory
cudaMalloc(d_M, matrix_size);
cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice);

cudaMalloc(d_N, matrix_size);
cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice);

// Allocate P on the device
cudaMalloc(d_P, matrix_size);

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
Adapted From:
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q 4

2

26 60 6 R 34 4G 3I7 4 7
a ` B U AT A 8TP YA A@ 8

// Setup the execution configuration
dim3 dimGrid(1, 1);
dim3 dimBlock(Width, Width);

// Launch the device computation threads!
MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);

// Copy back the results from device to host
cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost);

// Free up the device memory matrices
cudaFree(d_P);
cudaFree(d_M);
cudaFree(d_N);
© David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © X 5
ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign

Only One Thread Block Used
c
One Block of threads compute Grid 1 d_N
matrix d_P Block 1
2

4
d
Each thread
e
Loads a row of matrix d_M Thread
(2, 2)
2
e
Loads a column of matrix d_N 6
e
Perform one multiply and
addition for each pair of d_M
and d_N elements
e
Computes one element of d_P

3 2 5 4 48
Size of matrix limited by
the number of threads
allowed in a thread block
WIDTH d_P
d_M
Adapted From:
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£
©¨© § % $ ! ¦ ©¤ # ! ! ¤ © b 6

3

6 0 I 36) 6r 4 6 4 p p i 4 0D
A 8 s@ @ q UT @ V PT Y hg

threadIdx.x TILE_WIDTH

d_P
TILE_

threadIdx.y

7

Each thread is assigned to
a Tile of
TILE_WIDTHxTILE_WIDTH
entries

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © f

Solution 1: Give Each Thread More
Work
__global__ void MatrixMulKernel(float* d_M,
float* d_N,
float* d_P,
int Width) {
int start_row = threadIdx.y * TILE_WIDTH;
int end_row = start_row + TILE_WIDTH;
int start_col = threadIdx.x * TILE_WIDTH;
int end_col = start_col + TILE_WIDTH;

for (int row = start_row; row end_row; row++) {
for(int col = start_col; col end_col; col++) {
float P_val = 0;
} With one block we utilize
d_p[row*Width+col] = P_val; only one multiprocessor!
}
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © t

}
}

4

63 4 p 4 32 0 3I 47 F 6 0 I 36) 7
w UT @ V 8 q hv A 8 sP

threadIdx.x

blockIdx.x blockDim.x

d_P
blockDim.y

blockIdx.y

9 assigned to a
threadIdx.y thread

assigned to a
thread block

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © u

Solution 2: Use Multiple Thread
Blocks
__global__
void MatrixMulKernel(float* d_M,
float* d_N,
float* d_P,
int Width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float P_val = 0;

}
d_p[row*Width+col] = P_val;
}

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © x

5

26 60 6 R 34 4G 3I7 4 7
a ` B U AT A 8TP YA A@ 8

int block_size = 64;

// Setup the execution configuration
dim3 dimGrid(Width/block_size, Width/block_size);
dim3 dimBlock(block_size, block_size);

// Launch the device computation threads!
MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);

…
Size of matrix limited by the
number of threads allowed
on a device

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © 11

60 01 0 4 p D 7
A 8T8 ‚ UT @ V € v yV

ƒ
Max Number of Threads per Block: 512
ƒ
Max Number of Blocks per Streaming
Multiprocessor: 8
ƒ
Number of Streaming Multiprocessors: 30
ƒ
Total Number of Threads Available =
30 x 8 x 512 = 122880

Let me double-check this!

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © '

6

6 0 I 36) 6 4p 0 0„ 16 7
A 8 †V 8 …A A B

threadIdx.x
blockIdx.x blockDim.x

d_P
blockDim.y

blockIdx.y

TILE_WIDTH

13
threadIdx.y
TILE_WIDT

© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © C

6 0 I 36) 6 4p 0 0„ 16 7
A 8 †V 8 …A A B

__global__ void MatrixMulKernel(float* d_M,
float* d_N,
float* d_P,
int Width) {
int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH;
int end_row = start_row + TILE_WIDTH;
int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH;
int end_col = start_col + TILE_WIDTH;

for (int row = start_row; row end_row; row++) {
for(int col = start_col; col end_col; col++) {
float P_val = 0;
}
d_p[row*Width+col] = P_val;
}
}
} © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q

7

Matrix multiplication using CUDA

Empfohlen

Empfohlen

Weitere ähnliche Inhalte

Ähnlich wie Matrix multiplication using CUDA

Ähnlich wie Matrix multiplication using CUDA (14)

Mehr von Piyush Mittal

Mehr von Piyush Mittal (20)

Kürzlich hochgeladen

Kürzlich hochgeladen (20)

Matrix multiplication using CUDA