Weitere ähnliche Inhalte Ähnlich wie Matrix multiplication using CUDA (14) Mehr von Piyush Mittal (20) Kürzlich hochgeladen (20) Matrix multiplication using CUDA1. Basic Example: Matrix
Multiplication using CUDA
General-purpose Programming of Massively Parallel
Graphics Processors
Shiraz University, Spring 2010
Instructor: Reza Azimi
Some materials/slides are adapted from:
Andreas Moshovos’ Course at the University of Toronto
UIUC course by Wen-Mei Hwu and David Kirk
( 6 07 4 7 6 5 4 32 1 0) 0
A @ 9 8 B A
void MatrixMulOnHost( float* M, float* N, float* P, int Width) {
for (int i = 0; i < Width; ++i) N
for (int j = 0; j < Width; ++j) {
float sum = 0; k
for (int k = 0; k < Width; ++k) {
WIDTH
float a = M[i * Width + k]; j
float b = N[k * Width + j];
sum += a * b;
}
P[i * Width + j] = sum;
}
}
M P
i
WIDTH
k
Adapted From:
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ WIDTH
©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH ' 2
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC
1
2. 60 IH 34 4 G F ED
A 8P A A@
__global__
void MatrixMulKernel(float* d_M,
d_N
float* d_N,
float* d_P, k
int Width) {
WIDTH
int row = threadIdx.y;
int col = threadIdx.x; col
float P_val = 0; (threadIdx.x)
for (int k = 0; k Width; ++k) {
float M_elem = d_M[row * Width + k];
float N_elem = d_N[k * Width + col];
P_val += M_elem * N_elem;
} d_M d_P
d_p[row*Width+col] = P_val; row
} (threadIdx.y)
WIDTH
k
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ WIDTH
©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH C 3
Adapted From:
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC
4W 7 60 6 33 ( I2 R
@ AT @ V U AT A 8TP T8T S 8 A
void MatrixMulOnDevice(float* M,
float* N,
float* P,
int Width)
{
int matrix_size = Width * Width * sizeof(float);
float *d_M, *d_N, *d_P;
// Allocate and Load M and N to device memory
cudaMalloc(d_M, matrix_size);
cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice);
cudaMalloc(d_N, matrix_size);
cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice);
// Allocate P on the device
cudaMalloc(d_P, matrix_size);
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
Adapted From:
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q 4
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC
2
3. 26 60 6 R 34 4G 3I7 4 7
a ` B U AT A 8TP YA A@ 8
// Setup the execution configuration
dim3 dimGrid(1, 1);
dim3 dimBlock(Width, Width);
// Launch the device computation threads!
MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);
// Copy back the results from device to host
cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost);
// Free up the device memory matrices
cudaFree(d_P);
cudaFree(d_M);
cudaFree(d_N);
© David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © X 5
ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign
Only One Thread Block Used
c
One Block of threads compute Grid 1 d_N
matrix d_P Block 1
2
4
d
Each thread
e
Loads a row of matrix d_M Thread
(2, 2)
2
e
Loads a column of matrix d_N 6
e
Perform one multiply and
addition for each pair of d_M
and d_N elements
e
Computes one element of d_P
3 2 5 4 48
Size of matrix limited by
the number of threads
allowed in a thread block
WIDTH d_P
d_M
Adapted From:
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC
©¨© § % $ ! ¦ ©¤ # ! ! ¤ © b 6
3
4. 6 0 I 36) 6r 4 6 4 p p i 4 0D
A 8 s@ @ q UT @ V PT Y hg
threadIdx.x TILE_WIDTH
d_P
TILE_
threadIdx.y
7
Each thread is assigned to
a Tile of
TILE_WIDTHxTILE_WIDTH
entries
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © f
Solution 1: Give Each Thread More
Work
__global__ void MatrixMulKernel(float* d_M,
float* d_N,
float* d_P,
int Width) {
int start_row = threadIdx.y * TILE_WIDTH;
int end_row = start_row + TILE_WIDTH;
int start_col = threadIdx.x * TILE_WIDTH;
int end_col = start_col + TILE_WIDTH;
for (int row = start_row; row end_row; row++) {
for(int col = start_col; col end_col; col++) {
float P_val = 0;
for (int k = 0; k Width; ++k) {
float M_elem = d_M[row * Width + k];
float N_elem = d_N[k * Width + col];
P_val += M_elem * N_elem;
} With one block we utilize
d_p[row*Width+col] = P_val; only one multiprocessor!
}
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © t
}
}
4
5. 63 4 p 4 32 0 3I 47 F 6 0 I 36) 7
w UT @ V 8 q hv A 8 sP
threadIdx.x
blockIdx.x blockDim.x
d_P
blockDim.y
blockIdx.y
9 assigned to a
threadIdx.y thread
assigned to a
thread block
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © u
Solution 2: Use Multiple Thread
Blocks
__global__
void MatrixMulKernel(float* d_M,
float* d_N,
float* d_P,
int Width) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float P_val = 0;
for (int k = 0; k Width; ++k) {
float M_elem = d_M[row * Width + k];
float N_elem = d_N[k * Width + col];
P_val += M_elem * N_elem;
}
d_p[row*Width+col] = P_val;
}
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © x
5
6. 26 60 6 R 34 4G 3I7 4 7
a ` B U AT A 8TP YA A@ 8
int block_size = 64;
// Setup the execution configuration
dim3 dimGrid(Width/block_size, Width/block_size);
dim3 dimBlock(block_size, block_size);
// Launch the device computation threads!
MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);
…
Size of matrix limited by the
number of threads allowed
on a device
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © 11
60 01 0 4 p D 7
A 8T8 ‚ UT @ V € v yV
ƒ
Max Number of Threads per Block: 512
ƒ
Max Number of Blocks per Streaming
Multiprocessor: 8
ƒ
Number of Streaming Multiprocessors: 30
ƒ
Total Number of Threads Available =
30 x 8 x 512 = 122880
Let me double-check this!
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © '
6
7. 6 0 I 36) 6 4p 0 0„ 16 7
A 8 †V 8 …A A B
threadIdx.x
blockIdx.x blockDim.x
d_P
blockDim.y
blockIdx.y
TILE_WIDTH
13
threadIdx.y
TILE_WIDT
© ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © C
6 0 I 36) 6 4p 0 0„ 16 7
A 8 †V 8 …A A B
__global__ void MatrixMulKernel(float* d_M,
float* d_N,
float* d_P,
int Width) {
int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH;
int end_row = start_row + TILE_WIDTH;
int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH;
int end_col = start_col + TILE_WIDTH;
for (int row = start_row; row end_row; row++) {
for(int col = start_col; col end_col; col++) {
float P_val = 0;
for (int k = 0; k Width; ++k) {
float M_elem = d_M[row * Width + k];
float N_elem = d_N[k * Width + col];
P_val += M_elem * N_elem;
}
d_p[row*Width+col] = P_val;
}
}
} © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
£ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q
7