SlideShare ist ein Scribd-Unternehmen logo
1 von 7
Downloaden Sie, um offline zu lesen
Basic Example: Matrix
                  Multiplication using CUDA

                   General-purpose Programming of Massively Parallel
                                 Graphics Processors
                                               Shiraz University, Spring 2010
                                                  Instructor: Reza Azimi


                                Some materials/slides are adapted from:
                          Andreas Moshovos’ Course at the University of Toronto
                              UIUC course by Wen-Mei Hwu and David Kirk
                                                                                                                     




     (      6 07 4    7 6 5 4 32 1 0)                                    0
           A    @  9 8                                                       B A



 void MatrixMulOnHost( float* M, float* N, float* P, int Width) {
   for (int i = 0; i < Width; ++i)                   N
     for (int j = 0; j < Width; ++j) {
         float sum = 0;                                                                                         k
         for (int k = 0; k < Width; ++k) {
                                                                                                                        WIDTH




              float a = M[i * Width + k];              j
              float b = N[k * Width + j];
              sum += a * b;
          }
          P[i * Width + j] = sum;
     }
 }
                                                          M                                         P

                                                                     i
                                                                                                                        WIDTH




                                                              k

Adapted From:
             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                         £                                           WIDTH
                                                          ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©       WIDTH       '      2
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




                                                                                                                                1
60     IH 34   4 G F ED
     A  8P A       A@




 __global__
 void MatrixMulKernel(float* d_M,
                                                                                                    d_N
                      float* d_N,
                      float* d_P,                                                                                 k
                      int Width) {




                                                                                                                          WIDTH
   int row = threadIdx.y;
   int col = threadIdx.x;                                                                           col
   float P_val = 0;                                                                                 (threadIdx.x)
   for (int k = 0; k  Width; ++k) {
     float M_elem = d_M[row * Width + k];
     float N_elem = d_N[k * Width + col];
     P_val += M_elem * N_elem;
   }                            d_M                                                                 d_P
   d_p[row*Width+col] = P_val;        row
 }                                    (threadIdx.y)




                                                                                                                          WIDTH
                                                            k

             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                         £                                           WIDTH
                                                          ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©         WIDTH       C      3
Adapted From:
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




     4W 7              60    6 33 (         I2  R
    @     AT @ V U AT A  8TP        T8T S 8   A




   void MatrixMulOnDevice(float* M,
                          float* N,
                          float* P,
                          int Width)
   {
      int matrix_size = Width * Width * sizeof(float);
      float *d_M, *d_N, *d_P;

        // Allocate and Load M and N to device memory
        cudaMalloc(d_M, matrix_size);
        cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice);

        cudaMalloc(d_N, matrix_size);
        cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice);

        // Allocate P on the device
        cudaMalloc(d_P, matrix_size);

             © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
Adapted From:
                         £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                     Q      4
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC




                                                                                                                                  2
26         60    6    R 34   4G                                                             3I7 4        7
     a `   B U AT A  8TP   YA      A@                                                                        8




         // Setup the execution configuration
         dim3 dimGrid(1, 1);
         dim3 dimBlock(Width, Width);


      // Launch the device computation threads!
      MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
   Width);

         // Copy back the results from device to host
         cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost);

         // Free up the device memory matrices
         cudaFree(d_P);
         cudaFree(d_M);
         cudaFree(d_N);
     © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009
              © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                          £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                          X   5
     ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign




    Only One Thread Block Used
     c
         One Block of threads compute                                                      Grid 1                    d_N
         matrix d_P                                                                     Block 1
                                                                                                                       2

                                                                                                                       4
     d
         Each thread
           e
               Loads a row of matrix d_M                                                       Thread
                                                                                                (2, 2)
                                                                                                                       2
           e
               Loads a column of matrix d_N                                                                            6
           e
               Perform one multiply and
               addition for each pair of d_M
               and d_N elements
           e
               Computes one element of d_P


                                                                                 3     2       5         4             48
   Size of matrix limited by
   the number of threads
   allowed in a thread block
                                                                                     WIDTH                           d_P
                                                                                     d_M
Adapted From:
              © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                          £
David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC
                                                           ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                          b   6




                                                                                                                                    3
6 0 I 36)                  6r 4 6     4 p   p   i 4  0D
  A   8                     s@    @  q UT @  V  PT     Y   hg




                                                             threadIdx.x                           TILE_WIDTH


                                                         d_P
                                                                                                                    TILE_


                               threadIdx.y




                                                                                         7

Each thread is assigned to
a Tile of
TILE_WIDTHxTILE_WIDTH
entries


            © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                        £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                f




  Solution 1: Give Each Thread More
  Work
   __global__ void MatrixMulKernel(float* d_M,
                      float* d_N,
                      float* d_P,
                      int Width) {
     int start_row = threadIdx.y * TILE_WIDTH;
     int end_row = start_row + TILE_WIDTH;
     int start_col = threadIdx.x * TILE_WIDTH;
     int end_col = start_col + TILE_WIDTH;

       for (int row = start_row; row  end_row; row++) {
          for(int col = start_col; col  end_col; col++) {
             float P_val = 0;
             for (int k = 0; k  Width; ++k) {
                float M_elem = d_M[row * Width + k];
                float N_elem = d_N[k * Width + col];
                P_val += M_elem * N_elem;
             }                              With one block we utilize
             d_p[row*Width+col] = P_val;    only one multiprocessor!
          }
            © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                        £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                t

       }
   }




                                                                                                                            4
63    4 p   4 32 0 3I   47 F     6 0 I 36)                                                           7
  w UT @  V       8   q      hv A   8                                                           sP


                                                             threadIdx.x

                                    blockIdx.x                                     blockDim.x

       d_P
                                                                                                blockDim.y



      blockIdx.y




                                                              9                                 assigned to a
     threadIdx.y                                                                                thread

                                                                                                assigned to a
                                                                                                thread block



       © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                   £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  u




Solution 2: Use Multiple Thread
Blocks
__global__
void MatrixMulKernel(float* d_M,
                   float* d_N,
                   float* d_P,
                   int Width) {
  int row = blockIdx.y * blockDim.y + threadIdx.y;
  int col = blockIdx.x * blockDim.x + threadIdx.x;
  float P_val = 0;

    for (int k = 0; k  Width; ++k) {
      float M_elem = d_M[row * Width + k];
      float N_elem = d_N[k * Width + col];
      P_val += M_elem * N_elem;
    }
    d_p[row*Width+col] = P_val;
}


       © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                   £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©               x 




                                                                                                                 5
26         60    6    R 34   4G                                                      3I7 4       7
 a `   B U AT A  8TP   YA      A@                                                                8




     int block_size = 64;

     // Setup the execution configuration
     dim3 dimGrid(Width/block_size, Width/block_size);
     dim3 dimBlock(block_size, block_size);


   // Launch the device computation threads!
   MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P,
Width);

     …
                                                                            Size of matrix limited by the
                                                                            number of threads allowed
                                                                            on a device



        © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                    £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                     11




  60    01 0    4 p         D                                                    7
 A  8T8     ‚ UT @  V  € v yV




 ƒ
     Max Number of Threads per Block: 512
 ƒ
     Max Number of Blocks per Streaming
     Multiprocessor: 8
 ƒ
     Number of Streaming Multiprocessors: 30
 ƒ
     Total Number of Threads Available =
                30 x 8 x 512 = 122880

                          Let me double-check this!

        © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                    £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                ' 




                                                                                                                      6
6 0 I 36) 6    4p     0  0„ 16                                                   7
    A   8        †V    8 …A A       B



                                                       threadIdx.x
                                blockIdx.x                                    blockDim.x

     d_P
                                                                                                 blockDim.y



     blockIdx.y


                                                                                                 TILE_WIDTH


                                                        13
    threadIdx.y
                                                                                                              TILE_WIDT




          © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                      £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  C 




     6 0 I 36) 6    4p     0  0„ 16                                                   7
    A   8        †V    8 …A A       B




__global__ void MatrixMulKernel(float* d_M,
                     float* d_N,
                     float* d_P,
                     int Width) {
  int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH;
  int end_row = start_row + TILE_WIDTH;
  int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH;
  int end_col = start_col + TILE_WIDTH;

    for (int row = start_row; row  end_row; row++) {
       for(int col = start_col; col  end_col; col++) {
          float P_val = 0;
          for (int k = 0; k  Width; ++k) {
             float M_elem = d_M[row * Width + k];
             float N_elem = d_N[k * Width + col];
             P_val += M_elem * N_elem;
          }
          d_p[row*Width+col] = P_val;
       }
    }
}         © ¤ © § ¤©  ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡
                      £                                ©¨©  §  % $ ! ¦ ©¤ # ! ! ¤ ©                  Q 




                                                                                                                          7

Weitere ähnliche Inhalte

Ähnlich wie Matrix multiplication using CUDA

Automatically Describing Program Structure and Behavior (PhD Defense)
Automatically Describing Program Structure and Behavior (PhD Defense)Automatically Describing Program Structure and Behavior (PhD Defense)
Automatically Describing Program Structure and Behavior (PhD Defense)
Ray Buse
 

Ähnlich wie Matrix multiplication using CUDA (14)

Automatically Describing Program Structure and Behavior (PhD Defense)
Automatically Describing Program Structure and Behavior (PhD Defense)Automatically Describing Program Structure and Behavior (PhD Defense)
Automatically Describing Program Structure and Behavior (PhD Defense)
 
openFrameworks 007 - 3D
openFrameworks 007 - 3DopenFrameworks 007 - 3D
openFrameworks 007 - 3D
 
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeksBeginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
Beginning direct3d gameprogramming06_firststepstoanimation_20161115_jintaeks
 
Triggering patterns of topology changes in dynamic attributed graphs
Triggering patterns of topology changes in dynamic attributed graphsTriggering patterns of topology changes in dynamic attributed graphs
Triggering patterns of topology changes in dynamic attributed graphs
 
Lab Practices and Works Documentation / Report on Computer Graphics
Lab Practices and Works Documentation / Report on Computer GraphicsLab Practices and Works Documentation / Report on Computer Graphics
Lab Practices and Works Documentation / Report on Computer Graphics
 
CS 354 Transformation, Clipping, and Culling
CS 354 Transformation, Clipping, and CullingCS 354 Transformation, Clipping, and Culling
CS 354 Transformation, Clipping, and Culling
 
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency BandDWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
DWT-DCT-SVD Based Semi Blind Image Watermarking Using Middle Frequency Band
 
10CSL67 CG LAB PROGRAM 1
10CSL67 CG LAB PROGRAM 110CSL67 CG LAB PROGRAM 1
10CSL67 CG LAB PROGRAM 1
 
Writing a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 CanvasWriting a Space Shooter with HTML5 Canvas
Writing a Space Shooter with HTML5 Canvas
 
rgDefense
rgDefensergDefense
rgDefense
 
oop Lecture 5
oop Lecture 5oop Lecture 5
oop Lecture 5
 
Kirti Kumawat
Kirti KumawatKirti Kumawat
Kirti Kumawat
 
2.docx
2.docx2.docx
2.docx
 
Interactive High-Dimensional Visualization of Social Graphs
Interactive High-Dimensional Visualization of Social GraphsInteractive High-Dimensional Visualization of Social Graphs
Interactive High-Dimensional Visualization of Social Graphs
 

Mehr von Piyush Mittal (20)

Power mock
Power mockPower mock
Power mock
 
Design pattern tutorial
Design pattern tutorialDesign pattern tutorial
Design pattern tutorial
 
Reflection
ReflectionReflection
Reflection
 
Gpu archi
Gpu archiGpu archi
Gpu archi
 
Cuda Architecture
Cuda ArchitectureCuda Architecture
Cuda Architecture
 
Intel open mp
Intel open mpIntel open mp
Intel open mp
 
Intro to parallel computing
Intro to parallel computingIntro to parallel computing
Intro to parallel computing
 
Cuda toolkit reference manual
Cuda toolkit reference manualCuda toolkit reference manual
Cuda toolkit reference manual
 
Channel coding
Channel codingChannel coding
Channel coding
 
Basics of Coding Theory
Basics of Coding TheoryBasics of Coding Theory
Basics of Coding Theory
 
Java cheat sheet
Java cheat sheetJava cheat sheet
Java cheat sheet
 
Google app engine cheat sheet
Google app engine cheat sheetGoogle app engine cheat sheet
Google app engine cheat sheet
 
Git cheat sheet
Git cheat sheetGit cheat sheet
Git cheat sheet
 
Vi cheat sheet
Vi cheat sheetVi cheat sheet
Vi cheat sheet
 
Css cheat sheet
Css cheat sheetCss cheat sheet
Css cheat sheet
 
Cpp cheat sheet
Cpp cheat sheetCpp cheat sheet
Cpp cheat sheet
 
Ubuntu cheat sheet
Ubuntu cheat sheetUbuntu cheat sheet
Ubuntu cheat sheet
 
Php cheat sheet
Php cheat sheetPhp cheat sheet
Php cheat sheet
 
oracle 9i cheat sheet
oracle 9i cheat sheetoracle 9i cheat sheet
oracle 9i cheat sheet
 
Open ssh cheet sheat
Open ssh cheet sheatOpen ssh cheet sheat
Open ssh cheet sheat
 

Kürzlich hochgeladen

The basics of sentences session 3pptx.pptx
The basics of sentences session 3pptx.pptxThe basics of sentences session 3pptx.pptx
The basics of sentences session 3pptx.pptx
heathfieldcps1
 
1029-Danh muc Sach Giao Khoa khoi 6.pdf
1029-Danh muc Sach Giao Khoa khoi  6.pdf1029-Danh muc Sach Giao Khoa khoi  6.pdf
1029-Danh muc Sach Giao Khoa khoi 6.pdf
QucHHunhnh
 
Making and Justifying Mathematical Decisions.pdf
Making and Justifying Mathematical Decisions.pdfMaking and Justifying Mathematical Decisions.pdf
Making and Justifying Mathematical Decisions.pdf
Chris Hunter
 
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in DelhiRussian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
kauryashika82
 

Kürzlich hochgeladen (20)

Role Of Transgenic Animal In Target Validation-1.pptx
Role Of Transgenic Animal In Target Validation-1.pptxRole Of Transgenic Animal In Target Validation-1.pptx
Role Of Transgenic Animal In Target Validation-1.pptx
 
The basics of sentences session 3pptx.pptx
The basics of sentences session 3pptx.pptxThe basics of sentences session 3pptx.pptx
The basics of sentences session 3pptx.pptx
 
1029-Danh muc Sach Giao Khoa khoi 6.pdf
1029-Danh muc Sach Giao Khoa khoi  6.pdf1029-Danh muc Sach Giao Khoa khoi  6.pdf
1029-Danh muc Sach Giao Khoa khoi 6.pdf
 
Key note speaker Neum_Admir Softic_ENG.pdf
Key note speaker Neum_Admir Softic_ENG.pdfKey note speaker Neum_Admir Softic_ENG.pdf
Key note speaker Neum_Admir Softic_ENG.pdf
 
PROCESS RECORDING FORMAT.docx
PROCESS      RECORDING        FORMAT.docxPROCESS      RECORDING        FORMAT.docx
PROCESS RECORDING FORMAT.docx
 
Grant Readiness 101 TechSoup and Remy Consulting
Grant Readiness 101 TechSoup and Remy ConsultingGrant Readiness 101 TechSoup and Remy Consulting
Grant Readiness 101 TechSoup and Remy Consulting
 
Making and Justifying Mathematical Decisions.pdf
Making and Justifying Mathematical Decisions.pdfMaking and Justifying Mathematical Decisions.pdf
Making and Justifying Mathematical Decisions.pdf
 
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in DelhiRussian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
 
General Principles of Intellectual Property: Concepts of Intellectual Proper...
General Principles of Intellectual Property: Concepts of Intellectual  Proper...General Principles of Intellectual Property: Concepts of Intellectual  Proper...
General Principles of Intellectual Property: Concepts of Intellectual Proper...
 
Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...
Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...
Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...
 
Unit-V; Pricing (Pharma Marketing Management).pptx
Unit-V; Pricing (Pharma Marketing Management).pptxUnit-V; Pricing (Pharma Marketing Management).pptx
Unit-V; Pricing (Pharma Marketing Management).pptx
 
ICT Role in 21st Century Education & its Challenges.pptx
ICT Role in 21st Century Education & its Challenges.pptxICT Role in 21st Century Education & its Challenges.pptx
ICT Role in 21st Century Education & its Challenges.pptx
 
Measures of Dispersion and Variability: Range, QD, AD and SD
Measures of Dispersion and Variability: Range, QD, AD and SDMeasures of Dispersion and Variability: Range, QD, AD and SD
Measures of Dispersion and Variability: Range, QD, AD and SD
 
INDIA QUIZ 2024 RLAC DELHI UNIVERSITY.pptx
INDIA QUIZ 2024 RLAC DELHI UNIVERSITY.pptxINDIA QUIZ 2024 RLAC DELHI UNIVERSITY.pptx
INDIA QUIZ 2024 RLAC DELHI UNIVERSITY.pptx
 
Food Chain and Food Web (Ecosystem) EVS, B. Pharmacy 1st Year, Sem-II
Food Chain and Food Web (Ecosystem) EVS, B. Pharmacy 1st Year, Sem-IIFood Chain and Food Web (Ecosystem) EVS, B. Pharmacy 1st Year, Sem-II
Food Chain and Food Web (Ecosystem) EVS, B. Pharmacy 1st Year, Sem-II
 
Micro-Scholarship, What it is, How can it help me.pdf
Micro-Scholarship, What it is, How can it help me.pdfMicro-Scholarship, What it is, How can it help me.pdf
Micro-Scholarship, What it is, How can it help me.pdf
 
On National Teacher Day, meet the 2024-25 Kenan Fellows
On National Teacher Day, meet the 2024-25 Kenan FellowsOn National Teacher Day, meet the 2024-25 Kenan Fellows
On National Teacher Day, meet the 2024-25 Kenan Fellows
 
Ecological Succession. ( ECOSYSTEM, B. Pharmacy, 1st Year, Sem-II, Environmen...
Ecological Succession. ( ECOSYSTEM, B. Pharmacy, 1st Year, Sem-II, Environmen...Ecological Succession. ( ECOSYSTEM, B. Pharmacy, 1st Year, Sem-II, Environmen...
Ecological Succession. ( ECOSYSTEM, B. Pharmacy, 1st Year, Sem-II, Environmen...
 
Nutritional Needs Presentation - HLTH 104
Nutritional Needs Presentation - HLTH 104Nutritional Needs Presentation - HLTH 104
Nutritional Needs Presentation - HLTH 104
 
Holdier Curriculum Vitae (April 2024).pdf
Holdier Curriculum Vitae (April 2024).pdfHoldier Curriculum Vitae (April 2024).pdf
Holdier Curriculum Vitae (April 2024).pdf
 

Matrix multiplication using CUDA

  • 1. Basic Example: Matrix Multiplication using CUDA General-purpose Programming of Massively Parallel Graphics Processors Shiraz University, Spring 2010 Instructor: Reza Azimi Some materials/slides are adapted from: Andreas Moshovos’ Course at the University of Toronto UIUC course by Wen-Mei Hwu and David Kirk   ( 6 07 4 7 6 5 4 32 1 0) 0 A @ 9 8 B A void MatrixMulOnHost( float* M, float* N, float* P, int Width) { for (int i = 0; i < Width; ++i) N for (int j = 0; j < Width; ++j) { float sum = 0; k for (int k = 0; k < Width; ++k) { WIDTH float a = M[i * Width + k]; j float b = N[k * Width + j]; sum += a * b; } P[i * Width + j] = sum; } } M P i WIDTH k Adapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH ' 2 David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 1
  • 2. 60 IH 34 4 G F ED A 8P A A@ __global__ void MatrixMulKernel(float* d_M, d_N float* d_N, float* d_P, k int Width) { WIDTH int row = threadIdx.y; int col = threadIdx.x; col float P_val = 0; (threadIdx.x) for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_M d_P d_p[row*Width+col] = P_val; row } (threadIdx.y) WIDTH k © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ WIDTH ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © WIDTH C 3 Adapted From: David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 4W 7 60 6 33 ( I2 R @ AT @ V U AT A 8TP T8T S 8 A void MatrixMulOnDevice(float* M, float* N, float* P, int Width) { int matrix_size = Width * Width * sizeof(float); float *d_M, *d_N, *d_P; // Allocate and Load M and N to device memory cudaMalloc(d_M, matrix_size); cudaMemcpy(d_M, M, matrix_size, cudaMemcpyHostToDevice); cudaMalloc(d_N, matrix_size); cudaMemcpy(d_N, N, matrix_size, cudaMemcpyHostToDevice); // Allocate P on the device cudaMalloc(d_P, matrix_size); © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ Adapted From: £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q 4 David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC 2
  • 3. 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 // Setup the execution configuration dim3 dimGrid(1, 1); dim3 dimBlock(Width, Width); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P, Width); // Copy back the results from device to host cudaMemcpy(P, d_P, matrix_size, cudaMemcpyDeviceToHost); // Free up the device memory matrices cudaFree(d_P); cudaFree(d_M); cudaFree(d_N); © David Kirk/NVIDIA and Wen-mei W. Hwu, 2007-2009 © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © X 5 ECE 498AL Spring 2010, University of Illinois, Urbana-Champaign Only One Thread Block Used c One Block of threads compute Grid 1 d_N matrix d_P Block 1 2 4 d Each thread e Loads a row of matrix d_M Thread (2, 2) 2 e Loads a column of matrix d_N 6 e Perform one multiply and addition for each pair of d_M and d_N elements e Computes one element of d_P 3 2 5 4 48 Size of matrix limited by the number of threads allowed in a thread block WIDTH d_P d_M Adapted From: © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ David Kirk/NVIDIA and Wen-mei W. Hwu, UIUC ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © b 6 3
  • 4. 6 0 I 36) 6r 4 6 4 p p i 4 0D A 8 s@ @ q UT @ V PT Y hg threadIdx.x TILE_WIDTH d_P TILE_ threadIdx.y 7 Each thread is assigned to a Tile of TILE_WIDTHxTILE_WIDTH entries © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © f Solution 1: Give Each Thread More Work __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } With one block we utilize d_p[row*Width+col] = P_val; only one multiprocessor! } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © t } } 4
  • 5. 63 4 p 4 32 0 3I 47 F 6 0 I 36) 7 w UT @ V 8 q hv A 8 sP threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y 9 assigned to a threadIdx.y thread assigned to a thread block © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © u Solution 2: Use Multiple Thread Blocks __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © x  5
  • 6. 26 60 6 R 34 4G 3I7 4 7 a ` B U AT A 8TP YA A@ 8 int block_size = 64; // Setup the execution configuration dim3 dimGrid(Width/block_size, Width/block_size); dim3 dimBlock(block_size, block_size); // Launch the device computation threads! MatrixMulKerneldimGrid, dimBlock(d_M, d_N, d_P, Width); … Size of matrix limited by the number of threads allowed on a device © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ ©    11 60 01 0 4 p  D 7 A 8T8 ‚ UT @ V € v yV ƒ Max Number of Threads per Block: 512 ƒ Max Number of Blocks per Streaming Multiprocessor: 8 ƒ Number of Streaming Multiprocessors: 30 ƒ Total Number of Threads Available = 30 x 8 x 512 = 122880 Let me double-check this! © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © '  6
  • 7. 6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B threadIdx.x blockIdx.x blockDim.x d_P blockDim.y blockIdx.y TILE_WIDTH 13 threadIdx.y TILE_WIDT © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © C  6 0 I 36) 6 4p 0 0„ 16 7 A 8 †V 8 …A A B __global__ void MatrixMulKernel(float* d_M, float* d_N, float* d_P, int Width) { int start_row = blockDim.y * blockIdx.y + threadIdx.y * TILE_WIDTH; int end_row = start_row + TILE_WIDTH; int start_col = blockDim.x * blockIdx.x + threadIdx.x * TILE_WIDTH; int end_col = start_col + TILE_WIDTH; for (int row = start_row; row end_row; row++) { for(int col = start_col; col end_col; col++) { float P_val = 0; for (int k = 0; k Width; ++k) { float M_elem = d_M[row * Width + k]; float N_elem = d_N[k * Width + col]; P_val += M_elem * N_elem; } d_p[row*Width+col] = P_val; } } } © ¤ © § ¤© ¦©¨¨§ ¤ ¦¥ ¤¢ £ ¢¡ £ ©¨© § % $ ! ¦ ©¤ # ! ! ¤ © Q  7