SlideShare ist ein Scribd-Unternehmen logo
1 von 39
Downloaden Sie, um offline zu lesen
High-Level Language Features for Low-Level Programming


Cyrus Omar
Computer Science Department
Carnegie Mellon University
http://www.cs.cmu.edu/~comar/
The Needs of Scientists and Engineers




ment group) or letting the users/stakeholders know how the                                                                                                             100

software works (open source, scientific paper publication).
            Other                                                                                                                                                               Using Tool is ’Standard’
                                                                                                                                                                           80
      Reason given for use of programming language




                                                         Only language known
3.8                                                  Non-functional requirements




                                                                                                                                                   % of respondents
                                                                                                                                                                                Tool is Easy to Use
                                                           Required                                                                                                        60
  The respondents were asked to rate a series of non-functional                                                                                                                                Open Source




                                                                                                                                      Reason for use of tools
                  Favourite
requirements on the following Likert scale:                                                                                                                                40
                                                                               Performance                                                                                                                     Cost (or lack thereof) of Tool

                                                                                                                                                                           20
  1. very unimportant                                                          Legacy                                                                                                                               Project Organisation

                                                                                       Ease of use                                                                          0
  2. unimportant                                                                                                                                                                                                                       Features




                                                                                                                                                                                 Reliability


                                                                                                                                                                                               Functionality


                                                                                                                                                                                                                  Usability


                                                                                                                                                                                                                              Availability


                                                                                                                                                                                                                                             Flexibility


                                                                                                                                                                                                                                                           Performance*


                                                                                                                                                                                                                                                                            Portability


                                                                                                                                                                                                                                                                                          Testability


                                                                                                                                                                                                                                                                                                        Maintainability


                                                                                                                                                                                                                                                                                                                              Tracability*


                                                                                                                                                                                                                                                                                                                                             Reusability
                                                                                          Developer experience
                                                                                                                                                                                                                                                                          Version Control is ’Required’
  3. neither                                                                              Features

                                                                                                Cross-platform compatability                                                                                                                                                      Improve Ease of Coding
  4. important
                                                     0     10          20         30             40           50               60                             0                         5
                                                                                                                                                                           Very Unimportant                10    Neither                                                                  15                                       20
                                                                                                                                                                                                                                                                                                                          Very Important
                                                                                                                                                                                Unimportant Number of respondents (out of 46)
                                                                                                                                                                                                              Important
   5. very important Number of respondents
  Figure 7: Reasons for Choice of Programming Lan-                                                                                  Figure 18: Importance why non-functional require-
                                                                                                                                          Figure 9: Reasons of tools are used
   This scale was chosen so that the relative importance of
  guage                                                                                                                             ments as rated by respondents
non-functional requirements could be determined from re-                                                                                                              60
spondents’ answers. A straight ranking of non-functional re-
quirements would only indicate how important respondents
          Modelling                                                                                                                                                   50
                                                                                                                                    Table 1: Combined important and very important
considered each non-functional requirement in comparison
                                                                                                                                    ratings for non-functional requirements
                                                                                                                                      Number of respondents




          Framework
to others, but would not provide any information regard-                                                                                 40
                                                                                                                                        Ranking Requirement       Combined Important
ing how importantTracking
             Bug/Change a non-functional requirement was over-
                                                                                                                                         30                       and Very Important
all. The neutral response of ‘neither’ was included as some
                                                                                                                                                                      Ratings (%)
respondents may not consider a non-functional requirement
             Build Tools
      Tool type




                                                                                                                                         20 1     Reliability             100
or are unaware of it.
              Libraries/Packages                                                                                                            2     Functionality            95
   Non-functional requirements from the Software Require-
                                                                                                                                            3     Maintainability   [Nguyen-Hoan et al, 2010]
                                                                                                                                                                           90
                                                                                                                                         10
ments Specification Data Item described in United States
                             Testing
The State of Scientific Programming Today


  C, Fortran, CUDA, OpenCL                             MATLAB, Python, R, Perl

  Fast                                                 Productive
  Control over memory allocation                       Low syntactic overhead
  Control over data movement                           Read-eval-print loop (REPL)
  Access to hardware primitives                        Flexible data structures and abstractions
  Portability                                          Nice development environments


  Tedious                                              Slow
  Type annotations, templates, pragmas                 Dynamic lookups and indirection abound
  Obtuse compilers, linkers, preprocessors             Automatic memory management can cause problems
  No support for high-level abstractions




       Scientists relieve the tension by:
       • writing overall control flow and basic data analysis routines in a high-level language
       • calling into a low-level language for performance-critical sections (can be annoying)
The State of Scientific Programming Tomorrow


  C, Fortran, CUDA, OpenCL                             MATLAB, Python, R, Perl

  Fast                                                 Productive
  Control over memory allocation                       Low syntactic overhead
  Control over data movement                           Read-eval-print loop (REPL)
  Access to hardware primitives                        Flexible data structures and abstractions
  Portability                                          Nice development environments


  Tedious                                              Slow
  Type annotations, templates, pragmas                 Dynamic lookups and indirection abound
  Obtuse compilers, linkers, preprocessors             Automatic memory management can cause problems
  No support for high-level abstractions




       Scientists relieve any remaining tension by:
       • writing overall control flow and basic data analysis routines in a high-level language
       • calling into cl.oquence for performance-critical sections (can be annoying)
What is cl.oquence?

  A low-level programming language that maps closely onto, and compiles down to, OpenCL.

What is OpenCL?

  OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is
  designed as a library that can be used from a variety of higher-level language.

What is a heterogeneous computing environment?

  A heterogeneous computing environment is an environment where many different compute devices and address
  spaces are available. Devices can include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core
  processors like the Cell BE and other specialized accelerators.

Why should I use cl.oquence?

   •   Same core type system (including pointers) and performance profile as OpenCL
   •   Usable from any host language that has OpenCL bindings
   •   Type inference and extension inference eliminates annotational burden
   •   Simplified syntax is a subset of Python, can use existing tools
   •   Structural polymorphism gives you generic programming by default
   •   New features:
       • Higher-order functions
       • Default arguments for functions
   •   Python as the preprocessor and module system
       • Rich support for compile-time metaprogramming
       • Write compiler extensions, new basic types as libraries; modular, clean design
   •   Light-weight and easy to integrate into any build process
   •   Packaged with special Python host bindings that eliminate even basic overhead when using from within Python
       • Built on top of pyopencl and numpy
What is cl.oquence?

  A low-level programming language that maps closely onto, and compiles down to, OpenCL.

What is OpenCL?

  OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is
  designed as a library that can be used from a variety of higher-level language.

What is a heterogeneous computing environment?

  A heterogeneous computing environment is an environment where many different compute devices and address
  spaces are available. Devices can include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core
  processors like the Cell BE and other specialized accelerators.

Why should I use cl.oquence?

   •   Same core type system (including pointers) and performance profile as OpenCL
   •   Usable from any host language that has OpenCL bindings
   •   Type inference and extension inference eliminates annotational burden
   •   Simplified syntax is a subset of Python, can use existing tools
   •   Structural polymorphism gives you generic programming by default
   •   New features:
       • Higher-order functions
       • Default arguments for functions
   •   Python as the preprocessor and module system
       • Rich support for compile-time metaprogramming
       • Write compiler extensions, new basic types as libraries; modular, clean design
   •   Light-weight and easy to integrate into any build process
   •   Packaged with special Python host bindings that eliminate even basic overhead when using from within Python
       • Built on top of pyopencl and numpy
What is cl.oquence?

  A low-level programming language that maps closely onto, and compiles down to, OpenCL.

What is OpenCL?

  OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is
  designed as a library that can be used from a variety of higher-level language.

What is a heterogeneous computing environment?

  A heterogeneous computing environment is one where many different devices and address spaces must be
  managed. Examples of devices include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core
  processors like the Cell BE and other specialized accelerators.

Why should I use cl.oquence?

   •   Same core type system (including pointers) and performance profile as OpenCL
   •   Usable from any host language that has OpenCL bindings
   •   Type inference and extension inference eliminates annotational burden
   •   Simplified syntax is a subset of Python, can use existing tools
   •   Structural polymorphism gives you generic programming by default
   •   New features:
       • Higher-order functions
       • Default arguments for functions
   •   Python as the preprocessor and module system
       • Rich support for compile-time metaprogramming
       • Write compiler extensions, new basic types as libraries; modular, clean design
   •   Light-weight and easy to integrate into any build process
   •   Packaged with special Python host bindings that eliminate even basic overhead when using from within Python
       • Built on top of pyopencl and numpy
What is cl.oquence?

  A low-level programming language that maps closely onto, and compiles down to, OpenCL.

What is OpenCL?

  OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is
  designed as a library that can be used from a variety of higher-level language.

What is a heterogeneous computing environment?

  A heterogeneous computing environment is one where many different devices and address spaces must be
  managed. Examples of devices include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core
  processors like the Cell BE and other specialized accelerators.

Why should I use cl.oquence?

   •   Same core type system (including pointers) and performance profile as OpenCL
   •   Usable from any host language that has OpenCL bindings
   •   Type inference and extension inference eliminates annotational burden
   •   Simplified syntax is a subset of Python, can use existing tools
   •   Structural polymorphism gives you generic programming by default
   •   New features:
       • Higher-order functions
       • Default arguments for functions
   •   Python as the preprocessor and module system
       • Rich support for compile-time metaprogramming
       • Write compiler extensions, new basic types as libraries; modular, clean design
   •   Light-weight and easy to integrate into any build process
   •   Packaged with special Python host bindings that eliminate even basic overhead when using from within Python
       • Built on top of pyopencl and numpy
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  short*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  float*	
  a,	
  __global	
  double*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  ...
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  short*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  float*	
  a,	
  __global	
  double*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  ...
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  short*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  float*	
  a,	
  __global	
  double*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  ...
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_di(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            My photographs tell stories of loss, human struggle, and personal exploration
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       within landscapes scarred by technology and over-use… [I] strive to metaphorically
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                  and poetically link laborious actions, idiosyncratic rituals and strangely
                                                                                                                           crude machines into tales about our modern experience.
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                                                                                      Robert ParkeHarrison

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable                                               These two libraries express the same thing.
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                  The code will run in precisely the same amount of time.
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
Two invocation models
                                               @cl.oquence.fn
 1. Standalone compilation to OpenCL           def	
  ew_op(a,	
  b,	
  dest,	
  op):
     • Use any host language that has OpenCL   	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
       bindings available                      	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
        •C                                     	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
        • C++
        • Fortran                              @cl.oquence.fn
                                               def	
  plus(a,	
  b):
        • MATLAB                               	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
        • Java                                              return	
  a	
  +	
  b
        • .NET
        • Ruby                                 @cl.oquence.fn
                                               def	
  mul(a,	
  b):
        • Python                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
                                                            return	
  a	
  *	
  b

                                               #	
  Programmatically	
  specialize	
  and	
  assign	
  types	
  to	
  
                                               #	
  any	
  externally	
  callable	
  versions	
  you	
  need.

                                               sum	
  =	
  ew_op.specialize(op=plus)
                                               prod	
  =	
  ew_op.specialize(op=mul)

                                               g_int_p	
  =	
  cl_int.global_ptr
                                               g_float_p	
  =	
  cl_float.global_ptr

                                               sum_ff	
  =	
  sum.compile(g_float_p,	
  g_float_p,	
  g_float_p)
                                               sum_ii	
  =	
  sum.compile(g_int_p,	
  g_int_p,	
  g_int_p)
Two invocation models
                                                                                                                       @cl.oquence.fn
 1. Standalone compilation to OpenCL                                                                                   def	
  ew_op(a,	
  b,	
  dest,	
  op):
     • Use any host language that has OpenCL                                                                           	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
       bindings available                                                                                              	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
        •C                                                                                                             	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
        • C++
        • Fortran                                                                                                      @cl.oquence.fn
                                                                                                                       def	
  plus(a,	
  b):
        • MATLAB                                                                                                       	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
        • Java                                                                                                                      return	
  a	
  +	
  b
        • .NET
        • Ruby                                                                                                         @cl.oquence.fn
                                                                                                                       def	
  mul(a,	
  b):
        • Python                                                                                                       	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
                                                                                                                                    return	
  a	
  *	
  b

                                                                                                                       #	
  Programmatically	
  specialize	
  and	
  assign	
  types	
  to	
  
clqcc	
  hello.clq                                                                                                     #	
  any	
  externally	
  callable	
  versions	
  you	
  need.

                                                                                                                       sum	
  =	
  ew_op.specialize(op=plus)
     creates hello.cl:                                                                                                 prod	
  =	
  ew_op.specialize(op=mul)

__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                      g_int_p	
  =	
  cl_int.global_ptr
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
                                                                                                                       g_float_p	
  =	
  cl_float.global_ptr
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                      sum_ff	
  =	
  sum.compile(g_float_p,	
  g_float_p,	
  g_float_p)
                                                                                                                       sum_ii	
  =	
  sum.compile(g_int_p,	
  g_int_p,	
  g_int_p)
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}
Two invocation models
                                                  @cl.oquence.fn
 1. Standalone compilation to OpenCL              def	
  ew_op(a,	
  b,	
  dest,	
  op):
 2. Integrated into a host language               	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy   	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                  	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                  @cl.oquence.fn
                                                  def	
  plus(a,	
  b):
                                                  	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
                                                               return	
  a	
  +	
  b

                                                  @cl.oquence.fn
                                                  def	
  mul(a,	
  b):
                                                  	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
                                                               return	
  a	
  *	
  b

                                                  #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
                                                  a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
                                                  b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)

                                                  #	
  transfer	
  data	
  to	
  device
                                                  ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)
                                                  a_buf	
  =	
  ctx.to_device(a)
                                                  b_buf	
  =	
  ctx.to_device(b)
                                                  dest_buf	
  =	
  ctx.alloc(like=a)

                                                  #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
                                                  ew_op(a_buf,	
  b_buf,	
  dest_buf,	
  plus,	
  
                                                  	
  	
  	
  	
  	
  	
  global_size=a.shape,	
  local_size=(256,)).wait()

                                                  #	
  get	
  results
                                                  result	
  =	
  ctx.from_device(dest_buf)

                                                  #	
  check	
  results
                                                  print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))
Two invocation models
                                                      @cl.oquence.fn
 1. Standalone compilation to OpenCL                  def	
  ew_op(a,	
  b,	
  dest,	
  op):
 2. Integrated into a host language                   	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy       	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                      	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                      @cl.oquence.fn
                                                      def	
  plus(a,	
  b):
                                                      	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
Four simple memory management functions                            return	
  a	
  +	
  b
 1. to_device: numpy array => new buffer              @cl.oquence.fn
 2. from_device: buffer => new numpy array            def	
  mul(a,	
  b):
 3. alloc: empty buffer                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
 4. copy: copies between existing buffers or arrays                return	
  a	
  *	
  b

                                                      #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
Buffers hold metadata (type, shape, order) so you     a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
don’t have to provide it.                             b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)

                                                      #	
  transfer	
  data	
  to	
  device
                                                      ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)
                                                      a_buf	
  =	
  ctx.to_device(a)
                                                      b_buf	
  =	
  ctx.to_device(b)
                                                      dest_buf	
  =	
  ctx.alloc(like=a)

                                                      #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
                                                      ew_op(a_buf,	
  b_buf,	
  dest_buf,	
  plus,	
  
                                                      	
  	
  	
  	
  	
  	
  global_size=a.shape,	
  local_size=(256,)).wait()

                                                      #	
  get	
  results
                                                      result	
  =	
  ctx.from_device(dest_buf)

                                                      #	
  check	
  results
                                                      print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))
Two invocation models
                                                      @cl.oquence.fn
 1. Standalone compilation to OpenCL                  def	
  ew_op(a,	
  b,	
  dest,	
  op):
 2. Integrated into a host language                   	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy       	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                      	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                      @cl.oquence.fn
                                                      def	
  plus(a,	
  b):
                                                      	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
Four simple memory management functions                            return	
  a	
  +	
  b
 1. to_device: numpy array => new buffer              @cl.oquence.fn
 2. from_device: buffer => new numpy array            def	
  mul(a,	
  b):
 3. alloc: empty buffer                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
 4. copy: copies between existing buffers or arrays                return	
  a	
  *	
  b

                                                      #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
Buffers hold metadata (type, shape, order) so you     a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
don’t have to provide it.                             b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)

                                                      #	
  transfer	
  data	
  to	
  device
                                                      ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)
Implicit queue associated with each context.          a_buf	
  =	
  ctx.to_device(a)
                                                      b_buf	
  =	
  ctx.to_device(b)
                                                      dest_buf	
  =	
  ctx.alloc(like=a)

                                                      #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
                                                      ew_op(a_buf,	
  b_buf,	
  dest_buf,	
  plus,	
  
                                                      	
  	
  	
  	
  	
  	
  global_size=a.shape,	
  local_size=(256,)).wait()

                                                      #	
  get	
  results
                                                      result	
  =	
  ctx.from_device(dest_buf)

                                                      #	
  check	
  results
                                                      print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))
Two invocation models                                 @cl.oquence.auto(lambda	
  a,	
  b,	
  dest,	
  op:	
  a.shape,	
  (256,))
 1. Standalone compilation to OpenCL                  @cl.oquence.fn
 2. Integrated into a host language                   def	
  ew_op(a,	
  b,	
  dest,	
  op):
                                                      	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy       	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                      	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                      @cl.oquence.fn
                                                      def	
  plus(a,	
  b):
                                                      	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
Four simple memory management functions                            return	
  a	
  +	
  b
 1. to_device: numpy array => new buffer              @cl.oquence.fn
 2. from_device: buffer => new numpy array            def	
  mul(a,	
  b):
 3. alloc: empty buffer                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
 4. copy: copies between existing buffers or arrays                return	
  a	
  *	
  b

                                                      #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
Buffers hold metadata (type, shape, order) so you     a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
don’t have to provide it.                             b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)

                                                      #	
  transfer	
  data	
  to	
  device
                                                      ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)
Implicit queue associated with each context.          a_buf	
  =	
  ctx.to_device(a)
                                                      b_buf	
  =	
  ctx.to_device(b)
                                                      dest_buf	
  =	
  ctx.alloc(like=a)

The auto annotation can allow you to hide the         #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
details of parallelization from the user.             ew_op(a_buf,	
  b_buf,	
  dest_buf,	
  plus).wait()

                                                      #	
  get	
  results
                                                      result	
  =	
  ctx.from_device(dest_buf)

                                                      #	
  check	
  results
                                                      print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))
Two invocation models                                 @cl.oquence.auto(lambda	
  a,	
  b,	
  dest,	
  op:	
  a.shape,	
  (256,))
 1. Standalone compilation to OpenCL                  @cl.oquence.fn
 2. Integrated into a host language                   def	
  ew_op(a,	
  b,	
  dest,	
  op):
                                                      	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy       	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                      	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                      @cl.oquence.fn
                                                      def	
  plus(a,	
  b):
                                                      	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
Four simple memory management functions                            return	
  a	
  +	
  b
 1. to_device: numpy array => new buffer              @cl.oquence.fn
 2. from_device: buffer => new numpy array            def	
  mul(a,	
  b):
 3. alloc: empty buffer                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
 4. copy: copies between existing buffers or arrays                return	
  a	
  *	
  b

                                                      #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
Buffers hold metadata (type, shape, order) so you     a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
don’t have to provide it.                             b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
                                                      c	
  =	
  numpy.empty_like(a)

                                                      #	
  create	
  an	
  OpenCL	
  context
Implicit queue associated with each context.          ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)

                                                      #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
                                                      ew_op(In(a),	
  In(b),	
  Out(c),	
  plus).wait()
The auto annotation can allow you to hide the
details of parallelization from the user.             #	
  check	
  results
                                                      print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))


The In, Out and InOut constructs can help
automate data movement when convenient.
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable                                               These two libraries express the same thing.
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                  The code will run in precisely the same amount of time.
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
                                              How?
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable                                                         • cl.oquence.fn code looks like Python, but no!
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
                                                                                                                                     • Same core type system as OpenCL (C99+)
}                                                                                                                                    • Type inference to eliminate type annotations
                                                                                                                                         (not dynamic lookups)
...                                           ...                                                                                    •   Extension inference to eliminate pragmas
                                                                                                                                     •   Higher-order functions (inlined at compile-time)
//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
                                                                                                                                     •   Structural polymorphism
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                  • All functions are generic by default
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                                   • You can call a function with any arguments
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];                                                                                    that support the operations it uses.
}
                                                                                                                                           •
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)

Weitere ähnliche Inhalte

Andere mochten auch

[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...npinto
 
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...npinto
 
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...npinto
 
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...npinto
 
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)npinto
 
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...npinto
 
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)npinto
 
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...npinto
 

Andere mochten auch (8)

[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
 
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
 
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
 
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
 
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
 
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
 
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
 
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
 

Mehr von npinto

"AI" for Blockchain Security (Case Study: Cosmos)
"AI" for Blockchain Security (Case Study: Cosmos)"AI" for Blockchain Security (Case Study: Cosmos)
"AI" for Blockchain Security (Case Study: Cosmos)npinto
 
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...npinto
 
[Harvard CS264] 05 - Advanced-level CUDA Programming
[Harvard CS264] 05 - Advanced-level CUDA Programming[Harvard CS264] 05 - Advanced-level CUDA Programming
[Harvard CS264] 05 - Advanced-level CUDA Programmingnpinto
 
[Harvard CS264] 04 - Intermediate-level CUDA Programming
[Harvard CS264] 04 - Intermediate-level CUDA Programming[Harvard CS264] 04 - Intermediate-level CUDA Programming
[Harvard CS264] 04 - Intermediate-level CUDA Programmingnpinto
 
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basicsnpinto
 
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patternsnpinto
 
[Harvard CS264] 01 - Introduction
[Harvard CS264] 01 - Introduction[Harvard CS264] 01 - Introduction
[Harvard CS264] 01 - Introductionnpinto
 
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...npinto
 
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)npinto
 
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...npinto
 

Mehr von npinto (16)

"AI" for Blockchain Security (Case Study: Cosmos)
"AI" for Blockchain Security (Case Study: Cosmos)"AI" for Blockchain Security (Case Study: Cosmos)
"AI" for Blockchain Security (Case Study: Cosmos)
 
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
 
[Harvard CS264] 05 - Advanced-level CUDA Programming
[Harvard CS264] 05 - Advanced-level CUDA Programming[Harvard CS264] 05 - Advanced-level CUDA Programming
[Harvard CS264] 05 - Advanced-level CUDA Programming
 
[Harvard CS264] 04 - Intermediate-level CUDA Programming
[Harvard CS264] 04 - Intermediate-level CUDA Programming[Harvard CS264] 04 - Intermediate-level CUDA Programming
[Harvard CS264] 04 - Intermediate-level CUDA Programming
 
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
 
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
 
[Harvard CS264] 01 - Introduction
[Harvard CS264] 01 - Introduction[Harvard CS264] 01 - Introduction
[Harvard CS264] 01 - Introduction
 
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
 
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
 
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
 
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
 
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
 
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
 
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
 
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
 
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
 

Kürzlich hochgeladen

Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...
Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...
Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...christianmathematics
 
The basics of sentences session 2pptx copy.pptx
The basics of sentences session 2pptx copy.pptxThe basics of sentences session 2pptx copy.pptx
The basics of sentences session 2pptx copy.pptxheathfieldcps1
 
APM Welcome, APM North West Network Conference, Synergies Across Sectors
APM Welcome, APM North West Network Conference, Synergies Across SectorsAPM Welcome, APM North West Network Conference, Synergies Across Sectors
APM Welcome, APM North West Network Conference, Synergies Across SectorsAssociation for Project Management
 
1029-Danh muc Sach Giao Khoa khoi 6.pdf
1029-Danh muc Sach Giao Khoa khoi  6.pdf1029-Danh muc Sach Giao Khoa khoi  6.pdf
1029-Danh muc Sach Giao Khoa khoi 6.pdfQucHHunhnh
 
Paris 2024 Olympic Geographies - an activity
Paris 2024 Olympic Geographies - an activityParis 2024 Olympic Geographies - an activity
Paris 2024 Olympic Geographies - an activityGeoBlogs
 
fourth grading exam for kindergarten in writing
fourth grading exam for kindergarten in writingfourth grading exam for kindergarten in writing
fourth grading exam for kindergarten in writingTeacherCyreneCayanan
 
Interactive Powerpoint_How to Master effective communication
Interactive Powerpoint_How to Master effective communicationInteractive Powerpoint_How to Master effective communication
Interactive Powerpoint_How to Master effective communicationnomboosow
 
Nutritional Needs Presentation - HLTH 104
Nutritional Needs Presentation - HLTH 104Nutritional Needs Presentation - HLTH 104
Nutritional Needs Presentation - HLTH 104misteraugie
 
Z Score,T Score, Percential Rank and Box Plot Graph
Z Score,T Score, Percential Rank and Box Plot GraphZ Score,T Score, Percential Rank and Box Plot Graph
Z Score,T Score, Percential Rank and Box Plot GraphThiyagu K
 
Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...
Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...
Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...EduSkills OECD
 
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...fonyou31
 
microwave assisted reaction. General introduction
microwave assisted reaction. General introductionmicrowave assisted reaction. General introduction
microwave assisted reaction. General introductionMaksud Ahmed
 
Web & Social Media Analytics Previous Year Question Paper.pdf
Web & Social Media Analytics Previous Year Question Paper.pdfWeb & Social Media Analytics Previous Year Question Paper.pdf
Web & Social Media Analytics Previous Year Question Paper.pdfJayanti Pande
 
The Most Excellent Way | 1 Corinthians 13
The Most Excellent Way | 1 Corinthians 13The Most Excellent Way | 1 Corinthians 13
The Most Excellent Way | 1 Corinthians 13Steve Thomason
 
Key note speaker Neum_Admir Softic_ENG.pdf
Key note speaker Neum_Admir Softic_ENG.pdfKey note speaker Neum_Admir Softic_ENG.pdf
Key note speaker Neum_Admir Softic_ENG.pdfAdmir Softic
 
1029 - Danh muc Sach Giao Khoa 10 . pdf
1029 -  Danh muc Sach Giao Khoa 10 . pdf1029 -  Danh muc Sach Giao Khoa 10 . pdf
1029 - Danh muc Sach Giao Khoa 10 . pdfQucHHunhnh
 
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in DelhiRussian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhikauryashika82
 

Kürzlich hochgeladen (20)

Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...
Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...
Explore beautiful and ugly buildings. Mathematics helps us create beautiful d...
 
The basics of sentences session 2pptx copy.pptx
The basics of sentences session 2pptx copy.pptxThe basics of sentences session 2pptx copy.pptx
The basics of sentences session 2pptx copy.pptx
 
APM Welcome, APM North West Network Conference, Synergies Across Sectors
APM Welcome, APM North West Network Conference, Synergies Across SectorsAPM Welcome, APM North West Network Conference, Synergies Across Sectors
APM Welcome, APM North West Network Conference, Synergies Across Sectors
 
1029-Danh muc Sach Giao Khoa khoi 6.pdf
1029-Danh muc Sach Giao Khoa khoi  6.pdf1029-Danh muc Sach Giao Khoa khoi  6.pdf
1029-Danh muc Sach Giao Khoa khoi 6.pdf
 
Paris 2024 Olympic Geographies - an activity
Paris 2024 Olympic Geographies - an activityParis 2024 Olympic Geographies - an activity
Paris 2024 Olympic Geographies - an activity
 
fourth grading exam for kindergarten in writing
fourth grading exam for kindergarten in writingfourth grading exam for kindergarten in writing
fourth grading exam for kindergarten in writing
 
INDIA QUIZ 2024 RLAC DELHI UNIVERSITY.pptx
INDIA QUIZ 2024 RLAC DELHI UNIVERSITY.pptxINDIA QUIZ 2024 RLAC DELHI UNIVERSITY.pptx
INDIA QUIZ 2024 RLAC DELHI UNIVERSITY.pptx
 
Interactive Powerpoint_How to Master effective communication
Interactive Powerpoint_How to Master effective communicationInteractive Powerpoint_How to Master effective communication
Interactive Powerpoint_How to Master effective communication
 
Nutritional Needs Presentation - HLTH 104
Nutritional Needs Presentation - HLTH 104Nutritional Needs Presentation - HLTH 104
Nutritional Needs Presentation - HLTH 104
 
Mattingly "AI & Prompt Design: The Basics of Prompt Design"
Mattingly "AI & Prompt Design: The Basics of Prompt Design"Mattingly "AI & Prompt Design: The Basics of Prompt Design"
Mattingly "AI & Prompt Design: The Basics of Prompt Design"
 
Z Score,T Score, Percential Rank and Box Plot Graph
Z Score,T Score, Percential Rank and Box Plot GraphZ Score,T Score, Percential Rank and Box Plot Graph
Z Score,T Score, Percential Rank and Box Plot Graph
 
Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...
Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...
Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...
 
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
 
microwave assisted reaction. General introduction
microwave assisted reaction. General introductionmicrowave assisted reaction. General introduction
microwave assisted reaction. General introduction
 
Web & Social Media Analytics Previous Year Question Paper.pdf
Web & Social Media Analytics Previous Year Question Paper.pdfWeb & Social Media Analytics Previous Year Question Paper.pdf
Web & Social Media Analytics Previous Year Question Paper.pdf
 
The Most Excellent Way | 1 Corinthians 13
The Most Excellent Way | 1 Corinthians 13The Most Excellent Way | 1 Corinthians 13
The Most Excellent Way | 1 Corinthians 13
 
Key note speaker Neum_Admir Softic_ENG.pdf
Key note speaker Neum_Admir Softic_ENG.pdfKey note speaker Neum_Admir Softic_ENG.pdf
Key note speaker Neum_Admir Softic_ENG.pdf
 
1029 - Danh muc Sach Giao Khoa 10 . pdf
1029 -  Danh muc Sach Giao Khoa 10 . pdf1029 -  Danh muc Sach Giao Khoa 10 . pdf
1029 - Danh muc Sach Giao Khoa 10 . pdf
 
Mattingly "AI & Prompt Design: Structured Data, Assistants, & RAG"
Mattingly "AI & Prompt Design: Structured Data, Assistants, & RAG"Mattingly "AI & Prompt Design: Structured Data, Assistants, & RAG"
Mattingly "AI & Prompt Design: Structured Data, Assistants, & RAG"
 
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in DelhiRussian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
 

[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)

  • 1. High-Level Language Features for Low-Level Programming Cyrus Omar Computer Science Department Carnegie Mellon University http://www.cs.cmu.edu/~comar/
  • 2. The Needs of Scientists and Engineers ment group) or letting the users/stakeholders know how the 100 software works (open source, scientific paper publication). Other Using Tool is ’Standard’ 80 Reason given for use of programming language Only language known 3.8 Non-functional requirements % of respondents Tool is Easy to Use Required 60 The respondents were asked to rate a series of non-functional Open Source Reason for use of tools Favourite requirements on the following Likert scale: 40 Performance Cost (or lack thereof) of Tool 20 1. very unimportant Legacy Project Organisation Ease of use 0 2. unimportant Features Reliability Functionality Usability Availability Flexibility Performance* Portability Testability Maintainability Tracability* Reusability Developer experience Version Control is ’Required’ 3. neither Features Cross-platform compatability Improve Ease of Coding 4. important 0 10 20 30 40 50 60 0 5 Very Unimportant 10 Neither 15 20 Very Important Unimportant Number of respondents (out of 46) Important 5. very important Number of respondents Figure 7: Reasons for Choice of Programming Lan- Figure 18: Importance why non-functional require- Figure 9: Reasons of tools are used This scale was chosen so that the relative importance of guage ments as rated by respondents non-functional requirements could be determined from re- 60 spondents’ answers. A straight ranking of non-functional re- quirements would only indicate how important respondents Modelling 50 Table 1: Combined important and very important considered each non-functional requirement in comparison ratings for non-functional requirements Number of respondents Framework to others, but would not provide any information regard- 40 Ranking Requirement Combined Important ing how importantTracking Bug/Change a non-functional requirement was over- 30 and Very Important all. The neutral response of ‘neither’ was included as some Ratings (%) respondents may not consider a non-functional requirement Build Tools Tool type 20 1 Reliability 100 or are unaware of it. Libraries/Packages 2 Functionality 95 Non-functional requirements from the Software Require- 3 Maintainability [Nguyen-Hoan et al, 2010] 90 10 ments Specification Data Item described in United States Testing
  • 3. The State of Scientific Programming Today C, Fortran, CUDA, OpenCL MATLAB, Python, R, Perl Fast Productive Control over memory allocation Low syntactic overhead Control over data movement Read-eval-print loop (REPL) Access to hardware primitives Flexible data structures and abstractions Portability Nice development environments Tedious Slow Type annotations, templates, pragmas Dynamic lookups and indirection abound Obtuse compilers, linkers, preprocessors Automatic memory management can cause problems No support for high-level abstractions Scientists relieve the tension by: • writing overall control flow and basic data analysis routines in a high-level language • calling into a low-level language for performance-critical sections (can be annoying)
  • 4. The State of Scientific Programming Tomorrow C, Fortran, CUDA, OpenCL MATLAB, Python, R, Perl Fast Productive Control over memory allocation Low syntactic overhead Control over data movement Read-eval-print loop (REPL) Access to hardware primitives Flexible data structures and abstractions Portability Nice development environments Tedious Slow Type annotations, templates, pragmas Dynamic lookups and indirection abound Obtuse compilers, linkers, preprocessors Automatic memory management can cause problems No support for high-level abstractions Scientists relieve any remaining tension by: • writing overall control flow and basic data analysis routines in a high-level language • calling into cl.oquence for performance-critical sections (can be annoying)
  • 5. What is cl.oquence? A low-level programming language that maps closely onto, and compiles down to, OpenCL. What is OpenCL? OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is designed as a library that can be used from a variety of higher-level language. What is a heterogeneous computing environment? A heterogeneous computing environment is an environment where many different compute devices and address spaces are available. Devices can include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core processors like the Cell BE and other specialized accelerators. Why should I use cl.oquence? • Same core type system (including pointers) and performance profile as OpenCL • Usable from any host language that has OpenCL bindings • Type inference and extension inference eliminates annotational burden • Simplified syntax is a subset of Python, can use existing tools • Structural polymorphism gives you generic programming by default • New features: • Higher-order functions • Default arguments for functions • Python as the preprocessor and module system • Rich support for compile-time metaprogramming • Write compiler extensions, new basic types as libraries; modular, clean design • Light-weight and easy to integrate into any build process • Packaged with special Python host bindings that eliminate even basic overhead when using from within Python • Built on top of pyopencl and numpy
  • 6. What is cl.oquence? A low-level programming language that maps closely onto, and compiles down to, OpenCL. What is OpenCL? OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is designed as a library that can be used from a variety of higher-level language. What is a heterogeneous computing environment? A heterogeneous computing environment is an environment where many different compute devices and address spaces are available. Devices can include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core processors like the Cell BE and other specialized accelerators. Why should I use cl.oquence? • Same core type system (including pointers) and performance profile as OpenCL • Usable from any host language that has OpenCL bindings • Type inference and extension inference eliminates annotational burden • Simplified syntax is a subset of Python, can use existing tools • Structural polymorphism gives you generic programming by default • New features: • Higher-order functions • Default arguments for functions • Python as the preprocessor and module system • Rich support for compile-time metaprogramming • Write compiler extensions, new basic types as libraries; modular, clean design • Light-weight and easy to integrate into any build process • Packaged with special Python host bindings that eliminate even basic overhead when using from within Python • Built on top of pyopencl and numpy
  • 7. What is cl.oquence? A low-level programming language that maps closely onto, and compiles down to, OpenCL. What is OpenCL? OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is designed as a library that can be used from a variety of higher-level language. What is a heterogeneous computing environment? A heterogeneous computing environment is one where many different devices and address spaces must be managed. Examples of devices include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core processors like the Cell BE and other specialized accelerators. Why should I use cl.oquence? • Same core type system (including pointers) and performance profile as OpenCL • Usable from any host language that has OpenCL bindings • Type inference and extension inference eliminates annotational burden • Simplified syntax is a subset of Python, can use existing tools • Structural polymorphism gives you generic programming by default • New features: • Higher-order functions • Default arguments for functions • Python as the preprocessor and module system • Rich support for compile-time metaprogramming • Write compiler extensions, new basic types as libraries; modular, clean design • Light-weight and easy to integrate into any build process • Packaged with special Python host bindings that eliminate even basic overhead when using from within Python • Built on top of pyopencl and numpy
  • 8. What is cl.oquence? A low-level programming language that maps closely onto, and compiles down to, OpenCL. What is OpenCL? OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is designed as a library that can be used from a variety of higher-level language. What is a heterogeneous computing environment? A heterogeneous computing environment is one where many different devices and address spaces must be managed. Examples of devices include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core processors like the Cell BE and other specialized accelerators. Why should I use cl.oquence? • Same core type system (including pointers) and performance profile as OpenCL • Usable from any host language that has OpenCL bindings • Type inference and extension inference eliminates annotational burden • Simplified syntax is a subset of Python, can use existing tools • Structural polymorphism gives you generic programming by default • New features: • Higher-order functions • Default arguments for functions • Python as the preprocessor and module system • Rich support for compile-time metaprogramming • Write compiler extensions, new basic types as libraries; modular, clean design • Light-weight and easy to integrate into any build process • Packaged with special Python host bindings that eliminate even basic overhead when using from within Python • Built on top of pyopencl and numpy
  • 9. OpenCL //  Parallel  elementwise  sum __kernel  void  sum(__global  float*  a,  __global  float*  b,                                      __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  int*  a,  __global  int*  b,                                      __global  int*  dest)  {        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  short*  a,  __global  int*  b,                                      __global  int*  dest)  {        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  float*  a,  __global  double*  b,                                      __global  float*  dest)  {        #pragma  ...        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 10. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  short*  a,  __global  int*  b,                                      __global  int*  dest)  {        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  float*  a,  __global  double*  b,                                      __global  float*  dest)  {        #pragma  ...        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 11. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  short*  a,  __global  int*  b,                                      __global  int*  dest)  {        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  float*  a,  __global  double*  b,                                      __global  float*  dest)  {        #pragma  ...        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 12. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_di(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 13. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 14. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 15. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 16. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 17. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,   My photographs tell stories of loss, human struggle, and personal exploration                                          __global  float*  dest)  { within landscapes scarred by technology and over-use… [I] strive to metaphorically        size_t  gid  =  get_global_id(0);   and poetically link laborious actions, idiosyncratic rituals and strangely crude machines into tales about our modern experience.        dest[gid]  =  a[gid]  +  b[gid]; } Robert ParkeHarrison __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 18. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 19. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 20. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 21. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 22. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 23. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable These two libraries express the same thing.        size_t  gid  =  get_global_id(0);   The code will run in precisely the same amount of time.        dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 24. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): • Use any host language that has OpenCL        '''Parallel  elementwise  binary  operation.''' bindings available        gid  =  get_global_id(0)                  #  Get  thread  index •C        dest[gid]  =  op(a[gid],  b[gid]) • C++ • Fortran @cl.oquence.fn def  plus(a,  b): • MATLAB        '''Adds  the  two  operands.''' • Java return  a  +  b • .NET • Ruby @cl.oquence.fn def  mul(a,  b): • Python        '''Multiplies  the  two  operands.''' return  a  *  b #  Programmatically  specialize  and  assign  types  to   #  any  externally  callable  versions  you  need. sum  =  ew_op.specialize(op=plus) prod  =  ew_op.specialize(op=mul) g_int_p  =  cl_int.global_ptr g_float_p  =  cl_float.global_ptr sum_ff  =  sum.compile(g_float_p,  g_float_p,  g_float_p) sum_ii  =  sum.compile(g_int_p,  g_int_p,  g_int_p)
  • 25. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): • Use any host language that has OpenCL        '''Parallel  elementwise  binary  operation.''' bindings available        gid  =  get_global_id(0)                  #  Get  thread  index •C        dest[gid]  =  op(a[gid],  b[gid]) • C++ • Fortran @cl.oquence.fn def  plus(a,  b): • MATLAB        '''Adds  the  two  operands.''' • Java return  a  +  b • .NET • Ruby @cl.oquence.fn def  mul(a,  b): • Python        '''Multiplies  the  two  operands.''' return  a  *  b #  Programmatically  specialize  and  assign  types  to   clqcc  hello.clq #  any  externally  callable  versions  you  need. sum  =  ew_op.specialize(op=plus) creates hello.cl: prod  =  ew_op.specialize(op=mul) __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   g_int_p  =  cl_int.global_ptr                                          __global  float*  dest)  {        size_t  gid  =  get_global_id(0); g_float_p  =  cl_float.global_ptr        dest[gid]  =  a[gid]  +  b[gid]; } sum_ff  =  sum.compile(g_float_p,  g_float_p,  g_float_p) sum_ii  =  sum.compile(g_int_p,  g_int_p,  g_int_p) __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; }
  • 26. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): 2. Integrated into a host language        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' return  a  +  b @cl.oquence.fn def  mul(a,  b):        '''Multiplies  the  two  operands.''' return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding a  =  numpy.random.rand(50000).astype(numpy.float32) b  =  numpy.random.rand(50000).astype(numpy.float32) #  transfer  data  to  device ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) a_buf  =  ctx.to_device(a) b_buf  =  ctx.to_device(b) dest_buf  =  ctx.alloc(like=a) #  invoke  function  (automatically  specialized  as  needed) ew_op(a_buf,  b_buf,  dest_buf,  plus,              global_size=a.shape,  local_size=(256,)).wait() #  get  results result  =  ctx.from_device(dest_buf) #  check  results print  la.norm(c  -­‐  (a  +  b))
  • 27. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): 2. Integrated into a host language        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' Four simple memory management functions return  a  +  b 1. to_device: numpy array => new buffer @cl.oquence.fn 2. from_device: buffer => new numpy array def  mul(a,  b): 3. alloc: empty buffer        '''Multiplies  the  two  operands.''' 4. copy: copies between existing buffers or arrays return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding Buffers hold metadata (type, shape, order) so you a  =  numpy.random.rand(50000).astype(numpy.float32) don’t have to provide it. b  =  numpy.random.rand(50000).astype(numpy.float32) #  transfer  data  to  device ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) a_buf  =  ctx.to_device(a) b_buf  =  ctx.to_device(b) dest_buf  =  ctx.alloc(like=a) #  invoke  function  (automatically  specialized  as  needed) ew_op(a_buf,  b_buf,  dest_buf,  plus,              global_size=a.shape,  local_size=(256,)).wait() #  get  results result  =  ctx.from_device(dest_buf) #  check  results print  la.norm(c  -­‐  (a  +  b))
  • 28. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): 2. Integrated into a host language        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' Four simple memory management functions return  a  +  b 1. to_device: numpy array => new buffer @cl.oquence.fn 2. from_device: buffer => new numpy array def  mul(a,  b): 3. alloc: empty buffer        '''Multiplies  the  two  operands.''' 4. copy: copies between existing buffers or arrays return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding Buffers hold metadata (type, shape, order) so you a  =  numpy.random.rand(50000).astype(numpy.float32) don’t have to provide it. b  =  numpy.random.rand(50000).astype(numpy.float32) #  transfer  data  to  device ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) Implicit queue associated with each context. a_buf  =  ctx.to_device(a) b_buf  =  ctx.to_device(b) dest_buf  =  ctx.alloc(like=a) #  invoke  function  (automatically  specialized  as  needed) ew_op(a_buf,  b_buf,  dest_buf,  plus,              global_size=a.shape,  local_size=(256,)).wait() #  get  results result  =  ctx.from_device(dest_buf) #  check  results print  la.norm(c  -­‐  (a  +  b))
  • 29. Two invocation models @cl.oquence.auto(lambda  a,  b,  dest,  op:  a.shape,  (256,)) 1. Standalone compilation to OpenCL @cl.oquence.fn 2. Integrated into a host language def  ew_op(a,  b,  dest,  op):        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' Four simple memory management functions return  a  +  b 1. to_device: numpy array => new buffer @cl.oquence.fn 2. from_device: buffer => new numpy array def  mul(a,  b): 3. alloc: empty buffer        '''Multiplies  the  two  operands.''' 4. copy: copies between existing buffers or arrays return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding Buffers hold metadata (type, shape, order) so you a  =  numpy.random.rand(50000).astype(numpy.float32) don’t have to provide it. b  =  numpy.random.rand(50000).astype(numpy.float32) #  transfer  data  to  device ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) Implicit queue associated with each context. a_buf  =  ctx.to_device(a) b_buf  =  ctx.to_device(b) dest_buf  =  ctx.alloc(like=a) The auto annotation can allow you to hide the #  invoke  function  (automatically  specialized  as  needed) details of parallelization from the user. ew_op(a_buf,  b_buf,  dest_buf,  plus).wait() #  get  results result  =  ctx.from_device(dest_buf) #  check  results print  la.norm(c  -­‐  (a  +  b))
  • 30. Two invocation models @cl.oquence.auto(lambda  a,  b,  dest,  op:  a.shape,  (256,)) 1. Standalone compilation to OpenCL @cl.oquence.fn 2. Integrated into a host language def  ew_op(a,  b,  dest,  op):        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' Four simple memory management functions return  a  +  b 1. to_device: numpy array => new buffer @cl.oquence.fn 2. from_device: buffer => new numpy array def  mul(a,  b): 3. alloc: empty buffer        '''Multiplies  the  two  operands.''' 4. copy: copies between existing buffers or arrays return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding Buffers hold metadata (type, shape, order) so you a  =  numpy.random.rand(50000).astype(numpy.float32) don’t have to provide it. b  =  numpy.random.rand(50000).astype(numpy.float32) c  =  numpy.empty_like(a) #  create  an  OpenCL  context Implicit queue associated with each context. ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) #  invoke  function  (automatically  specialized  as  needed) ew_op(In(a),  In(b),  Out(c),  plus).wait() The auto annotation can allow you to hide the details of parallelization from the user. #  check  results print  la.norm(c  -­‐  (a  +  b)) The In, Out and InOut constructs can help automate data movement when convenient.
  • 31. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable These two libraries express the same thing.        size_t  gid  =  get_global_id(0);   The code will run in precisely the same amount of time.        dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 32. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,   How?                                          __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable • cl.oquence.fn code looks like Python, but no!        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; • Same core type system as OpenCL (C99+) } • Type inference to eliminate type annotations (not dynamic lookups) ... ... • Extension inference to eliminate pragmas • Higher-order functions (inlined at compile-time) //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,   • Structural polymorphism                                      __global  float*  dest)  {                                            __global  float*  dest)  { • All functions are generic by default        size_t  gid  =  get_global_id(0);  //  Get  thread  index • You can call a function with any arguments        dest[gid]  =  a[gid]  *  b[gid]; that support the operations it uses. } • __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {