SlideShare ist ein Scribd-Unternehmen logo
1 von 19
Downloaden Sie, um offline zu lesen
CODE	
  VECTORIZATION	
  
 for	
  mobile	
  devices	
  

                    by	
  Dmitriy	
  Vovk	
  
Hardware	
  
•  Typical	
  hardware	
  found	
  in	
  modern	
  mobile	
  
   devices:	
  
   –    ARMv7	
  architecture	
  
   –    Cortex	
  A8Cortex	
  A9Custom	
  cores	
  (Krait,	
  SwiN)	
  
   –    800	
  –	
  1500	
  MHz	
  
   –    1-­‐4	
  cores	
  
   –    Thumb-­‐2	
  instrucXons	
  set	
  
   –    VFPv3	
  
   –    NEON,	
  opXonal	
  for	
  Cortex	
  A9.	
  Nvidia	
  Tegra	
  2	
  has	
  
        no	
  NEON	
  support	
  
NEON	
  
•  NEON	
  is	
  a	
  general	
  purpose	
  SIMD	
  engine	
  
   designed	
  by	
  ARM	
  for	
  ARM	
  processor	
  
   architecture	
  
•  16	
  registers,	
  128	
  bit	
  wide	
  each.	
  Supports	
  
   operaXons	
  on	
  8,	
  16,	
  32	
  and	
  64	
  bits	
  integers	
  
   and	
  32	
  bits	
  float	
  values	
  
NEON	
  
•  NEON	
  can	
  be	
  used	
  for:	
  
    –  SoNware	
  geometry	
  instancing;	
  
    –  Skinning	
  on	
  ES	
  1.1;	
  
    –  As	
  a	
  general	
  vertex	
  processor;	
  
    –  Other,	
  typical,	
  applicaXons	
  for	
  SIMD.	
  
NEON	
  
•  Some	
  unified	
  shader	
  architectures,	
  like	
  
   popular	
  ImaginaXon	
  Technologies	
  USSE1	
  
   (PowerVR	
  SGX	
  530-­‐545)	
  are	
  scalar,	
  NEON	
  is	
  
   vector	
  by	
  nature.	
  Move	
  your	
  vertex	
  processing	
  
   to	
  CPU	
  from	
  GPU	
  to	
  speedup	
  calculaXons*	
  
•  ???????	
  
•  PROFIT!!!111	
  

•    *NOTE.	
  That	
  doesn’t	
  apply	
  to	
  USSE2	
  hardware	
  
NEON	
  
•  The	
  weakest	
  side	
  of	
  mobile	
  GPUs	
  is	
  a	
  fill	
  rate.	
  
   Fill	
  rate	
  is	
  quickly	
  killed	
  by	
  blending.	
  2D	
  games	
  
   are	
  heavy	
  on	
  this.	
  PowerVR	
  USSE	
  engine	
  
   doesn’t	
  care	
  what	
  to	
  do	
  –	
  vertex	
  or	
  fragments	
  
   processing.	
  Moving	
  you	
  vertex	
  processing	
  to	
  
   CPU	
  (NEON)	
  will	
  leave	
  some	
  room	
  space	
  for	
  
   fragment	
  processing.	
  
NEON	
  
•  There	
  are	
  3	
  ways	
  to	
  use	
  NEON	
  vectorizaXon	
  in	
  
   your	
  code:	
  
    1.  Intrinsics	
  
    2.  Handwrijen	
  NEON	
  assembly	
  
    3.  AutovectorizaXon	
  by	
  compiler.	
  –mllvm	
  –
        vectorize	
  –mllvm	
  –bb-­‐vectorize-­‐aligned-­‐only	
  
        compiler	
  flags	
  for	
  LLVM.	
  -­‐Bree-­‐vectorizer-­‐
        verbose=4	
  -­‐mfpu=neon	
  -­‐funsafe-­‐math-­‐
        opGmizaGons	
  -­‐Bree-­‐vectorize	
  for	
  GCC	
  
DEMO	
  
Measurements	
  
•  Intrinsics:	
  
Measurements	
  
•  Assembly	
  :	
  
Measurements	
  
•  Summary:	
  
                                   Running	
  'me,	
  ms	
     CPU	
  usage,	
  %	
  
        Intrinsics	
               2764	
                      19	
  
        Assembly	
                 3664	
                      20	
  
        FPU	
                      6209	
                      25-­‐28	
  
        FPU	
  autovectorized	
   5028	
                       22-­‐24	
  


•  Intrinsics	
  got	
  me	
  25%	
  speedup	
  over	
  assembly.	
  	
  
•  Note	
  that	
  speed	
  of	
  intrinsics	
  code	
  vary	
  from	
  
   compiler	
  to	
  compiler.	
  
NEON	
  
•  Intrinsics	
  advantages	
  over	
  assembly:	
  
   –  Higher	
  level	
  code;	
  
   –  No	
  need	
  to	
  manage	
  registers;	
  
   –  You	
  can	
  vectorize	
  basic	
  blocks	
  and	
  build	
  soluXon	
  
      to	
  every	
  new	
  problem	
  with	
  this	
  blocks.	
  In	
  contrast	
  
      to	
  assembly	
  –	
  you	
  have	
  to	
  solve	
  each	
  new	
  
      problem	
  from	
  scratch;	
  
NEON	
  
•  Assembly	
  advantages	
  over	
  intrinsics:	
  
   –  Code	
  generated	
  from	
  intrinsics	
  vary	
  from	
  compiler	
  
      to	
  compiler	
  and	
  can	
  give	
  you	
  really	
  big	
  difference	
  
      in	
  speed.	
  Assembly	
  code	
  will	
  always	
  be	
  the	
  same.	
  
Code	
  
void	
  Update()	
  {	
  
	
  	
  	
  	
  GLKMatrix4	
  modelviewMat	
  =	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  1,	
  0,	
  0,	
  0,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  0,	
  1,	
  0,	
  0,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  0,	
  0,	
  1,	
  0,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  0,	
  0,	
  0,	
  1	
  };	
  
	
  
	
  	
  	
  	
  const	
  float	
  Y_DELTA	
  =	
  420.0f	
  /	
  QUADS_COUNT;	
  
	
  
	
  	
  	
  	
  for	
  (int	
  i	
  =	
  0;	
  i	
  <	
  QUADS_COUNT	
  *	
  VERTS_PER_QUAD;	
  i	
  +=	
  VERTS_PER_QUAD)	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  modelviewMat.m[12]	
  =	
  random()	
  %	
  260;	
  
	
  	
  	
  	
  	
  	
  	
  	
  modelviewMat.m[13]	
  =	
  Y_DELTA	
  ;	
  
#ifdef	
  ASM	
  
	
  	
  	
  	
  	
  	
  	
  	
  CalculateSpriteVertsWorldPos((float32x4x4_t*)proj.m,	
  (float32x4x4_t*)modelviewMat.m,	
  (float32x4_t*)&data[i	
  +	
  0].pos,	
  (float32x4_t*)&data[i	
  +	
  
1].pos,	
  (float32x4_t*)&data[i	
  +	
  2].pos,	
  (float32x4_t*)&data[i	
  +	
  3].pos);	
  
#else	
  
	
  	
  	
  	
  	
  	
  	
  	
  float32x4x4_t	
  modelviewProj;	
  
	
  	
  	
  	
  	
  	
  	
  	
  Matrix4ByMatrix4((float32x4x4_t*)proj.m,	
  (float32x4x4_t*)modelviewMat.m,	
  &modelviewProj);	
  
	
  	
  	
  
	
  	
  	
  	
  	
  	
  	
  	
  for	
  (int	
  j	
  =	
  0;	
  j	
  <	
  4;	
  ++j)	
  {	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  Matrix4ByVec4(&modelviewProj,	
  (float32x4_t*)&squareVerXces[j],	
  (float32x4_t*)&data[i	
  +	
  j].pos);	
  
	
  	
  	
  	
  	
  	
  	
  	
  }	
  
#endif	
  
	
  	
  	
  	
  }	
  
	
  	
  	
  	
  glBindBuffer(GL_ARRAY_BUFFER,	
  vertexBuffer);	
  
	
  	
  	
  	
  glBufferData(GL_ARRAY_BUFFER,	
  sizeof(data),	
  data,	
  GL_STREAM_DRAW);	
  
}	
  
Code	
  
__ajribute__((always_inline))	
  void	
  Matrix4ByVec4(const	
  
float32x4x4_t*	
  __restrict__	
  mat,	
  const	
  float32x4_t*	
  
__restrict__	
  vec,	
  float32x4_t*	
  __restrict__	
  result)	
  
{	
  
	
  	
  	
  	
  (*result)	
  =	
  vmulq_n_f32((*mat).val[0],	
  (*vec)[0]);	
  
	
  	
  	
  	
  	
  
	
  	
  	
  	
  (*result)	
  =	
  vmlaq_n_f32((*result),	
  (*mat).val[1],	
  (*vec)[1]);	
  
	
  	
  	
  	
  (*result)	
  =	
  vmlaq_n_f32((*result),	
  (*mat).val[2],	
  (*vec)[2]);	
  
	
  	
  	
  	
  (*result)	
  =	
  vmlaq_n_f32((*result),	
  (*mat).val[3],	
  (*vec)[3]);	
  
}	
  
Code	
  
__ajribute__((always_inline))	
  void	
  Matrix4ByMatrix4(const	
  float32x4x4_t*	
  __restrict__	
  m1,	
  const	
  float32x4x4_t*	
  __restrict__	
  m2,	
  
float32x4x4_t*	
  __restrict__	
  r)	
  
{	
  
#ifdef	
  INTRINSICS	
  
	
  	
  	
  	
  (*r).val[0]	
  =	
  vmulq_n_f32((*m1).val[0],	
  vgetq_lane_f32((*m2).val[0],	
  0));	
  
	
  	
  	
  	
  (*r).val[1]	
  =	
  vmulq_n_f32((*m1).val[0],	
  vgetq_lane_f32((*m2).val[1],	
  0));	
  
	
  	
  	
  	
  (*r).val[2]	
  =	
  vmulq_n_f32((*m1).val[0],	
  vgetq_lane_f32((*m2).val[2],	
  0));	
  
	
  	
  	
  	
  (*r).val[3]	
  =	
  vmulq_n_f32((*m1).val[0],	
  vgetq_lane_f32((*m2).val[3],	
  0));	
  
	
  	
  	
  	
  	
  
	
  	
  	
  	
  (*r).val[0]	
  =	
  vmlaq_n_f32((*r).val[0],	
  (*m1).val[1],	
  vgetq_lane_f32((*m2).val[0],	
  1));	
  
	
  	
  	
  	
  (*r).val[1]	
  =	
  vmlaq_n_f32((*r).val[1],	
  (*m1).val[1],	
  vgetq_lane_f32((*m2).val[1],	
  1));	
  
	
  	
  	
  	
  (*r).val[2]	
  =	
  vmlaq_n_f32((*r).val[2],	
  (*m1).val[1],	
  vgetq_lane_f32((*m2).val[2],	
  1));	
  
	
  	
  	
  	
  (*r).val[3]	
  =	
  vmlaq_n_f32((*r).val[3],	
  (*m1).val[1],	
  vgetq_lane_f32((*m2).val[3],	
  1));	
  
	
  	
  	
  	
  	
  
	
  	
  	
  	
  (*r).val[0]	
  =	
  vmlaq_n_f32((*r).val[0],	
  (*m1).val[2],	
  vgetq_lane_f32((*m2).val[0],	
  2));	
  
	
  	
  	
  	
  (*r).val[1]	
  =	
  vmlaq_n_f32((*r).val[1],	
  (*m1).val[2],	
  vgetq_lane_f32((*m2).val[1],	
  2));	
  
	
  	
  	
  	
  (*r).val[2]	
  =	
  vmlaq_n_f32((*r).val[2],	
  (*m1).val[2],	
  vgetq_lane_f32((*m2).val[2],	
  2));	
  
	
  	
  	
  	
  (*r).val[3]	
  =	
  vmlaq_n_f32((*r).val[3],	
  (*m1).val[2],	
  vgetq_lane_f32((*m2).val[3],	
  2));	
  
	
  	
  	
  	
  	
  
	
  	
  	
  	
  (*r).val[0]	
  =	
  vmlaq_n_f32((*r).val[0],	
  (*m1).val[3],	
  vgetq_lane_f32((*m2).val[0],	
  3));	
  
	
  	
  	
  	
  (*r).val[1]	
  =	
  vmlaq_n_f32((*r).val[1],	
  (*m1).val[3],	
  vgetq_lane_f32((*m2).val[1],	
  3));	
  
	
  	
  	
  	
  (*r).val[2]	
  =	
  vmlaq_n_f32((*r).val[2],	
  (*m1).val[3],	
  vgetq_lane_f32((*m2).val[2],	
  3));	
  
	
  	
  	
  	
  (*r).val[3]	
  =	
  vmlaq_n_f32((*r).val[3],	
  (*m1).val[3],	
  vgetq_lane_f32((*m2).val[3],	
  3));	
  
}	
  
Code	
  
	
  __asm__	
  volaXle	
                                               	
  	
  	
  	
  	
  "vmla.f32	
  q12,	
  q11,	
  d1[1]nt"	
          	
  	
  	
  	
  	
  "vmla.f32	
  q10,	
  q13,	
  d4[1]nt"	
  
	
  	
  	
  	
  (	
                                                    	
  	
  	
  	
  	
  "vmla.f32	
  q13,	
  q11,	
  d3[1]nt"	
          	
  	
  	
  	
  	
  "vmla.f32	
  q10,	
  q14,	
  d5[0]nt"	
  
	
  	
  	
  	
  	
  "vldmia	
  %6,	
  {	
  q0-­‐q3	
  }	
  nt"	
     	
  	
  	
  	
  	
  "vmla.f32	
  q14,	
  q11,	
  d5[1]nt"	
          	
  	
  	
  	
  	
  "vmla.f32	
  q10,	
  q15,	
  d5[1]nt"	
  
	
  	
  	
  	
  	
  "vldmia	
  %0,	
  {	
  q8-­‐q11	
  }nt"	
        	
  	
  	
  	
  	
  "vmla.f32	
  q15,	
  q11,	
  d7[1]nt"	
          	
  	
  	
  	
  	
  	
  
	
  	
  	
  	
  	
  	
                                                 	
  	
  	
  	
  	
  	
                                                 	
  	
  	
  	
  	
  "vmla.f32	
  q11,	
  q13,	
  d6[1]nt"	
  
	
  	
  	
  	
  	
  "vmul.f32	
  q12,	
  q8,	
  d0[0]nt"	
           	
  	
  	
  	
  	
  "vldmia	
  %1,	
  {	
  q0-­‐q3	
  }	
  nt"	
     	
  	
  	
  	
  	
  "vmla.f32	
  q11,	
  q14,	
  d7[0]nt"	
  
	
  	
  	
  	
  	
  "vmul.f32	
  q13,	
  q8,	
  d2[0]nt"	
           	
  	
  	
  	
  	
  	
                                                 	
  	
  	
  	
  	
  "vmla.f32	
  q11,	
  q15,	
  d7[1]nt"	
  
	
  	
  	
  	
  	
  "vmul.f32	
  q14,	
  q8,	
  d4[0]nt"	
           	
  	
  	
  	
  	
  "vmul.f32	
  q8,	
  q12,	
  d0[0]nt"	
           	
  	
  	
  	
  	
  	
  
	
  	
  	
  	
  	
  "vmul.f32	
  q15,	
  q8,	
  d6[0]nt"	
           	
  	
  	
  	
  	
  "vmul.f32	
  q9,	
  q12,	
  d2[0]nt"	
           	
  	
  	
  	
  	
  "vstmia	
  %2,	
  {	
  q8	
  }nt"	
  
	
  	
  	
  	
  	
  	
                                                 	
  	
  	
  	
  	
  "vmul.f32	
  q10,	
  q12,	
  d4[0]nt"	
          	
  	
  	
  	
  	
  "vstmia	
  %3,	
  {	
  q9	
  }nt"	
  
	
  	
  	
  	
  	
  "vmla.f32	
  q12,	
  q9,	
  d0[1]nt"	
           	
  	
  	
  	
  	
  "vmul.f32	
  q11,	
  q12,	
  d6[0]nt"	
          	
  	
  	
  	
  	
  "vstmia	
  %4,	
  {	
  q10	
  }nt"	
  
	
  	
  	
  	
  	
  "vmla.f32	
  q13,	
  q9,	
  d2[1]nt"	
           	
  	
  	
  	
  	
  	
                                                 	
  	
  	
  	
  	
  "vstmia	
  %5,	
  {	
  q11	
  }"	
  
	
  	
  	
  	
  	
  "vmla.f32	
  q14,	
  q9,	
  d4[1]nt"	
           	
  	
  	
  	
  	
  "vmla.f32	
  q8,	
  q13,	
  d0[1]nt"	
           	
  	
  	
  	
  	
  	
  
	
  	
  	
  	
  	
  "vmla.f32	
  q15,	
  q9,	
  d6[1]nt"	
           	
  	
  	
  	
  	
  "vmla.f32	
  q8,	
  q14,	
  d1[0]nt"	
           	
  	
  	
  	
  	
  :	
  
	
  	
  	
  	
  	
  	
                                                 	
  	
  	
  	
  	
  "vmla.f32	
  q8,	
  q15,	
  d1[1]nt"	
           	
  	
  	
  	
  	
  :	
  "r"	
  (proj),	
  "r"	
  (squareVerXces),	
  "r"	
  (v1),	
  
	
  	
  	
  	
  	
  "vmla.f32	
  q12,	
  q10,	
  d1[0]nt"	
          	
  	
  	
  	
  	
  	
                                                 "r"	
  (v2),	
  "r"	
  (v3),	
  "r"	
  (v4),	
  "r"	
  (modelView)	
  
	
  	
  	
  	
  	
  "vmla.f32	
  q13,	
  q10,	
  d3[0]nt"	
          	
  	
  	
  	
  	
  "vmla.f32	
  q9,	
  q13,	
  d2[1]nt"	
           	
  	
  	
  	
  	
  :	
  "memory",	
  "q0",	
  "q1",	
  "q2",	
  "q3",	
  
	
  	
  	
  	
  	
  "vmla.f32	
  q14,	
  q10,	
  d5[0]nt"	
          	
  	
  	
  	
  	
  "vmla.f32	
  q9,	
  q14,	
  d3[0]nt"	
           "q8",	
  "q9",	
  "q10",	
  "q11",	
  "q12",	
  "q13",	
  
                                                                                                                                              "q14",	
  "q15"	
  
	
  	
  	
  	
  	
  "vmla.f32	
  q15,	
  q10,	
  d7[0]nt"	
          	
  	
  	
  	
  	
  "vmla.f32	
  q9,	
  q15,	
  d3[1]nt"	
  
                                                                                                                                              	
  	
  	
  	
  	
  );	
  
	
  	
  	
  	
  	
  	
                                                 	
  	
  	
  	
  	
  	
  
Docs	
  
•  For	
  detailed	
  explanaXon	
  on	
  intrinsics
   assembly	
  see:	
  
   hjp://infocenter.arm.com/help/index.jsp?
   topic=/com.arm.doc.dui0491e/CIHJBEFE.html	
  
Contact	
  me	
  
               	
  
               	
  
               	
  
hjp://www.linkedin.com/in/dvovk/	
  
  hjp://nukecode.blogspot.com/	
  

Weitere ähnliche Inhalte

Was ist angesagt?

Exploiting the Linux Kernel via Intel's SYSRET Implementation
Exploiting the Linux Kernel via Intel's SYSRET ImplementationExploiting the Linux Kernel via Intel's SYSRET Implementation
Exploiting the Linux Kernel via Intel's SYSRET Implementationnkslides
 
SMP implementation for OpenBSD/sgi
SMP implementation for OpenBSD/sgiSMP implementation for OpenBSD/sgi
SMP implementation for OpenBSD/sgiTakuya ASADA
 
Embedded Systems Conference 2014 Presentation
Embedded Systems Conference 2014 PresentationEmbedded Systems Conference 2014 Presentation
Embedded Systems Conference 2014 PresentationManish Jaggi
 
Austin c-c++-meetup-feb2018-spectre
Austin c-c++-meetup-feb2018-spectreAustin c-c++-meetup-feb2018-spectre
Austin c-c++-meetup-feb2018-spectreKim Phillips
 
Vc4c development of opencl compiler for videocore4
Vc4c  development of opencl compiler for videocore4Vc4c  development of opencl compiler for videocore4
Vc4c development of opencl compiler for videocore4nomaddo
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpuMarco Parenzan
 
Kernel Recipes 2014 - Writing Code: Keep It Short, Stupid!
Kernel Recipes 2014 - Writing Code: Keep It Short, Stupid!Kernel Recipes 2014 - Writing Code: Keep It Short, Stupid!
Kernel Recipes 2014 - Writing Code: Keep It Short, Stupid!Anne Nicolas
 
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321Teddy Hsiung
 
OpenGL 4.4 - Scene Rendering Techniques
OpenGL 4.4 - Scene Rendering TechniquesOpenGL 4.4 - Scene Rendering Techniques
OpenGL 4.4 - Scene Rendering TechniquesNarann29
 
App secforum2014 andrivet-cplusplus11-metaprogramming_applied_to_software_obf...
App secforum2014 andrivet-cplusplus11-metaprogramming_applied_to_software_obf...App secforum2014 andrivet-cplusplus11-metaprogramming_applied_to_software_obf...
App secforum2014 andrivet-cplusplus11-metaprogramming_applied_to_software_obf...Cyber Security Alliance
 
from Binary to Binary: How Qemu Works
from Binary to Binary: How Qemu Worksfrom Binary to Binary: How Qemu Works
from Binary to Binary: How Qemu WorksZhen Wei
 
NSC #2 - D2 01 - Andrea Allievi - Windows 8.1 Patch Protections
NSC #2 - D2 01 - Andrea Allievi - Windows 8.1 Patch ProtectionsNSC #2 - D2 01 - Andrea Allievi - Windows 8.1 Patch Protections
NSC #2 - D2 01 - Andrea Allievi - Windows 8.1 Patch ProtectionsNoSuchCon
 
Killing any security product … using a Mimikatz undocumented feature
Killing any security product … using a Mimikatz undocumented featureKilling any security product … using a Mimikatz undocumented feature
Killing any security product … using a Mimikatz undocumented featureCyber Security Alliance
 
Static analysis of C++ source code
Static analysis of C++ source codeStatic analysis of C++ source code
Static analysis of C++ source codeAndrey Karpov
 
Mateusz 'j00ru' Jurczyk - Windows Kernel Trap Handler and NTVDM Vulnerabiliti...
Mateusz 'j00ru' Jurczyk - Windows Kernel Trap Handler and NTVDM Vulnerabiliti...Mateusz 'j00ru' Jurczyk - Windows Kernel Trap Handler and NTVDM Vulnerabiliti...
Mateusz 'j00ru' Jurczyk - Windows Kernel Trap Handler and NTVDM Vulnerabiliti...DefconRussia
 
Zn task - defcon russia 20
Zn task  - defcon russia 20Zn task  - defcon russia 20
Zn task - defcon russia 20DefconRussia
 
深層学習フレームワークにおけるIntel CPU/富岳向け最適化法
深層学習フレームワークにおけるIntel CPU/富岳向け最適化法深層学習フレームワークにおけるIntel CPU/富岳向け最適化法
深層学習フレームワークにおけるIntel CPU/富岳向け最適化法MITSUNARI Shigeo
 
第11回 配信講義 計算科学技術特論A(2021)
第11回 配信講義 計算科学技術特論A(2021)第11回 配信講義 計算科学技術特論A(2021)
第11回 配信講義 計算科学技術特論A(2021)RCCSRENKEI
 

Was ist angesagt? (20)

Exploiting the Linux Kernel via Intel's SYSRET Implementation
Exploiting the Linux Kernel via Intel's SYSRET ImplementationExploiting the Linux Kernel via Intel's SYSRET Implementation
Exploiting the Linux Kernel via Intel's SYSRET Implementation
 
SMP implementation for OpenBSD/sgi
SMP implementation for OpenBSD/sgiSMP implementation for OpenBSD/sgi
SMP implementation for OpenBSD/sgi
 
Embedded Systems Conference 2014 Presentation
Embedded Systems Conference 2014 PresentationEmbedded Systems Conference 2014 Presentation
Embedded Systems Conference 2014 Presentation
 
Austin c-c++-meetup-feb2018-spectre
Austin c-c++-meetup-feb2018-spectreAustin c-c++-meetup-feb2018-spectre
Austin c-c++-meetup-feb2018-spectre
 
Vc4c development of opencl compiler for videocore4
Vc4c  development of opencl compiler for videocore4Vc4c  development of opencl compiler for videocore4
Vc4c development of opencl compiler for videocore4
 
2011.02.18 marco parenzan - modelli di programmazione per le gpu
2011.02.18   marco parenzan - modelli di programmazione per le gpu2011.02.18   marco parenzan - modelli di programmazione per le gpu
2011.02.18 marco parenzan - modelli di programmazione per le gpu
 
Kernel Recipes 2014 - Writing Code: Keep It Short, Stupid!
Kernel Recipes 2014 - Writing Code: Keep It Short, Stupid!Kernel Recipes 2014 - Writing Code: Keep It Short, Stupid!
Kernel Recipes 2014 - Writing Code: Keep It Short, Stupid!
 
AA-sort with SSE4.1
AA-sort with SSE4.1AA-sort with SSE4.1
AA-sort with SSE4.1
 
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
 
OpenGL 4.4 - Scene Rendering Techniques
OpenGL 4.4 - Scene Rendering TechniquesOpenGL 4.4 - Scene Rendering Techniques
OpenGL 4.4 - Scene Rendering Techniques
 
App secforum2014 andrivet-cplusplus11-metaprogramming_applied_to_software_obf...
App secforum2014 andrivet-cplusplus11-metaprogramming_applied_to_software_obf...App secforum2014 andrivet-cplusplus11-metaprogramming_applied_to_software_obf...
App secforum2014 andrivet-cplusplus11-metaprogramming_applied_to_software_obf...
 
from Binary to Binary: How Qemu Works
from Binary to Binary: How Qemu Worksfrom Binary to Binary: How Qemu Works
from Binary to Binary: How Qemu Works
 
NSC #2 - D2 01 - Andrea Allievi - Windows 8.1 Patch Protections
NSC #2 - D2 01 - Andrea Allievi - Windows 8.1 Patch ProtectionsNSC #2 - D2 01 - Andrea Allievi - Windows 8.1 Patch Protections
NSC #2 - D2 01 - Andrea Allievi - Windows 8.1 Patch Protections
 
Killing any security product … using a Mimikatz undocumented feature
Killing any security product … using a Mimikatz undocumented featureKilling any security product … using a Mimikatz undocumented feature
Killing any security product … using a Mimikatz undocumented feature
 
Static analysis of C++ source code
Static analysis of C++ source codeStatic analysis of C++ source code
Static analysis of C++ source code
 
Mateusz 'j00ru' Jurczyk - Windows Kernel Trap Handler and NTVDM Vulnerabiliti...
Mateusz 'j00ru' Jurczyk - Windows Kernel Trap Handler and NTVDM Vulnerabiliti...Mateusz 'j00ru' Jurczyk - Windows Kernel Trap Handler and NTVDM Vulnerabiliti...
Mateusz 'j00ru' Jurczyk - Windows Kernel Trap Handler and NTVDM Vulnerabiliti...
 
Zn task - defcon russia 20
Zn task  - defcon russia 20Zn task  - defcon russia 20
Zn task - defcon russia 20
 
深層学習フレームワークにおけるIntel CPU/富岳向け最適化法
深層学習フレームワークにおけるIntel CPU/富岳向け最適化法深層学習フレームワークにおけるIntel CPU/富岳向け最適化法
深層学習フレームワークにおけるIntel CPU/富岳向け最適化法
 
第11回 配信講義 計算科学技術特論A(2021)
第11回 配信講義 計算科学技術特論A(2021)第11回 配信講義 計算科学技術特論A(2021)
第11回 配信講義 計算科学技術特論A(2021)
 
Qemu JIT Code Generator and System Emulation
Qemu JIT Code Generator and System EmulationQemu JIT Code Generator and System Emulation
Qemu JIT Code Generator and System Emulation
 

Ähnlich wie Дмитрий Вовк: Векторизация кода под мобильные платформы

Code vectorization for mobile devices
Code vectorization for mobile devicesCode vectorization for mobile devices
Code vectorization for mobile devicesSt1X
 
NIR on the Mesa i965 backend (FOSDEM 2016)
NIR on the Mesa i965 backend (FOSDEM 2016)NIR on the Mesa i965 backend (FOSDEM 2016)
NIR on the Mesa i965 backend (FOSDEM 2016)Igalia
 
BlueHat v18 || A mitigation for kernel toctou vulnerabilities
BlueHat v18 || A mitigation for kernel toctou vulnerabilitiesBlueHat v18 || A mitigation for kernel toctou vulnerabilities
BlueHat v18 || A mitigation for kernel toctou vulnerabilitiesBlueHat Security Conference
 
SAST and Application Security: how to fight vulnerabilities in the code
SAST and Application Security: how to fight vulnerabilities in the codeSAST and Application Security: how to fight vulnerabilities in the code
SAST and Application Security: how to fight vulnerabilities in the codeAndrey Karpov
 
Java Jit. Compilation and optimization by Andrey Kovalenko
Java Jit. Compilation and optimization by Andrey KovalenkoJava Jit. Compilation and optimization by Andrey Kovalenko
Java Jit. Compilation and optimization by Andrey KovalenkoValeriia Maliarenko
 
Optimizing unity games (Google IO 2014)
Optimizing unity games (Google IO 2014)Optimizing unity games (Google IO 2014)
Optimizing unity games (Google IO 2014)Alexander Dolbilov
 
Digging for Android Kernel Bugs
Digging for Android Kernel BugsDigging for Android Kernel Bugs
Digging for Android Kernel BugsJiahong Fang
 
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...Positive Hack Days
 
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...Andrey Karpov
 
[2007 CodeEngn Conference 01] dual5651 - Windows 커널단의 후킹
[2007 CodeEngn Conference 01] dual5651 - Windows 커널단의 후킹[2007 CodeEngn Conference 01] dual5651 - Windows 커널단의 후킹
[2007 CodeEngn Conference 01] dual5651 - Windows 커널단의 후킹GangSeok Lee
 
How Triton can help to reverse virtual machine based software protections
How Triton can help to reverse virtual machine based software protectionsHow Triton can help to reverse virtual machine based software protections
How Triton can help to reverse virtual machine based software protectionsJonathan Salwan
 
Android NDK and the x86 Platform
Android NDK and the x86 PlatformAndroid NDK and the x86 Platform
Android NDK and the x86 PlatformSebastian Mauer
 
Instruction Combine in LLVM
Instruction Combine in LLVMInstruction Combine in LLVM
Instruction Combine in LLVMWang Hsiangkai
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computingArka Ghosh
 

Ähnlich wie Дмитрий Вовк: Векторизация кода под мобильные платформы (20)

Code vectorization for mobile devices
Code vectorization for mobile devicesCode vectorization for mobile devices
Code vectorization for mobile devices
 
NIR on the Mesa i965 backend (FOSDEM 2016)
NIR on the Mesa i965 backend (FOSDEM 2016)NIR on the Mesa i965 backend (FOSDEM 2016)
NIR on the Mesa i965 backend (FOSDEM 2016)
 
Vectorization in ATLAS
Vectorization in ATLASVectorization in ATLAS
Vectorization in ATLAS
 
Andes open cl for RISC-V
Andes open cl for RISC-VAndes open cl for RISC-V
Andes open cl for RISC-V
 
BlueHat v18 || A mitigation for kernel toctou vulnerabilities
BlueHat v18 || A mitigation for kernel toctou vulnerabilitiesBlueHat v18 || A mitigation for kernel toctou vulnerabilities
BlueHat v18 || A mitigation for kernel toctou vulnerabilities
 
ADCSS 2022
ADCSS 2022ADCSS 2022
ADCSS 2022
 
SAST and Application Security: how to fight vulnerabilities in the code
SAST and Application Security: how to fight vulnerabilities in the codeSAST and Application Security: how to fight vulnerabilities in the code
SAST and Application Security: how to fight vulnerabilities in the code
 
Java Jit. Compilation and optimization by Andrey Kovalenko
Java Jit. Compilation and optimization by Andrey KovalenkoJava Jit. Compilation and optimization by Andrey Kovalenko
Java Jit. Compilation and optimization by Andrey Kovalenko
 
Optimizing unity games (Google IO 2014)
Optimizing unity games (Google IO 2014)Optimizing unity games (Google IO 2014)
Optimizing unity games (Google IO 2014)
 
Digging for Android Kernel Bugs
Digging for Android Kernel BugsDigging for Android Kernel Bugs
Digging for Android Kernel Bugs
 
Fedor Polyakov - Optimizing computer vision problems on mobile platforms
Fedor Polyakov - Optimizing computer vision problems on mobile platforms Fedor Polyakov - Optimizing computer vision problems on mobile platforms
Fedor Polyakov - Optimizing computer vision problems on mobile platforms
 
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
The System of Automatic Searching for Vulnerabilities or how to use Taint Ana...
 
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
PVS-Studio 5.00, a solution for developers of modern resource-intensive appl...
 
[2007 CodeEngn Conference 01] dual5651 - Windows 커널단의 후킹
[2007 CodeEngn Conference 01] dual5651 - Windows 커널단의 후킹[2007 CodeEngn Conference 01] dual5651 - Windows 커널단의 후킹
[2007 CodeEngn Conference 01] dual5651 - Windows 커널단의 후킹
 
How Triton can help to reverse virtual machine based software protections
How Triton can help to reverse virtual machine based software protectionsHow Triton can help to reverse virtual machine based software protections
How Triton can help to reverse virtual machine based software protections
 
Android NDK and the x86 Platform
Android NDK and the x86 PlatformAndroid NDK and the x86 Platform
Android NDK and the x86 Platform
 
RISC V in Spacer
RISC V in SpacerRISC V in Spacer
RISC V in Spacer
 
Instruction Combine in LLVM
Instruction Combine in LLVMInstruction Combine in LLVM
Instruction Combine in LLVM
 
UVM TUTORIAL;
UVM TUTORIAL;UVM TUTORIAL;
UVM TUTORIAL;
 
Vpu technology &gpgpu computing
Vpu technology &gpgpu computingVpu technology &gpgpu computing
Vpu technology &gpgpu computing
 

Mehr von DevGAMM Conference

The art of small steps, or how to make sound for games in conditions of war /...
The art of small steps, or how to make sound for games in conditions of war /...The art of small steps, or how to make sound for games in conditions of war /...
The art of small steps, or how to make sound for games in conditions of war /...DevGAMM Conference
 
Breaking up with FMOD - Why we ended things and embraced Metasounds / Daniel ...
Breaking up with FMOD - Why we ended things and embraced Metasounds / Daniel ...Breaking up with FMOD - Why we ended things and embraced Metasounds / Daniel ...
Breaking up with FMOD - Why we ended things and embraced Metasounds / Daniel ...DevGAMM Conference
 
How Audio Objects Improve Spatial Accuracy / Mads Maretty Sønderup (Audiokine...
How Audio Objects Improve Spatial Accuracy / Mads Maretty Sønderup (Audiokine...How Audio Objects Improve Spatial Accuracy / Mads Maretty Sønderup (Audiokine...
How Audio Objects Improve Spatial Accuracy / Mads Maretty Sønderup (Audiokine...DevGAMM Conference
 
Why indie developers should consider hyper-casual right now / Igor Gurenyov (...
Why indie developers should consider hyper-casual right now / Igor Gurenyov (...Why indie developers should consider hyper-casual right now / Igor Gurenyov (...
Why indie developers should consider hyper-casual right now / Igor Gurenyov (...DevGAMM Conference
 
AI / ML for Indies / Tyler Coleman (Retora Games)
AI / ML for Indies / Tyler Coleman (Retora Games)AI / ML for Indies / Tyler Coleman (Retora Games)
AI / ML for Indies / Tyler Coleman (Retora Games)DevGAMM Conference
 
Agility is the Key: Power Up Your GameDev Project Management with Agile Pract...
Agility is the Key: Power Up Your GameDev Project Management with Agile Pract...Agility is the Key: Power Up Your GameDev Project Management with Agile Pract...
Agility is the Key: Power Up Your GameDev Project Management with Agile Pract...DevGAMM Conference
 
New PR Tech and AI Tools for 2023: A Game Changer for Outreach / Kirill Perev...
New PR Tech and AI Tools for 2023: A Game Changer for Outreach / Kirill Perev...New PR Tech and AI Tools for 2023: A Game Changer for Outreach / Kirill Perev...
New PR Tech and AI Tools for 2023: A Game Changer for Outreach / Kirill Perev...DevGAMM Conference
 
Playable Ads - Revolutionizing mobile games advertising / Jakub Kukuryk (Popc...
Playable Ads - Revolutionizing mobile games advertising / Jakub Kukuryk (Popc...Playable Ads - Revolutionizing mobile games advertising / Jakub Kukuryk (Popc...
Playable Ads - Revolutionizing mobile games advertising / Jakub Kukuryk (Popc...DevGAMM Conference
 
Creative Collaboration: Managing an Art Team / Nastassia Radzivonava (Glera G...
Creative Collaboration: Managing an Art Team / Nastassia Radzivonava (Glera G...Creative Collaboration: Managing an Art Team / Nastassia Radzivonava (Glera G...
Creative Collaboration: Managing an Art Team / Nastassia Radzivonava (Glera G...DevGAMM Conference
 
From Local to Global: Unleashing the Power of Payments / Jan Kuhlmannn (Xsolla)
From Local to Global: Unleashing the Power of Payments / Jan Kuhlmannn (Xsolla)From Local to Global: Unleashing the Power of Payments / Jan Kuhlmannn (Xsolla)
From Local to Global: Unleashing the Power of Payments / Jan Kuhlmannn (Xsolla)DevGAMM Conference
 
Strategies and case studies to grow LTV in 2023 / Julia Iljuk (Balancy)
Strategies and case studies to grow LTV in 2023 / Julia Iljuk (Balancy)Strategies and case studies to grow LTV in 2023 / Julia Iljuk (Balancy)
Strategies and case studies to grow LTV in 2023 / Julia Iljuk (Balancy)DevGAMM Conference
 
Why is ASO not working in 2023 and how to change it? / Olena Vedmedenko (Keya...
Why is ASO not working in 2023 and how to change it? / Olena Vedmedenko (Keya...Why is ASO not working in 2023 and how to change it? / Olena Vedmedenko (Keya...
Why is ASO not working in 2023 and how to change it? / Olena Vedmedenko (Keya...DevGAMM Conference
 
How to increase wishlists & game sales from China? Growth marketing tactics &...
How to increase wishlists & game sales from China? Growth marketing tactics &...How to increase wishlists & game sales from China? Growth marketing tactics &...
How to increase wishlists & game sales from China? Growth marketing tactics &...DevGAMM Conference
 
Turkish Gaming Industry and HR Insights / Mustafa Mert EFE (Zindhu)
Turkish Gaming Industry and HR Insights / Mustafa Mert EFE (Zindhu)Turkish Gaming Industry and HR Insights / Mustafa Mert EFE (Zindhu)
Turkish Gaming Industry and HR Insights / Mustafa Mert EFE (Zindhu)DevGAMM Conference
 
Building an Awesome Creative Team from Scratch, Capable of Scaling Up / Sasha...
Building an Awesome Creative Team from Scratch, Capable of Scaling Up / Sasha...Building an Awesome Creative Team from Scratch, Capable of Scaling Up / Sasha...
Building an Awesome Creative Team from Scratch, Capable of Scaling Up / Sasha...DevGAMM Conference
 
Seven Reasons Why Your LiveOps Is Not Performing / Alexander Devyaterikov (Be...
Seven Reasons Why Your LiveOps Is Not Performing / Alexander Devyaterikov (Be...Seven Reasons Why Your LiveOps Is Not Performing / Alexander Devyaterikov (Be...
Seven Reasons Why Your LiveOps Is Not Performing / Alexander Devyaterikov (Be...DevGAMM Conference
 
The Power of Game and Music Collaborations: Reaching and Engaging the Masses ...
The Power of Game and Music Collaborations: Reaching and Engaging the Masses ...The Power of Game and Music Collaborations: Reaching and Engaging the Masses ...
The Power of Game and Music Collaborations: Reaching and Engaging the Masses ...DevGAMM Conference
 
Branded Content: How to overcome players' immunity to advertising / Alex Brod...
Branded Content: How to overcome players' immunity to advertising / Alex Brod...Branded Content: How to overcome players' immunity to advertising / Alex Brod...
Branded Content: How to overcome players' immunity to advertising / Alex Brod...DevGAMM Conference
 
Resurrecting Chasm: The Rift - A Source-less Remastering Journey / Gennadii P...
Resurrecting Chasm: The Rift - A Source-less Remastering Journey / Gennadii P...Resurrecting Chasm: The Rift - A Source-less Remastering Journey / Gennadii P...
Resurrecting Chasm: The Rift - A Source-less Remastering Journey / Gennadii P...DevGAMM Conference
 
How NOT to do showcase events: Behind the scenes of Midnight Show / Andrew Ko...
How NOT to do showcase events: Behind the scenes of Midnight Show / Andrew Ko...How NOT to do showcase events: Behind the scenes of Midnight Show / Andrew Ko...
How NOT to do showcase events: Behind the scenes of Midnight Show / Andrew Ko...DevGAMM Conference
 

Mehr von DevGAMM Conference (20)

The art of small steps, or how to make sound for games in conditions of war /...
The art of small steps, or how to make sound for games in conditions of war /...The art of small steps, or how to make sound for games in conditions of war /...
The art of small steps, or how to make sound for games in conditions of war /...
 
Breaking up with FMOD - Why we ended things and embraced Metasounds / Daniel ...
Breaking up with FMOD - Why we ended things and embraced Metasounds / Daniel ...Breaking up with FMOD - Why we ended things and embraced Metasounds / Daniel ...
Breaking up with FMOD - Why we ended things and embraced Metasounds / Daniel ...
 
How Audio Objects Improve Spatial Accuracy / Mads Maretty Sønderup (Audiokine...
How Audio Objects Improve Spatial Accuracy / Mads Maretty Sønderup (Audiokine...How Audio Objects Improve Spatial Accuracy / Mads Maretty Sønderup (Audiokine...
How Audio Objects Improve Spatial Accuracy / Mads Maretty Sønderup (Audiokine...
 
Why indie developers should consider hyper-casual right now / Igor Gurenyov (...
Why indie developers should consider hyper-casual right now / Igor Gurenyov (...Why indie developers should consider hyper-casual right now / Igor Gurenyov (...
Why indie developers should consider hyper-casual right now / Igor Gurenyov (...
 
AI / ML for Indies / Tyler Coleman (Retora Games)
AI / ML for Indies / Tyler Coleman (Retora Games)AI / ML for Indies / Tyler Coleman (Retora Games)
AI / ML for Indies / Tyler Coleman (Retora Games)
 
Agility is the Key: Power Up Your GameDev Project Management with Agile Pract...
Agility is the Key: Power Up Your GameDev Project Management with Agile Pract...Agility is the Key: Power Up Your GameDev Project Management with Agile Pract...
Agility is the Key: Power Up Your GameDev Project Management with Agile Pract...
 
New PR Tech and AI Tools for 2023: A Game Changer for Outreach / Kirill Perev...
New PR Tech and AI Tools for 2023: A Game Changer for Outreach / Kirill Perev...New PR Tech and AI Tools for 2023: A Game Changer for Outreach / Kirill Perev...
New PR Tech and AI Tools for 2023: A Game Changer for Outreach / Kirill Perev...
 
Playable Ads - Revolutionizing mobile games advertising / Jakub Kukuryk (Popc...
Playable Ads - Revolutionizing mobile games advertising / Jakub Kukuryk (Popc...Playable Ads - Revolutionizing mobile games advertising / Jakub Kukuryk (Popc...
Playable Ads - Revolutionizing mobile games advertising / Jakub Kukuryk (Popc...
 
Creative Collaboration: Managing an Art Team / Nastassia Radzivonava (Glera G...
Creative Collaboration: Managing an Art Team / Nastassia Radzivonava (Glera G...Creative Collaboration: Managing an Art Team / Nastassia Radzivonava (Glera G...
Creative Collaboration: Managing an Art Team / Nastassia Radzivonava (Glera G...
 
From Local to Global: Unleashing the Power of Payments / Jan Kuhlmannn (Xsolla)
From Local to Global: Unleashing the Power of Payments / Jan Kuhlmannn (Xsolla)From Local to Global: Unleashing the Power of Payments / Jan Kuhlmannn (Xsolla)
From Local to Global: Unleashing the Power of Payments / Jan Kuhlmannn (Xsolla)
 
Strategies and case studies to grow LTV in 2023 / Julia Iljuk (Balancy)
Strategies and case studies to grow LTV in 2023 / Julia Iljuk (Balancy)Strategies and case studies to grow LTV in 2023 / Julia Iljuk (Balancy)
Strategies and case studies to grow LTV in 2023 / Julia Iljuk (Balancy)
 
Why is ASO not working in 2023 and how to change it? / Olena Vedmedenko (Keya...
Why is ASO not working in 2023 and how to change it? / Olena Vedmedenko (Keya...Why is ASO not working in 2023 and how to change it? / Olena Vedmedenko (Keya...
Why is ASO not working in 2023 and how to change it? / Olena Vedmedenko (Keya...
 
How to increase wishlists & game sales from China? Growth marketing tactics &...
How to increase wishlists & game sales from China? Growth marketing tactics &...How to increase wishlists & game sales from China? Growth marketing tactics &...
How to increase wishlists & game sales from China? Growth marketing tactics &...
 
Turkish Gaming Industry and HR Insights / Mustafa Mert EFE (Zindhu)
Turkish Gaming Industry and HR Insights / Mustafa Mert EFE (Zindhu)Turkish Gaming Industry and HR Insights / Mustafa Mert EFE (Zindhu)
Turkish Gaming Industry and HR Insights / Mustafa Mert EFE (Zindhu)
 
Building an Awesome Creative Team from Scratch, Capable of Scaling Up / Sasha...
Building an Awesome Creative Team from Scratch, Capable of Scaling Up / Sasha...Building an Awesome Creative Team from Scratch, Capable of Scaling Up / Sasha...
Building an Awesome Creative Team from Scratch, Capable of Scaling Up / Sasha...
 
Seven Reasons Why Your LiveOps Is Not Performing / Alexander Devyaterikov (Be...
Seven Reasons Why Your LiveOps Is Not Performing / Alexander Devyaterikov (Be...Seven Reasons Why Your LiveOps Is Not Performing / Alexander Devyaterikov (Be...
Seven Reasons Why Your LiveOps Is Not Performing / Alexander Devyaterikov (Be...
 
The Power of Game and Music Collaborations: Reaching and Engaging the Masses ...
The Power of Game and Music Collaborations: Reaching and Engaging the Masses ...The Power of Game and Music Collaborations: Reaching and Engaging the Masses ...
The Power of Game and Music Collaborations: Reaching and Engaging the Masses ...
 
Branded Content: How to overcome players' immunity to advertising / Alex Brod...
Branded Content: How to overcome players' immunity to advertising / Alex Brod...Branded Content: How to overcome players' immunity to advertising / Alex Brod...
Branded Content: How to overcome players' immunity to advertising / Alex Brod...
 
Resurrecting Chasm: The Rift - A Source-less Remastering Journey / Gennadii P...
Resurrecting Chasm: The Rift - A Source-less Remastering Journey / Gennadii P...Resurrecting Chasm: The Rift - A Source-less Remastering Journey / Gennadii P...
Resurrecting Chasm: The Rift - A Source-less Remastering Journey / Gennadii P...
 
How NOT to do showcase events: Behind the scenes of Midnight Show / Andrew Ko...
How NOT to do showcase events: Behind the scenes of Midnight Show / Andrew Ko...How NOT to do showcase events: Behind the scenes of Midnight Show / Andrew Ko...
How NOT to do showcase events: Behind the scenes of Midnight Show / Andrew Ko...
 

Дмитрий Вовк: Векторизация кода под мобильные платформы

  • 1. CODE  VECTORIZATION   for  mobile  devices   by  Dmitriy  Vovk  
  • 2. Hardware   •  Typical  hardware  found  in  modern  mobile   devices:   –  ARMv7  architecture   –  Cortex  A8Cortex  A9Custom  cores  (Krait,  SwiN)   –  800  –  1500  MHz   –  1-­‐4  cores   –  Thumb-­‐2  instrucXons  set   –  VFPv3   –  NEON,  opXonal  for  Cortex  A9.  Nvidia  Tegra  2  has   no  NEON  support  
  • 3. NEON   •  NEON  is  a  general  purpose  SIMD  engine   designed  by  ARM  for  ARM  processor   architecture   •  16  registers,  128  bit  wide  each.  Supports   operaXons  on  8,  16,  32  and  64  bits  integers   and  32  bits  float  values  
  • 4. NEON   •  NEON  can  be  used  for:   –  SoNware  geometry  instancing;   –  Skinning  on  ES  1.1;   –  As  a  general  vertex  processor;   –  Other,  typical,  applicaXons  for  SIMD.  
  • 5. NEON   •  Some  unified  shader  architectures,  like   popular  ImaginaXon  Technologies  USSE1   (PowerVR  SGX  530-­‐545)  are  scalar,  NEON  is   vector  by  nature.  Move  your  vertex  processing   to  CPU  from  GPU  to  speedup  calculaXons*   •  ???????   •  PROFIT!!!111   •  *NOTE.  That  doesn’t  apply  to  USSE2  hardware  
  • 6. NEON   •  The  weakest  side  of  mobile  GPUs  is  a  fill  rate.   Fill  rate  is  quickly  killed  by  blending.  2D  games   are  heavy  on  this.  PowerVR  USSE  engine   doesn’t  care  what  to  do  –  vertex  or  fragments   processing.  Moving  you  vertex  processing  to   CPU  (NEON)  will  leave  some  room  space  for   fragment  processing.  
  • 7. NEON   •  There  are  3  ways  to  use  NEON  vectorizaXon  in   your  code:   1.  Intrinsics   2.  Handwrijen  NEON  assembly   3.  AutovectorizaXon  by  compiler.  –mllvm  – vectorize  –mllvm  –bb-­‐vectorize-­‐aligned-­‐only   compiler  flags  for  LLVM.  -­‐Bree-­‐vectorizer-­‐ verbose=4  -­‐mfpu=neon  -­‐funsafe-­‐math-­‐ opGmizaGons  -­‐Bree-­‐vectorize  for  GCC  
  • 11. Measurements   •  Summary:   Running  'me,  ms   CPU  usage,  %   Intrinsics   2764   19   Assembly   3664   20   FPU   6209   25-­‐28   FPU  autovectorized   5028   22-­‐24   •  Intrinsics  got  me  25%  speedup  over  assembly.     •  Note  that  speed  of  intrinsics  code  vary  from   compiler  to  compiler.  
  • 12. NEON   •  Intrinsics  advantages  over  assembly:   –  Higher  level  code;   –  No  need  to  manage  registers;   –  You  can  vectorize  basic  blocks  and  build  soluXon   to  every  new  problem  with  this  blocks.  In  contrast   to  assembly  –  you  have  to  solve  each  new   problem  from  scratch;  
  • 13. NEON   •  Assembly  advantages  over  intrinsics:   –  Code  generated  from  intrinsics  vary  from  compiler   to  compiler  and  can  give  you  really  big  difference   in  speed.  Assembly  code  will  always  be  the  same.  
  • 14. Code   void  Update()  {          GLKMatrix4  modelviewMat  =  {                                  1,  0,  0,  0,                                  0,  1,  0,  0,                                  0,  0,  1,  0,                                  0,  0,  0,  1  };            const  float  Y_DELTA  =  420.0f  /  QUADS_COUNT;            for  (int  i  =  0;  i  <  QUADS_COUNT  *  VERTS_PER_QUAD;  i  +=  VERTS_PER_QUAD)  {                  modelviewMat.m[12]  =  random()  %  260;                  modelviewMat.m[13]  =  Y_DELTA  ;   #ifdef  ASM                  CalculateSpriteVertsWorldPos((float32x4x4_t*)proj.m,  (float32x4x4_t*)modelviewMat.m,  (float32x4_t*)&data[i  +  0].pos,  (float32x4_t*)&data[i  +   1].pos,  (float32x4_t*)&data[i  +  2].pos,  (float32x4_t*)&data[i  +  3].pos);   #else                  float32x4x4_t  modelviewProj;                  Matrix4ByMatrix4((float32x4x4_t*)proj.m,  (float32x4x4_t*)modelviewMat.m,  &modelviewProj);                        for  (int  j  =  0;  j  <  4;  ++j)  {                          Matrix4ByVec4(&modelviewProj,  (float32x4_t*)&squareVerXces[j],  (float32x4_t*)&data[i  +  j].pos);                  }   #endif          }          glBindBuffer(GL_ARRAY_BUFFER,  vertexBuffer);          glBufferData(GL_ARRAY_BUFFER,  sizeof(data),  data,  GL_STREAM_DRAW);   }  
  • 15. Code   __ajribute__((always_inline))  void  Matrix4ByVec4(const   float32x4x4_t*  __restrict__  mat,  const  float32x4_t*   __restrict__  vec,  float32x4_t*  __restrict__  result)   {          (*result)  =  vmulq_n_f32((*mat).val[0],  (*vec)[0]);                    (*result)  =  vmlaq_n_f32((*result),  (*mat).val[1],  (*vec)[1]);          (*result)  =  vmlaq_n_f32((*result),  (*mat).val[2],  (*vec)[2]);          (*result)  =  vmlaq_n_f32((*result),  (*mat).val[3],  (*vec)[3]);   }  
  • 16. Code   __ajribute__((always_inline))  void  Matrix4ByMatrix4(const  float32x4x4_t*  __restrict__  m1,  const  float32x4x4_t*  __restrict__  m2,   float32x4x4_t*  __restrict__  r)   {   #ifdef  INTRINSICS          (*r).val[0]  =  vmulq_n_f32((*m1).val[0],  vgetq_lane_f32((*m2).val[0],  0));          (*r).val[1]  =  vmulq_n_f32((*m1).val[0],  vgetq_lane_f32((*m2).val[1],  0));          (*r).val[2]  =  vmulq_n_f32((*m1).val[0],  vgetq_lane_f32((*m2).val[2],  0));          (*r).val[3]  =  vmulq_n_f32((*m1).val[0],  vgetq_lane_f32((*m2).val[3],  0));                    (*r).val[0]  =  vmlaq_n_f32((*r).val[0],  (*m1).val[1],  vgetq_lane_f32((*m2).val[0],  1));          (*r).val[1]  =  vmlaq_n_f32((*r).val[1],  (*m1).val[1],  vgetq_lane_f32((*m2).val[1],  1));          (*r).val[2]  =  vmlaq_n_f32((*r).val[2],  (*m1).val[1],  vgetq_lane_f32((*m2).val[2],  1));          (*r).val[3]  =  vmlaq_n_f32((*r).val[3],  (*m1).val[1],  vgetq_lane_f32((*m2).val[3],  1));                    (*r).val[0]  =  vmlaq_n_f32((*r).val[0],  (*m1).val[2],  vgetq_lane_f32((*m2).val[0],  2));          (*r).val[1]  =  vmlaq_n_f32((*r).val[1],  (*m1).val[2],  vgetq_lane_f32((*m2).val[1],  2));          (*r).val[2]  =  vmlaq_n_f32((*r).val[2],  (*m1).val[2],  vgetq_lane_f32((*m2).val[2],  2));          (*r).val[3]  =  vmlaq_n_f32((*r).val[3],  (*m1).val[2],  vgetq_lane_f32((*m2).val[3],  2));                    (*r).val[0]  =  vmlaq_n_f32((*r).val[0],  (*m1).val[3],  vgetq_lane_f32((*m2).val[0],  3));          (*r).val[1]  =  vmlaq_n_f32((*r).val[1],  (*m1).val[3],  vgetq_lane_f32((*m2).val[1],  3));          (*r).val[2]  =  vmlaq_n_f32((*r).val[2],  (*m1).val[3],  vgetq_lane_f32((*m2).val[2],  3));          (*r).val[3]  =  vmlaq_n_f32((*r).val[3],  (*m1).val[3],  vgetq_lane_f32((*m2).val[3],  3));   }  
  • 17. Code    __asm__  volaXle            "vmla.f32  q12,  q11,  d1[1]nt"            "vmla.f32  q10,  q13,  d4[1]nt"          (            "vmla.f32  q13,  q11,  d3[1]nt"            "vmla.f32  q10,  q14,  d5[0]nt"            "vldmia  %6,  {  q0-­‐q3  }  nt"            "vmla.f32  q14,  q11,  d5[1]nt"            "vmla.f32  q10,  q15,  d5[1]nt"            "vldmia  %0,  {  q8-­‐q11  }nt"            "vmla.f32  q15,  q11,  d7[1]nt"                                                "vmla.f32  q11,  q13,  d6[1]nt"            "vmul.f32  q12,  q8,  d0[0]nt"            "vldmia  %1,  {  q0-­‐q3  }  nt"            "vmla.f32  q11,  q14,  d7[0]nt"            "vmul.f32  q13,  q8,  d2[0]nt"                        "vmla.f32  q11,  q15,  d7[1]nt"            "vmul.f32  q14,  q8,  d4[0]nt"            "vmul.f32  q8,  q12,  d0[0]nt"                        "vmul.f32  q15,  q8,  d6[0]nt"            "vmul.f32  q9,  q12,  d2[0]nt"            "vstmia  %2,  {  q8  }nt"                        "vmul.f32  q10,  q12,  d4[0]nt"            "vstmia  %3,  {  q9  }nt"            "vmla.f32  q12,  q9,  d0[1]nt"            "vmul.f32  q11,  q12,  d6[0]nt"            "vstmia  %4,  {  q10  }nt"            "vmla.f32  q13,  q9,  d2[1]nt"                        "vstmia  %5,  {  q11  }"            "vmla.f32  q14,  q9,  d4[1]nt"            "vmla.f32  q8,  q13,  d0[1]nt"                        "vmla.f32  q15,  q9,  d6[1]nt"            "vmla.f32  q8,  q14,  d1[0]nt"            :                        "vmla.f32  q8,  q15,  d1[1]nt"            :  "r"  (proj),  "r"  (squareVerXces),  "r"  (v1),            "vmla.f32  q12,  q10,  d1[0]nt"               "r"  (v2),  "r"  (v3),  "r"  (v4),  "r"  (modelView)            "vmla.f32  q13,  q10,  d3[0]nt"            "vmla.f32  q9,  q13,  d2[1]nt"            :  "memory",  "q0",  "q1",  "q2",  "q3",            "vmla.f32  q14,  q10,  d5[0]nt"            "vmla.f32  q9,  q14,  d3[0]nt"   "q8",  "q9",  "q10",  "q11",  "q12",  "q13",   "q14",  "q15"            "vmla.f32  q15,  q10,  d7[0]nt"            "vmla.f32  q9,  q15,  d3[1]nt"            );                          
  • 18. Docs   •  For  detailed  explanaXon  on  intrinsics assembly  see:   hjp://infocenter.arm.com/help/index.jsp? topic=/com.arm.doc.dui0491e/CIHJBEFE.html  
  • 19. Contact  me         hjp://www.linkedin.com/in/dvovk/   hjp://nukecode.blogspot.com/